├── .gitignore
├── LICENSE.txt
├── Makefile
├── README.md
├── ext-net
    ├── README.md
    ├── example
    │   ├── Makefile
    │   ├── nccl
    │   │   ├── common.h
    │   │   ├── err.h
    │   │   ├── net.h
    │   │   ├── net_device.h
    │   │   ├── net_v10.h
    │   │   ├── net_v2.h
    │   │   ├── net_v3.h
    │   │   ├── net_v4.h
    │   │   ├── net_v5.h
    │   │   ├── net_v6.h
    │   │   ├── net_v7.h
    │   │   ├── net_v8.h
    │   │   ├── net_v9.h
    │   │   └── types.h
    │   └── plugin.c
    └── google-fastsocket
    │   └── Makefile
├── ext-profiler
    ├── README.md
    └── example
    │   ├── Makefile
    │   ├── README.md
    │   ├── event.c
    │   ├── event.h
    │   ├── nccl
    │       ├── common.h
    │       ├── err.h
    │       ├── net_ib_v1.h
    │       ├── net_socket_v1.h
    │       ├── profiler.h
    │       ├── profiler_net.h
    │       ├── profiler_v1.h
    │       ├── profiler_v2.h
    │       ├── profiler_v3.h
    │       ├── profiler_v4.h
    │       └── types.h
    │   ├── plugin.c
    │   ├── plugin.h
    │   ├── print_event.c
    │   └── print_event.h
├── ext-tuner
    └── example
    │   ├── Makefile
    │   ├── nccl
    │       ├── common.h
    │       ├── err.h
    │       └── tuner.h
    │   └── plugin.c
├── makefiles
    ├── common.mk
    ├── formatting.mk
    └── version.mk
├── pkg
    ├── Makefile
    ├── debian
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── changelog.in
    │   ├── compat
    │   ├── control.in
    │   ├── copyright
    │   ├── gbp.conf
    │   ├── libnccl-dev.install.in
    │   ├── libnccl2.install.in
    │   ├── rules
    │   └── source
    │   │   └── format
    ├── redhat
    │   ├── Makefile
    │   └── nccl.spec.in
    ├── srctxz
    │   ├── Makefile
    │   └── create_srctxz.sh.in
    └── txz
    │   ├── Makefile
    │   └── create_txz.sh.in
└── src
    ├── Makefile
    ├── allocator.cc
    ├── bootstrap.cc
    ├── channel.cc
    ├── collectives.cc
    ├── debug.cc
    ├── device
        ├── Makefile
        ├── all_gather.h
        ├── all_reduce.h
        ├── broadcast.h
        ├── common.cu
        ├── common.h
        ├── common_kernel.h
        ├── generate.py
        ├── network
        │   └── unpack
        │   │   ├── unpack.h
        │   │   └── unpack_defs.h
        ├── onerank.cu
        ├── op128.h
        ├── primitives.h
        ├── prims_ll.h
        ├── prims_ll128.h
        ├── prims_simple.h
        ├── reduce.h
        ├── reduce_kernel.h
        ├── reduce_scatter.h
        ├── sendrecv.h
        └── symmetric
        │   ├── all_gather.cuh
        │   ├── all_reduce.cuh
        │   ├── generate.py
        │   ├── kernel.cuh
        │   ├── primitives.cuh
        │   └── reduce_scatter.cuh
    ├── enhcompat.cc
    ├── enqueue.cc
    ├── graph
        ├── connect.cc
        ├── paths.cc
        ├── rings.cc
        ├── rings.h
        ├── search.cc
        ├── topo.cc
        ├── topo.h
        ├── trees.cc
        ├── tuning.cc
        ├── xml.cc
        └── xml.h
    ├── group.cc
    ├── include
        ├── alloc.h
        ├── allocator.h
        ├── argcheck.h
        ├── bitops.h
        ├── bootstrap.h
        ├── channel.h
        ├── checks.h
        ├── coll_net.h
        ├── collectives.h
        ├── comm.h
        ├── core.h
        ├── cpuset.h
        ├── cudawrap.h
        ├── debug.h
        ├── device.h
        ├── enqueue.h
        ├── gdrwrap.h
        ├── graph.h
        ├── group.h
        ├── ibvcore.h
        ├── ibvsymbols.h
        ├── ibvwrap.h
        ├── info.h
        ├── ipcsocket.h
        ├── mlx5
        │   ├── mlx5dvcore.h
        │   ├── mlx5dvsymbols.h
        │   └── mlx5dvwrap.h
        ├── mnnvl.h
        ├── nccl_common.h
        ├── net.h
        ├── net_device.h
        ├── nvmlwrap.h
        ├── nvtx.h
        ├── nvtx3
        │   ├── nvToolsExt.h
        │   ├── nvToolsExtCounters.h
        │   ├── nvToolsExtCuda.h
        │   ├── nvToolsExtCudaRt.h
        │   ├── nvToolsExtMem.h
        │   ├── nvToolsExtMemCudaRt.h
        │   ├── nvToolsExtOpenCL.h
        │   ├── nvToolsExtPayload.h
        │   ├── nvToolsExtPayloadHelper.h
        │   ├── nvToolsExtSemanticsCounters.h
        │   ├── nvToolsExtSemanticsScope.h
        │   ├── nvToolsExtSync.h
        │   ├── nvtx3.hpp
        │   └── nvtxDetail
        │   │   ├── nvtxExtHelperMacros.h
        │   │   ├── nvtxExtImpl.h
        │   │   ├── nvtxExtImplCounters_v1.h
        │   │   ├── nvtxExtImplMemCudaRt_v1.h
        │   │   ├── nvtxExtImplMem_v1.h
        │   │   ├── nvtxExtImplPayload_v1.h
        │   │   ├── nvtxExtInit.h
        │   │   ├── nvtxExtPayloadHelperInternal.h
        │   │   ├── nvtxExtPayloadTypeInfo.h
        │   │   ├── nvtxExtTypes.h
        │   │   ├── nvtxImpl.h
        │   │   ├── nvtxImplCore.h
        │   │   ├── nvtxImplCudaRt_v3.h
        │   │   ├── nvtxImplCuda_v3.h
        │   │   ├── nvtxImplOpenCL_v3.h
        │   │   ├── nvtxImplSync_v3.h
        │   │   ├── nvtxInit.h
        │   │   ├── nvtxInitDecls.h
        │   │   ├── nvtxInitDefs.h
        │   │   ├── nvtxLinkOnce.h
        │   │   └── nvtxTypes.h
        ├── nvtx_payload_schemas.h
        ├── p2p.h
        ├── param.h
        ├── plugin
        │   ├── nccl_net.h
        │   ├── nccl_profiler.h
        │   ├── nccl_tuner.h
        │   ├── net
        │   │   ├── net_v10.h
        │   │   ├── net_v6.h
        │   │   ├── net_v7.h
        │   │   ├── net_v8.h
        │   │   └── net_v9.h
        │   ├── plugin.h
        │   ├── profiler
        │   │   ├── net_ib.h
        │   │   ├── net_ib_v1.h
        │   │   ├── net_socket.h
        │   │   ├── net_socket_v1.h
        │   │   ├── profiler_v1.h
        │   │   ├── profiler_v2.h
        │   │   ├── profiler_v3.h
        │   │   └── profiler_v4.h
        │   └── tuner
        │   │   ├── tuner_v2.h
        │   │   ├── tuner_v3.h
        │   │   └── tuner_v4.h
        ├── profiler.h
        ├── proxy.h
        ├── ras.h
        ├── register.h
        ├── register_inline.h
        ├── shm.h
        ├── shmutils.h
        ├── socket.h
        ├── strongstream.h
        ├── symmetric.h
        ├── timer.h
        ├── transport.h
        ├── trees.h
        ├── tuner.h
        └── utils.h
    ├── init.cc
    ├── init_nvtx.cc
    ├── misc
        ├── argcheck.cc
        ├── cudawrap.cc
        ├── gdrwrap.cc
        ├── ibvsymbols.cc
        ├── ibvwrap.cc
        ├── ipcsocket.cc
        ├── mlx5dvsymbols.cc
        ├── mlx5dvwrap.cc
        ├── nvmlwrap.cc
        ├── param.cc
        ├── shmutils.cc
        ├── socket.cc
        ├── strongstream.cc
        └── utils.cc
    ├── mnnvl.cc
    ├── nccl.h.in
    ├── nccl.pc.in
    ├── plugin
        ├── net.cc
        ├── net
        │   ├── net_v10.cc
        │   ├── net_v6.cc
        │   ├── net_v7.cc
        │   ├── net_v8.cc
        │   └── net_v9.cc
        ├── plugin_open.cc
        ├── profiler.cc
        ├── profiler
        │   ├── profiler_v1.cc
        │   ├── profiler_v2.cc
        │   ├── profiler_v3.cc
        │   └── profiler_v4.cc
        ├── tuner.cc
        └── tuner
        │   ├── tuner_v2.cc
        │   ├── tuner_v3.cc
        │   └── tuner_v4.cc
    ├── proxy.cc
    ├── ras
        ├── client.cc
        ├── client_support.cc
        ├── collectives.cc
        ├── peers.cc
        ├── ras.cc
        ├── ras_internal.h
        └── rasnet.cc
    ├── register
        ├── coll_reg.cc
        ├── register.cc
        └── sendrecv_reg.cc
    ├── symmetric.cc
    ├── transport.cc
    └── transport
        ├── coll_net.cc
        ├── generic.cc
        ├── net.cc
        ├── net_ib.cc
        ├── net_socket.cc
        ├── nvls.cc
        ├── p2p.cc
        ├── profiler.cc
        └── shm.cc


/.gitignore:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
2 | /build
3 | *.gcov
4 | /coverage/
5 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 | 
 4 |  Redistribution and use in source and binary forms, with or without
 5 |  modification, are permitted provided that the following conditions
 6 |  are met:
 7 |   * Redistributions of source code must retain the above copyright
 8 |     notice, this list of conditions and the following disclaimer.
 9 |   * Redistributions in binary form must reproduce the above copyright
10 |     notice, this list of conditions and the following disclaimer in the
11 |     documentation and/or other materials provided with the distribution.
12 |   * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
13 |     Laboratory, the U.S. Department of Energy, nor the names of their
14 |     contributors may be used to endorse or promote products derived
15 |     from this software without specific prior written permission.
16 | 
17 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 |  The U.S. Department of Energy funded the development of this software
30 |  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
31 | 
32 | 
33 | This code also includes files from the NVIDIA Tools Extension SDK project.
34 | 
35 | See:
36 | 
37 |    https://github.com/NVIDIA/NVTX
38 | 
39 | for more information and license details.
40 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : src.build
 9 | install : src.install
10 | BUILDDIR ?= $(abspath ./build)
11 | ABSBUILDDIR := $(abspath $(BUILDDIR))
12 | TARGETS := src pkg
13 | clean: ${TARGETS:%=%.clean}
14 | test.build: src.build
15 | LICENSE_FILES := LICENSE.txt
16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
17 | lic: $(LICENSE_TARGETS)
18 | 
19 | ${BUILDDIR}/%.txt: %.txt
20 | 	@printf "Copying    %-35s > %s\n" $< $@
21 | 	mkdir -p ${BUILDDIR}
22 | 	cp $< $@
23 | 
24 | src.%:
25 | 	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
26 | 
27 | pkg.%:
28 | 	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
29 | 
30 | pkg.debian.prep: lic
31 | pkg.txz.prep: lic
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NCCL
 2 | 
 3 | Optimized primitives for inter-GPU communication.
 4 | 
 5 | ## Introduction
 6 | 
 7 | NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
 8 | 
 9 | For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
10 | 
11 | ## Build
12 | 
13 | Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.
14 | 
15 | To build the library :
16 | 
17 | ```shell
18 | $ cd nccl
19 | $ make -j src.build
20 | ```
21 | 
22 | If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :
23 | 
24 | ```shell
25 | $ make src.build CUDA_HOME=<path to cuda install>
26 | ```
27 | 
28 | NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.
29 | 
30 | By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
31 | ```shell
32 | $ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
33 | ```
34 | 
35 | ## Install
36 | 
37 | To install NCCL on the system, create a package then install it as root.
38 | 
39 | Debian/Ubuntu :
40 | ```shell
41 | $ # Install tools to create debian packages
42 | $ sudo apt install build-essential devscripts debhelper fakeroot
43 | $ # Build NCCL deb package
44 | $ make pkg.debian.build
45 | $ ls build/pkg/deb/
46 | ```
47 | 
48 | RedHat/CentOS :
49 | ```shell
50 | $ # Install tools to create rpm packages
51 | $ sudo yum install rpm-build rpmdevtools
52 | $ # Build NCCL rpm package
53 | $ make pkg.redhat.build
54 | $ ls build/pkg/rpm/
55 | ```
56 | 
57 | OS-agnostic tarball :
58 | ```shell
59 | $ make pkg.txz.build
60 | $ ls build/pkg/txz/
61 | ```
62 | 
63 | ## Tests
64 | 
65 | Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
66 | 
67 | ```shell
68 | $ git clone https://github.com/NVIDIA/nccl-tests.git
69 | $ cd nccl-tests
70 | $ make
71 | $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
72 | ```
73 | 
74 | ## Copyright
75 | 
76 | All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
77 | 


--------------------------------------------------------------------------------
/ext-net/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME:=../../build/
 7 | CUDA_HOME:=/usr/local/cuda
 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 9 | PLUGIN_SO:=libnccl-net.so
10 | 
11 | default: $(PLUGIN_SO)
12 | 
13 | $(PLUGIN_SO): plugin.c
14 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
15 | 
16 | clean:
17 | 	rm -f $(PLUGIN_SO)
18 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
13 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
14 | 
15 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
16 | 
17 | enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop };
18 | 
19 | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_ERR_H_
 6 | #define NCCL_ERR_H_
 7 | 
 8 | /* Error type for plugins */
 9 | typedef enum { ncclSuccess                 =  0,
10 |                ncclUnhandledCudaError      =  1,
11 |                ncclSystemError             =  2,
12 |                ncclInternalError           =  3,
13 |                ncclInvalidArgument         =  4,
14 |                ncclInvalidUsage            =  5,
15 |                ncclRemoteError             =  6 } ncclResult_t;
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NET_H_
 6 | #define NET_H_
 7 | 
 8 | #include <stdint.h>
 9 | #include <stdlib.h>
10 | 
11 | #include "err.h"
12 | #include "net_device.h"
13 | #include "common.h"
14 | 
15 | #define NCCL_NET_HANDLE_MAXSIZE 128
16 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
17 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
18 | 
19 | #define NCCL_PTR_HOST 0x1
20 | #define NCCL_PTR_CUDA 0x2
21 | #define NCCL_PTR_DMABUF 0x4
22 | 
23 | // Maximum number of requests per comm object
24 | #define NCCL_NET_MAX_REQUESTS 32
25 | 
26 | #include "net_v10.h"
27 | #include "net_v9.h"
28 | #include "net_v8.h"
29 | #include "net_v7.h"
30 | #include "net_v6.h"
31 | #include "net_v5.h"
32 | #include "net_v4.h"
33 | #include "net_v3.h"
34 | #include "net_v2.h"
35 | 
36 | typedef ncclNet_v10_t ncclNet_t;
37 | typedef ncclNetProperties_v10_t ncclNetProperties_t;
38 | typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
39 | typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
40 | 
41 | #endif // end include guard
42 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_device.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_DEVICE_H_
 8 | #define NET_DEVICE_H_
 9 | 
10 | #define NCCL_NET_DEVICE_INVALID_VERSION      0x0
11 | #define NCCL_NET_MTU_SIZE                    4096
12 | 
13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
16 | 
17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
18 | 
19 | typedef struct {
20 |   ncclNetDeviceType netDeviceType; // Network offload type
21 |   int netDeviceVersion;            // Version number for network offload
22 |   void* handle;
23 |   size_t size;
24 |   int needsProxyProgress;
25 | } ncclNetDeviceHandle_v7_t;
26 | 
27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
29 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
30 | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v2.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NET_V2_H_
 6 | #define NET_V2_H_
 7 | 
 8 | typedef struct {
 9 |   // Name of the network (mainly for logs)
10 |   const char* name;
11 |   // Initialize the network.
12 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
13 |   // Return the number of adapters.
14 |   ncclResult_t (*devices)(int* ndev);
15 |   // Return the device path in /sys. NCCL will call free on this path.
16 |   ncclResult_t (*pciPath)(int dev, char** path);
17 |   // Return whether this device supports host pointers and/or CUDA pointers
18 |   // as data from the current GPU. Supported types should be composed with
19 |   // NCCL_PTR_HOST and NCCL_PTR_CUDA.
20 |   ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
21 |   // Create a receiving object and provide a handle to connect to it. The
22 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
23 |   // between ranks to create a connection.
24 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
25 |   // Connect to a handle and return a sending comm object for that peer.
26 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
27 |   // Finalize connection establishment after remote peer has called connectHandle
28 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
29 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
30 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
31 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
32 |   // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
33 |   // May return request == NULL if the call cannot be performed (or would block)
34 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
35 |   // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
36 |   // May return request == NULL if the call cannot be performed (or would block)
37 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
38 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
39 |   // visible to the GPU
40 |   ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
41 |   // Test whether a request is complete. If size is not NULL, it returns the
42 |   // number of bytes sent/received.
43 |   ncclResult_t (*test)(void* request, int* done, int* size);
44 |   // Close and free send/recv comm objects
45 |   ncclResult_t (*closeSend)(void* sendComm);
46 |   ncclResult_t (*closeRecv)(void* recvComm);
47 |   ncclResult_t (*closeListen)(void* listenComm);
48 | } ncclNet_v2_t;
49 | 
50 | #endif // end include guard
51 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NET_V3_H_
 6 | #define NET_V3_H_
 7 | 
 8 | #define NCCL_NET_MAX_REQUESTS_V3 16
 9 | 
10 | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
11 | typedef struct {
12 |   // Name of the network (mainly for logs)
13 |   const char* name;
14 |   // Initialize the network.
15 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
16 |   // Return the number of adapters.
17 |   ncclResult_t (*devices)(int* ndev);
18 |   // Get various device properties.
19 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
20 |   // Create a receiving object and provide a handle to connect to it. The
21 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
22 |   // between ranks to create a connection.
23 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
24 |   // Connect to a handle and return a sending comm object for that peer.
25 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
26 |   // Finalize connection establishment after remote peer has called connectHandle
27 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
28 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
29 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
30 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
31 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
32 |   // Asynchronous send to a peer.
33 |   // May return request == NULL if the call cannot be performed (or would block)
34 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
35 |   // Asynchronous recv from a peer.
36 |   // May return request == NULL if the call cannot be performed (or would block)
37 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
38 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
39 |   // visible to the GPU
40 |   ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
41 |   // Test whether a request is complete. If size is not NULL, it returns the
42 |   // number of bytes sent/received.
43 |   ncclResult_t (*test)(void* request, int* done, int* size);
44 |   // Close and free send/recv comm objects
45 |   ncclResult_t (*closeSend)(void* sendComm);
46 |   ncclResult_t (*closeRecv)(void* recvComm);
47 |   ncclResult_t (*closeListen)(void* listenComm);
48 | } ncclNet_v3_t;
49 | 
50 | #endif // end include guard
51 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v4.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NET_V4_H_
 6 | #define NET_V4_H_
 7 | 
 8 | #define NCCL_NET_HANDLE_MAXSIZE_V4 64
 9 | 
10 | typedef struct {
11 |   char* name;     // Used mostly for logging.
12 |   char* pciPath;  // Path to the PCI device in /sys.
13 |   uint64_t guid;  // Unique identifier for the NIC chip. Important for
14 |                   // cards with multiple PCI functions (Physical or virtual).
15 |   int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
16 |   int speed;      // Port speed in Mbps.
17 |   int port;       // Port number.
18 |   int maxComms;   // Maximum number of comms we can create
19 | } ncclNetProperties_v4_t;
20 | 
21 | // v4 struct for backwards compatibility
22 | typedef struct {
23 |   // Name of the network (mainly for logs)
24 |   const char* name;
25 |   // Initialize the network.
26 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
27 |   // Return the number of adapters.
28 |   ncclResult_t (*devices)(int* ndev);
29 |   // Get various device properties.
30 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
31 |   // Create a receiving object and provide a handle to connect to it. The
32 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
33 |   // between ranks to create a connection.
34 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
35 |   // Connect to a handle and return a sending comm object for that peer.
36 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
37 |   // Finalize connection establishment after remote peer has called connectHandle
38 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
39 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
40 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
41 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
42 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
43 |   // Asynchronous send to a peer.
44 |   // May return request == NULL if the call cannot be performed (or would block)
45 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
46 |   // Asynchronous recv from a peer.
47 |   // May return request == NULL if the call cannot be performed (or would block)
48 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
49 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
50 |   // visible to the GPU
51 |   ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
52 |   // Test whether a request is complete. If size is not NULL, it returns the
53 |   // number of bytes sent/received.
54 |   ncclResult_t (*test)(void* request, int* done, int* size);
55 |   // Close and free send/recv comm objects
56 |   ncclResult_t (*closeSend)(void* sendComm);
57 |   ncclResult_t (*closeRecv)(void* recvComm);
58 |   ncclResult_t (*closeListen)(void* listenComm);
59 | } ncclNet_v4_t;
60 | 
61 | #endif // end include guard
62 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v5.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NET_V5_H_
 6 | #define NET_V5_H_
 7 | 
 8 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
 9 | typedef struct {
10 |   // Name of the network (mainly for logs)
11 |   const char* name;
12 |   // Initialize the network.
13 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
14 |   // Return the number of adapters.
15 |   ncclResult_t (*devices)(int* ndev);
16 |   // Get various device properties.
17 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
18 |   // Create a receiving object and provide a handle to connect to it. The
19 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
20 |   // between ranks to create a connection.
21 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
22 |   // Connect to a handle and return a sending comm object for that peer.
23 |   // This call must not block for the connection to be established, and instead
24 |   // should return successfully with sendComm == NULL with the expectation that
25 |   // it will be called again until sendComm != NULL.
26 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
27 |   // Finalize connection establishment after remote peer has called connect.
28 |   // This call must not block for the connection to be established, and instead
29 |   // should return successfully with recvComm == NULL with the expectation that
30 |   // it will be called again until recvComm != NULL.
31 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
32 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
33 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
34 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
35 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
36 |   // Asynchronous send to a peer.
37 |   // May return request == NULL if the call cannot be performed (or would block)
38 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
39 |   // Asynchronous recv from a peer.
40 |   // May return request == NULL if the call cannot be performed (or would block)
41 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
42 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
43 |   // visible to the GPU
44 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
45 |   // Test whether a request is complete. If size is not NULL, it returns the
46 |   // number of bytes sent/received.
47 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
48 |   // Close and free send/recv comm objects
49 |   ncclResult_t (*closeSend)(void* sendComm);
50 |   ncclResult_t (*closeRecv)(void* recvComm);
51 |   ncclResult_t (*closeListen)(void* listenComm);
52 | } ncclNet_v5_t;
53 | 
54 | #endif // end include guard
55 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v6.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NET_V6_H_
 6 | #define NET_V6_H_
 7 | 
 8 | typedef struct {
 9 |   char* name;     // Used mostly for logging.
10 |   char* pciPath;  // Path to the PCI device in /sys.
11 |   uint64_t guid;  // Unique identifier for the NIC chip. Important for
12 |                   // cards with multiple PCI functions (Physical or virtual).
13 |   int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
14 |   int speed;      // Port speed in Mbps.
15 |   int port;       // Port number.
16 |   float latency;  // Network latency
17 |   int maxComms;   // Maximum number of comms we can create
18 |   int maxRecvs;   // Maximum number of grouped receives.
19 | }ncclNetProperties_v6_t;
20 | 
21 | typedef struct {
22 |   // Name of the network (mainly for logs)
23 |   const char* name;
24 |   // Initialize the network.
25 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
26 |   // Return the number of adapters.
27 |   ncclResult_t (*devices)(int* ndev);
28 |   // Get various device properties.
29 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
30 |   // Create a receiving object and provide a handle to connect to it. The
31 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
32 |   // between ranks to create a connection.
33 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
34 |   // Connect to a handle and return a sending comm object for that peer.
35 |   // This call must not block for the connection to be established, and instead
36 |   // should return successfully with sendComm == NULL with the expectation that
37 |   // it will be called again until sendComm != NULL.
38 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
39 |   // Finalize connection establishment after remote peer has called connect.
40 |   // This call must not block for the connection to be established, and instead
41 |   // should return successfully with recvComm == NULL with the expectation that
42 |   // it will be called again until recvComm != NULL.
43 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
44 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
45 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
46 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
47 |   /* DMA-BUF support */
48 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
49 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
50 |   // Asynchronous send to a peer.
51 |   // May return request == NULL if the call cannot be performed (or would block)
52 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
53 |   // Asynchronous recv from a peer.
54 |   // May return request == NULL if the call cannot be performed (or would block)
55 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
56 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
57 |   // visible to the GPU
58 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
59 |   // Test whether a request is complete. If size is not NULL, it returns the
60 |   // number of bytes sent/received.
61 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
62 |   // Close and free send/recv comm objects
63 |   ncclResult_t (*closeSend)(void* sendComm);
64 |   ncclResult_t (*closeRecv)(void* recvComm);
65 |   ncclResult_t (*closeListen)(void* listenComm);
66 | } ncclNet_v6_t;
67 | 
68 | #endif // end include guard
69 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_TYPES_H_
 6 | #define NCCL_TYPES_H_
 7 | 
 8 | /* Data types */
 9 | typedef enum { ncclInt8       = 0, ncclChar       = 0,
10 |                ncclUint8      = 1,
11 |                ncclInt32      = 2, ncclInt        = 2,
12 |                ncclUint32     = 3,
13 |                ncclInt64      = 4,
14 |                ncclUint64     = 5,
15 |                ncclFloat16    = 6, ncclHalf       = 6,
16 |                ncclFloat32    = 7, ncclFloat      = 7,
17 |                ncclFloat64    = 8, ncclDouble     = 8,
18 |                ncclBfloat16   = 9,
19 | } ncclDataType_t;
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/ext-net/google-fastsocket/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME?=/usr/local/cuda
 2 | INC:=-I$(CUDA_HOME)/include
 3 | PLUGIN_SO:=libnccl-net.so
 4 | 
 5 | default: $(PLUGIN_SO)
 6 | 
 7 | $(PLUGIN_SO): nccl-fastsocket/*.cc
 8 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
 9 | 
10 | nccl-fastsocket/*.cc:
11 | 	git clone https://github.com/google/nccl-fastsocket.git
12 | 
13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO)
14 | 
15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO)
16 | 	@printf "Grabbing %-35s > %s\n" $< $@
17 | 	mkdir -p $(BUILDDIR)/lib
18 | 	install -m 644 $< $@
19 | 
20 | clean:
21 | 	rm -f $(PLUGIN_SO)
22 | 	rm -Rf nccl-fastsocket
23 | 


--------------------------------------------------------------------------------
/ext-profiler/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME := ../../build
 7 | INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 8 | PLUGIN_SO := libnccl-profiler.so
 9 | 
10 | default: $(PLUGIN_SO)
11 | 
12 | $(PLUGIN_SO): plugin.c event.c print_event.c
13 | 	$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
14 | 
15 | clean:
16 | 	rm -f $(PLUGIN_SO)
17 | 


--------------------------------------------------------------------------------
/ext-profiler/example/event.c:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <stdio.h>
 8 | #include "event.h"
 9 | 
10 | int taskEventQueueEmpty(struct group* g) {
11 |   return g->eventHead == NULL;
12 | }
13 | 
14 | void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
15 |   event->next = NULL;
16 |   if (g->eventHead) g->eventTail->next = event;
17 |   else g->eventHead = event;
18 |   g->eventTail = event;
19 | }
20 | 
21 | struct taskEventBase* taskEventQueueHead(struct group* g) {
22 |   return g->eventHead;
23 | }
24 | 
25 | struct taskEventBase* taskEventQueueDequeue(struct group* g) {
26 |   struct taskEventBase* tmp = g->eventHead;
27 |   g->eventHead = g->eventHead->next;
28 |   if (g->eventHead == NULL) g->eventTail = NULL;
29 |   return tmp;
30 | }
31 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12 | 
13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ERR_H_
 8 | #define NCCL_ERR_H_
 9 | 
10 | /* Error type for plugins */
11 | typedef enum { ncclSuccess                 =  0,
12 |                ncclUnhandledCudaError      =  1,
13 |                ncclSystemError             =  2,
14 |                ncclInternalError           =  3,
15 |                ncclInvalidArgument         =  4,
16 |                ncclInvalidUsage            =  5,
17 |                ncclRemoteError             =  6 } ncclResult_t;
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/net_ib_v1.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_IB_V1_H_
 8 | #define NET_IB_V1_H_
 9 | 
10 | #define NCCL_PROFILER_NET_IB_VER 1
11 | 
12 | enum {
13 |   ncclProfileQp = (1 << 0),
14 | };
15 | 
16 | // The data structure version is encoded in the plugin identifier bitmask and
17 | // passed to NCCL core through the profiler callback. NCCL copies the plugin
18 | // identifier in the event descriptor before calling the profiler startEvent
19 | // function. The profiler should inspect the plugin id to find out the source
20 | // plugin as well as the version of the event struct
21 | typedef struct {
22 |   uint8_t type;        // event type (plugin defined)
23 |   union {
24 |     struct {
25 |       int device;      // network device id
26 |       uint64_t wr_id;  // work request id
27 |       int opcode;      // ibv opcode
28 |       int qpNum;       // QP number
29 |       size_t length;   // work request data length
30 |     } qp;
31 |   };
32 | } ncclProfilerNetIbDescr_v1_t;
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/net_socket_v1.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_SOCKET_V1_H_
 8 | #define NET_SOCKET_V1_H_
 9 | 
10 | #define NCCL_PROFILER_NET_SOCKET_VER 1
11 | 
12 | enum {
13 |   ncclProfileSocket = (1 << 0),
14 | };
15 | 
16 | // The data structure version is encoded in the plugin identifier bitmask and
17 | // passed to NCCL core through the profiler callback. NCCL copies the plugin
18 | // identifier in the event descriptor before calling the profiler startEvent
19 | // function. The profiler should inspect the plugin id to find out the source
20 | // plugin as well as the version of the event struct
21 | typedef struct {
22 |   uint8_t type;        // event type (plugin defined)
23 |   union {
24 |     struct {
25 |       int fd;
26 |       int op;
27 |       size_t length;
28 |     } sock;
29 |   };
30 | } ncclProfilerNetSockDescr_v1_t;
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PROFILER_H_
 8 | #define PROFILER_H_
 9 | 
10 | #include <stdint.h>
11 | #include <stdlib.h>
12 | 
13 | #include "common.h"
14 | #include "err.h"
15 | 
16 | enum {
17 |   ncclProfileGroup     = (1 << 0),  // group event type
18 |   ncclProfileColl      = (1 << 1),  // host collective call event type
19 |   ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
20 |   ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
21 |   ncclProfileProxyStep = (1 << 4),  // proxy step event type
22 |   ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
23 |   ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
24 |   ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
25 | };
26 | 
27 | typedef enum {
28 |   ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
29 |   ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
30 |   ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
31 |   ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
32 |   ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
33 |   ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
34 |   ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
35 |   ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
36 |   ncclProfilerProxyOpInProgress_v4     = 19,
37 | 
38 |   /* Legacy proxy profiler states */
39 |   ncclProfilerProxyStepSendGPUWait     = 8,
40 |   ncclProfilerProxyStepSendPeerWait_v4 = 20,
41 |   ncclProfilerProxyStepSendWait        = 9,
42 |   ncclProfilerProxyStepRecvWait        = 10,
43 |   ncclProfilerProxyStepRecvFlushWait   = 11,
44 |   ncclProfilerProxyStepRecvGPUWait     = 12,
45 | 
46 |   /* Legacy proxy control states */
47 |   ncclProfilerProxyCtrlIdle            = 13,
48 |   ncclProfilerProxyCtrlActive          = 14,
49 |   ncclProfilerProxyCtrlSleep           = 15,
50 |   ncclProfilerProxyCtrlWakeup          = 16,
51 |   ncclProfilerProxyCtrlAppend          = 17,
52 |   ncclProfilerProxyCtrlAppendEnd       = 18,
53 | 
54 |   /* Network defined events states */
55 |   ncclProfilerNetPluginUpdate          = 21,
56 | 
57 |   /* Kernel event states */
58 |   ncclProfilerKernelChStop             = 22,
59 | } ncclProfilerEventState_t;
60 | 
61 | typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
62 | typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
63 | typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
64 | typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
65 | 
66 | #include "profiler_v4.h"
67 | #include "profiler_v3.h"
68 | #include "profiler_v2.h"
69 | #include "profiler_v1.h"
70 | #include "profiler_net.h"
71 | 
72 | typedef ncclProfiler_v4_t ncclProfiler_t;
73 | typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
74 | typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
75 | 
76 | #endif // end include guard
77 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/profiler_net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PROFILER_NET_H_
 8 | #define PROFILER_NET_H_
 9 | 
10 | #define NCCL_PROFILER_NET_VER_BITS  (16)
11 | #define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
12 | #define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
13 | 
14 | typedef enum {
15 |   NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
16 |   NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
17 | } ncclProfilerNetType;
18 | 
19 | #include "net_ib_v1.h"
20 | #include "net_socket_v1.h"
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/profiler_v1.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef PROFILER_V1_H_
  8 | #define PROFILER_V1_H_
  9 | 
 10 | #include <stdint.h>
 11 | 
 12 | typedef struct {
 13 |   uint8_t type;                 // event type descriptor: ncclProfileColl, ...
 14 |   void* parentObj;              // pointer to the profiler parent object (for coll is the group)
 15 |   int rank;                     // originating rank
 16 |   union {
 17 |     struct {
 18 |       const char* name;
 19 |       uint64_t commHash;
 20 |       uint64_t seqNumber;
 21 |       uint8_t func;
 22 |       void const* sendBuff;
 23 |       void* recvBuff;
 24 |       size_t count;
 25 |       int root;
 26 |       uint8_t datatype;
 27 |       uint32_t op;
 28 |       size_t trafficBytes;
 29 |       uint8_t nMaxChannels;
 30 |       uint8_t nWarps;
 31 |       uint8_t algo;
 32 |       uint8_t proto;
 33 |       int isCollnet;
 34 |       int isNvls;
 35 |     } coll;
 36 | 
 37 |     struct {
 38 |       const char* name;
 39 |       uint64_t commHash;
 40 |       uint8_t func;
 41 |       void* buff;
 42 |       uint8_t datatype;
 43 |       size_t count;
 44 |       int peer;
 45 |     } p2p;
 46 | 
 47 |     struct {
 48 |       pid_t pid;                // pid of the originating process
 49 |       uint8_t channelId;        // channel id for this proxy operation
 50 |       int peer;                 // remote rank for send/recv
 51 |       int nSteps;               // number of steps for this proxy operation
 52 |       int chunkSize;            // amount of data transferred by this proxy operation
 53 |       int isSend;
 54 |     } proxyOp;
 55 | 
 56 |     struct {
 57 |       int step;
 58 |     } proxyStep;
 59 |   };
 60 | } ncclProfilerEventDescr_v1_t;
 61 | 
 62 | typedef union {
 63 |   struct {
 64 |     size_t transSize;
 65 |     int steps;
 66 |   } proxyOp;
 67 | 
 68 |   struct {
 69 |     int appendedProxyOps;
 70 |   } proxyCtrl;
 71 | } ncclProfilerEventStateArgs_v1_t;
 72 | 
 73 | typedef struct {
 74 |   const char* name;
 75 | 
 76 |   // init - initialize the profiler plugin
 77 |   // Input
 78 |   //  - context        : opaque profiler context object for separating profiler behavior across comms
 79 |   // Output
 80 |   //  - eActivationMask: bitmask of active events set by the plugin
 81 |   ncclResult_t (*init)(void** context, int* eActivationMask);
 82 | 
 83 |   // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
 84 |   // Input
 85 |   //  - context: opaque profiler context object
 86 |   //  - eDescr : pointer to ncclProfilerEventDescr_t object
 87 |   // Output
 88 |   //  - eHandle: return event handle for supplied event descriptor object
 89 |   ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
 90 | 
 91 |   // stopEvent - stop/finalize an event inside and event set
 92 |   // Input
 93 |   //  - eHandle: handle to event object
 94 |   ncclResult_t (*stopEvent)(void* eHandle);
 95 | 
 96 |   // recordEventState - record event state transitions and event attribute updates
 97 |   // Input
 98 |   //  - eHandle   : handle to event object created through startEvent
 99 |   //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
100 |   //  - eState    : event state transition
101 |   ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
102 | 
103 |   // finalize - finalize the profiler plugin
104 |   // Input
105 |   //  - context: opaque profiler context object
106 |   ncclResult_t (*finalize)(void* context);
107 | } ncclProfiler_v1_t;
108 | 
109 | #endif
110 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/profiler_v2.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef PROFILER_V2_H_
  8 | #define PROFILER_V2_H_
  9 | 
 10 | #include <stdint.h>
 11 | 
 12 | typedef struct {
 13 |   uint8_t type;                 // event type descriptor: ncclProfileColl, ...
 14 |   void* parentObj;              // pointer to the profiler parent object (for coll is the group)
 15 |   int rank;                     // originating rank
 16 |   union {
 17 |     struct {
 18 |       const char* name;
 19 |       uint64_t commHash;
 20 |       uint64_t seqNumber;
 21 |       const char* func;
 22 |       void const* sendBuff;
 23 |       void* recvBuff;
 24 |       size_t count;
 25 |       int root;
 26 |       const char* datatype;
 27 |       size_t trafficBytes;
 28 |       uint8_t nMaxChannels;
 29 |       uint8_t nWarps;
 30 |       const char* algo;
 31 |       const char* proto;
 32 |     } coll;
 33 | 
 34 |     struct {
 35 |       const char* name;
 36 |       uint64_t commHash;
 37 |       const char* func;
 38 |       void* buff;
 39 |       const char* datatype;
 40 |       size_t count;
 41 |       int peer;
 42 |     } p2p;
 43 | 
 44 |     struct {
 45 |       pid_t pid;                // pid of the originating process
 46 |       uint8_t channelId;        // channel id for this proxy operation
 47 |       int peer;                 // remote rank for send/recv
 48 |       int nSteps;               // number of steps for this proxy operation
 49 |       int chunkSize;            // amount of data transferred by this proxy operation
 50 |       int isSend;
 51 |     } proxyOp;
 52 | 
 53 |     struct {
 54 |       int step;
 55 |     } proxyStep;
 56 |   };
 57 | } ncclProfilerEventDescr_v2_t;
 58 | 
 59 | typedef union {
 60 |   struct {
 61 |     size_t transSize;
 62 |     int steps;
 63 |   } proxyOp;
 64 | 
 65 |   struct {
 66 |     int appendedProxyOps;
 67 |   } proxyCtrl;
 68 | } ncclProfilerEventStateArgs_v2_t;
 69 | 
 70 | typedef struct {
 71 |   const char* name;
 72 | 
 73 |   // init - initialize the profiler plugin
 74 |   // Input
 75 |   //  - context        : opaque profiler context object for separating profiler behavior across comms
 76 |   // Output
 77 |   //  - eActivationMask: bitmask of active events set by the plugin
 78 |   ncclResult_t (*init)(void** context, int* eActivationMask);
 79 | 
 80 |   // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
 81 |   // Input
 82 |   //  - context: opaque profiler context object
 83 |   //  - eDescr : pointer to ncclProfilerEventDescr_t object
 84 |   // Output
 85 |   //  - eHandle: return event handle for supplied event descriptor object
 86 |   ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
 87 | 
 88 |   // stopEvent - stop/finalize an event inside and event set
 89 |   // Input
 90 |   //  - eHandle: handle to event object
 91 |   ncclResult_t (*stopEvent)(void* eHandle);
 92 | 
 93 |   // recordEventState - record event state transitions and event attribute updates
 94 |   // Input
 95 |   //  - eHandle   : handle to event object created through startEvent
 96 |   //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
 97 |   //  - eState    : event state transition
 98 |   ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
 99 | 
100 |   // finalize - finalize the profiler plugin
101 |   // Input
102 |   //  - context: opaque profiler context object
103 |   ncclResult_t (*finalize)(void* context);
104 | } ncclProfiler_v2_t;
105 | 
106 | #endif
107 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_TYPES_H_
 6 | #define NCCL_TYPES_H_
 7 | 
 8 | /* Data types */
 9 | typedef enum { ncclInt8       = 0, ncclChar       = 0,
10 |                ncclUint8      = 1,
11 |                ncclInt32      = 2, ncclInt        = 2,
12 |                ncclUint32     = 3,
13 |                ncclInt64      = 4,
14 |                ncclUint64     = 5,
15 |                ncclFloat16    = 6, ncclHalf       = 6,
16 |                ncclFloat32    = 7, ncclFloat      = 7,
17 |                ncclFloat64    = 8, ncclDouble     = 8,
18 |                ncclBfloat16   = 9,
19 | } ncclDataType_t;
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/ext-profiler/example/plugin.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PLUGIN_H_
 8 | #define PLUGIN_H_
 9 | 
10 | int exampleProfilerStart(int eActivationMask);
11 | int exampleProfilerStop(void);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/ext-profiler/example/print_event.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PRINT_EVENT_H_
 8 | #define PRINT_EVENT_H_
 9 | 
10 | #include "nccl/common.h"
11 | extern ncclDebugLogger_t logFn;
12 | 
13 | void debugEvent(void* eHandle, const char* tag);
14 | void printEvent(FILE* fh, void* handle);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/ext-tuner/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME:=../../build/
 7 | CUDA_HOME:=/usr/local/cuda
 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 9 | PLUGIN_SO:=libnccl-tuner.so
10 | 
11 | default: $(PLUGIN_SO)
12 | 
13 | $(PLUGIN_SO): plugin.c
14 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
15 | 
16 | clean:
17 | 	rm -f $(PLUGIN_SO)
18 | 


--------------------------------------------------------------------------------
/ext-tuner/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12 | 
13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ext-tuner/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_ERR_H_
 6 | #define NCCL_ERR_H_
 7 | 
 8 | /* Error type for plugins */
 9 | typedef enum { ncclSuccess                 =  0,
10 |                ncclUnhandledCudaError      =  1,
11 |                ncclSystemError             =  2,
12 |                ncclInternalError           =  3,
13 |                ncclInvalidArgument         =  4,
14 |                ncclInvalidUsage            =  5,
15 |                ncclRemoteError             =  6 } ncclResult_t;
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/ext-tuner/example/nccl/tuner.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_TUNER_H_
 9 | #define NCCL_TUNER_H_
10 | 
11 | #include <stdint.h>
12 | #include <stdlib.h>
13 | 
14 | #include "common.h"
15 | #include "err.h"
16 | 
17 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
18 | typedef enum {
19 |   ncclFuncBroadcast = 0,
20 |   ncclFuncReduce = 1,
21 |   ncclFuncAllGather = 2,
22 |   ncclFuncReduceScatter = 3,
23 |   ncclFuncAllReduce = 4,
24 |   ncclFuncSendRecv = 5,
25 |   ncclFuncSend = 6,
26 |   ncclFuncRecv = 7,
27 |   ncclNumFuncs = 8
28 | } ncclFunc_t;
29 | 
30 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
31 | #define NCCL_ALGO_UNDEF -1
32 | #define NCCL_ALGO_TREE 0
33 | #define NCCL_ALGO_RING 1
34 | #define NCCL_ALGO_COLLNET_DIRECT 2
35 | #define NCCL_ALGO_COLLNET_CHAIN 3
36 | #define NCCL_ALGO_NVLS 4
37 | #define NCCL_ALGO_NVLS_TREE 5
38 | #define NCCL_ALGO_PAT 6
39 | 
40 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
41 | #define NCCL_PROTO_UNDEF -1
42 | #define NCCL_PROTO_LL 0
43 | #define NCCL_PROTO_LL128 1
44 | #define NCCL_PROTO_SIMPLE 2
45 | 
46 | #define NCCL_ALGO_PROTO_IGNORE -1.0
47 | 
48 | // API to be implemented by external tuner
49 | typedef struct {
50 |   // Name of the tuner
51 |   const char* name;
52 | 
53 |   // Initializes tuner states.
54 |   // Inputs:
55 |   //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
56 |   //   - nNodes: number of nodes in current communicator.
57 |   //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
58 |   // Outputs:
59 |   //   - context: tuner context object
60 |   ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
61 | 
62 |   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
63 |   // Inputs:
64 |   //   - context: tuner context object
65 |   //   - collType: collective type , e.g., allreduce, allgather…
66 |   //   - nBytes: collective size in bytes
67 |   //   - numPipeOps: number of operations in the group
68 |   //   - numAlgo: number of algorithms in collCostTable
69 |   //   - numProto: number of protocols in collCostTable
70 |   //   - regBuff: can register user buffer
71 |   //
72 |   // Outputs:
73 |   //   - nChannels: number of channels (hence SMs) to be used.
74 |   //
75 |   // InOut:
76 |   //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
77 |   //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
78 |   //
79 |   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
80 |   // default tuning for the given collective.
81 |   // Also, the plugin is allowed to not set any output, or set only the
82 |   // algorithm and protocol, but not only the algorithm or only the protocol.
83 |   // Unset fields will be set automatically by NCCL.
84 |   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
85 |                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
86 |                               int regBuff, int* nChannels);
87 | 
88 |   // Terminates the plugin and cleans up any resources that the plugin allocated.
89 |   // context: tuner context object
90 |   ncclResult_t (*destroy)(void* context);
91 | } ncclTuner_v4_t;
92 | 
93 | typedef ncclTuner_v4_t ncclTuner_t;
94 | 
95 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/ext-tuner/example/plugin.c:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "tuner.h"
 8 | 
 9 | #define __hidden __attribute__ ((visibility("hidden")))
10 | 
11 | __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
12 | 
13 | __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
14 |                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
15 |                               int regBuff, int* nChannels) {
16 |   // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
17 |   float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
18 |   if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
19 |     table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
20 |   }
21 |   *nChannels = 1;
22 |   return ncclSuccess;
23 | }
24 | 
25 | __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
26 | 
27 | #define PLUGIN_NAME "Example"
28 | 
29 | const ncclTuner_v4_t ncclTunerPlugin_v4 = {
30 |   .name = PLUGIN_NAME,
31 |   .init = pluginInit,
32 |   .getCollInfo = pluginGetCollInfo,
33 |   .destroy = pluginDestroy
34 | };
35 | 


--------------------------------------------------------------------------------
/makefiles/formatting.mk:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
 8 | # As this file defines a new target (format), it should be included at least after the definition of the
 9 | # default target.
10 | 
11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
12 | ASTYLEDIR := $(BUILDDIR)/contrib
13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
16 | ASTYLEVER := 3.1
17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
18 | 
19 | $(ASTYLEDIR) :
20 | 	@mkdir -p $(ASTYLEDIR)
21 | 
22 | $(ASTYLETAR) : $(ASTYLEDIR)
23 | 	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
24 | 
25 | $(ASTYLEBLD) : $(ASTYLETAR)
26 | 	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
27 | 
28 | $(ASTYLEBIN) : $(ASTYLEBLD)
29 | 	${MAKE} -C $(ASTYLEBLD)
30 | 
31 | .PHONY : format
32 | format : $(ASTYLEBIN)
33 | 	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
34 | 


--------------------------------------------------------------------------------
/makefiles/version.mk:
--------------------------------------------------------------------------------
1 | ##### version
2 | NCCL_MAJOR   := 2
3 | NCCL_MINOR   := 27
4 | NCCL_PATCH   := 3
5 | NCCL_SUFFIX  :=
6 | PKG_REVISION := 1
7 | 


--------------------------------------------------------------------------------
/pkg/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : build
 9 | build : debian.build txz.build
10 | 
11 | BUILDDIR ?= $(abspath ../build)
12 | ABSBUILDDIR := $(abspath $(BUILDDIR))
13 | TARGETS := debian txz
14 | all:   ${TARGETS:%=%.build}
15 | prep:  ${TARGETS:%=%.prep}
16 | build: ${TARGETS:%=%.build}
17 | clean: ${TARGETS:%=%.clean}
18 | 
19 | %.prep:
20 | 	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
21 | 
22 | %.build:
23 | 	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
24 | 
25 | %.clean:
26 | 	${MAKE} -C $* clean
27 | 


--------------------------------------------------------------------------------
/pkg/debian/.gitignore:
--------------------------------------------------------------------------------
1 | /*.debhelper.log
2 | /*.debhelper
3 | /*.substvars
4 | /tmp/
5 | /files
6 | /libnccl1/
7 | /libnccl-dev/
8 | 


--------------------------------------------------------------------------------
/pkg/debian/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | DEBPREPDIR := $(BUILDDIR)/debian
11 | PKGDIR  := $(BUILDDIR)/pkg/deb/
12 | 
13 | DEBGEN_IN  := $(wildcard *.in)
14 | DEBGEN     := $(DEBGEN_IN:.in=)
15 | DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | PKG_ARCH       ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
20 | PKG_MULTIARCH  ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
21 | 
22 | prep : $(DEBTARGETS)
23 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
24 | 
25 | build : prep
26 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
27 | 	@printf "Building Debian package\n"
28 | 	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz)
29 | 	mkdir -p $(PKGDIR)
30 | 	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
31 | 
32 | clean:
33 | 	rm -Rf $(DEBPREPDIR) $(PKGDIR)
34 | 
35 | $(DEBPREPDIR)/% : %.in
36 | 	@printf "Generating %-35s > %s\n" $< $@
37 | 	mkdir -p $(DEBPREPDIR)
38 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
39 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
40 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
41 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
42 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
43 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
44 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
45 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
46 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
47 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
48 | 	    $< > $@
49 | 
50 | $(DEBPREPDIR)/% : %
51 | 	@printf "Grabbing   %-35s > %s\n" $< $@
52 | 	mkdir -p $(DEBPREPDIR)
53 | 	cp -f $< $@
54 | 


--------------------------------------------------------------------------------
/pkg/debian/changelog.in:
--------------------------------------------------------------------------------
1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
2 | 
3 |   * Automatic Debian package from build
4 | 
5 |  -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
6 | 


--------------------------------------------------------------------------------
/pkg/debian/compat:
--------------------------------------------------------------------------------
1 | 9
2 | 


--------------------------------------------------------------------------------
/pkg/debian/control.in:
--------------------------------------------------------------------------------
 1 | Source: nccl
 2 | Section: libs
 3 | Maintainer: cudatools <cudatools@nvidia.com>
 4 | Priority: optional
 5 | Build-depends: debhelper(>=9)
 6 | Standards-Version: 3.9.5
 7 | 
 8 | Package: libnccl${nccl:Major}
 9 | Section: libs
10 | Architecture: ${pkg:Arch}
11 | Depends: ${misc:Depends}, ${shlibs:Depends}
12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime
13 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 |  broadcast, and reduce-scatter.
16 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
17 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 |  sockets.
19 | 
20 | Package: libnccl-dev
21 | Section: libdevel
22 | Architecture: ${pkg:Arch}
23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files
25 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
26 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
27 |  broadcast, and reduce-scatter.
28 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
29 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
30 |  sockets.
31 | 


--------------------------------------------------------------------------------
/pkg/debian/copyright:
--------------------------------------------------------------------------------
1 | ../../LICENSE.txt


--------------------------------------------------------------------------------
/pkg/debian/gbp.conf:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | debian-branch   = master
 3 | upstream-branch = master
 4 | 
 5 | ignore-new = True
 6 | 
 7 | [git-buildpackage]
 8 | 
 9 | no-purge = True
10 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl-dev.install.in:
--------------------------------------------------------------------------------
1 | bin/ncclras /usr/bin
2 | include/nccl.h /usr/include
3 | lib/libnccl.so /usr/lib/${pkg:MultiArch}
4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
5 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl2.install.in:
--------------------------------------------------------------------------------
1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
3 | 


--------------------------------------------------------------------------------
/pkg/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | %:
 4 | 	dh $@ --parallel
 5 | 
 6 | override_dh_auto_install:
 7 | 	PREFIX=debian/tmp dh_auto_install
 8 | 
 9 | override_dh_auto_test:
10 | 	# Do not make test
11 | 
12 | override_dh_auto_clean:
13 | 	# Do not make clean
14 | 
15 | override_dh_builddeb:
16 | 	dh_builddeb -- -Zxz
17 | 


--------------------------------------------------------------------------------
/pkg/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (native)
2 | 


--------------------------------------------------------------------------------
/pkg/redhat/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | RPMPREPDIR := $(BUILDDIR)/redhat
11 | PKGDIR  := $(BUILDDIR)/pkg/rpm/
12 | 
13 | RPMGEN_IN  := $(wildcard *.in)
14 | RPMGEN     := $(RPMGEN_IN:.in=)
15 | RPMFILES   := $(RPMGEN)
16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | ARCH           := $(shell uname -m)
20 | PKG_ARCH       ?= $(shell uname -m)
21 | PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
22 | ifeq ($(PKG_MULTIARCH),)
23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
24 | PKG_MULTIARCH  := $(ARCH)-linux-gnu
25 | endif
26 | 
27 | prep : $(RPMTARGETS)
28 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
29 | 
30 | build : prep
31 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
32 | 	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
33 | 	@printf "Building Redhat package\n"
34 | 	mkdir -p $(PKGDIR)
35 | 	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
36 |                  --define "_rpmdir $(PKGDIR)" \
37 |                  --define "_builddir $(PKGDIR)/build/" \
38 |                  --define "_buildrootdir $(PKGDIR)/buildroot/" \
39 |                  -bb $(BUILDDIR)/redhat/nccl.spec
40 | 
41 | clean:
42 | 	rm -Rf $(RPMPREPDIR) $(PKGDIR)
43 | 
44 | $(RPMPREPDIR)/% : %.in
45 | 	@printf "Generating %-35s > %s\n" $< $@
46 | 	mkdir -p $(RPMPREPDIR)
47 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
48 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
49 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
50 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
51 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
52 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
53 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
54 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
55 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
56 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
57 | 	    $< > $@
58 | 
59 | $(RPMPREPDIR)/% : %
60 | 	@printf "Grabbing   %-35s > %s\n" $< $@
61 | 	mkdir -p $(RPMPREPDIR)
62 | 	cp -f $< $@
63 | 


--------------------------------------------------------------------------------
/pkg/redhat/nccl.spec.in:
--------------------------------------------------------------------------------
 1 | Name:           libnccl
 2 | Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
 3 | Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
 4 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 5 | 
 6 | Group:          Development/Libraries
 7 | License:        BSD
 8 | URL:            http://developer.nvidia.com/nccl
 9 | Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
10 | Requires(pre,preun): /sbin/ldconfig
11 | 
12 | %description
13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 | broadcast, and reduce-scatter.
16 | It has been optimized to achieve high bandwidth on any platform using PCIe,
17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 | sockets.
19 | 
20 | %package devel
21 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
22 | Group:          Development/Libraries
23 | Requires:       libnccl >= ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
24 | %description devel
25 | NCCL development files
26 | 
27 | %package static
28 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
29 | Group:          Development/Libraries
30 | %description static
31 | NCCL static library
32 | 
33 | %define debug_package %{nil}
34 | 
35 | %prep
36 | %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
37 | 
38 | %build
39 | 
40 | %install
41 | rm -rf $RPM_BUILD_ROOT
42 | install -m 755 -d $RPM_BUILD_ROOT
43 | install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
44 | install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
45 | ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
46 | 
47 | # devel
48 | install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
49 | install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
50 | install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
51 | install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
52 | ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
53 | 
54 | # static
55 | install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
56 | 
57 | %post -p /sbin/ldconfig
58 | %postun -p /sbin/ldconfig
59 | 
60 | %post devel -p /sbin/ldconfig
61 | %postun devel -p /sbin/ldconfig
62 | 
63 | %clean
64 | rm -rf $RPM_BUILD_ROOT
65 | 
66 | %files devel
67 | %doc LICENSE.txt
68 | %defattr(-,root,root,-)
69 | %{_bindir}/ncclras
70 | %{_includedir}/nccl.h
71 | %{_libdir}/libnccl.so
72 | 
73 | %files static
74 | %doc LICENSE.txt
75 | %defattr(-,root,root,-)
76 | %{_libdir}/libnccl_static.a
77 | 
78 | %files
79 | %doc LICENSE.txt
80 | %defattr(-,root,root,-)
81 | %{_libdir}/libnccl.so.${nccl:Major}
82 | %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
83 | 
84 | %changelog
85 | 


--------------------------------------------------------------------------------
/pkg/srctxz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/srctxz
11 | PKGDIR  := $(BUILDDIR)/pkg/srctxz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_REVISION   ?= 3
18 | PKG_ARCH       := $(shell uname -m)
19 | 
20 | prep: $(TXZTARGETS)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../../src clean
24 | 	@printf "Building source tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
40 | 	    $< > $@
41 | 


--------------------------------------------------------------------------------
/pkg/srctxz/create_srctxz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | cd ..
11 | NCCLDIR=`basename $PWD`
12 | 
13 | echo "Checking for unclean directory ..."
14 | git clean -x -i
15 | echo "Clean done."
16 | echo "Checking for uncommited files ..."
17 | if [ "`git status -s | wc -l`" != "0" ]; then
18 |   git status -s
19 |   echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
20 |   read
21 | fi
22 | 
23 | cd ..
24 | NCCL_MAJOR=${nccl:Major}
25 | NCCL_MINOR=${nccl:Minor}
26 | NCCL_PATCH=${nccl:Patch}
27 | NCCL_SUFFIX=${nccl:Suffix}
28 | NCCL_BUILD=${pkg:Revision}
29 | 
30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
31 | 
32 | tar --exclude build \
33 |     --exclude ".git*" \
34 |     --exclude pkg/srctxz \
35 |     --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
36 | 


--------------------------------------------------------------------------------
/pkg/txz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/txz
11 | PKGDIR  := $(BUILDDIR)/pkg/txz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_ARCH   := $(shell uname -m)
18 | 
19 | prep: $(TXZTARGETS)
20 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
24 | 	@printf "Building tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash txz/create_txz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
40 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
41 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
42 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
43 | 	    $< > $@
44 | 


--------------------------------------------------------------------------------
/pkg/txz/create_txz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | BUILDDIR=`basename $PWD`
11 | 
12 | cd ..
13 | NCCL_MAJOR=${nccl:Major}
14 | NCCL_MINOR=${nccl:Minor}
15 | NCCL_PATCH=${nccl:Patch}
16 | NCCL_SUFFIX=${nccl:Suffix}
17 | CUDA_MAJOR=${cuda:Major}
18 | CUDA_MINOR=${cuda:Minor}
19 | PKG_REVISION=${pkg:Revision}
20 | PKG_ARCH=${pkg:Arch}
21 | 
22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
23 | 
24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
25 | 


--------------------------------------------------------------------------------
/src/device/broadcast.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "device.h"
 8 | #include "collectives.h"
 9 | #include "primitives.h"
10 | 
11 | namespace {
12 |   template<typename T, typename RedOp, typename Proto>
13 |   __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
14 |     ncclRing *ring = &ncclShmem.channel.ring;
15 |     const int rank = ring->userRanks[0];
16 |     const int nextRank = ring->userRanks[1];
17 |     const int root = work->root;
18 |     ssize_t chunkCount;
19 |     ssize_t channelCount;
20 |     ssize_t gridOffset;
21 |     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
22 |     size_t offset;
23 |     int nelem;
24 |     int workNthreads;
25 |     bool isNetOffload = work->isOneRPN && work->netRegUsed;
26 | 
27 |     T *inputBuf = (T*)work->sendbuff;
28 |     T *outputBuf = (T*)work->recvbuff;
29 |     workNthreads = isNetOffload ? WARP_SIZE : nthreads;
30 | 
31 |     if (tid < workNthreads) {
32 |       // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
33 |       // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
34 |       // coverity[callee_ptr_arith:FALSE]
35 |       Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
36 |         prims(tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
37 | 
38 |       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
39 |         offset = gridOffset + elemOffset;
40 |         nelem = min(chunkCount, channelCount - elemOffset);
41 | 
42 |         if (rank == root) {
43 |           if (inputBuf == outputBuf || isNetOffload) {
44 |             prims.directSend(offset, offset, nelem);
45 |           } else {
46 |             prims.directCopySend(offset, offset, nelem);
47 |           }
48 |         } else if (nextRank == root) {
49 |           prims.directRecv(offset, nelem);
50 |         } else {
51 |           prims.directRecvCopyDirectSend(offset, offset, nelem);
52 |         }
53 |       }
54 |     } else if (inputBuf != outputBuf && rank == root) {
55 |       inputBuf = inputBuf + gridOffset;
56 |       outputBuf = outputBuf + gridOffset;
57 |       reduceCopy<COLL_UNROLL, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs=*/0>
58 |         (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, channelCount);
59 |     }
60 |     if (isNetOffload) barrier_sync(14, nthreads);
61 |   }
62 | }
63 | 
64 | template<typename T, typename RedOp>
65 | struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
66 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
67 |     using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
68 |     runRing<T, RedOp, Proto>(tid, nthreads, work);
69 |   }
70 | };
71 | 
72 | template<typename T, typename RedOp>
73 | struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
74 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
75 |     runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
76 |   }
77 | };
78 | 
79 | template<typename T, typename RedOp>
80 | struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
81 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
82 |     runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
83 |   }
84 | };
85 | 


--------------------------------------------------------------------------------
/src/device/common.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "device.h"
 8 | #include "collectives.h"
 9 | #include "common.h"
10 | 
11 | __shared__ ncclShmemData ncclShmem;
12 | #if __CUDA_ARCH__ < 700
13 |   __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
14 | #endif
15 | 
16 | struct RunWorkNop {
17 |   __device__ void run() {}
18 | };
19 | 
20 | __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
21 |   ncclKernelMain<-1, RunWorkNop>(&args4K.args);
22 | }
23 | 
24 | __device__ void ncclDevFunc_Nop() {}
25 | 


--------------------------------------------------------------------------------
/src/device/network/unpack/unpack_defs.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, Google LLC.  All rights reserved.
 3 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | #ifndef NET_DEVICE_UNPACK_DEFS_H
 8 | #define NET_DEVICE_UNPACK_DEFS_H
 9 | 
10 | #include <stdint.h>
11 | 
12 | #include "device.h"
13 | 
14 | #define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16
15 | 
16 | union alignas(16) loadMeta {
17 |   uint64_t r64[2];
18 |   struct {
19 |     uint32_t src_off;
20 |     uint32_t len;
21 |     uint64_t dst_off;
22 |   };
23 | };
24 | static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned");
25 | 
26 | /****** global memory ******/
27 | 
28 | #define NET_UNPACK_MAX_QUEUE_DEPTH 16  // MAX_REQUESTS
29 | #define NET_UNPACK_MAX_SLICE_SIZE 4194304  // 4MB per Irecv call
30 | #define SLICE_PAGE_SIZE 4096
31 | #define NET_UNPACK_MAX_SLICE_PAGES \
32 |   (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2)  // * 2 for slack, wasteful..
33 | 
34 | struct netUnpackMeta {
35 |   loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES];
36 |   uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH];
37 | };
38 | 
39 | struct unpackNetDeviceHandle {
40 |   struct netUnpackMeta *meta;  // mapped
41 |   void* bounce_buf;
42 |   uint64_t head;
43 | };
44 | 
45 | /****** shared memory ******/
46 | 
47 | #define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h
48 | #define NET_UNPACK_MAX_NPEERS 2  // The most you should have is 2 network peers per-group (indexed by index)
49 | #define WARP_SHM_PAGE_CNT 4
50 | #define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta))
51 | struct unpackShmem {
52 |   void* bounce_buf;
53 | };
54 | 
55 | struct unpackGroupShmem {
56 |   int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
57 |   uint64_t head[NET_UNPACK_MAX_NPEERS];
58 |   struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
59 | };
60 | 
61 | #endif // NET_DEVICE_UNPACK_DEFS_H_
62 | 


--------------------------------------------------------------------------------
/src/device/onerank.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "alloc.h"
 8 | #include "collectives.h"
 9 | #include "common_kernel.h"
10 | #include "common.h"
11 | #include <cuda_runtime.h>
12 | 
13 | namespace {
14 |   template<typename RedOp>
15 |   __global__ __launch_bounds__(512, 1)
16 |   void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) {
17 |     using T = typename RedOp::EltType;
18 |     int tid = threadIdx.x;
19 |     int tn = blockDim.x;
20 |     int bid = blockIdx.x;
21 |     int bn = gridDim.x;
22 | 
23 |     // each block/channel gets a roughly equal segment of 16 byte packs
24 |     constexpr int EltPerPack = 16/sizeof(T);
25 |     intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack);
26 |     intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack);
27 |     i0 = min(i0, nElts);
28 |     i1 = min(i1, nElts);
29 |     src = (T*)src + i0;
30 |     dst = (T*)dst + i0;
31 | 
32 |     if (redOpArgIsPtr) {
33 |       if (redOpArg%2 != 0) {
34 |         redOpArg = *reinterpret_cast<uint8_t*>(redOpArg);
35 |       } else if (redOpArg%4 != 0) {
36 |         redOpArg = *reinterpret_cast<uint16_t*>(redOpArg);
37 |       } else if (redOpArg%8 != 0) {
38 |         redOpArg = *reinterpret_cast<uint32_t*>(redOpArg);
39 |       } else {
40 |         redOpArg = *reinterpret_cast<uint64_t*>(redOpArg);
41 |       }
42 |     }
43 |     reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
44 |       (tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0);
45 |   }
46 | }
47 | 
48 | ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) {
49 |   size_t eltSize = ncclTypeSize(eltType);
50 |   if (redOp.op != ncclDevPreMulSum) {
51 |     if (dst != src) {
52 |       NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream));
53 |     }
54 |     return ncclSuccess;
55 |   }
56 | 
57 |   void const* kernel;
58 |   switch (eltType) {
59 |   case ncclInt8:     kernel = (void const*)&oneRankReduce<FuncPreMulSum<int8_t>>; break;
60 |   case ncclUint8:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint8_t>>; break;
61 |   case ncclInt32:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int32_t>>; break;
62 |   case ncclUint32:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint32_t>>; break;
63 |   case ncclInt64:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int64_t>>; break;
64 |   case ncclUint64:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint64_t>>; break;
65 |   #if defined(__CUDA_FP8_TYPES_EXIST__) && __CUDA_ARCH__ >= 900
66 |   case ncclFloat8e4m3: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e4m3>>; break;
67 |   case ncclFloat8e5m2: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e5m2>>; break;
68 |   #endif
69 |   case ncclFloat16:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<half>>; break;
70 |   #if defined(__CUDA_BF16_TYPES_EXIST__)
71 |   case ncclBfloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_bfloat16>>; break;
72 |   #endif
73 |   case ncclFloat32:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<float>>; break;
74 |   case ncclFloat64:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<double>>; break;
75 |   default: return ncclInvalidArgument;
76 |   }
77 |   dim3 grid = {0, 1, 1};
78 |   grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10));
79 |   dim3 block = {512, 1, 1};
80 |   void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr};
81 |   CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream));
82 |   return ncclSuccess;
83 | }
84 | 


--------------------------------------------------------------------------------
/src/device/reduce.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "device.h"
 8 | #include "collectives.h"
 9 | #include "primitives.h"
10 | 
11 | namespace {
12 |   template<typename T, typename RedOp, typename Proto>
13 |   __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
14 |     ncclRing *ring = &ncclShmem.channel.ring;
15 |     const int nranks = ncclShmem.comm.nRanks;
16 |     const int rank = ncclShmem.comm.rank;
17 |     const int prevRank = ring->userRanks[nranks-1];
18 |     const int root = work->root;
19 |     size_t chunkCount;
20 |     size_t channelCount;
21 |     size_t gridOffset;
22 |     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
23 |     size_t offset;
24 |     int nelem;
25 | 
26 |     // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
27 |     // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
28 |     // coverity[callee_ptr_arith:FALSE]
29 |     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
30 |       prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
31 | 
32 |     if (prevRank == root) {
33 |       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
34 |         offset = gridOffset + elemOffset;
35 |         nelem = min(chunkCount, channelCount - elemOffset);
36 |         prims.send(offset, nelem);
37 |       }
38 |     }
39 |     else if (rank == root) {
40 |       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
41 |         offset = gridOffset + elemOffset;
42 |         nelem = min(chunkCount, channelCount - elemOffset);
43 |         prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
44 |       }
45 |     }
46 |     else {
47 |       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
48 |         offset = gridOffset + elemOffset;
49 |         nelem = min(chunkCount, channelCount - elemOffset);
50 |         prims.recvReduceSend(offset, nelem);
51 |       }
52 |     }
53 |   }
54 | }
55 | 
56 | template<typename T, typename RedOp>
57 | struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
58 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
59 |     using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
60 |     runRing<T, RedOp, Proto>(tid, nthreads, work);
61 |   }
62 | };
63 | 
64 | template<typename T, typename RedOp>
65 | struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
66 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
67 |     runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
68 |   }
69 | };
70 | 
71 | template<typename T, typename RedOp>
72 | struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
73 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
74 |     runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
75 |   }
76 | };
77 | 


--------------------------------------------------------------------------------
/src/device/symmetric/kernel.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_
 2 | #define NCCL_DEVICE_SYMMETRIC_KERNEL_H_
 3 | 
 4 | #include "symmetric.h"
 5 | 
 6 | template<template<typename> typename Red, typename T>
 7 | __device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args);
 8 | template<template<typename> typename Red, typename T>
 9 | __device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args);
10 | 
11 | template<template<typename> typename Red, typename T>
12 | __device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args);
13 | template<template<typename> typename Red, typename T>
14 | __device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args);
15 | 
16 | __device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args);
17 | __device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args);
18 | __device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args);
19 | __device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args);
20 | 
21 | template<template<typename> typename Red, typename T>
22 | __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args);
23 | template<template<typename> typename Red, typename T>
24 | __device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args);
25 | template<template<typename> typename Red, typename T>
26 | __device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args);
27 | #endif
28 | 


--------------------------------------------------------------------------------
/src/enhcompat.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
 8 | 
 9 | enum cudaError_t { cudaErrorStubLibrary = 34 };
10 | 
11 | extern "C" {
12 | 
13 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         __attribute__((visibility("hidden"))) __attribute((weak));
14 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         { return cudaErrorStubLibrary; }
15 | 
16 | cudaError_t cudaUserObjectCreate(...)                __attribute__((visibility("hidden"))) __attribute((weak));
17 | cudaError_t cudaUserObjectCreate(...)                { return cudaErrorStubLibrary; }
18 | 
19 | cudaError_t cudaGraphRetainUserObject(...)           __attribute__((visibility("hidden"))) __attribute((weak));
20 | cudaError_t cudaGraphRetainUserObject(...)           { return cudaErrorStubLibrary; }
21 | 
22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
24 | 
25 | cudaError_t cudaGetDriverEntryPoint(...)             __attribute__((visibility("hidden"))) __attribute((weak));
26 | cudaError_t cudaGetDriverEntryPoint(...)             { return cudaErrorStubLibrary; }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/graph/rings.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "core.h"
 8 | 
 9 | void dumpLine(int* values, int nranks, const char* prefix) {
10 |   constexpr int line_length = 128;
11 |   char line[line_length];
12 |   int num_width = snprintf(nullptr, 0, "%d", nranks-1);  // safe as per "man snprintf"
13 |   int n = snprintf(line, line_length, "%s", prefix);
14 |   for (int i = 0; i < nranks && n < line_length-1; i++) {
15 |     n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]);
16 |     // At this point n may be more than line_length-1, so don't use it
17 |     // for indexing into "line".
18 |   }
19 |   if (n >= line_length) {
20 |     // Sprintf wanted to write more than would fit in the buffer. Assume
21 |     // line_length is at least 4 and replace the end with "..." to
22 |     // indicate that it was truncated.
23 |     snprintf(line+line_length-4, 4, "...");
24 |   }
25 |   INFO(NCCL_INIT, "%s", line);
26 | }
27 | 
28 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
29 |   for (int r=0; r<nrings; r++) {
30 |     char prefix[40];
31 |     /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
32 |     dumpLine(prev+r*nranks, nranks, prefix);
33 |     sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
34 |     dumpLine(next+r*nranks, nranks, prefix);*/
35 | 
36 |     int current = rank;
37 |     for (int i=0; i<nranks; i++) {
38 |       rings[r*nranks+i] = current;
39 |       current = next[r*nranks+current];
40 |     }
41 |     snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings);
42 |     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
43 |     if (current != rank) {
44 |       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
45 |       return ncclInternalError;
46 |     }
47 |     // Check that all ranks are there
48 |     for (int i=0; i<nranks; i++) {
49 |       int found = 0;
50 |       for (int j=0; j<nranks; j++) {
51 |         if (rings[r*nranks+j] == i) {
52 |           found = 1;
53 |           break;
54 |         }
55 |       }
56 |       if (found == 0) {
57 |         WARN("Error : ring %d does not contain rank %d", r, i);
58 |         return ncclInternalError;
59 |       }
60 |     }
61 |   }
62 |   return ncclSuccess;
63 | }
64 | 


--------------------------------------------------------------------------------
/src/graph/rings.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 |  *
4 |  * See LICENSE.txt for license information
5 |  ************************************************************************/
6 | 
7 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
8 | 


--------------------------------------------------------------------------------
/src/include/allocator.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ALLOCATOR_H_
 8 | #define NCCL_ALLOCATOR_H_
 9 | 
10 | ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
11 | ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/include/argcheck.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ARGCHECK_H_
 8 | #define NCCL_ARGCHECK_H_
 9 | 
10 | #include "core.h"
11 | #include "info.h"
12 | 
13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
14 | ncclResult_t CommCheck(struct ncclComm* ptr, const char* opname, const char* ptrname);
15 | ncclResult_t ArgsCheck(struct ncclInfo* info);
16 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname);
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/src/include/bootstrap.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_BOOTSTRAP_H_
 8 | #define NCCL_BOOTSTRAP_H_
 9 | 
10 | #include "nccl.h"
11 | #include "comm.h"
12 | 
13 | struct ncclBootstrapHandle {
14 |   uint64_t magic;
15 |   union ncclSocketAddress addr;
16 | };
17 | static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
18 | 
19 | ncclResult_t bootstrapNetInit();
20 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
21 | ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
22 | ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
23 | ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
24 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
25 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
26 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
27 | ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag);
28 | ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size);
29 | ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
30 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
31 | ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
32 | ncclResult_t bootstrapClose(void* commState);
33 | ncclResult_t bootstrapAbort(void* commState);
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/include/channel.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CHANNEL_H_
 8 | #define NCCL_CHANNEL_H_
 9 | #include "comm.h"
10 | #include "utils.h"
11 | 
12 | #include <algorithm>
13 | 
14 | ncclResult_t initChannel(struct ncclComm* comm, int channelid);
15 | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
16 | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
17 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
18 | 
19 | inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
20 |   if (comm->nNodes > 1) {
21 |     int nodeDelta = p2pRound/comm->maxLocalRanks;
22 |     int localDelta = p2pRound%comm->maxLocalRanks;
23 |     int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
24 |     base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
25 |     return base & 0xff;
26 |   } else {
27 |     return p2pRound & 0xff;
28 |   }
29 | }
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/src/include/coll_net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COLL_NET_H_
 8 | #define COLL_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | 
13 | typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
14 | 
15 | // Translation to external API
16 | static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
17 | static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
18 | static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
19 | static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
20 | static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
21 | static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
22 | static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
23 | /* DMA-BUF support */
24 | static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
25 | static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
26 | static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
27 |   NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
28 | static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
29 | static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
30 | static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
31 | static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
32 | 
33 | static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/src/include/core.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CORE_H_
 8 | #define NCCL_CORE_H_
 9 | 
10 | #include <pthread.h>
11 | #include <unistd.h>
12 | #include <stdlib.h>
13 | #include <stdint.h>
14 | #include <algorithm> // For std::min/std::max
15 | #include "nccl.h"
16 | 
17 | #ifdef PROFAPI
18 | #define NCCL_API(ret, func, args...)        \
19 |     __attribute__ ((visibility("default"))) \
20 |     __attribute__ ((alias(#func)))          \
21 |     ret p##func (args);                     \
22 |     extern "C"                              \
23 |     __attribute__ ((visibility("default"))) \
24 |     __attribute__ ((weak))                  \
25 |     ret func(args)
26 | #else
27 | #define NCCL_API(ret, func, args...)        \
28 |     extern "C"                              \
29 |     __attribute__ ((visibility("default"))) \
30 |     ret func(args)
31 | #endif // end PROFAPI
32 | 
33 | #include "debug.h"
34 | #include "checks.h"
35 | #include "cudawrap.h"
36 | #include "alloc.h"
37 | #include "utils.h"
38 | #include "param.h"
39 | #include "nvtx.h"
40 | 
41 | #endif // end include guard
42 | 


--------------------------------------------------------------------------------
/src/include/cpuset.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CPUSET_H_
 8 | #define NCCL_CPUSET_H_
 9 | 
10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
11 | 
12 | static int hexToInt(char c) {
13 |   int v = c - '0';
14 |   if (v < 0) return -1;
15 |   if (v > 9) v = 10 + c - 'a';
16 |   if ((v < 0) || (v > 15)) return -1;
17 |   return v;
18 | }
19 | 
20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
21 | 
22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
23 |   uint32_t cpumasks[CPU_SET_N_U32];
24 |   int m = CPU_SET_N_U32-1;
25 |   cpumasks[m] = 0;
26 |   for (int o=0; o<strlen(str); o++) {
27 |     char c = str[o];
28 |     if (c == ',') {
29 |       m--;
30 |       cpumasks[m] = 0;
31 |     } else {
32 |       int v = hexToInt(c);
33 |       if (v == -1) break;
34 |       cpumasks[m] <<= 4;
35 |       cpumasks[m] += v;
36 |     }
37 |   }
38 |   // Copy cpumasks to mask
39 |   for (int a=0; m<CPU_SET_N_U32; a++,m++) {
40 |     memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
41 |   }
42 |   return ncclSuccess;
43 | }
44 | 
45 | static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
46 |   int c = 0;
47 |   uint8_t* m8 = (uint8_t*)mask;
48 |   for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
49 |     if (c == 0 && m8[o] == 0) continue;
50 |     sprintf(str+c, "%02x", m8[o]);
51 |     c+=2;
52 |     if (o && o%4 == 0) {
53 |       sprintf(str+c, ",");
54 |       c++;
55 |     }
56 |   }
57 |   str[c] = '\0';
58 |   return ncclSuccess;
59 | }
60 | 
61 | static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
62 |   int c = 0;
63 |   int start = -1;
64 |   // Iterate through all possible CPU bits plus one extra position
65 |   for (int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
66 |     int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
67 |     // Start of a new range
68 |     if (isSet && start == -1) {
69 |       start = cpu;
70 |     }
71 |     // End of a range, add comma between ranges
72 |     if (!isSet && start != -1) {
73 |       if (cpu-1 == start) {
74 |         c += snprintf(str+c, len-c, "%s%d", c ? "," : "", start);
75 |       } else {
76 |         c += snprintf(str+c, len-c, "%s%d-%d", c ? "," : "", start, cpu-1);
77 |       }
78 |       if (c >= len-1) break;
79 |       start = -1;
80 |     }
81 |   }
82 |   if (c == 0) str[0] = '\0';
83 |   return str;
84 | }
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/src/include/debug.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INT_DEBUG_H_
 8 | #define NCCL_INT_DEBUG_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_common.h"
12 | #include <stdio.h>
13 | 
14 | #include <pthread.h>
15 | 
16 | // Conform to pthread and NVTX standard
17 | #define NCCL_THREAD_NAMELEN 16
18 | 
19 | extern int ncclDebugLevel;
20 | extern FILE *ncclDebugFile;
21 | 
22 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
23 | 
24 | // Let code temporarily downgrade WARN into INFO
25 | extern thread_local int ncclDebugNoWarn;
26 | extern char ncclLastError[];
27 | 
28 | #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
29 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
30 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
31 | #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
32 | 
33 | #ifdef ENABLE_TRACE
34 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
35 | #else
36 | #define TRACE(...)
37 | #endif
38 | 
39 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
40 | 
41 | void ncclResetDebugInit();
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/include/enqueue.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ENQUEUE_H_
 8 | #define NCCL_ENQUEUE_H_
 9 | 
10 | #include "comm.h"
11 | #include "group.h"
12 | #include "collectives.h"
13 | #include "utils.h"
14 | 
15 | #define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t)
16 | #define NCCL_LL128_ALIGNMENT_PER_WARP 480
17 | #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
18 | #define NCCL_BYTES_ALIGNMENT 16
19 | 
20 | ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize);
21 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
22 | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
23 | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
24 | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
25 | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
26 | ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
27 | ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
28 | ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm);
29 | 
30 | static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) {
31 |   return func == ncclFuncReduceScatter ? nRanks*count : count;
32 | }
33 | static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
34 |   return func == ncclFuncAllGather ? nRanks*count : count;
35 | }
36 | static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
37 |   return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
38 | }
39 | 
40 | #endif // End include guard
41 | 


--------------------------------------------------------------------------------
/src/include/ibvsymbols.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_IBV_SYMBOLS_H_
 2 | #define NCCL_IBV_SYMBOLS_H_
 3 | 
 4 | #ifdef NCCL_BUILD_RDMA_CORE
 5 | #include <infiniband/verbs.h>
 6 | #else
 7 | #include "ibvcore.h"
 8 | #endif
 9 | 
10 | #include "nccl.h"
11 | 
12 | /* IB Verbs Function Pointers*/
13 | struct ncclIbvSymbols {
14 |   int (*ibv_internal_fork_init)(void);
15 |   struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
16 |   void (*ibv_internal_free_device_list)(struct ibv_device **list);
17 |   const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
18 |   struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
19 |   int (*ibv_internal_close_device)(struct ibv_context *context);
20 |   int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
21 |   void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
22 |   int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
23 |   int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
24 |   int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
25 |   int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
26 |   struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
27 |   int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
28 |   struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
29 |   struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
30 |   /* DMA-BUF support */
31 |   struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
32 |   int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
33 |   struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
34 |   int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
35 |   struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
36 |   int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
37 |   int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
38 |   const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
39 |   int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
40 |   int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
41 | };
42 | 
43 | /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
44 | ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
45 | 
46 | #endif  // NCCL_IBV_SYMBOLS_H_
47 | 


--------------------------------------------------------------------------------
/src/include/info.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INFO_H_
 8 | #define NCCL_INFO_H_
 9 | 
10 | #include "nccl.h"
11 | #include "collectives.h"
12 | #include "core.h"
13 | #include "utils.h"
14 | 
15 | // Used to pass NCCL call information between functions
16 | struct ncclInfo {
17 |   ncclFunc_t coll;
18 |   const char* opName;
19 |   // NCCL Coll Args
20 |   const void* sendbuff;
21 |   void* recvbuff;
22 |   size_t count;
23 |   ncclDataType_t datatype;
24 |   ncclRedOp_t op;
25 |   int root; // peer for p2p operations
26 |   ncclComm_t comm;
27 |   cudaStream_t stream;
28 |   // Algorithm details
29 |   int chunkSteps;
30 |   int sliceSteps;
31 | };
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/include/ipcsocket.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See COPYRIGHT for license information
 5 |  */
 6 | 
 7 | #ifndef NCCL_IPCSOCKET_H
 8 | #define NCCL_IPCSOCKET_H
 9 | 
10 | #include "nccl.h"
11 | #include <stdio.h>
12 | #include <fcntl.h>
13 | #include <sys/mman.h>
14 | #include <unistd.h>
15 | #include <errno.h>
16 | #include <sys/wait.h>
17 | #include <sys/types.h>
18 | #include <sys/socket.h>
19 | #include <memory.h>
20 | #include <sys/un.h>
21 | #include <inttypes.h>
22 | 
23 | #define NCCL_IPC_SOCKNAME_LEN 64
24 | 
25 | struct ncclIpcSocket {
26 |   int fd;
27 |   char socketName[NCCL_IPC_SOCKNAME_LEN];
28 |   volatile uint32_t* abortFlag;
29 | };
30 | 
31 | ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
32 | ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
33 | ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
34 | 
35 | ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
36 | ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
37 | 
38 | ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash);
39 | ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd);
40 | 
41 | #endif /* NCCL_IPCSOCKET_H */
42 | 


--------------------------------------------------------------------------------
/src/include/mlx5/mlx5dvcore.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_MLX5DV_CORE_H_
 2 | #define NCCL_MLX5DV_CORE_H_
 3 | 
 4 | /* Basic MLX5 direct verbs structs. Needed to dynamically load MLX5 direct verbs functions without
 5 |  * explicit including of MLX5 direct verbs header.
 6 |  */
 7 | 
 8 | #include <stddef.h>
 9 | #include <stdint.h>
10 | #include <sys/types.h>
11 | #include <unistd.h>
12 | #include "ibvwrap.h"
13 | 
14 | enum mlx5dv_reg_dmabuf_access  {
15 | 	MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT		= (1<<0),
16 | };
17 | 
18 | #endif  // NCCL_MLX5DV_CORE_H_
19 | 


--------------------------------------------------------------------------------
/src/include/mlx5/mlx5dvsymbols.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_MLX5DV_SYMBOLS_H_
 2 | #define NCCL_MLX5DV_SYMBOLS_H_
 3 | 
 4 | #ifdef NCCL_BUILD_MLX5DV
 5 | #include <infiniband/mlx5dv.h>
 6 | #else
 7 | #include "mlx5/mlx5dvcore.h"
 8 | #endif
 9 | 
10 | #include "nccl.h"
11 | 
12 | /* MLX5 Direct Verbs Function Pointers*/
13 | struct ncclMlx5dvSymbols {
14 |   bool (*mlx5dv_internal_is_supported)(struct ibv_device *device);
15 |   int (*mlx5dv_internal_get_data_direct_sysfs_path)(struct ibv_context *context, char *buf, size_t buf_len);
16 |   /* DMA-BUF support */
17 |   struct ibv_mr * (*mlx5dv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
18 |   };
19 | 
20 | /* Constructs MLX5 direct verbs symbols per rdma-core linking or dynamic loading mode */
21 | ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols);
22 | 
23 | #endif  // NCCL_MLX5DV_SYMBOLS_H_
24 | 


--------------------------------------------------------------------------------
/src/include/mlx5/mlx5dvwrap.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
 3 |  * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
 4 |  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
 5 |  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
 6 |  *
 7 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 8 |  *
 9 |  * See LICENSE.txt for license information
10 |  ************************************************************************/
11 | 
12 | #ifndef NCCL_MLX5DVWRAP_H_
13 | #define NCCL_MLX5DVWRAP_H_
14 | 
15 | #include <arpa/inet.h>
16 | #include <netinet/in.h>
17 | #ifdef NCCL_BUILD_MLX5DV
18 | #include <infiniband/mlx5dv.h>
19 | #else
20 | #include "mlx5/mlx5dvcore.h"
21 | #endif
22 | 
23 | #include "core.h"
24 | #include "ibvwrap.h"
25 | #include <sys/types.h>
26 | #include <unistd.h>
27 | 
28 | typedef enum mlx5dv_return_enum
29 | {
30 |     MLX5DV_SUCCESS = 0,                   //!< The operation was successful
31 | } mlx5dv_return_t;
32 | 
33 | ncclResult_t wrap_mlx5dv_symbols(void);
34 | /* NCCL wrappers of MLX5 direct verbs functions */
35 | bool wrap_mlx5dv_is_supported(struct ibv_device *device);
36 | ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len);
37 | /* DMA-BUF support */
38 | ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
39 | struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
40 | 
41 | #endif // NCCL_MLX5DVWRAP_H_
42 | 


--------------------------------------------------------------------------------
/src/include/mnnvl.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_MNNVL_H_
 8 | #define NCCL_MNNVL_H_
 9 | 
10 | #include "nccl.h"
11 | #include "comm.h"
12 | 
13 | ncclResult_t ncclMnnvlCheck(struct ncclComm* comm);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/include/nccl_common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_DEBUG_H_
 8 | #define NCCL_DEBUG_H_
 9 | 
10 | #include <cstdint>
11 | 
12 | typedef enum {
13 |   NCCL_LOG_NONE = 0,
14 |   NCCL_LOG_VERSION = 1,
15 |   NCCL_LOG_WARN = 2,
16 |   NCCL_LOG_INFO = 3,
17 |   NCCL_LOG_ABORT = 4,
18 |   NCCL_LOG_TRACE = 5
19 | } ncclDebugLogLevel;
20 | 
21 | typedef enum {
22 |   NCCL_INIT = 0x1,
23 |   NCCL_COLL = 0x2,
24 |   NCCL_P2P = 0x4,
25 |   NCCL_SHM = 0x8,
26 |   NCCL_NET = 0x10,
27 |   NCCL_GRAPH = 0x20,
28 |   NCCL_TUNING = 0x40,
29 |   NCCL_ENV = 0x80,
30 |   NCCL_ALLOC = 0x100,
31 |   NCCL_CALL = 0x200,
32 |   NCCL_PROXY = 0x400,
33 |   NCCL_NVLS = 0x800,
34 |   NCCL_BOOTSTRAP = 0x1000,
35 |   NCCL_REG = 0x2000,
36 |   NCCL_PROFILE = 0x4000,
37 |   NCCL_RAS = 0x8000,
38 |   NCCL_ALL = ~0
39 | } ncclDebugLogSubSys;
40 | 
41 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
42 | 
43 | // NCCL core profiler callback for network defined events instrumentation
44 | enum {
45 |   ncclProfilerNetEventStart = 0,
46 |   ncclProfilerNetEventStop,
47 |   ncclProfilerNetEventUpdate,
48 |   ncclProfilerNetEventUpdateAndStop,
49 | };
50 | 
51 | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
52 | 
53 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
54 | typedef enum {
55 |   ncclFuncBroadcast = 0,
56 |   ncclFuncReduce = 1,
57 |   ncclFuncAllGather = 2,
58 |   ncclFuncReduceScatter = 3,
59 |   ncclFuncAllReduce = 4,
60 |   ncclFuncSendRecv = 5,
61 |   ncclFuncSend = 6,
62 |   ncclFuncRecv = 7,
63 |   ncclNumFuncs = 8
64 | } ncclFunc_t;
65 | 
66 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT
67 | #define NCCL_ALGO_UNDEF -1
68 | #define NCCL_ALGO_TREE 0
69 | #define NCCL_ALGO_RING 1
70 | #define NCCL_ALGO_COLLNET_DIRECT 2
71 | #define NCCL_ALGO_COLLNET_CHAIN 3
72 | #define NCCL_ALGO_NVLS 4
73 | #define NCCL_ALGO_NVLS_TREE 5
74 | #define NCCL_ALGO_PAT 6
75 | 
76 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
77 | #define NCCL_PROTO_UNDEF -1
78 | #define NCCL_PROTO_LL 0
79 | #define NCCL_PROTO_LL128 1
80 | #define NCCL_PROTO_SIMPLE 2
81 | 
82 | #define NCCL_ALGO_PROTO_IGNORE -1.0
83 | #endif
84 | 


--------------------------------------------------------------------------------
/src/include/net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INT_NET_H_
 8 | #define NCCL_INT_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | #include "comm.h"
13 | #include "checks.h"
14 | 
15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
16 | 
17 | ncclResult_t ncclNetInit(struct ncclComm* comm);
18 | ncclResult_t ncclNetFinalize(struct ncclComm* comm);
19 | 
20 | // Test whether the current GPU support GPU Direct RDMA.
21 | ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
22 | 
23 | extern ncclNet_t ncclNetIb;
24 | extern ncclNet_t ncclNetSocket;
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/include/net_device.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_NET_DEVICE_H_
 8 | #define NCCL_NET_DEVICE_H_
 9 | 
10 | #define NCCL_NET_DEVICE_INVALID_VERSION      0x0
11 | #define NCCL_NET_MTU_SIZE                    4096
12 | 
13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
16 | 
17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
18 | 
19 | typedef struct {
20 |   ncclNetDeviceType netDeviceType; // Network offload type
21 |   int netDeviceVersion;            // Version number for network offload
22 |   void* handle;
23 |   size_t size;
24 |   int needsProxyProgress;
25 | } ncclNetDeviceHandle_v7_t;
26 | 
27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
29 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
30 | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtSemanticsCounters.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2024  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /**
10 |  * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
11 |  */
12 | 
13 | #ifndef NVTX_SEMANTIC_ID_COUNTERS_V1
14 | #define NVTX_SEMANTIC_ID_COUNTERS_V1 2
15 | 
16 | /**
17 |  * Flags to extend the semantics of counters.
18 |  */
19 | #define NVTX_COUNTERS_FLAGS_NONE  0
20 | 
21 | /**
22 |  * Convert the fixed point value to a normalized floating point value.
23 |  * Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type
24 |  * this flag is applied to.
25 |  */
26 | #define NVTX_COUNTERS_FLAG_NORMALIZE    (1 << 1)
27 | 
28 | /**
29 |  *  Visual tools should apply scale and limits when graphing.
30 |  */
31 | #define NVTX_COUNTERS_FLAG_LIMIT_MIN    (1 << 2)
32 | #define NVTX_COUNTERS_FLAG_LIMIT_MAX    (1 << 3)
33 | #define NVTX_COUNTERS_FLAG_LIMITS \
34 |     (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
35 | 
36 | /**
37 |  * Counter time scopes.
38 |  */
39 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT        (1 << 5)
40 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST   (2 << 5)
41 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT   (3 << 5)
42 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START  (4 << 5)
43 | 
44 | /**
45 |  * Counter value types.
46 |  */
47 | #define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10)
48 | /** Delta to previous value of same counter type. */
49 | #define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA    (2 << 10)
50 | 
51 | /**
52 |  * Datatypes for the `limits` union.
53 |  */
54 | #define NVTX_COUNTERS_LIMIT_I64 0
55 | #define NVTX_COUNTERS_LIMIT_U64 1
56 | #define NVTX_COUNTERS_LIMIT_F64 2
57 | 
58 | /**
59 |  *\brief Specify counter semantics.
60 |  */
61 | typedef struct nvtxSemanticsCounter_v1 {
62 |     /** Header of the semantic extensions (with identifier, version, etc.). */
63 |     struct nvtxSemanticsHeader_v1 header;
64 | 
65 |     /** Flags to provide more context about the counter value. */
66 |     uint64_t flags;
67 | 
68 |     /** Unit of the counter value (case-insensitive). */
69 |     const char*  unit;
70 | 
71 |     /** Should be 1 if not used. */
72 |     uint64_t unitScaleNumerator;
73 | 
74 |     /** Should be 1 if not used. */
75 |     uint64_t unitScaleDenominator;
76 | 
77 |     /** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */
78 |     int64_t limitType;
79 | 
80 |     /** Graph limits {minimum, maximum}. */
81 |     union limits_t {
82 |         int64_t  i64[2];
83 |         uint64_t u64[2];
84 |         double   d[2];
85 |     } limits;
86 | } nvtxSemanticsCounter_t;
87 | 
88 | #endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtSemanticsScope.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2024  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /**
10 |  * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
11 |  */
12 | 
13 | #ifndef NVTX_SEMANTIC_ID_SCOPE_V1
14 | #define NVTX_SEMANTIC_ID_SCOPE_V1 1
15 | 
16 | /**
17 |  * \brief Specify the NVTX scope for a payload entry.
18 |  *
19 |  * This allows the scope to be set for a specific value or counter in a payload.
20 |  * The scope must be known at schema registration time.
21 |  */
22 | typedef struct nvtxSemanticsScope_v1
23 | {
24 |     struct nvtxSemanticsHeader_v1 header;
25 | 
26 |     /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */
27 |     uint64_t scopeId;
28 | } nvtxSemanticsScope_t;
29 | 
30 | #endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2023  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_HELPER_MACROS_H
10 | #define NVTX_EXT_HELPER_MACROS_H
11 | 
12 | /* Combine tokens */
13 | #define _NVTX_EXT_CONCAT(a, b) a##b
14 | #define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b)
15 | 
16 | /* Resolves to the number of arguments passed. */
17 | #define NVTX_EXT_NUM_ARGS(...) \
18 |     NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway)
19 | #define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16
20 | 
21 | /* Cast argument(s) to void to prevent unused variable warnings. */
22 | #define _NVTX_EXT_VOIDIFY1(a1) (void)a1;
23 | #define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2;
24 | #define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3;
25 | #define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4;
26 | 
27 | /* Mark function arguments as unused. */
28 | #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \
29 |     NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
30 | 
31 | #endif /* NVTX_EXT_HELPER_MACROS_H */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_IMPL_GUARD
10 | #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifndef NVTX_EXT_IMPL_H
14 | #define NVTX_EXT_IMPL_H
15 | /* ---- Include required platform headers ---- */
16 | 
17 | #include <stdlib.h>
18 | #include <stdio.h>
19 | #include <string.h>
20 | #include <wchar.h>
21 | 
22 | #if defined(_WIN32)
23 | 
24 | #include <Windows.h>
25 | 
26 | #else
27 | #include <unistd.h>
28 | 
29 | #if defined(__ANDROID__)
30 | #include <android/api-level.h>
31 | #endif
32 | 
33 | #if defined(__linux__) || defined(__CYGWIN__)
34 | #include <sched.h>
35 | #endif
36 | 
37 | #include <sys/types.h>
38 | #include <limits.h>
39 | #include <dlfcn.h>
40 | #include <fcntl.h>
41 | #include <errno.h>
42 | #include <pthread.h>
43 | 
44 | #endif
45 | 
46 | /* ---- Define macros used in this file ---- */
47 | 
48 | #ifdef NVTX_DEBUG_PRINT
49 | #ifdef __ANDROID__
50 | #include <android/log.h>
51 | #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
52 | #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
53 | #else
54 | #include <stdio.h>
55 | #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
56 | #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
57 | #endif
58 | #else /* !defined(NVTX_DEBUG_PRINT) */
59 | #define NVTX_ERR(...)
60 | #define NVTX_INFO(...)
61 | #endif
62 | 
63 | #ifdef __cplusplus
64 | extern "C" {
65 | #endif /* __cplusplus */
66 | /*
67 | #ifdef __GNUC__
68 | #pragma GCC visibility push(hidden)
69 | #endif
70 | */
71 | #define NVTX_EXTENSION_FRESH 0
72 | #define NVTX_EXTENSION_DISABLED 1
73 | #define NVTX_EXTENSION_STARTING 2
74 | #define NVTX_EXTENSION_LOADED 3
75 | 
76 | /* Function slots are local to each extension */
77 | typedef struct nvtxExtGlobals1_t
78 | {
79 |     NvtxExtInitializeInjectionFunc_t injectionFnPtr;
80 | } nvtxExtGlobals1_t;
81 | 
82 | NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) =
83 | {
84 |     (NvtxExtInitializeInjectionFunc_t)0
85 | };
86 | 
87 | #define NVTX_EXT_INIT_GUARD
88 | #include "nvtxExtInit.h"
89 | #undef NVTX_EXT_INIT_GUARD
90 | /*
91 | #ifdef __GNUC__
92 | #pragma GCC visibility pop
93 | #endif
94 | */
95 | #ifdef __cplusplus
96 | } /* extern "C" */
97 | #endif /* __cplusplus */
98 | 
99 | #endif /* NVTX_EXT_IMPL_H */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD
10 | #error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif /* __cplusplus */
16 | 
17 | #ifdef NVTX_DISABLE
18 | 
19 | #include "nvtxExtHelperMacros.h"
20 | 
21 | #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
22 | ret_val fn_name signature { \
23 |     NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
24 |     return ((ret_val)(intptr_t)-1); \
25 | }
26 | 
27 | #else  /* NVTX_DISABLE */
28 | 
29 | #define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
30 | typedef ret_type ( * fn_name##_impl_fntype )signature; \
31 |     NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
32 |     intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
33 |     if (slot != NVTX_EXTENSION_DISABLED) { \
34 |         if (slot != NVTX_EXTENSION_FRESH) { \
35 |             return (*(fn_name##_impl_fntype)slot) arg_names; \
36 |         } else { \
37 |             NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
38 |             /* Re-read function slot after extension initialization. */ \
39 |             slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
40 |             if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
41 |                 return (*(fn_name##_impl_fntype)slot) arg_names; \
42 |             } \
43 |         } \
44 |     } \
45 |     NVTX_EXT_FN_RETURN_INVALID(ret_type) \
46 | }
47 | 
48 | #endif /*NVTX_DISABLE*/
49 | 
50 | /* Non-void functions. */
51 | #define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
52 | 
53 | NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain))
54 | 
55 | NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device))
56 | 
57 | #undef NVTX_EXT_FN_RETURN_INVALID
58 | /* END: Non-void functions. */
59 | 
60 | /* void functions. */
61 | #define NVTX_EXT_FN_RETURN_INVALID(rtype)
62 | #define return
63 | 
64 | NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags))
65 | 
66 | #undef return
67 | #undef NVTX_EXT_FN_RETURN_INVALID
68 | /* END: void functions. */
69 | 
70 | #undef NVTX_EXT_FN_IMPL
71 | 
72 | #ifdef __cplusplus
73 | } /* extern "C" */
74 | #endif /* __cplusplus */
75 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2021  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /* This header defines types which are used by the internal implementation
10 | *  of NVTX and callback subscribers.  API clients do not use these types,
11 | *  so they are defined here instead of in nvToolsExt.h to clarify they are
12 | *  not part of the NVTX client API. */
13 | 
14 | #ifndef NVTXEXTTYPES_H
15 | #define NVTXEXTTYPES_H
16 | 
17 | #ifndef NVTX_EXT_TYPES_GUARD
18 | #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h.
19 | #endif
20 | 
21 | typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
22 | 
23 | typedef struct nvtxExtModuleSegment_t
24 | {
25 |     size_t segmentId;
26 |     size_t slotCount;
27 |     intptr_t* functionSlots;
28 | } nvtxExtModuleSegment_t;
29 | 
30 | typedef struct nvtxExtModuleInfo_t
31 | {
32 |     uint16_t nvtxVer;
33 |     uint16_t structSize;
34 |     uint16_t moduleId;
35 |     uint16_t compatId;
36 |     size_t segmentsCount;
37 |     nvtxExtModuleSegment_t* segments;
38 |     NvtxExtGetExportFunction_t getExportFunction;
39 |     const void* extInfo;
40 | } nvtxExtModuleInfo_t;
41 | 
42 | typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
43 | 
44 | #endif /* NVTXEXTTYPES_H */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_IMPL_GUARD_CUDART
10 | #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif /* __cplusplus */
16 | 
17 | typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
18 | typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
19 | typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
20 | typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
21 | typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
22 | typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
23 | 
24 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
25 | {
26 | #ifndef NVTX_DISABLE
27 |     nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
28 |     if(local!=0)
29 |         (*local)(device, name);
30 | #endif /*NVTX_DISABLE*/
31 | }
32 | 
33 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
34 | {
35 | #ifndef NVTX_DISABLE
36 |     nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
37 |     if(local!=0)
38 |         (*local)(device, name);
39 | #endif /*NVTX_DISABLE*/
40 | }
41 | 
42 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
43 | {
44 | #ifndef NVTX_DISABLE
45 |     nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
46 |     if(local!=0)
47 |         (*local)(stream, name);
48 | #endif /*NVTX_DISABLE*/
49 | }
50 | 
51 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
52 | {
53 | #ifndef NVTX_DISABLE
54 |     nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
55 |     if(local!=0)
56 |         (*local)(stream, name);
57 | #endif /*NVTX_DISABLE*/
58 | }
59 | 
60 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
61 | {
62 | #ifndef NVTX_DISABLE
63 |     nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
64 |     if(local!=0)
65 |         (*local)(event, name);
66 | #endif /*NVTX_DISABLE*/
67 | }
68 | 
69 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
70 | {
71 | #ifndef NVTX_DISABLE
72 |     nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
73 |     if(local!=0)
74 |         (*local)(event, name);
75 | #endif /*NVTX_DISABLE*/
76 | }
77 | 
78 | #ifdef __cplusplus
79 | } /* extern "C" */
80 | #endif /* __cplusplus */
81 | 
82 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_IMPL_GUARD_SYNC
10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif /* __cplusplus */
17 | 
18 | typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
19 | typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
20 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
21 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
22 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
23 | typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
24 | 
25 | NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
26 | {
27 | #ifndef NVTX_DISABLE
28 |     nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
29 |     if(local!=0)
30 |         return (*local)(domain, attribs);
31 |     else
32 | #endif  /*NVTX_DISABLE*/
33 |         return (nvtxSyncUser_t)0;
34 | }
35 | 
36 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
37 | {
38 | #ifndef NVTX_DISABLE
39 |     nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
40 |     if(local!=0)
41 |         (*local)(handle);
42 | #endif /*NVTX_DISABLE*/
43 | }
44 | 
45 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
46 | {
47 | #ifndef NVTX_DISABLE
48 |     nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
49 |     if(local!=0)
50 |         (*local)(handle);
51 | #endif /*NVTX_DISABLE*/
52 | }
53 | 
54 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
55 | {
56 | #ifndef NVTX_DISABLE
57 |     nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
58 |     if(local!=0)
59 |         (*local)(handle);
60 | #endif /*NVTX_DISABLE*/
61 | }
62 | 
63 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
64 | {
65 | #ifndef NVTX_DISABLE
66 |     nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
67 |     if(local!=0)
68 |         (*local)(handle);
69 | #endif /*NVTX_DISABLE*/
70 | }
71 | 
72 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
73 | {
74 | #ifndef NVTX_DISABLE
75 |     nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
76 |     if(local!=0)
77 |         (*local)(handle);
78 | #endif /*NVTX_DISABLE*/
79 | }
80 | 
81 | #ifdef __cplusplus
82 | } /* extern "C" */
83 | #endif /* __cplusplus */
84 | 


--------------------------------------------------------------------------------
/src/include/p2p.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <stdlib.h>
 8 | 
 9 | #ifndef NCCL_P2P_H_
10 | #define NCCL_P2P_H_
11 | 
12 | #include <cuda.h>
13 | #include <cuda_runtime.h>
14 | 
15 | #include "core.h"
16 | 
17 | #if CUDART_VERSION < 12030
18 | // MNNVL: FABRIC handle support lifted from CUDA 12.3
19 | #define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128)
20 | #define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
21 | #define CU_IPC_HANDLE_SIZE 64
22 | typedef struct CUmemFabricHandle_st {
23 |     unsigned char data[CU_IPC_HANDLE_SIZE];
24 | } CUmemFabricHandle_v1;
25 | typedef CUmemFabricHandle_v1 CUmemFabricHandle;
26 | #endif
27 | 
28 | typedef union {
29 |   uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
30 |   CUmemFabricHandle handle;
31 | } ncclCuDesc;
32 | 
33 | typedef union {
34 |   // Legacy CUDA IPC
35 |   cudaIpcMemHandle_t devIpc;
36 |   // cuMem API support
37 |   struct {
38 |     ncclCuDesc cuDesc;
39 |     CUmemGenericAllocationHandle memHandle;
40 |   };
41 | } ncclIpcDesc;
42 | 
43 | enum ncclIpcRegType {
44 |   NCCL_IPC_SENDRECV = 0,
45 |   NCCL_IPC_COLLECTIVE = 1
46 | };
47 | 
48 | struct ncclIpcImpInfo {
49 |   void* rmtRegAddr;
50 |   bool legacyIpcCap;
51 |   uintptr_t offset;
52 | };
53 | 
54 | struct ncclIpcRegInfo {
55 |   int peerRank;
56 |   void* baseAddr;
57 |   struct ncclProxyConnector* ipcProxyconn;
58 |   struct ncclIpcImpInfo impInfo;
59 | };
60 | 
61 | ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr);
62 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
63 | ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
64 | ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut);
65 | ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts);
66 | 
67 | ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo);
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/src/include/param.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PARAM_H_
 8 | #define NCCL_PARAM_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | const char* userHomeDir();
13 | void setEnvFile(const char* fileName);
14 | void initEnv();
15 | const char *ncclGetEnv(const char *name);
16 | 
17 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
18 | 
19 | #define NCCL_PARAM(name, env, deftVal) \
20 |   int64_t ncclParam##name() { \
21 |     constexpr int64_t uninitialized = INT64_MIN; \
22 |     static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
23 |     static int64_t cache = uninitialized; \
24 |     if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
25 |       ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \
26 |     } \
27 |     return cache; \
28 |   }
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/include/plugin/nccl_net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_NET_H_
 8 | #define NCCL_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_common.h"
12 | #include "net_device.h"
13 | #include <stdint.h>
14 | 
15 | #define NCCL_NET_HANDLE_MAXSIZE 128
16 | //Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
17 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
18 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
19 | 
20 | #define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
21 | #define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
22 | 
23 | #define NCCL_PTR_HOST 0x1
24 | #define NCCL_PTR_CUDA 0x2
25 | #define NCCL_PTR_DMABUF 0x4
26 | 
27 | // Maximum number of requests per comm object
28 | #define NCCL_NET_MAX_REQUESTS 32
29 | 
30 | // Max number of ncclNet objects which can live in the same process
31 | #ifndef NCCL_NET_MAX_PLUGINS
32 | #define NCCL_NET_MAX_PLUGINS 16
33 | #endif
34 | 
35 | #include "net/net_v10.h"
36 | #include "net/net_v9.h"
37 | #include "net/net_v8.h"
38 | #include "net/net_v7.h"
39 | #include "net/net_v6.h"
40 | 
41 | typedef ncclNet_v10_t ncclNet_t;
42 | typedef ncclCollNet_v10_t ncclCollNet_t;
43 | typedef ncclNetSGE_v10_t ncclNetSGE_t;
44 | typedef ncclNetProperties_v10_t ncclNetProperties_t;
45 | typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
46 | typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
47 | 
48 | #define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
49 | 
50 | #define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10
51 | #define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10
52 | 
53 | #endif // end include guard
54 | 


--------------------------------------------------------------------------------
/src/include/plugin/nccl_profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PROFILER_H_
 8 | #define NCCL_PROFILER_H_
 9 | 
10 | enum {
11 |   ncclProfileGroup     = (1 << 0),  // group event type
12 |   ncclProfileColl      = (1 << 1),  // host collective call event type
13 |   ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
14 |   ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
15 |   ncclProfileProxyStep = (1 << 4),  // proxy step event type
16 |   ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
17 |   ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
18 |   ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
19 | };
20 | 
21 | typedef enum {
22 |   ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
23 |   ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
24 |   ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
25 |   ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
26 |   ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
27 |   ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
28 |   ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
29 |   ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
30 |   ncclProfilerProxyOpInProgress_v4     = 19,
31 | 
32 |   /* Legacy proxy profiler states */
33 |   ncclProfilerProxyStepSendGPUWait     = 8,
34 |   ncclProfilerProxyStepSendPeerWait_v4 = 20,
35 |   ncclProfilerProxyStepSendWait        = 9,
36 |   ncclProfilerProxyStepRecvWait        = 10,
37 |   ncclProfilerProxyStepRecvFlushWait   = 11,
38 |   ncclProfilerProxyStepRecvGPUWait     = 12,
39 | 
40 |   /* Legacy proxy control states */
41 |   ncclProfilerProxyCtrlIdle            = 13,
42 |   ncclProfilerProxyCtrlActive          = 14,
43 |   ncclProfilerProxyCtrlSleep           = 15,
44 |   ncclProfilerProxyCtrlWakeup          = 16,
45 |   ncclProfilerProxyCtrlAppend          = 17,
46 |   ncclProfilerProxyCtrlAppendEnd       = 18,
47 | 
48 |   /* Network defined event states */
49 |   ncclProfilerNetPluginUpdate          = 21,
50 | 
51 |   /* Kernel event states */
52 |   ncclProfilerKernelChStop             = 22,
53 | } ncclProfilerEventState_t;
54 | 
55 | typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
56 | typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
57 | typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
58 | typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
59 | 
60 | #include <cstdint>
61 | #include "profiler/profiler_v4.h"
62 | #include "profiler/profiler_v3.h"
63 | #include "profiler/profiler_v2.h"
64 | #include "profiler/profiler_v1.h"
65 | 
66 | typedef ncclProfiler_v4_t ncclProfiler_t;
67 | typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
68 | typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
69 | 
70 | #define NCCL_PROFILER_NET_VER_BITS  (16)
71 | #define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
72 | #define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
73 | 
74 | typedef enum {
75 |   NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
76 |   NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
77 | } ncclProfilerNetType;
78 | 
79 | #endif
80 | 


--------------------------------------------------------------------------------
/src/include/plugin/nccl_tuner.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_TUNER_H_
 9 | #define NCCL_TUNER_H_
10 | 
11 | #include "nccl.h"
12 | #include "nccl_common.h"
13 | 
14 | #include "tuner/tuner_v4.h"
15 | #include "tuner/tuner_v3.h"
16 | #include "tuner/tuner_v2.h"
17 | 
18 | typedef ncclTuner_v4_t ncclTuner_t;
19 | 
20 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/src/include/plugin/plugin.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PLUGIN_H_
 8 | #define NCCL_PLUGIN_H_
 9 | 
10 | #include "nccl.h"
11 | 
12 | void* ncclOpenNetPluginLib(const char* name);
13 | void* ncclOpenTunerPluginLib(const char* name);
14 | void* ncclOpenProfilerPluginLib(const char* name);
15 | void* ncclGetNetPluginLib(void);
16 | ncclResult_t ncclClosePluginLib(void* handle);
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/src/include/plugin/profiler/net_ib.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_IB_H_
 8 | #define NET_IB_H_
 9 | 
10 | #include "nccl_profiler.h"
11 | #include "net_ib_v1.h"
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/include/plugin/profiler/net_ib_v1.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_IB_V1_H_
 8 | #define NET_IB_V1_H_
 9 | 
10 | #define NCCL_PROFILER_NET_IB_VER 1
11 | 
12 | enum {
13 |   ncclProfileQp = (1 << 0),
14 | };
15 | 
16 | // The data structure version is encoded in the plugin identifier bitmask and
17 | // passed to NCCL core through the profiler callback. NCCL copies the plugin
18 | // identifier in the event descriptor before calling the profiler startEvent
19 | // function. The profiler should inspect the plugin id to find out the source
20 | // plugin as well as the version of the event struct
21 | typedef struct {
22 |   uint8_t type;        // event type (plugin defined)
23 |   union {
24 |     struct {
25 |       int device;      // network device id
26 |       uint64_t wr_id;  // work request id
27 |       int opcode;      // ibv opcode
28 |       int qpNum;       // QP number
29 |       size_t length;   // work request data length
30 |     } qp;
31 |   };
32 | } ncclProfilerNetIbDescr_v1_t;
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/include/plugin/profiler/net_socket.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_SOCKET_H_
 8 | #define NET_SOCKET_H_
 9 | 
10 | #include "nccl_profiler.h"
11 | #include "net_socket_v1.h"
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/include/plugin/profiler/net_socket_v1.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_SOCKET_V1_H_
 8 | #define NET_SOCKET_V1_H_
 9 | 
10 | #define NCCL_PROFILER_NET_SOCKET_VER 1
11 | 
12 | enum {
13 |   ncclProfileSocket = (1 << 0),
14 | };
15 | 
16 | // The data structure version is encoded in the plugin identifier bitmask and
17 | // passed to NCCL core through the profiler callback. NCCL copies the plugin
18 | // identifier in the event descriptor before calling the profiler startEvent
19 | // function. The profiler should inspect the plugin id to find out the source
20 | // plugin as well as the version of the event struct
21 | typedef struct {
22 |   uint8_t type;        // event type (plugin defined)
23 |   union {
24 |     struct {
25 |       int fd;
26 |       int op;
27 |       size_t length;
28 |     } sock;
29 |   };
30 | } ncclProfilerNetSockDescr_v1_t;
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/src/include/plugin/profiler/profiler_v1.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef PROFILER_V1_H_
  8 | #define PROFILER_V1_H_
  9 | 
 10 | typedef struct {
 11 |   uint8_t type;                 // event type descriptor: ncclProfileColl, ...
 12 |   void* parentObj;              // pointer to the profiler parent object (for coll is the group)
 13 |   int rank;                     // originating rank
 14 |   union {
 15 |     struct {
 16 |       const char* name;
 17 |       uint64_t commHash;
 18 |       uint64_t seqNumber;
 19 |       uint8_t func;
 20 |       void const* sendBuff;
 21 |       void* recvBuff;
 22 |       size_t count;
 23 |       int root;
 24 |       uint8_t datatype;
 25 |       uint32_t op;
 26 |       size_t trafficBytes;
 27 |       uint8_t nMaxChannels;
 28 |       uint8_t nWarps;
 29 |       uint8_t algo;
 30 |       uint8_t proto;
 31 |       int isCollnet;
 32 |       int isNvls;
 33 |     } coll;
 34 | 
 35 |     struct {
 36 |       const char* name;
 37 |       uint64_t commHash;
 38 |       uint8_t func;
 39 |       void* buff;
 40 |       uint8_t datatype;
 41 |       size_t count;
 42 |       int peer;
 43 |     } p2p;
 44 | 
 45 |     struct {
 46 |       pid_t pid;                // pid of the originating process
 47 |       uint8_t channelId;        // channel id for this proxy operation
 48 |       int peer;                 // remote rank for send/recv
 49 |       int nSteps;               // number of steps for this proxy operation
 50 |       int chunkSize;            // amount of data transferred by this proxy operation
 51 |       int isSend;
 52 |     } proxyOp;
 53 | 
 54 |     struct {
 55 |       int step;
 56 |     } proxyStep;
 57 |   };
 58 | } ncclProfilerEventDescr_v1_t;
 59 | 
 60 | typedef union {
 61 |   struct {
 62 |     size_t transSize;
 63 |     int steps;
 64 |   } proxyOp;
 65 | 
 66 |   struct {
 67 |     int appendedProxyOps;
 68 |   } proxyCtrl;
 69 | } ncclProfilerEventStateArgs_v1_t;
 70 | 
 71 | typedef struct {
 72 |   const char* name;
 73 | 
 74 |   // init - initialize the profiler plugin
 75 |   // Input
 76 |   //  - context        : opaque profiler context object for separating profiler behavior across comms
 77 |   // Output
 78 |   //  - eActivationMask: bitmask of active events set by the plugin
 79 |   ncclResult_t (*init)(void** context, int* eActivationMask);
 80 | 
 81 |   // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
 82 |   // Input
 83 |   //  - context: opaque profiler context object
 84 |   //  - eDescr : pointer to ncclProfilerEventDescr_t object
 85 |   // Output
 86 |   //  - eHandle: return event handle for supplied event descriptor object
 87 |   ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
 88 | 
 89 |   // stopEvent - stop/finalize an event inside and event set
 90 |   // Input
 91 |   //  - eHandle: handle to event object
 92 |   ncclResult_t (*stopEvent)(void* eHandle);
 93 | 
 94 |   // recordEventState - record event state transitions and event attribute updates
 95 |   // Input
 96 |   //  - eHandle   : handle to event object created through startEvent
 97 |   //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
 98 |   //  - eState    : event state transition
 99 |   ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
100 | 
101 |   // finalize - finalize the profiler plugin
102 |   // Input
103 |   //  - context: opaque profiler context object
104 |   ncclResult_t (*finalize)(void* context);
105 | } ncclProfiler_v1_t;
106 | 
107 | #endif
108 | 


--------------------------------------------------------------------------------
/src/include/plugin/profiler/profiler_v2.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef PROFILER_V2_H_
  8 | #define PROFILER_V2_H_
  9 | 
 10 | typedef struct {
 11 |   uint8_t type;                 // event type descriptor: ncclProfileColl, ...
 12 |   void* parentObj;              // pointer to the profiler parent object (for coll is the group)
 13 |   int rank;                     // originating rank
 14 |   union {
 15 |     struct {
 16 |       const char* name;
 17 |       uint64_t commHash;
 18 |       uint64_t seqNumber;
 19 |       const char* func;
 20 |       void const* sendBuff;
 21 |       void* recvBuff;
 22 |       size_t count;
 23 |       int root;
 24 |       const char* datatype;
 25 |       size_t trafficBytes;
 26 |       uint8_t nMaxChannels;
 27 |       uint8_t nWarps;
 28 |       const char* algo;
 29 |       const char* proto;
 30 |     } coll;
 31 | 
 32 |     struct {
 33 |       const char* name;
 34 |       uint64_t commHash;
 35 |       const char* func;
 36 |       void* buff;
 37 |       const char* datatype;
 38 |       size_t count;
 39 |       int peer;
 40 |     } p2p;
 41 | 
 42 |     struct {
 43 |       pid_t pid;                // pid of the originating process
 44 |       uint8_t channelId;        // channel id for this proxy operation
 45 |       int peer;                 // remote rank for send/recv
 46 |       int nSteps;               // number of steps for this proxy operation
 47 |       int chunkSize;            // amount of data transferred by this proxy operation
 48 |       int isSend;
 49 |     } proxyOp;
 50 | 
 51 |     struct {
 52 |       int step;
 53 |     } proxyStep;
 54 |   };
 55 | } ncclProfilerEventDescr_v2_t;
 56 | 
 57 | typedef union {
 58 |   struct {
 59 |     size_t transSize;
 60 |     int steps;
 61 |   } proxyOp;
 62 | 
 63 |   struct {
 64 |     int appendedProxyOps;
 65 |   } proxyCtrl;
 66 | } ncclProfilerEventStateArgs_v2_t;
 67 | 
 68 | typedef struct {
 69 |   const char* name;
 70 | 
 71 |   // init - initialize the profiler plugin
 72 |   // Input
 73 |   //  - context        : opaque profiler context object for separating profiler behavior across comms
 74 |   // Output
 75 |   //  - eActivationMask: bitmask of active events set by the plugin
 76 |   ncclResult_t (*init)(void** context, int* eActivationMask);
 77 | 
 78 |   // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
 79 |   // Input
 80 |   //  - context: opaque profiler context object
 81 |   //  - eDescr : pointer to ncclProfilerEventDescr_t object
 82 |   // Output
 83 |   //  - eHandle: return event handle for supplied event descriptor object
 84 |   ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
 85 | 
 86 |   // stopEvent - stop/finalize an event inside and event set
 87 |   // Input
 88 |   //  - eHandle: handle to event object
 89 |   ncclResult_t (*stopEvent)(void* eHandle);
 90 | 
 91 |   // recordEventState - record event state transitions and event attribute updates
 92 |   // Input
 93 |   //  - eHandle   : handle to event object created through startEvent
 94 |   //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
 95 |   //  - eState    : event state transition
 96 |   ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
 97 | 
 98 |   // finalize - finalize the profiler plugin
 99 |   // Input
100 |   //  - context: opaque profiler context object
101 |   ncclResult_t (*finalize)(void* context);
102 | } ncclProfiler_v2_t;
103 | 
104 | #endif
105 | 


--------------------------------------------------------------------------------
/src/include/plugin/tuner/tuner_v2.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef TUNER_V2_H_
 9 | #define TUNER_V2_H_
10 | 
11 | // API to be implemented by external tuner
12 | typedef struct {
13 |   // Name of the tuner
14 |   const char* name;
15 | 
16 |   // Initializes tuner states.
17 |   // Inputs:
18 |   //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
19 |   //   - nNodes: number of nodes in current communicator.
20 |   //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
21 |   // Outputs:
22 |   //   - context: tuner context object
23 |   ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
24 | 
25 |   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
26 |   // Inputs:
27 |   //   - context: tuner context object
28 |   //   - collType: collective type , e.g., allreduce, allgather…
29 |   //   - nBytes: collective size in bytes
30 |   //   - collNetTypeSupport: whether collnet supports this type
31 |   //   - nvlsTypeSupport: whether nvlink sharp supports this time
32 |   //   - numPipeOps: number of operations in the group
33 |   //
34 |   // Outputs:
35 |   //   - algorithm: selected algorithm to be used for the given collective
36 |   //   - protocol: selected protocol to be used for the give collective
37 |   //   - nChannels: number of channels (hence SMs) to be used.
38 |   //
39 |   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
40 |   // default tuning for the given collective.
41 |   // Also, the plugin is allowed to not set any output, or set only the
42 |   // algorithm and protocol, but not only the algorithm or only the protocol.
43 |   // Unset fields will be set automatically by NCCL.
44 |   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
45 |                               int collNetSupport, int nvlsSupport, int numPipeOps,
46 |                               int* algorithm, int* protocol, int* nChannels);
47 | 
48 |   // Terminates the plugin and cleans up any resources that the plugin allocated.
49 |   // context: tuner context object
50 |   ncclResult_t (*destroy)(void* context);
51 | } ncclTuner_v2_t;
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/src/include/plugin/tuner/tuner_v3.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef TUNER_V3_H_
 9 | #define TUNER_V3_H_
10 | 
11 | // API to be implemented by external tuner
12 | typedef struct {
13 |   // Name of the tuner
14 |   const char* name;
15 | 
16 |   // Initializes tuner states.
17 |   // Inputs:
18 |   //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
19 |   //   - nNodes: number of nodes in current communicator.
20 |   //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
21 |   // Outputs:
22 |   //   - context: tuner context object
23 |   ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
24 | 
25 |   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
26 |   // Inputs:
27 |   //   - context: tuner context object
28 |   //   - collType: collective type , e.g., allreduce, allgather…
29 |   //   - nBytes: collective size in bytes
30 |   //   - numPipeOps: number of operations in the group
31 |   //   - numAlgo: number of algorithms in collCostTable
32 |   //   - numProto: number of protocols in collCostTable
33 |   //
34 |   // Outputs:
35 |   //   - nChannels: number of channels (hence SMs) to be used.
36 |   //
37 |   // InOut:
38 |   //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
39 |   //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
40 |   //
41 |   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
42 |   // default tuning for the given collective.
43 |   // Also, the plugin is allowed to not set any output, or set only the
44 |   // algorithm and protocol, but not only the algorithm or only the protocol.
45 |   // Unset fields will be set automatically by NCCL.
46 |   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
47 |                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
48 |                               int* nChannels);
49 | 
50 |   // Terminates the plugin and cleans up any resources that the plugin allocated.
51 |   // context: tuner context object
52 |   ncclResult_t (*destroy)(void* context);
53 | } ncclTuner_v3_t;
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/src/include/plugin/tuner/tuner_v4.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef TUNER_V4_H_
 9 | #define TUNER_V4_H_
10 | 
11 | // API to be implemented by external tuner
12 | typedef struct {
13 |   // Name of the tuner
14 |   const char* name;
15 | 
16 |   // Initializes tuner states.
17 |   // Inputs:
18 |   //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
19 |   //   - nNodes: number of nodes in current communicator.
20 |   //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
21 |   // Outputs:
22 |   //   - context: tuner context object
23 |   ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
24 | 
25 |   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
26 |   // Inputs:
27 |   //   - context: tuner context object
28 |   //   - collType: collective type , e.g., allreduce, allgather…
29 |   //   - nBytes: collective size in bytes
30 |   //   - numPipeOps: number of operations in the group
31 |   //   - numAlgo: number of algorithms in collCostTable
32 |   //   - numProto: number of protocols in collCostTable
33 |   //   - regBuff: can register user buffer
34 |   //
35 |   // Outputs:
36 |   //   - nChannels: number of channels (hence SMs) to be used.
37 |   //
38 |   // InOut:
39 |   //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
40 |   //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
41 |   //
42 |   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
43 |   // default tuning for the given collective.
44 |   // Also, the plugin is allowed to not set any output, or set only the
45 |   // algorithm and protocol, but not only the algorithm or only the protocol.
46 |   // Unset fields will be set automatically by NCCL.
47 |   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
48 |                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
49 |                               int regBuff, int* nChannels);
50 | 
51 |   // Terminates the plugin and cleans up any resources that the plugin allocated.
52 |   // context: tuner context object
53 |   ncclResult_t (*destroy)(void* context);
54 | } ncclTuner_v4_t;
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/src/include/profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PROFILER_H_
 8 | #define PROFILER_H_
 9 | 
10 | #include <cuda_runtime.h>
11 | #include "nccl_profiler.h"
12 | 
13 | struct ncclProxyArgs;
14 | struct ncclKernelPlan;
15 | struct ncclTaskColl;
16 | struct ncclTaskP2p;
17 | struct ncclInfo;
18 | struct ncclComm;
19 | struct ncclProxyOp;
20 | struct ncclProxyConnector;
21 | 
22 | struct ncclProfilerProxy {
23 |   bool initialized;
24 |   struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
25 |   struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
26 |   uint64_t workCounter[MAXCHANNELS]; // host work counter
27 |   struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
28 |   struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
29 | };
30 | 
31 | extern int ncclProfilerEventMask;
32 | 
33 | // Plugin Init/Finalize Wrappers
34 | ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
35 | ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm);
36 | 
37 | // Profiler Start/Stop Group Wrappers
38 | ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan);
39 | ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan);
40 | 
41 | // Profiler Start/Stop Task Events Wrappers
42 | ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
43 | ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
44 | 
45 | // Proxy Op Start/Stop Event Wrappers
46 | ncclResult_t ncclProfilerStartProxyOpEvent(int sub, struct ncclProxyArgs* args);
47 | ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
48 | 
49 | // Proxy Step Start/Stop Event Wrappers
50 | ncclResult_t ncclProfilerStartSendProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
51 | ncclResult_t ncclProfilerStartRecvProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
52 | ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
53 | 
54 | // Proxy Control Start/Stop Events Wrappers
55 | ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
56 | ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
57 | 
58 | // Kernel Channel Start/Stop Event Wrappers
59 | ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start);
60 | ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop);
61 | 
62 | // Record Event Wrappers
63 | ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, ncclProfilerEventState_t eState);
64 | ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
65 | ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
66 | 
67 | // Profiler utility functions
68 | ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
69 | bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op);
70 | bool ncclProfilerPluginLoaded(void);
71 | 
72 | // Profiler callback for network plugin
73 | ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/src/include/ras.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_RAS_H_
 8 | #define NCCL_RAS_H_
 9 | 
10 | #include "socket.h"
11 | 
12 | // Structure used to communicate data about NCCL ranks from NCCL threads to RAS.
13 | struct rasRankInit {
14 |   union ncclSocketAddress addr;
15 |   pid_t pid;
16 |   int cudaDev;
17 |   int nvmlDev;
18 |   uint64_t hostHash;
19 |   uint64_t pidHash;
20 | };
21 | 
22 | ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
23 | ncclResult_t ncclRasCommFini(const struct ncclComm* comm);
24 | ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks);
25 | 
26 | #endif // !NCCL_RAS_H_
27 | 


--------------------------------------------------------------------------------
/src/include/register.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_REGISTER_H_
 2 | #define NCCL_REGISTER_H_
 3 | 
 4 | #include "device.h"
 5 | 
 6 | #include <cuda.h>
 7 | #include <stdint.h>
 8 | 
 9 | int64_t ncclParamLocalRegister();
10 | int64_t ncclParamGraphRegister();
11 | 
12 | enum {
13 |   NET_REG_COMPLETE = 0x01,
14 |   NVLS_REG_COMPLETE = 0x02,
15 |   NVLS_REG_POSSIBLE = 0x04,
16 |   NVLS_REG_NO_SUPPORT = 0x08,
17 |   COLLNET_REG_COMPLETE = 0x10,
18 |   IPC_REG_COMPLETE = 0x20
19 | };
20 | 
21 | struct ncclPeerRegIpcAddr {
22 |   uintptr_t* devPeerRmtAddrs;
23 |   uintptr_t* hostPeerRmtAddrs;
24 | };
25 | 
26 | struct ncclRegNetHandles {
27 |   void* handle;
28 |   struct ncclProxyConnector* proxyConn;
29 |   struct ncclRegNetHandles* next;
30 | };
31 | 
32 | struct ncclSymRegTask {
33 |   struct ncclSymRegTask *next;
34 |   void* buff;
35 |   size_t baseSize;
36 |   CUmemGenericAllocationHandle memHandle;
37 |   struct ncclReg* regHandle;
38 |   size_t alignment;
39 | };
40 | 
41 | struct ncclReg {
42 |   // common attributes
43 |   uintptr_t begAddr, endAddr; // page aligned
44 |   int localRefs;
45 |   int graphRefs;
46 |   uint32_t state;
47 |   // net reg
48 |   struct ncclRegNetHandles* netHandleHead;
49 |   // nvls reg
50 |   CUdeviceptr regAddr;
51 |   size_t regUCSize, regMCSize;
52 |   int dev;
53 |   CUmemGenericAllocationHandle mcHandle;
54 |   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
55 |   // collnet reg
56 |   void* collnetHandle;
57 |   struct ncclProxyConnector* collnetProxyconn;
58 |   // general ipc reg
59 |   struct ncclPeerRegIpcAddr regIpcAddrs;
60 |   struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
61 |   // symmetric reg
62 |   void* baseSymPtr;
63 |   size_t symSize;
64 |   int winFlags;
65 | };
66 | 
67 | struct ncclRegCache {
68 |   struct ncclReg **slots;
69 |   int capacity, population;
70 |   uintptr_t pageSize;
71 | };
72 | 
73 | struct ncclWindow {
74 |   struct ncclReg* handle;
75 | };
76 | 
77 | ncclResult_t ncclRegCleanup(struct ncclComm* comm);
78 | ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
79 | ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
80 | ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
81 | ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle);
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/src/include/register_inline.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_REGISTER_INLINE_H_
 2 | #define NCCL_REGISTER_INLINE_H_
 3 | 
 4 | #include "comm.h"
 5 | #include "register.h"
 6 | 
 7 | static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** outReg) {
 8 |   struct ncclRegCache* cache = &comm->regCache;
 9 |   *outReg = NULL;
10 |   for (int slot=0; /*true*/; slot++) {
11 |     if (slot == cache->population) return ncclSuccess;
12 |     struct ncclReg *reg = cache->slots[slot];
13 |     if ((uintptr_t)data < reg->begAddr) return ncclSuccess;
14 |     if ((uintptr_t)data + size <= reg->endAddr) {
15 |       *outReg = reg;
16 |       return ncclSuccess;
17 |     }
18 |   }
19 | }
20 | 
21 | static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) {
22 |   struct ncclReg* regRecord = NULL;
23 |   *symPtr = NULL;
24 |   *outReg = NULL;
25 |   NCCLCHECK(ncclRegFind(comm, data, size, &regRecord));
26 |   if (regRecord && regRecord->baseSymPtr) {
27 |     *symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr);
28 |     *outReg = regRecord;
29 |   }
30 |   return ncclSuccess;
31 | }
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/include/shm.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_SHM_H_
 2 | #define NCCL_SHM_H_
 3 | 
 4 | #include "comm.h"
 5 | 
 6 | struct shmLegacyIpc {
 7 |   char shmSuffix[7];
 8 |   ncclShmHandle_t handle;
 9 |   size_t shmSize;
10 | };
11 | 
12 | struct shmCuIpc {
13 |   union {
14 |     CUmemFabricHandle handle;
15 |     CUmemGenericAllocationHandle data;
16 |   };
17 |   void *ptr;
18 |   size_t size;
19 | };
20 | 
21 | struct shmIpcDesc {
22 |   union
23 |   {
24 |     struct shmLegacyIpc shmli;
25 |     struct shmCuIpc shmci;
26 |   };
27 |   bool legacy;
28 | };
29 | 
30 | typedef struct shmIpcDesc ncclShmIpcDesc_t;
31 | 
32 | ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
33 | ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
34 | ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/src/include/shmutils.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SHMUTILS_H_
 8 | #define NCCL_SHMUTILS_H_
 9 | 
10 | #include "nccl.h"
11 | 
12 | typedef void* ncclShmHandle_t;
13 | ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
14 | ncclResult_t ncclShmClose(ncclShmHandle_t handle);
15 | ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
16 | 
17 | struct ncclShmemCollBuff {
18 |   volatile size_t *cnt[2];
19 |   volatile void *ptr[2];
20 |   int round;
21 |   size_t maxTypeSize;
22 | };
23 | 
24 | ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/include/symmetric.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_DEVICE_SYMMETRIC_H_
 2 | #define NCCL_DEVICE_SYMMETRIC_H_
 3 | 
 4 | #include "nccl.h"
 5 | #include "nccl_common.h"
 6 | #include "bitops.h"
 7 | 
 8 | constexpr int ncclSymMaxBlocks = 64;
 9 | constexpr int ncclSymMaxThreads = 512;
10 | constexpr int ncclSymLLMaxEltSize = 64;
11 | 
12 | constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) {
13 |   return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize;
14 | }
15 | 
16 | constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) {
17 |   return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize);
18 | }
19 | 
20 | struct alignas(16) ncclSymDevBase {
21 |   uint32_t llEpoch[ncclSymMaxBlocks];
22 |   uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks];
23 |   uint32_t barInboxMc[ncclSymMaxBlocks];
24 |   uint32_t barInboxPerPeer[];
25 | 
26 |   static constexpr size_t size(int nRanks) {
27 |     return sizeof(ncclSymDevBase) +
28 |            alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) +
29 |            ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks);
30 |   }
31 | };
32 | 
33 | static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) {
34 |   // Get pointer to buffer trailing the header struct.
35 |   char* ans = (char*)(base + 1);
36 |   // Skip over barInboxPerPeer[]
37 |   ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16);
38 |   // Skip to our block
39 |   int epochSize = ncclSymLLEpochSize(nRanks);
40 |   ans += block * /*epochs=*/2 * epochSize;
41 |   ans += (epoch & 1)*epochSize;
42 |   return (uint4*)ans;
43 | }
44 | 
45 | struct ncclSymDevComm {
46 |   ncclSymDevBase* base;
47 |   ncclSymDevBase* baseMc;
48 |   uint32_t stride4G;
49 |   int nRanks, rank;
50 |   uint32_t nRanks_rcp32; // idivRcp32(nRanks)
51 | };
52 | 
53 | struct alignas(16) ncclSymDevArgs {
54 |   struct ncclSymDevComm comm;
55 |   int rootRank;
56 |   uint64_t redOpArg; // must be collectively uniform
57 |   size_t nElts;
58 |   char* input;
59 |   char* output;
60 | };
61 | 
62 | enum ncclSymKernelId {
63 |   ncclSymKernelId_AllReduce_AGxLL_R,
64 |   ncclSymKernelId_AllReduce_AGxLLMC_R,
65 |   ncclSymKernelId_AllReduce_RSxLD_AGxST,
66 |   ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC,
67 | 
68 |   ncclSymKernelId_AllGather_LL,
69 |   ncclSymKernelId_AllGather_LLMC,
70 |   ncclSymKernelId_AllGather_ST,
71 |   ncclSymKernelId_AllGather_STMC,
72 | 
73 |   ncclSymKernelId_ReduceScatter_LL,
74 |   ncclSymKernelId_ReduceScatter_LD,
75 |   ncclSymKernelId_ReduceScatter_LDMC,
76 | 
77 |   ncclSymKernelId_Count
78 | };
79 | 
80 | bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
81 | 
82 | ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps);
83 | 
84 | // Generated by src/device/symmetric/generate.py
85 | extern int const ncclSymKernelCount;
86 | extern void* const ncclSymKernelList[];
87 | void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
88 | const char* ncclSymKernelIdToString(int kernelId);
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/src/include/timer.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TIMER_H_
 8 | #define NCCL_TIMER_H_
 9 | #if ENABLE_TIMER
10 | #include <unistd.h>
11 | #include <sys/time.h>
12 | #include <x86intrin.h>
13 | static double freq = -1;
14 | static void calibrate() {
15 |   struct timeval tv;
16 |   gettimeofday(&tv, NULL);
17 |   uint64_t timeCycles = __rdtsc();
18 |   double time = - tv.tv_sec*1E6 - tv.tv_usec;
19 |   uint64_t total = 0ULL;
20 |   for (int i=0; i<10000; i++) total += __rdtsc();
21 |   gettimeofday(&tv, NULL);
22 |   timeCycles = __rdtsc() - timeCycles;
23 |   time += tv.tv_sec*1E6 + tv.tv_usec;
24 |   freq = timeCycles/time;
25 | }
26 | static inline double gettime() {
27 |   if (freq == -1) calibrate();
28 |   return __rdtsc()/freq;
29 | }
30 | static uint64_t counts[8];
31 | static double times[8];
32 | static double startTimes[8];
33 | #define TIME_START(index) do { \
34 |   counts[index]++; \
35 |   startTimes[index] = gettime(); \
36 | } while (0)
37 | 
38 | #define TIME_STOP(index) do { \
39 |   times[index] += gettime() - startTimes[index]; \
40 | } while (0)
41 | 
42 | #define TIME_CANCEL(index) do { \
43 |   counts[index]--; \
44 | } while (0)
45 | 
46 | #define TIME_PRINT(name) do { \
47 |   printf("%s stats", name); \
48 |   for (int i=0; i<8; i++) { \
49 |     if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
50 |     counts[i] = 0; \
51 |   } \
52 |   printf("\n"); \
53 | } while (0)
54 | #else
55 | #define TIME_START(index) do {} while(0)
56 | #define TIME_STOP(index) do {} while(0)
57 | #define TIME_CANCEL(index) do {} while(0)
58 | #define TIME_PRINT(name)
59 | #endif
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/include/trees.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TREES_H_
 8 | #define NCCL_TREES_H_
 9 | 
10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/include/tuner.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_INT_TUNER_H_
 9 | #define NCCL_INT_TUNER_H_
10 | 
11 | #include "nccl_tuner.h"
12 | #include "comm.h"
13 | 
14 | // Tuning plugin to override NCCL's default algorithm/protocol tuning.
15 | 
16 | // Attempts to load NCCL tuner from environmental variable.
17 | // Returns ncclSuccess if the correct tuner symbol has been found and
18 | // successully loaded.  Otherwise returns an error and also logs the error.
19 | ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm);
20 | 
21 | // Cleans up NCCL tuner plugin.
22 | ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm);
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/init_nvtx.cc:
--------------------------------------------------------------------------------
 1 | #include "nccl.h"
 2 | #include "nvtx.h"
 3 | 
 4 | static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
 5 |   {"Sum", ncclSum, 0},
 6 |   {"Product", ncclProd, 0},
 7 |   {"Max", ncclMax, 0},
 8 |   {"Min", ncclMin, 0},
 9 |   {"Avg", ncclAvg, 0}
10 | };
11 | 
12 | // Must be called before the first call to any reduction operation.
13 | void initNvtxRegisteredEnums() {
14 |   // Register schemas and strings
15 |   constexpr const nvtxPayloadEnumAttr_t eAttr {
16 |     .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES |
17 |       NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID,
18 |     .name = NULL,
19 |     .entries = NvtxEnumRedSchema,
20 |     .numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
21 |     .sizeOfEnum = sizeof(ncclRedOp_t),
22 |     .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP,
23 |     .extension = nullptr
24 |   };
25 | 
26 |   nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
27 | }
28 | 


--------------------------------------------------------------------------------
/src/misc/argcheck.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "argcheck.h"
 8 | #include "comm.h"
 9 | 
10 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
11 |   cudaPointerAttributes attr;
12 |   cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
13 |   if (err != cudaSuccess || attr.devicePointer == NULL) {
14 |     WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer);
15 |     return ncclInvalidArgument;
16 |   }
17 | #if CUDART_VERSION >= 10000
18 |   if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
19 | #else
20 |   if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
21 | #endif
22 |     WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
23 |     return ncclInvalidArgument;
24 |   }
25 |   return ncclSuccess;
26 | }
27 | 
28 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
29 |   if (ptr == NULL) {
30 |     WARN("%s : %s argument is NULL", opname, ptrname);
31 |     return ncclInvalidArgument;
32 |   }
33 |   return ncclSuccess;
34 | }
35 | 
36 | ncclResult_t CommCheck(struct ncclComm* comm, const char* opname, const char* ptrname) {
37 |   NCCLCHECK(PtrCheck(comm, opname, ptrname));
38 |   if (comm->startMagic != NCCL_MAGIC || comm->endMagic != NCCL_MAGIC) {
39 |     WARN("Error: corrupted comm object detected");
40 |     return ncclInvalidArgument;
41 |   }
42 |   return ncclSuccess;
43 | }
44 | 
45 | ncclResult_t ArgsCheck(struct ncclInfo* info) {
46 |   // First, the easy ones
47 |   if (info->root < 0 || info->root >= info->comm->nRanks) {
48 |     WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
49 |     return ncclInvalidArgument;
50 |   }
51 |   if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
52 |     WARN("%s : invalid type %d", info->opName, info->datatype);
53 |     return ncclInvalidArgument;
54 |   }
55 | 
56 |   // ncclMaxRedOp < info->op will always be false due to the sizes of
57 |   // the datatypes involved, and that's by design.  We keep the check though
58 |   // just as a reminder.
59 |   // coverity[result_independent_of_operands]
60 |   if (info->op < 0 || ncclMaxRedOp < info->op) {
61 |     WARN("%s : invalid reduction operation %d", info->opName, info->op);
62 |     return ncclInvalidArgument;
63 |   }
64 |   int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps);
65 |   if (ncclNumOps <= info->op &&
66 |       (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) {
67 |     WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op);
68 |     return ncclInvalidArgument;
69 |   }
70 | 
71 |   if (info->comm->checkPointers) {
72 |     if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) {
73 |       if (info->count >0)
74 |         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName));
75 |     } else {
76 |       // Check CUDA device pointers
77 |       if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
78 |         NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
79 |       }
80 |       if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
81 |         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
82 |       }
83 |     }
84 |   }
85 |   return ncclSuccess;
86 | }
87 | 


--------------------------------------------------------------------------------
/src/misc/mlx5dvsymbols.cc:
--------------------------------------------------------------------------------
 1 | #include <sys/types.h>
 2 | #include <unistd.h>
 3 | 
 4 | #include "mlx5/mlx5dvsymbols.h"
 5 | 
 6 | #ifdef NCCL_BUILD_MLX5DV
 7 | /* Mlx5dv linking mode. Symbols are pointers to linked MLX5 Direct Verbs */
 8 | 
 9 | #define ASSIGN_SYM(container, symbol, name) container->name= &symbol;
10 | 
11 | ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
12 |   ASSIGN_SYM(mlx5dvSymbols, mlx5dv_is_supported, mlx5dv_internal_is_supported);
13 |   ASSIGN_SYM(mlx5dvSymbols, mlx5dv_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path);
14 |   ASSIGN_SYM(mlx5dvSymbols, mlx5dv_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr);
15 |   return ncclSuccess;
16 | }
17 | 
18 | #else
19 | /* Mlx5dv dynamic loading mode. Symbols are loaded from shared objects. */
20 | 
21 | #include <dlfcn.h>
22 | #include "core.h"
23 | 
24 | // MLX5DV Library versioning
25 | #define MLX5DV_VERSION "MLX5_1.8"
26 | 
27 | ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
28 |   static void* mlx5dvhandle = NULL;
29 |   void* tmp;
30 |   void** cast;
31 | 
32 |   mlx5dvhandle=dlopen("libmlx5.so", RTLD_NOW);
33 |   if (!mlx5dvhandle) {
34 |     mlx5dvhandle=dlopen("libmlx5.so.1", RTLD_NOW);
35 |     if (!mlx5dvhandle) {
36 |       INFO(NCCL_INIT, "Failed to open libmlx5.so[.1]");
37 |       goto teardown;
38 |     }
39 |   }
40 | 
41 | #define LOAD_SYM(handle, symbol, funcptr) do {           \
42 |     cast = (void**)&funcptr;                             \
43 |     tmp = dlvsym(handle, symbol, MLX5DV_VERSION);       \
44 |     if (tmp == NULL) {                                   \
45 |       WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), MLX5DV_VERSION);  \
46 |       goto teardown;                                     \
47 |     }                                                    \
48 |     *cast = tmp;                                         \
49 |   } while (0)
50 | 
51 | // Attempt to load a specific symbol version - fail silently
52 | #define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do {  \
53 |     cast = (void**)&funcptr;                                     \
54 |     *cast = dlvsym(handle, symbol, version);                     \
55 |   } while (0)
56 | 
57 |   LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported);
58 |   // Cherry-pick the mlx5dv_get_data_direct_sysfs_path API from MLX5 1.25
59 |   LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_get_data_direct_sysfs_path", mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path, "MLX5_1.25");
60 |   // Cherry-pick the ibv_reg_dmabuf_mr API from MLX5 1.25
61 |   LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_reg_dmabuf_mr", mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr, "MLX5_1.25");
62 | 
63 |   return ncclSuccess;
64 | 
65 | teardown:
66 |   mlx5dvSymbols->mlx5dv_internal_is_supported = NULL;
67 |   mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path = NULL;
68 |   mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr = NULL;
69 | 
70 |   if (mlx5dvhandle != NULL) dlclose(mlx5dvhandle);
71 |   return ncclSystemError;
72 | }
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/src/misc/mlx5dvwrap.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "mlx5/mlx5dvwrap.h"
 8 | #include <sys/types.h>
 9 | #include <unistd.h>
10 | 
11 | #ifdef NCCL_BUILD_MLX5DV
12 | #include <infiniband/mlx5dv.h>
13 | #else
14 | #include "mlx5/mlx5dvcore.h"
15 | #endif
16 | #include "mlx5/mlx5dvsymbols.h"
17 | 
18 | static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
19 | static ncclResult_t initResult;
20 | struct ncclMlx5dvSymbols mlx5dvSymbols;
21 | 
22 | ncclResult_t wrap_mlx5dv_symbols(void) {
23 |   pthread_once(&initOnceControl,
24 |                [](){ initResult = buildMlx5dvSymbols(&mlx5dvSymbols); });
25 |   return initResult;
26 | }
27 | 
28 | /* CHECK_NOT_NULL: helper macro to check for NULL symbol */
29 | #define CHECK_NOT_NULL(container, internal_name) \
30 |   if (container.internal_name == NULL) { \
31 |      WARN("lib wrapper not initialized."); \
32 |      return ncclInternalError; \
33 |   }
34 | 
35 | #define MLX5DV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \
36 |   CHECK_NOT_NULL(container, internal_name); \
37 |   retval = container.call; \
38 |   if (retval == error_retval) { \
39 |     WARN("Call to " name " failed with error %s", strerror(errno)); \
40 |     return ncclSystemError; \
41 |   } \
42 |   return ncclSuccess;
43 | 
44 | #define MLX5DV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
45 |   CHECK_NOT_NULL(container, internal_name); \
46 |   int ret = container.call; \
47 |   if (ret != success_retval) { \
48 |     INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
49 |     return ncclSystemError; \
50 |   } \
51 |   return ncclSuccess;
52 | 
53 | bool wrap_mlx5dv_is_supported(struct ibv_device *device) {
54 |   if (mlx5dvSymbols.mlx5dv_internal_is_supported == NULL) {
55 |     return 0;
56 |   }
57 |   return mlx5dvSymbols.mlx5dv_internal_is_supported(device);
58 | }
59 | 
60 | ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len) {
61 |   MLX5DV_INT_CHECK_RET_ERRNO(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len), 0, "mlx5dv_get_data_direct_sysfs_path");
62 | }
63 | 
64 | /* DMA-BUF support */
65 | ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) {
66 |   MLX5DV_PTR_CHECK_ERRNO(mlx5dvSymbols, mlx5dv_internal_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access), *ret, NULL, "mlx5dv_reg_dmabuf_mr");
67 | }
68 | 
69 | struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) {
70 |   if (mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr == NULL) {
71 |     errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
72 |     return NULL;
73 |   }
74 |   return mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access);
75 | }


--------------------------------------------------------------------------------
/src/misc/param.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "param.h"
 8 | #include "debug.h"
 9 | 
10 | #include <algorithm>
11 | #include <errno.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <string.h>
15 | #include <sys/types.h>
16 | #include <unistd.h>
17 | #include <pthread.h>
18 | #include <pwd.h>
19 | 
20 | const char* userHomeDir() {
21 |   struct passwd *pwUser = getpwuid(getuid());
22 |   return pwUser == NULL ? NULL : pwUser->pw_dir;
23 | }
24 | 
25 | void setEnvFile(const char* fileName) {
26 |   FILE * file = fopen(fileName, "r");
27 |   if (file == NULL) return;
28 | 
29 |   char *line = NULL;
30 |   char envVar[1024];
31 |   char envValue[1024];
32 |   size_t n = 0;
33 |   ssize_t read;
34 |   while ((read = getline(&line, &n, file)) != -1) {
35 |     if (line[0] == '#') continue;
36 |     if (line[read-1] == '\n') line[read-1] = '\0';
37 |     int s=0; // Env Var Size
38 |     while (line[s] != '\0' && line[s] != '=') s++;
39 |     if (line[s] == '\0') continue;
40 |     strncpy(envVar, line, std::min(1023,s));
41 |     envVar[std::min(1023,s)] = '\0';
42 |     s++;
43 |     strncpy(envValue, line+s, 1023);
44 |     envValue[1023]='\0';
45 |     setenv(envVar, envValue, 0);
46 |     //printf("%s : %s->%s\n", fileName, envVar, envValue);
47 |   }
48 |   if (line) free(line);
49 |   fclose(file);
50 | }
51 | 
52 | static void initEnvFunc() {
53 |   char confFilePath[1024];
54 |   const char* userFile = getenv("NCCL_CONF_FILE");
55 |   if (userFile && strlen(userFile) > 0) {
56 |     snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
57 |     setEnvFile(confFilePath);
58 |   } else {
59 |     const char* userDir = userHomeDir();
60 |     if (userDir) {
61 |       snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir);
62 |       setEnvFile(confFilePath);
63 |     }
64 |   }
65 |   snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf");
66 |   setEnvFile(confFilePath);
67 | }
68 | 
69 | void initEnv() {
70 |   static pthread_once_t once = PTHREAD_ONCE_INIT;
71 |   pthread_once(&once, initEnvFunc);
72 | }
73 | 
74 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
75 |   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
76 |   pthread_mutex_lock(&mutex);
77 |   if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
78 |     const char* str = ncclGetEnv(env);
79 |     int64_t value = deftVal;
80 |     if (str && strlen(str) > 0) {
81 |       errno = 0;
82 |       value = strtoll(str, nullptr, 0);
83 |       if (errno) {
84 |         value = deftVal;
85 |         INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
86 |       } else {
87 |         INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value);
88 |       }
89 |     }
90 |     __atomic_store_n(cache, value, __ATOMIC_RELAXED);
91 |   }
92 |   pthread_mutex_unlock(&mutex);
93 | }
94 | 
95 | const char* ncclGetEnv(const char* name) {
96 |   initEnv();
97 |   return getenv(name);
98 | }
99 | 


--------------------------------------------------------------------------------
/src/nccl.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=${nccl:Prefix}
 2 | exec_prefix=${prefix}
 3 | libdir=${exec_prefix}/lib
 4 | includedir=${prefix}/include
 5 | 
 6 | Name: nccl
 7 | Description: Optimized primitives for collective multi-GPU communication
 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
 9 | Libs: -L${libdir} -lnccl
10 | Cflags: -I${includedir}
11 | 


--------------------------------------------------------------------------------
/src/plugin/net/net_v10.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "nccl_net.h"
 8 | #include "net_device.h"
 9 | #include "proxy.h"
10 | 
11 | static ncclNet_v10_t* ncclNet_v10;
12 | static ncclCollNet_v10_t* ncclCollNet_v10;
13 | 
14 | ncclNet_t* getNcclNet_v10(void* lib) {
15 |   ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10");
16 |   if (ncclNet_v10) {
17 |     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name);
18 |     return ncclNet_v10;
19 |   }
20 |   INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol.");
21 |   return nullptr;
22 | }
23 | 
24 | ncclCollNet_t* getNcclCollNet_v10(void* lib) {
25 |   ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10");
26 |   if (ncclCollNet_v10) {
27 |     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name);
28 |     return ncclCollNet_v10;
29 |   }
30 |   INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol.");
31 |   return nullptr;
32 | }
33 | 


--------------------------------------------------------------------------------
/src/plugin/profiler/profiler_v4.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "comm.h"
 8 | #include "nccl_profiler.h"
 9 | #include "checks.h"
10 | 
11 | static ncclProfiler_v4_t* ncclProfiler_v4;
12 | 
13 | ncclProfiler_t* getNcclProfiler_v4(void* lib) {
14 |   ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4");
15 |   if (ncclProfiler_v4) {
16 |     INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name);
17 |     return ncclProfiler_v4;
18 |   }
19 |   INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4");
20 |   return NULL;
21 | }
22 | 


--------------------------------------------------------------------------------
/src/plugin/tuner.cc:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
  3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
  4 |  *
  5 |  * See LICENSE.txt for license information
  6 |  ************************************************************************/
  7 | 
  8 | #include <errno.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #include "checks.h"
 12 | #include "debug.h"
 13 | #include "tuner.h"
 14 | #include "plugin.h"
 15 | 
 16 | extern ncclTuner_t* getNcclTuner_v2(void* lib);
 17 | extern ncclTuner_t* getNcclTuner_v3(void* lib);
 18 | extern ncclTuner_t* getNcclTuner_v4(void* lib);
 19 | 
 20 | pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
 21 | static int tunerPluginRefCount;
 22 | static void* tunerPluginLib = nullptr;
 23 | static ncclTuner_t* tunerSymbol = nullptr;
 24 | 
 25 | enum {
 26 |   tunerPluginLoadFailed  = -1,
 27 |   tunerPluginLoadReady   =  0,
 28 |   tunerPluginLoadSuccess =  1,
 29 | };
 30 | 
 31 | #define MAX_PLUGIN_LOAD 4
 32 | 
 33 | static int status = tunerPluginLoadReady;
 34 | 
 35 | ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
 36 |   // Initialize to nullptr by default if plugin tuner cannot be loaded.
 37 |   comm->tuner = nullptr;
 38 |   if (tunerPluginLoadFailed == status) {
 39 |     return ncclSuccess;
 40 |   }
 41 | 
 42 |   pthread_mutex_lock(&tunerPluginLock);
 43 |   if (tunerPluginLoadFailed == status) {
 44 |     goto exit;
 45 |   }
 46 | 
 47 |   if (tunerPluginLoadSuccess == status) {
 48 |     comm->tuner = tunerSymbol;
 49 |     ++tunerPluginRefCount;
 50 |     goto exit;
 51 |   }
 52 | 
 53 |   tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN"));
 54 |   if (nullptr == tunerPluginLib) {
 55 |     tunerPluginLib = ncclGetNetPluginLib();
 56 |     if (nullptr == tunerPluginLib) {
 57 |       goto fail;
 58 |     }
 59 |   }
 60 | 
 61 |   tunerSymbol = getNcclTuner_v4(tunerPluginLib);
 62 |   if (tunerSymbol == NULL) {
 63 |     tunerSymbol = getNcclTuner_v3(tunerPluginLib);
 64 |   }
 65 |   if (tunerSymbol == NULL) {
 66 |     tunerSymbol = getNcclTuner_v2(tunerPluginLib);
 67 |   }
 68 |   if (tunerSymbol == NULL) {
 69 |     goto fail;
 70 |   }
 71 | 
 72 |   comm->tuner = tunerSymbol;
 73 |   ++tunerPluginRefCount;
 74 |   status = tunerPluginLoadSuccess;
 75 |   comm->tunerPluginLoaded = 1;
 76 | 
 77 | exit:
 78 |   pthread_mutex_unlock(&tunerPluginLock);
 79 |   return ncclSuccess;
 80 | fail:
 81 |   tunerPluginLib = nullptr;
 82 |   status = tunerPluginLoadFailed;
 83 |   goto exit;
 84 | }
 85 | 
 86 | ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
 87 |   pthread_mutex_lock(&tunerPluginLock);
 88 |   if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
 89 |     INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
 90 |     NCCLCHECK(ncclClosePluginLib(tunerPluginLib));
 91 |     tunerPluginLib = nullptr;
 92 |     tunerSymbol = nullptr;
 93 |     comm->tuner = nullptr;
 94 |     status = tunerPluginLoadReady;
 95 |     comm->tunerPluginLoaded = 0;
 96 |   }
 97 |   pthread_mutex_unlock(&tunerPluginLock);
 98 |   return ncclSuccess;
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/plugin/tuner/tuner_v2.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #include <dlfcn.h>
 9 | #include "debug.h"
10 | #include "checks.h"
11 | #include "nccl_tuner.h"
12 | 
13 | static ncclTuner_v2_t* ncclTuner_v2;
14 | static ncclTuner_t ncclTuner;
15 | 
16 | static int hasNvlsSupport(float** collCostTable) {
17 |   // Requirements for support of different algorithms:
18 |   //
19 |   // - NVLS intra-node: nvlsSupport
20 |   // - NVLS intra+inter-node: collNetSupport
21 |   // - NVLSTree intra-node: always disabled
22 |   // - NVLSTree inter-node: nvlsSupport
23 |   // - Collnet* inter-node: collNetSupport
24 |   //
25 |   // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
26 |   float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
27 |   return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
28 | }
29 | 
30 | static int hasCollNetSupport(float** collCostTable) {
31 |   float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
32 |   return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
33 | }
34 | 
35 | static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
36 |   int algorithm = NCCL_ALGO_UNDEF;
37 |   int protocol = NCCL_PROTO_UNDEF;
38 |   int nvlsSupport = hasNvlsSupport(collCostTable);
39 |   int collNetSupport = hasCollNetSupport(collCostTable);
40 |   NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
41 |   // set time to 0 below to make sure this algorithm/protocol is selected later on
42 |   if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
43 |     float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
44 |     if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
45 |   }
46 |   return ncclSuccess;
47 | }
48 | 
49 | static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
50 |   NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context));
51 |   ncclTuner.getCollInfo = ncclTuner_getCollInfo;
52 |   ncclTuner.destroy = ncclTuner_v2->destroy;
53 |   return ncclSuccess;
54 | }
55 | 
56 | ncclTuner_t* getNcclTuner_v2(void* lib) {
57 |   ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(lib, "ncclTunerPlugin_v2");
58 |   if (ncclTuner_v2) {
59 |     ncclTuner.name = ncclTuner_v2->name;
60 |     ncclTuner.init = ncclTuner_init;
61 |     INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name);
62 |     return &ncclTuner;
63 |   }
64 |   INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
65 |   return NULL;
66 | }
67 | 


--------------------------------------------------------------------------------
/src/plugin/tuner/tuner_v3.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #include <dlfcn.h>
 9 | #include "debug.h"
10 | #include "checks.h"
11 | #include "nccl_tuner.h"
12 | 
13 | static ncclTuner_v3_t* ncclTuner_v3;
14 | static ncclTuner_t ncclTuner;
15 | 
16 | static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
17 |   NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
18 |   return ncclSuccess;
19 | }
20 | 
21 | static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
22 |   NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context));
23 |   ncclTuner.getCollInfo = ncclTuner_getCollInfo;
24 |   ncclTuner.destroy = ncclTuner_v3->destroy;
25 |   return ncclSuccess;
26 | }
27 | 
28 | ncclTuner_t* getNcclTuner_v3(void* lib) {
29 |   ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(lib, "ncclTunerPlugin_v3");
30 |   if (ncclTuner_v3) {
31 |     ncclTuner.name = ncclTuner_v3->name;
32 |     ncclTuner.init = ncclTuner_init;
33 |     INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name);
34 |     return &ncclTuner;
35 |   }
36 |   INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
37 |   return NULL;
38 | }
39 | 


--------------------------------------------------------------------------------
/src/plugin/tuner/tuner_v4.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #include <dlfcn.h>
 9 | #include "debug.h"
10 | #include "nccl_tuner.h"
11 | 
12 | static ncclTuner_v4_t* ncclTuner_v4;
13 | 
14 | ncclTuner_t* getNcclTuner_v4(void* lib) {
15 |   ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4");
16 |   if (ncclTuner_v4) {
17 |     INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name);
18 |     return ncclTuner_v4;
19 |   }
20 |   INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
21 |   return NULL;
22 | }
23 | 


--------------------------------------------------------------------------------
/src/register/sendrecv_reg.cc:
--------------------------------------------------------------------------------
 1 | #include "register.h"
 2 | #include "transport.h"
 3 | 
 4 | ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
 5 |   ncclResult_t ret = ncclSuccess;
 6 | 
 7 |   *regFlag = 0;
 8 |   if (comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) {
 9 |     if (comm->planner.persistent && ncclParamGraphRegister()) {
10 |       ncclNetGraphRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle, cleanupQueue, NULL);
11 |     }
12 |     if (*regFlag == 0 && ncclParamLocalRegister()) {
13 |       ncclNetLocalRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle);
14 |     }
15 |   }
16 |   return ret;
17 | }
18 | 
19 | ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
20 |   ncclResult_t ret = ncclSuccess;
21 |   uintptr_t offset = 0;
22 |   uintptr_t* peerRmtAddrs = NULL;
23 | 
24 |   *regFlag = 0;
25 |   if (comm->planner.persistent && ncclParamGraphRegister()) {
26 |     ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
27 |   }
28 |   if (*regFlag == 0 && ncclParamLocalRegister()) {
29 |     ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
30 |   }
31 | 
32 |   if (*regFlag)
33 |     *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
34 |   return ret;
35 | }
36 | 


--------------------------------------------------------------------------------
/src/transport/generic.cc:
--------------------------------------------------------------------------------
 1 | #include "comm.h"
 2 | #include "transport.h"
 3 | #include "bootstrap.h"
 4 | 
 5 | ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) {
 6 |   struct ringConnInfo {
 7 |     bool useNetPXN;
 8 |     bool useGdr;
 9 |   };
10 |   struct ringConnInfo* ringInfo = NULL;
11 |   ncclResult_t ret = ncclSuccess;
12 |   if (comm && comm->nRanks > 1) {
13 |     comm->useGdr = true;
14 |     comm->useNetPXN = false;
15 |     for (int c = 0; c < comm->nChannels; c++) {
16 |       struct ncclChannel* channel = comm->channels + c;
17 |       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
18 |     }
19 |     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail);
20 |     if (ncclParamLocalRegister() || ncclParamGraphRegister()) {
21 |       NCCLCHECK(ncclCalloc(&ringInfo, comm->nRanks));
22 |       ringInfo[comm->rank].useGdr = comm->useGdr;
23 |       ringInfo[comm->rank].useNetPXN = comm->useNetPXN;
24 |       NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, ringInfo, sizeof(struct ringConnInfo)), ret, fail);
25 |       for (int i = 0; i < comm->nRanks; ++i) {
26 |         if (!ringInfo[i].useGdr) comm->useGdr = false;
27 |         if (ringInfo[i].useNetPXN) comm->useNetPXN = true;
28 |         if (comm->useGdr == false && comm->useNetPXN == true) break;
29 |       }
30 |     }
31 |     INFO(NCCL_INIT, "Connected all rings, use ring PXN %d GDR %d", comm->useNetPXN, comm->useGdr);
32 |   }
33 | exit:
34 |   free(ringInfo);
35 |   return ret;
36 | fail:
37 |   goto exit;
38 | }
39 | 
40 | ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm) {
41 |   ncclResult_t ret = ncclSuccess;
42 |   if (comm && comm->nRanks > 1) {
43 |     // Connect Trees
44 |     for (int c = 0; c < comm->nChannels; c++) {
45 |       struct ncclChannel* channel = comm->channels + c;
46 |       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
47 |       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
48 |     }
49 |     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
50 |     INFO(NCCL_INIT, "Connected all trees");
51 |   }
52 | exit:
53 |   return ret;
54 | fail:
55 |   goto exit;
56 | }
57 | 
58 | ncclResult_t ncclTransportPatConnect(struct ncclComm* comm) {
59 |   ncclResult_t ret = ncclSuccess;
60 |   if (comm && comm->nRanks > 1) {
61 |     for (int mask=1; mask<comm->nRanks; mask<<=1) {
62 |       int prevPeer = (comm->rank + mask) % comm->nRanks;
63 |       int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks;
64 |       for (int c = 0; c < comm->nChannels; c++) {
65 |         NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter
66 |       }
67 |       NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
68 |       for (int c = 0; c < comm->nChannels; c++) {
69 |         NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather
70 |       }
71 |       NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
72 |     }
73 |     INFO(NCCL_INIT, "Connected binomial trees");
74 |   }
75 | exit:
76 |   return ret;
77 | fail:
78 |   goto exit;
79 | }
80 | 


--------------------------------------------------------------------------------
/src/transport/profiler.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | #include "transport.h"
 7 | #include "proxy.h"
 8 | #include "profiler.h"
 9 | #include "device.h"
10 | 
11 | static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
12 |   connection->proxyAppendPtr = &connection->proxyAppend;
13 |   connection->shared = 1;
14 |   return ncclSuccess;
15 | }
16 | 
17 | // The following ncclProxySubArgs are overloaded by the profiler progress function:
18 | // - base       : is set to the current value of workCounter[channelId]
19 | // - posted     : is set to sub->nsteps to indicate that the profiler has started the event
20 | // - transmitted: is set to sub->nsteps to indicate that the profiler has stopped the event
21 | static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
22 |   if (args->state == ncclProxyOpReady) {
23 |     for (int s = 0; s < args->nsubs; s++) {
24 |       struct ncclProxySubArgs* sub = args->subs + s;
25 |       sub->base = sub->workCounter;
26 |       sub->posted = sub->transmitted = 0;
27 |     }
28 |     args->state = ncclProxyOpProgress;
29 |   }
30 |   if (args->state == ncclProxyOpProgress) {
31 |     for (int s = 0; s < args->nsubs; s++) {
32 |       struct ncclProxySubArgs* sub = args->subs + s;
33 |       struct ncclDevProfiler* workStarted = (struct ncclDevProfiler *)sub->sendbuff;
34 |       struct ncclDevProfiler* workCompleted = (struct ncclDevProfiler *)sub->recvbuff;
35 |       if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].counter) {
36 |         ncclProfilerStartKernelChEvent(args, s, workStarted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp);
37 |         sub->posted = sub->nsteps;
38 |         continue; // allow events on every channel to start
39 |       }
40 |       if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].counter) {
41 |         ncclProfilerStopKernelChEvent(args, s, workCompleted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp);
42 |         sub->transmitted = sub->nsteps;
43 |         args->done++;
44 |       }
45 |     }
46 |     if (args->done == args->nsubs) args->state = ncclProxyOpNone;
47 |   }
48 |   return ncclSuccess;
49 | }
50 | 
51 | struct ncclTransport profilerTransport = {
52 |   "Prof",
53 |   NULL,
54 |   { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL },
55 |   { NULL, NULL, NULL, NULL, NULL, profilerProxyConnect, NULL, profilerProxyProgress, NULL, NULL }
56 | };
57 | 


--------------------------------------------------------------------------------