├── .azure-pipelines
    └── integration-test.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── LICENSE.txt
├── Makefile
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── cgmanifest.json
├── ext-net
    ├── README.md
    ├── example
    │   ├── Makefile
    │   ├── nccl
    │   │   ├── common.h
    │   │   ├── err.h
    │   │   ├── net.h
    │   │   ├── net_device.h
    │   │   ├── net_v2.h
    │   │   ├── net_v3.h
    │   │   ├── net_v4.h
    │   │   ├── net_v5.h
    │   │   ├── net_v6.h
    │   │   ├── net_v7.h
    │   │   ├── net_v8.h
    │   │   └── types.h
    │   └── plugin.c
    └── google-fastsocket
    │   └── Makefile
├── ext-profiler
    └── example
    │   ├── Makefile
    │   ├── event.c
    │   ├── event.h
    │   ├── nccl
    │       ├── common.h
    │       ├── err.h
    │       ├── profiler.h
    │       ├── profiler_v1.h
    │       └── types.h
    │   ├── plugin.c
    │   ├── print_event.c
    │   └── print_event.h
├── ext-tuner
    └── example
    │   ├── Makefile
    │   ├── nccl
    │       ├── common.h
    │       ├── err.h
    │       └── tuner.h
    │   └── plugin.c
├── makefiles
    ├── common.mk
    ├── formatting.mk
    └── version.mk
├── pkg
    ├── Makefile
    ├── debian
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── changelog.in
    │   ├── compat
    │   ├── control.in
    │   ├── copyright
    │   ├── gbp.conf
    │   ├── libnccl-dev.install.in
    │   ├── libnccl2.install.in
    │   ├── rules
    │   └── source
    │   │   └── format
    ├── redhat
    │   ├── Makefile
    │   └── nccl.spec.in
    ├── srctxz
    │   ├── Makefile
    │   └── create_srctxz.sh.in
    └── txz
    │   ├── Makefile
    │   └── create_txz.sh.in
└── src
    ├── Makefile
    ├── bootstrap.cc
    ├── channel.cc
    ├── collectives.cc
    ├── debug.cc
    ├── device
        ├── Makefile
        ├── all_gather.h
        ├── all_reduce.h
        ├── broadcast.h
        ├── common.cu
        ├── common.h
        ├── common_kernel.h
        ├── generate.py
        ├── msccl_kernel.cu
        ├── network
        │   └── unpack
        │   │   ├── unpack.h
        │   │   └── unpack_defs.h
        ├── onerank.cu
        ├── op128.h
        ├── primitives.h
        ├── prims_ll.h
        ├── prims_ll128.h
        ├── prims_simple.h
        ├── reduce.h
        ├── reduce_kernel.h
        ├── reduce_scatter.h
        └── sendrecv.h
    ├── enhcompat.cc
    ├── enqueue.cc
    ├── graph
        ├── connect.cc
        ├── paths.cc
        ├── rings.cc
        ├── rings.h
        ├── search.cc
        ├── topo.cc
        ├── topo.h
        ├── trees.cc
        ├── tuning.cc
        ├── xml.cc
        └── xml.h
    ├── group.cc
    ├── include
        ├── alloc.h
        ├── argcheck.h
        ├── bitops.h
        ├── bootstrap.h
        ├── channel.h
        ├── checks.h
        ├── coll_net.h
        ├── collectives.h
        ├── comm.h
        ├── core.h
        ├── cpuset.h
        ├── cudawrap.h
        ├── debug.h
        ├── device.h
        ├── enqueue.h
        ├── gdrwrap.h
        ├── graph.h
        ├── group.h
        ├── ibvcore.h
        ├── ibvsymbols.h
        ├── ibvwrap.h
        ├── info.h
        ├── ipcsocket.h
        ├── msccl
        │   ├── msccl_kernel.h
        │   ├── msccl_lifecycle.h
        │   ├── msccl_parser.h
        │   ├── msccl_scheduler.h
        │   ├── msccl_setup.h
        │   ├── msccl_status.h
        │   └── msccl_struct.h
        ├── nccl_common.h
        ├── nccl_net.h
        ├── nccl_profiler.h
        ├── nccl_tuner.h
        ├── net.h
        ├── net_device.h
        ├── npkit
        │   ├── npkit.h
        │   ├── npkit_event.h
        │   └── npkit_struct.h
        ├── nvmlwrap.h
        ├── nvtx.h
        ├── nvtx3
        │   ├── nvToolsExt.h
        │   ├── nvToolsExtCounters.h
        │   ├── nvToolsExtCuda.h
        │   ├── nvToolsExtCudaRt.h
        │   ├── nvToolsExtMem.h
        │   ├── nvToolsExtMemCudaRt.h
        │   ├── nvToolsExtOpenCL.h
        │   ├── nvToolsExtPayload.h
        │   ├── nvToolsExtPayloadHelper.h
        │   ├── nvToolsExtSemanticsCounters.h
        │   ├── nvToolsExtSemanticsScope.h
        │   ├── nvToolsExtSync.h
        │   ├── nvtx3.hpp
        │   └── nvtxDetail
        │   │   ├── nvtxExtHelperMacros.h
        │   │   ├── nvtxExtImpl.h
        │   │   ├── nvtxExtImplCounters_v1.h
        │   │   ├── nvtxExtImplMemCudaRt_v1.h
        │   │   ├── nvtxExtImplMem_v1.h
        │   │   ├── nvtxExtImplPayload_v1.h
        │   │   ├── nvtxExtInit.h
        │   │   ├── nvtxExtPayloadHelperInternal.h
        │   │   ├── nvtxExtPayloadTypeInfo.h
        │   │   ├── nvtxExtTypes.h
        │   │   ├── nvtxImpl.h
        │   │   ├── nvtxImplCore.h
        │   │   ├── nvtxImplCudaRt_v3.h
        │   │   ├── nvtxImplCuda_v3.h
        │   │   ├── nvtxImplOpenCL_v3.h
        │   │   ├── nvtxImplSync_v3.h
        │   │   ├── nvtxInit.h
        │   │   ├── nvtxInitDecls.h
        │   │   ├── nvtxInitDefs.h
        │   │   ├── nvtxLinkOnce.h
        │   │   └── nvtxTypes.h
        ├── p2p.h
        ├── param.h
        ├── profiler.h
        ├── proxy.h
        ├── register.h
        ├── shm.h
        ├── shmutils.h
        ├── socket.h
        ├── strongstream.h
        ├── timer.h
        ├── transport.h
        ├── trees.h
        ├── tuner.h
        └── utils.h
    ├── init.cc
    ├── init_nvtx.cc
    ├── misc
        ├── argcheck.cc
        ├── cudawrap.cc
        ├── gdrwrap.cc
        ├── ibvsymbols.cc
        ├── ibvwrap.cc
        ├── ipcsocket.cc
        ├── msccl
        │   ├── msccl_lifecycle.cc
        │   ├── msccl_parser.cc
        │   ├── msccl_setup.cc
        │   └── msccl_status.cc
        ├── npkit.cc
        ├── nvmlwrap.cc
        ├── param.cc
        ├── profiler.cc
        ├── shmutils.cc
        ├── socket.cc
        ├── strongstream.cc
        ├── tuner.cc
        └── utils.cc
    ├── nccl.h.in
    ├── nccl.pc.in
    ├── net.cc
    ├── proxy.cc
    ├── register.cc
    ├── transport.cc
    └── transport
        ├── coll_net.cc
        ├── generic.cc
        ├── net.cc
        ├── net_ib.cc
        ├── net_socket.cc
        ├── nvls.cc
        ├── p2p.cc
        └── shm.cc


/.gitignore:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
2 | /build
3 | *.gcov
4 | /coverage/
5 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 |  Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 4 | 
 5 |  Redistribution and use in source and binary forms, with or without
 6 |  modification, are permitted provided that the following conditions
 7 |  are met:
 8 |   * Redistributions of source code must retain the above copyright
 9 |     notice, this list of conditions and the following disclaimer.
10 |   * Redistributions in binary form must reproduce the above copyright
11 |     notice, this list of conditions and the following disclaimer in the
12 |     documentation and/or other materials provided with the distribution.
13 |   * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
14 |     Laboratory, the U.S. Department of Energy, nor the names of their
15 |     contributors may be used to endorse or promote products derived
16 |     from this software without specific prior written permission.
17 | 
18 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
19 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
22 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
26 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 |  The U.S. Department of Energy funded the development of this software
31 |  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
32 | 
33 | 
34 | This code also includes files from the NVIDIA Tools Extension SDK project.
35 | 
36 | See:
37 | 
38 |    https://github.com/NVIDIA/NVTX
39 | 
40 | for more information and license details.
41 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 4 | 
 5 |  Redistribution and use in source and binary forms, with or without
 6 |  modification, are permitted provided that the following conditions
 7 |  are met:
 8 |   * Redistributions of source code must retain the above copyright
 9 |     notice, this list of conditions and the following disclaimer.
10 |   * Redistributions in binary form must reproduce the above copyright
11 |     notice, this list of conditions and the following disclaimer in the
12 |     documentation and/or other materials provided with the distribution.
13 |   * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
14 |     Laboratory, the U.S. Department of Energy, nor the names of their
15 |     contributors may be used to endorse or promote products derived
16 |     from this software without specific prior written permission.
17 | 
18 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
19 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
22 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
26 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 |  The U.S. Department of Energy funded the development of this software
31 |  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
32 | 
33 | 
34 | This code also includes files from the NVIDIA Tools Extension SDK project.
35 | 
36 | See:
37 | 
38 |    https://github.com/NVIDIA/NVTX
39 | 
40 | for more information and license details.
41 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : src.build
 9 | install : src.install
10 | BUILDDIR ?= $(abspath ./build)
11 | ABSBUILDDIR := $(abspath $(BUILDDIR))
12 | TARGETS := src pkg
13 | clean: ${TARGETS:%=%.clean}
14 | test.build: src.build
15 | LICENSE_FILES := LICENSE.txt
16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
17 | lic: $(LICENSE_TARGETS)
18 | 
19 | ${BUILDDIR}/%.txt: %.txt
20 | 	@printf "Copying    %-35s > %s\n" $< $@
21 | 	mkdir -p ${BUILDDIR}
22 | 	cp $< $@
23 | 
24 | src.%:
25 | 	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
26 | 
27 | pkg.%:
28 | 	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
29 | 
30 | pkg.debian.prep: lic
31 | pkg.txz.prep: lic
32 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Support
 2 | 
 3 | ## How to file issues and get help
 4 | 
 5 | This project uses [GitHub Issues] to track bugs and feature requests. Please search the existing
 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
 7 | feature request as a new issue.
 8 | 
 9 | For help and questions about using this project, please create a new post in [GitHub Discussions].
10 | 
11 | ## Microsoft Support Policy
12 | 
13 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
14 | 
15 | [GitHub Issues]: https://github.com/Azure/msccl-executor-nccl/issues
16 | [GitHub Discussions]: https://github.com/Azure/msccl-executor-nccl/discussions
17 | 


--------------------------------------------------------------------------------
/cgmanifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "registrations": [
 3 |       {
 4 |         "component": {
 5 |           "type": "git",
 6 |           "git": {
 7 |             "repositoryUrl": "https://github.com/NVIDIA/nccl.git",
 8 |             "commitHash": "5d3ab08"
 9 |           }
10 |         }
11 |       }
12 |     ]
13 |   }
14 |   


--------------------------------------------------------------------------------
/ext-net/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME:=../../build/
 7 | CUDA_HOME:=/usr/local/cuda
 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 9 | PLUGIN_SO:=libnccl-net.so
10 | 
11 | default: $(PLUGIN_SO)
12 | 
13 | $(PLUGIN_SO): plugin.c
14 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
15 | 
16 | clean:
17 | 	rm -f $(PLUGIN_SO)
18 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12 | 
13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_ERR_H_
 6 | #define NCCL_ERR_H_
 7 | 
 8 | /* Error type for plugins */
 9 | typedef enum { ncclSuccess                 =  0,
10 |                ncclUnhandledCudaError      =  1,
11 |                ncclSystemError             =  2,
12 |                ncclInternalError           =  3,
13 |                ncclInvalidArgument         =  4,
14 |                ncclInvalidUsage            =  5,
15 |                ncclRemoteError             =  6 } ncclResult_t;
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_H_
 6 | #define NCCL_NET_H_
 7 | 
 8 | #include <stdint.h>
 9 | #include <stdlib.h>
10 | 
11 | #include "common.h"
12 | #include "err.h"
13 | 
14 | #define NCCL_NET_HANDLE_MAXSIZE 128
15 | 
16 | #define NCCL_PTR_HOST 0x1
17 | #define NCCL_PTR_CUDA 0x2
18 | #define NCCL_PTR_DMABUF 0x4
19 | 
20 | // Maximum number of requests per comm object
21 | #define NCCL_NET_MAX_REQUESTS 32
22 | 
23 | #include "net_v8.h"
24 | #include "net_v7.h"
25 | #include "net_v6.h"
26 | #include "net_v5.h"
27 | #include "net_v4.h"
28 | #include "net_v3.h"
29 | #include "net_v2.h"
30 | 
31 | #endif // end include guard
32 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_device.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_DEVICE_H_
 8 | #define NET_DEVICE_H_
 9 | 
10 | #define NCCL_NET_DEVICE_INVALID_VERSION      0x0
11 | #define NCCL_NET_MTU_SIZE                    4096
12 | 
13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
16 | 
17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
18 | 
19 | typedef struct {
20 |   ncclNetDeviceType netDeviceType; // Network offload type
21 |   int netDeviceVersion;            // Version number for network offload
22 |   void* handle;
23 |   size_t size;
24 |   int needsProxyProgress;
25 | } ncclNetDeviceHandle_v7_t;
26 | 
27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
28 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v2.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V2_H_
 6 | #define NCCL_NET_V2_H_
 7 | 
 8 | typedef struct {
 9 |   // Name of the network (mainly for logs)
10 |   const char* name;
11 |   // Initialize the network.
12 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
13 |   // Return the number of adapters.
14 |   ncclResult_t (*devices)(int* ndev);
15 |   // Return the device path in /sys. NCCL will call free on this path.
16 |   ncclResult_t (*pciPath)(int dev, char** path);
17 |   // Return whether this device supports host pointers and/or CUDA pointers
18 |   // as data from the current GPU. Supported types should be composed with
19 |   // NCCL_PTR_HOST and NCCL_PTR_CUDA.
20 |   ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
21 |   // Create a receiving object and provide a handle to connect to it. The
22 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
23 |   // between ranks to create a connection.
24 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
25 |   // Connect to a handle and return a sending comm object for that peer.
26 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
27 |   // Finalize connection establishment after remote peer has called connectHandle
28 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
29 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
30 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
31 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
32 |   // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
33 |   // May return request == NULL if the call cannot be performed (or would block)
34 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
35 |   // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
36 |   // May return request == NULL if the call cannot be performed (or would block)
37 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
38 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
39 |   // visible to the GPU
40 |   ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
41 |   // Test whether a request is complete. If size is not NULL, it returns the
42 |   // number of bytes sent/received.
43 |   ncclResult_t (*test)(void* request, int* done, int* size);
44 |   // Close and free send/recv comm objects
45 |   ncclResult_t (*closeSend)(void* sendComm);
46 |   ncclResult_t (*closeRecv)(void* recvComm);
47 |   ncclResult_t (*closeListen)(void* listenComm);
48 | } ncclNet_v2_t;
49 | 
50 | #endif // end include guard
51 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V3_H_
 6 | #define NCCL_NET_V3_H_
 7 | 
 8 | #define NCCL_NET_MAX_REQUESTS_V3 16
 9 | 
10 | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
11 | typedef struct {
12 |   // Name of the network (mainly for logs)
13 |   const char* name;
14 |   // Initialize the network.
15 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
16 |   // Return the number of adapters.
17 |   ncclResult_t (*devices)(int* ndev);
18 |   // Get various device properties.
19 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
20 |   // Create a receiving object and provide a handle to connect to it. The
21 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
22 |   // between ranks to create a connection.
23 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
24 |   // Connect to a handle and return a sending comm object for that peer.
25 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
26 |   // Finalize connection establishment after remote peer has called connectHandle
27 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
28 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
29 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
30 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
31 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
32 |   // Asynchronous send to a peer.
33 |   // May return request == NULL if the call cannot be performed (or would block)
34 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
35 |   // Asynchronous recv from a peer.
36 |   // May return request == NULL if the call cannot be performed (or would block)
37 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
38 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
39 |   // visible to the GPU
40 |   ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
41 |   // Test whether a request is complete. If size is not NULL, it returns the
42 |   // number of bytes sent/received.
43 |   ncclResult_t (*test)(void* request, int* done, int* size);
44 |   // Close and free send/recv comm objects
45 |   ncclResult_t (*closeSend)(void* sendComm);
46 |   ncclResult_t (*closeRecv)(void* recvComm);
47 |   ncclResult_t (*closeListen)(void* listenComm);
48 | } ncclNet_v3_t;
49 | 
50 | #endif // end include guard
51 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v4.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V4_H_
 6 | #define NCCL_NET_V4_H_
 7 | 
 8 | #define NCCL_NET_HANDLE_MAXSIZE_V4 64
 9 | 
10 | typedef struct {
11 |   char* name;     // Used mostly for logging.
12 |   char* pciPath;  // Path to the PCI device in /sys.
13 |   uint64_t guid;  // Unique identifier for the NIC chip. Important for
14 |                   // cards with multiple PCI functions (Physical or virtual).
15 |   int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
16 |   int speed;      // Port speed in Mbps.
17 |   int port;       // Port number.
18 |   int maxComms;   // Maximum number of comms we can create
19 | } ncclNetProperties_v4_t;
20 | 
21 | // v4 struct for backwards compatibility
22 | typedef struct {
23 |   // Name of the network (mainly for logs)
24 |   const char* name;
25 |   // Initialize the network.
26 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
27 |   // Return the number of adapters.
28 |   ncclResult_t (*devices)(int* ndev);
29 |   // Get various device properties.
30 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
31 |   // Create a receiving object and provide a handle to connect to it. The
32 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
33 |   // between ranks to create a connection.
34 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
35 |   // Connect to a handle and return a sending comm object for that peer.
36 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
37 |   // Finalize connection establishment after remote peer has called connectHandle
38 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
39 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
40 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
41 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
42 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
43 |   // Asynchronous send to a peer.
44 |   // May return request == NULL if the call cannot be performed (or would block)
45 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
46 |   // Asynchronous recv from a peer.
47 |   // May return request == NULL if the call cannot be performed (or would block)
48 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
49 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
50 |   // visible to the GPU
51 |   ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
52 |   // Test whether a request is complete. If size is not NULL, it returns the
53 |   // number of bytes sent/received.
54 |   ncclResult_t (*test)(void* request, int* done, int* size);
55 |   // Close and free send/recv comm objects
56 |   ncclResult_t (*closeSend)(void* sendComm);
57 |   ncclResult_t (*closeRecv)(void* recvComm);
58 |   ncclResult_t (*closeListen)(void* listenComm);
59 | } ncclNet_v4_t;
60 | 
61 | #endif // end include guard
62 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v5.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V5_H_
 6 | #define NCCL_NET_V5_H_
 7 | 
 8 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
 9 | typedef struct {
10 |   // Name of the network (mainly for logs)
11 |   const char* name;
12 |   // Initialize the network.
13 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
14 |   // Return the number of adapters.
15 |   ncclResult_t (*devices)(int* ndev);
16 |   // Get various device properties.
17 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
18 |   // Create a receiving object and provide a handle to connect to it. The
19 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
20 |   // between ranks to create a connection.
21 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
22 |   // Connect to a handle and return a sending comm object for that peer.
23 |   // This call must not block for the connection to be established, and instead
24 |   // should return successfully with sendComm == NULL with the expectation that
25 |   // it will be called again until sendComm != NULL.
26 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
27 |   // Finalize connection establishment after remote peer has called connect.
28 |   // This call must not block for the connection to be established, and instead
29 |   // should return successfully with recvComm == NULL with the expectation that
30 |   // it will be called again until recvComm != NULL.
31 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
32 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
33 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
34 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
35 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
36 |   // Asynchronous send to a peer.
37 |   // May return request == NULL if the call cannot be performed (or would block)
38 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
39 |   // Asynchronous recv from a peer.
40 |   // May return request == NULL if the call cannot be performed (or would block)
41 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
42 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
43 |   // visible to the GPU
44 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
45 |   // Test whether a request is complete. If size is not NULL, it returns the
46 |   // number of bytes sent/received.
47 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
48 |   // Close and free send/recv comm objects
49 |   ncclResult_t (*closeSend)(void* sendComm);
50 |   ncclResult_t (*closeRecv)(void* recvComm);
51 |   ncclResult_t (*closeListen)(void* listenComm);
52 | } ncclNet_v5_t;
53 | 
54 | #endif // end include guard
55 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v6.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V6_H_
 6 | #define NCCL_NET_V6_H_
 7 | 
 8 | #define NCCL_NET_MAX_REQUESTS_V6 8
 9 | 
10 | typedef struct {
11 |   char* name;     // Used mostly for logging.
12 |   char* pciPath;  // Path to the PCI device in /sys.
13 |   uint64_t guid;  // Unique identifier for the NIC chip. Important for
14 |                   // cards with multiple PCI functions (Physical or virtual).
15 |   int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
16 |   int speed;      // Port speed in Mbps.
17 |   int port;       // Port number.
18 |   float latency;  // Network latency
19 |   int maxComms;   // Maximum number of comms we can create
20 |   int maxRecvs;   // Maximum number of grouped receives.
21 | }ncclNetProperties_v6_t;
22 | 
23 | typedef struct {
24 |   // Name of the network (mainly for logs)
25 |   const char* name;
26 |   // Initialize the network.
27 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
28 |   // Return the number of adapters.
29 |   ncclResult_t (*devices)(int* ndev);
30 |   // Get various device properties.
31 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
32 |   // Create a receiving object and provide a handle to connect to it. The
33 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
34 |   // between ranks to create a connection.
35 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
36 |   // Connect to a handle and return a sending comm object for that peer.
37 |   // This call must not block for the connection to be established, and instead
38 |   // should return successfully with sendComm == NULL with the expectation that
39 |   // it will be called again until sendComm != NULL.
40 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
41 |   // Finalize connection establishment after remote peer has called connect.
42 |   // This call must not block for the connection to be established, and instead
43 |   // should return successfully with recvComm == NULL with the expectation that
44 |   // it will be called again until recvComm != NULL.
45 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
46 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
47 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
48 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
49 |   /* DMA-BUF support */
50 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
51 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
52 |   // Asynchronous send to a peer.
53 |   // May return request == NULL if the call cannot be performed (or would block)
54 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
55 |   // Asynchronous recv from a peer.
56 |   // May return request == NULL if the call cannot be performed (or would block)
57 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
58 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
59 |   // visible to the GPU
60 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
61 |   // Test whether a request is complete. If size is not NULL, it returns the
62 |   // number of bytes sent/received.
63 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
64 |   // Close and free send/recv comm objects
65 |   ncclResult_t (*closeSend)(void* sendComm);
66 |   ncclResult_t (*closeRecv)(void* recvComm);
67 |   ncclResult_t (*closeListen)(void* listenComm);
68 | } ncclNet_v6_t;
69 | 
70 | #endif // end include guard
71 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v7.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V7_H_
 6 | #define NCCL_NET_V7_H_
 7 | 
 8 | #include "net_device.h"
 9 | 
10 | typedef struct {
11 |   char* name;                      // Used mostly for logging.
12 |   char* pciPath;                   // Path to the PCI device in /sys.
13 |   uint64_t guid;                   // Unique identifier for the NIC chip. Important for
14 |                                    // cards with multiple PCI functions (Physical or virtual).
15 |   int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
16 |   int speed;                       // Port speed in Mbps.
17 |   int port;                        // Port number.
18 |   float latency;                   // Network latency
19 |   int maxComms;                    // Maximum number of comms we can create
20 |   int maxRecvs;                    // Maximum number of grouped receives.
21 |   ncclNetDeviceType netDeviceType; // Network offload type
22 |   int netDeviceVersion;            // Version number for network offload
23 | } ncclNetProperties_v7_t;
24 | 
25 | typedef struct {
26 |   // Name of the network (mainly for logs)
27 |   const char* name;
28 |   // Initialize the network.
29 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
30 |   // Return the number of adapters.
31 |   ncclResult_t (*devices)(int* ndev);
32 |   // Get various device properties.
33 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
34 |   // Create a receiving object and provide a handle to connect to it. The
35 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
36 |   // between ranks to create a connection.
37 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
38 |   // Connect to a handle and return a sending comm object for that peer.
39 |   // This call must not block for the connection to be established, and instead
40 |   // should return successfully with sendComm == NULL with the expectation that
41 |   // it will be called again until sendComm != NULL.
42 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
43 |   // Finalize connection establishment after remote peer has called connect.
44 |   // This call must not block for the connection to be established, and instead
45 |   // should return successfully with recvComm == NULL with the expectation that
46 |   // it will be called again until recvComm != NULL.
47 |   ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
48 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
49 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
50 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
51 |   /* DMA-BUF support */
52 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
53 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
54 |   // Asynchronous send to a peer.
55 |   // May return request == NULL if the call cannot be performed (or would block)
56 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
57 |   // Asynchronous recv from a peer.
58 |   // May return request == NULL if the call cannot be performed (or would block)
59 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
60 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
61 |   // visible to the GPU
62 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
63 |   // Test whether a request is complete. If size is not NULL, it returns the
64 |   // number of bytes sent/received.
65 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
66 |   // Close and free send/recv comm objects
67 |   ncclResult_t (*closeSend)(void* sendComm);
68 |   ncclResult_t (*closeRecv)(void* recvComm);
69 |   ncclResult_t (*closeListen)(void* listenComm);
70 |   // Copy the given mhandle to a dptr in a format usable by this plugin's device code
71 |   ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
72 | 
73 |   // Notify the plugin that a recv has completed by the device
74 |   ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
75 | } ncclNet_v7_t;
76 | 
77 | #endif // end include guard
78 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v8.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V8_H_
 6 | #define NCCL_NET_V8_H_
 7 | 
 8 | #include "net_device.h"
 9 | 
10 | typedef struct {
11 |   char* name;                      // Used mostly for logging.
12 |   char* pciPath;                   // Path to the PCI device in /sys.
13 |   uint64_t guid;                   // Unique identifier for the NIC chip. Important for
14 |                                    // cards with multiple PCI functions (Physical or virtual).
15 |   int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
16 |   int regIsGlobal;                 // regMr is not tied to a particular comm
17 |   int speed;                       // Port speed in Mbps.
18 |   int port;                        // Port number.
19 |   float latency;                   // Network latency
20 |   int maxComms;                    // Maximum number of comms we can create
21 |   int maxRecvs;                    // Maximum number of grouped receives.
22 |   ncclNetDeviceType netDeviceType; // Network offload type
23 |   int netDeviceVersion;            // Version number for network offload
24 | } ncclNetProperties_v8_t;
25 | 
26 | typedef ncclNetProperties_v8_t ncclNetProperties_t;
27 | 
28 | typedef struct {
29 |   // Name of the network (mainly for logs)
30 |   const char* name;
31 |   // Initialize the network.
32 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
33 |   // Return the number of adapters.
34 |   ncclResult_t (*devices)(int* ndev);
35 |   // Get various device properties.
36 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
37 |   // Create a receiving object and provide a handle to connect to it. The
38 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
39 |   // between ranks to create a connection.
40 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
41 |   // Connect to a handle and return a sending comm object for that peer.
42 |   // This call must not block for the connection to be established, and instead
43 |   // should return successfully with sendComm == NULL with the expectation that
44 |   // it will be called again until sendComm != NULL.
45 |   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
46 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
47 |   // Finalize connection establishment after remote peer has called connect.
48 |   // This call must not block for the connection to be established, and instead
49 |   // should return successfully with recvComm == NULL with the expectation that
50 |   // it will be called again until recvComm != NULL.
51 |   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
52 |   ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
53 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
54 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
55 |   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
56 |   /* DMA-BUF support */
57 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
58 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
59 |   // Asynchronous send to a peer.
60 |   // May return request == NULL if the call cannot be performed (or would block)
61 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
62 |   // Asynchronous recv from a peer.
63 |   // May return request == NULL if the call cannot be performed (or would block)
64 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
65 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
66 |   // visible to the GPU
67 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
68 |   // Test whether a request is complete. If size is not NULL, it returns the
69 |   // number of bytes sent/received.
70 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
71 |   // Close and free send/recv comm objects
72 |   ncclResult_t (*closeSend)(void* sendComm);
73 |   ncclResult_t (*closeRecv)(void* recvComm);
74 |   ncclResult_t (*closeListen)(void* listenComm);
75 | 
76 |   // Copy the given mhandle to a dptr in a format usable by this plugin's device code
77 |   ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
78 | 
79 |   // Notify the plugin that a recv has completed by the device
80 |   ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
81 | } ncclNet_v8_t;
82 | 
83 | #endif // end include guard
84 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_TYPES_H_
 6 | #define NCCL_TYPES_H_
 7 | 
 8 | /* Data types */
 9 | typedef enum { ncclInt8       = 0, ncclChar       = 0,
10 |                ncclUint8      = 1,
11 |                ncclInt32      = 2, ncclInt        = 2,
12 |                ncclUint32     = 3,
13 |                ncclInt64      = 4,
14 |                ncclUint64     = 5,
15 |                ncclFloat16    = 6, ncclHalf       = 6,
16 |                ncclFloat32    = 7, ncclFloat      = 7,
17 |                ncclFloat64    = 8, ncclDouble     = 8,
18 |                ncclBfloat16   = 9,
19 | } ncclDataType_t;
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/ext-net/google-fastsocket/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME?=/usr/local/cuda
 2 | INC:=-I$(CUDA_HOME)/include
 3 | PLUGIN_SO:=libnccl-net.so
 4 | 
 5 | default: $(PLUGIN_SO)
 6 | 
 7 | $(PLUGIN_SO): nccl-fastsocket/*.cc
 8 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
 9 | 
10 | nccl-fastsocket/*.cc:
11 | 	git clone https://github.com/google/nccl-fastsocket.git
12 | 
13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO)
14 | 
15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO)
16 | 	@printf "Grabbing %-35s > %s\n" $< $@
17 | 	mkdir -p $(BUILDDIR)/lib
18 | 	install -m 644 $< $@
19 | 
20 | clean:
21 | 	rm -f $(PLUGIN_SO)
22 | 	rm -Rf nccl-fastsocket
23 | 


--------------------------------------------------------------------------------
/ext-profiler/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME := ../../build
 7 | INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 8 | PLUGIN_SO := libnccl-profiler.so
 9 | 
10 | default: $(PLUGIN_SO)
11 | 
12 | $(PLUGIN_SO): plugin.c event.c print_event.c
13 | 	$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
14 | 
15 | clean:
16 | 	rm -f $(PLUGIN_SO)
17 | 


--------------------------------------------------------------------------------
/ext-profiler/example/event.c:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <stdio.h>
 8 | #include "event.h"
 9 | 
10 | int taskEventQueueEmpty(struct group* g) {
11 |   return g->eventHead == NULL;
12 | }
13 | 
14 | void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
15 |   event->next = NULL;
16 |   if (g->eventHead) g->eventTail->next = event;
17 |   else g->eventHead = event;
18 |   g->eventTail = event;
19 | }
20 | 
21 | struct taskEventBase* taskEventQueueHead(struct group* g) {
22 |   return g->eventHead;
23 | }
24 | 
25 | struct taskEventBase* taskEventQueueDequeue(struct group* g) {
26 |   struct taskEventBase* tmp = g->eventHead;
27 |   g->eventHead = g->eventHead->next;
28 |   if (g->eventHead == NULL) g->eventTail = NULL;
29 |   return tmp;
30 | }
31 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12 | 
13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ERR_H_
 8 | #define NCCL_ERR_H_
 9 | 
10 | /* Error type for plugins */
11 | typedef enum { ncclSuccess                 =  0,
12 |                ncclUnhandledCudaError      =  1,
13 |                ncclSystemError             =  2,
14 |                ncclInternalError           =  3,
15 |                ncclInvalidArgument         =  4,
16 |                ncclInvalidUsage            =  5,
17 |                ncclRemoteError             =  6 } ncclResult_t;
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PROFILER_H_
 8 | #define NCCL_PROFILER_H_
 9 | 
10 | #include <stdint.h>
11 | #include <stdlib.h>
12 | 
13 | #include "common.h"
14 | #include "err.h"
15 | 
16 | #include "profiler_v1.h"
17 | 
18 | #endif // end include guard
19 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_TYPES_H_
 6 | #define NCCL_TYPES_H_
 7 | 
 8 | /* Data types */
 9 | typedef enum { ncclInt8       = 0, ncclChar       = 0,
10 |                ncclUint8      = 1,
11 |                ncclInt32      = 2, ncclInt        = 2,
12 |                ncclUint32     = 3,
13 |                ncclInt64      = 4,
14 |                ncclUint64     = 5,
15 |                ncclFloat16    = 6, ncclHalf       = 6,
16 |                ncclFloat32    = 7, ncclFloat      = 7,
17 |                ncclFloat64    = 8, ncclDouble     = 8,
18 |                ncclBfloat16   = 9,
19 | } ncclDataType_t;
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/ext-profiler/example/print_event.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PRINT_EVENT_H_
 8 | #define PRINT_EVENT_H_
 9 | 
10 | void debugEvent(void* eHandle, const char* tag);
11 | void printEvent(FILE* fh, void* handle);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/ext-tuner/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME:=../../build/
 7 | CUDA_HOME:=/usr/local/cuda
 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 9 | PLUGIN_SO:=libnccl-tuner.so
10 | 
11 | default: $(PLUGIN_SO)
12 | 
13 | $(PLUGIN_SO): plugin.c
14 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
15 | 
16 | clean:
17 | 	rm -f $(PLUGIN_SO)
18 | 


--------------------------------------------------------------------------------
/ext-tuner/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12 | 
13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ext-tuner/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_ERR_H_
 6 | #define NCCL_ERR_H_
 7 | 
 8 | /* Error type for plugins */
 9 | typedef enum { ncclSuccess                 =  0,
10 |                ncclUnhandledCudaError      =  1,
11 |                ncclSystemError             =  2,
12 |                ncclInternalError           =  3,
13 |                ncclInvalidArgument         =  4,
14 |                ncclInvalidUsage            =  5,
15 |                ncclRemoteError             =  6 } ncclResult_t;
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/ext-tuner/example/nccl/tuner.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_TUNER_H_
 9 | #define NCCL_TUNER_H_
10 | 
11 | #include <stdint.h>
12 | #include <stdlib.h>
13 | 
14 | #include "common.h"
15 | #include "err.h"
16 | 
17 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
18 | typedef enum {
19 |   ncclFuncBroadcast = 0,
20 |   ncclFuncReduce = 1,
21 |   ncclFuncAllGather = 2,
22 |   ncclFuncReduceScatter = 3,
23 |   ncclFuncAllReduce = 4,
24 |   ncclFuncSendRecv = 5,
25 |   ncclFuncSend = 6,
26 |   ncclFuncRecv = 7,
27 |   ncclNumFuncs = 8
28 | } ncclFunc_t;
29 | 
30 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
31 | #define NCCL_ALGO_UNDEF -1
32 | #define NCCL_ALGO_TREE 0
33 | #define NCCL_ALGO_RING 1
34 | #define NCCL_ALGO_COLLNET_DIRECT 2
35 | #define NCCL_ALGO_COLLNET_CHAIN 3
36 | #define NCCL_ALGO_NVLS 4
37 | #define NCCL_ALGO_NVLS_TREE 5
38 | #define NCCL_ALGO_PAT 6
39 | 
40 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
41 | #define NCCL_PROTO_UNDEF -1
42 | #define NCCL_PROTO_LL 0
43 | #define NCCL_PROTO_LL128 1
44 | #define NCCL_PROTO_SIMPLE 2
45 | 
46 | #define NCCL_ALGO_PROTO_IGNORE -1.0
47 | 
48 | // API to be implemented by external tuner
49 | typedef struct {
50 |   // Name of the tuner
51 |   const char* name;
52 | 
53 |   // Initializes tuner states.
54 |   // Inputs:
55 |   //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
56 |   //   - nNodes: number of nodes in current communicator.
57 |   //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
58 |   // Outputs:
59 |   //   - context: tuner context object
60 |   ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
61 | 
62 |   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
63 |   // Inputs:
64 |   //   - context: tuner context object
65 |   //   - collType: collective type , e.g., allreduce, allgather…
66 |   //   - nBytes: collective size in bytes
67 |   //   - numPipeOps: number of operations in the group
68 |   //   - numAlgo: number of algorithms in collCostTable
69 |   //   - numProto: number of protocols in collCostTable
70 |   //
71 |   // Outputs:
72 |   //   - nChannels: number of channels (hence SMs) to be used.
73 |   //
74 |   // InOut:
75 |   //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
76 |   //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
77 |   //
78 |   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
79 |   // default tuning for the given collective.
80 |   // Also, the plugin is allowed to not set any output, or set only the
81 |   // algorithm and protocol, but not only the algorithm or only the protocol.
82 |   // Unset fields will be set automatically by NCCL.
83 |   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
84 |                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
85 |                               int* nChannels);
86 | 
87 |   // Terminates the plugin and cleans up any resources that the plugin allocated.
88 |   // context: tuner context object
89 |   ncclResult_t (*destroy)(void* context);
90 | } ncclTuner_v3_t;
91 | 
92 | typedef ncclTuner_v3_t ncclTuner_t;
93 | 
94 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
95 | 
96 | #endif
97 | 


--------------------------------------------------------------------------------
/ext-tuner/example/plugin.c:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "tuner.h"
 8 | 
 9 | #define __hidden __attribute__ ((visibility("hidden")))
10 | 
11 | __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
12 | 
13 | __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
14 |                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
15 |                               int* nChannels) {
16 |   // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
17 |   if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
18 |     collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
19 |   }
20 |   *nChannels = 1;
21 |   return ncclSuccess;
22 | }
23 | 
24 | __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
25 | 
26 | #define PLUGIN_NAME "Example"
27 | 
28 | const ncclTuner_v3_t ncclTunerPlugin_v3 = {
29 |   .name = PLUGIN_NAME,
30 |   .init = pluginInit,
31 |   .getCollInfo = pluginGetCollInfo,
32 |   .destroy = pluginDestroy
33 | };
34 | 


--------------------------------------------------------------------------------
/makefiles/formatting.mk:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
 8 | # As this file defines a new target (format), it should be included at least after the definition of the
 9 | # default target.
10 | 
11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
12 | ASTYLEDIR := $(BUILDDIR)/contrib
13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
16 | ASTYLEVER := 3.1
17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
18 | 
19 | $(ASTYLEDIR) :
20 | 	@mkdir -p $(ASTYLEDIR)
21 | 
22 | $(ASTYLETAR) : $(ASTYLEDIR)
23 | 	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
24 | 
25 | $(ASTYLEBLD) : $(ASTYLETAR)
26 | 	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
27 | 
28 | $(ASTYLEBIN) : $(ASTYLEBLD)
29 | 	${MAKE} -C $(ASTYLEBLD)
30 | 
31 | .PHONY : format
32 | format : $(ASTYLEBIN)
33 | 	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
34 | 


--------------------------------------------------------------------------------
/makefiles/version.mk:
--------------------------------------------------------------------------------
1 | ##### version
2 | NCCL_MAJOR   := 2
3 | NCCL_MINOR   := 23
4 | NCCL_PATCH   := 4
5 | NCCL_SUFFIX  :=
6 | PKG_REVISION := 1
7 | 


--------------------------------------------------------------------------------
/pkg/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : build
 9 | build : debian.build txz.build
10 | 
11 | BUILDDIR ?= $(abspath ../build)
12 | ABSBUILDDIR := $(abspath $(BUILDDIR))
13 | TARGETS := debian txz
14 | all:   ${TARGETS:%=%.build}
15 | prep:  ${TARGETS:%=%.prep}
16 | build: ${TARGETS:%=%.build}
17 | clean: ${TARGETS:%=%.clean}
18 | 
19 | %.prep:
20 | 	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
21 | 
22 | %.build:
23 | 	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
24 | 
25 | %.clean:
26 | 	${MAKE} -C $* clean
27 | 


--------------------------------------------------------------------------------
/pkg/debian/.gitignore:
--------------------------------------------------------------------------------
1 | /*.debhelper.log
2 | /*.debhelper
3 | /*.substvars
4 | /tmp/
5 | /files
6 | /libnccl1/
7 | /libnccl-dev/
8 | 


--------------------------------------------------------------------------------
/pkg/debian/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | DEBPREPDIR := $(BUILDDIR)/debian
11 | PKGDIR  := $(BUILDDIR)/pkg/deb/
12 | 
13 | DEBGEN_IN  := $(wildcard *.in)
14 | DEBGEN     := $(DEBGEN_IN:.in=)
15 | DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | PKG_ARCH       ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
20 | PKG_MULTIARCH  ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
21 | 
22 | prep : $(DEBTARGETS)
23 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
24 | 
25 | build : prep
26 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
27 | 	@printf "Building Debian package\n"
28 | 	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
29 | 	mkdir -p $(PKGDIR)
30 | 	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
31 | 
32 | clean:
33 | 	rm -Rf $(DEBPREPDIR) $(PKGDIR)
34 | 
35 | $(DEBPREPDIR)/% : %.in
36 | 	@printf "Generating %-35s > %s\n" $< $@
37 | 	mkdir -p $(DEBPREPDIR)
38 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
39 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
40 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
41 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
42 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
43 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
44 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
45 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
46 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
47 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
48 | 	    $< > $@
49 | 
50 | $(DEBPREPDIR)/% : %
51 | 	@printf "Grabbing   %-35s > %s\n" $< $@
52 | 	mkdir -p $(DEBPREPDIR)
53 | 	cp -f $< $@
54 | 


--------------------------------------------------------------------------------
/pkg/debian/changelog.in:
--------------------------------------------------------------------------------
1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
2 | 
3 |   * Automatic Debian package from build
4 | 
5 |  -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
6 | 


--------------------------------------------------------------------------------
/pkg/debian/compat:
--------------------------------------------------------------------------------
1 | 9
2 | 


--------------------------------------------------------------------------------
/pkg/debian/control.in:
--------------------------------------------------------------------------------
 1 | Source: nccl
 2 | Section: libs
 3 | Maintainer: cudatools <cudatools@nvidia.com>
 4 | Priority: optional
 5 | Build-depends: debhelper(>=9)
 6 | Standards-Version: 3.9.5
 7 | 
 8 | Package: libnccl${nccl:Major}
 9 | Section: libs
10 | Architecture: ${pkg:Arch}
11 | Depends: ${misc:Depends}, ${shlibs:Depends}
12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime
13 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 |  broadcast, and reduce-scatter.
16 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
17 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 |  sockets.
19 | 
20 | Package: libnccl-dev
21 | Section: libdevel
22 | Architecture: ${pkg:Arch}
23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files
25 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
26 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
27 |  broadcast, and reduce-scatter.
28 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
29 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
30 |  sockets.
31 | 


--------------------------------------------------------------------------------
/pkg/debian/copyright:
--------------------------------------------------------------------------------
1 | ../../LICENSE.txt


--------------------------------------------------------------------------------
/pkg/debian/gbp.conf:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | debian-branch   = master
 3 | upstream-branch = master
 4 | 
 5 | ignore-new = True
 6 | 
 7 | [git-buildpackage]
 8 | 
 9 | no-purge = True
10 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl-dev.install.in:
--------------------------------------------------------------------------------
1 | include/nccl.h /usr/include
2 | include/nccl_net.h /usr/include
3 | lib/libnccl.so /usr/lib/${pkg:MultiArch}
4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
5 | lib/pkgconfig/nccl.pc /usr/lib/${pkg:MultiArch}/pkgconfig
6 | lib/msccl-algorithms /usr/share/nccl
7 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl2.install.in:
--------------------------------------------------------------------------------
1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
3 | 


--------------------------------------------------------------------------------
/pkg/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | %:
 4 | 	dh $@ --parallel
 5 | 
 6 | override_dh_auto_install:
 7 | 	PREFIX=debian/tmp dh_auto_install
 8 | 
 9 | override_dh_auto_test:
10 | 	# Do not make test
11 | 
12 | override_dh_auto_clean:
13 | 	# Do not make clean
14 | 


--------------------------------------------------------------------------------
/pkg/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (native)
2 | 


--------------------------------------------------------------------------------
/pkg/redhat/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | RPMPREPDIR := $(BUILDDIR)/redhat
11 | PKGDIR  := $(BUILDDIR)/pkg/rpm/
12 | 
13 | RPMGEN_IN  := $(wildcard *.in)
14 | RPMGEN     := $(RPMGEN_IN:.in=)
15 | RPMFILES   := $(RPMGEN)
16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | ARCH           := $(shell uname -m)
20 | PKG_ARCH       ?= $(shell uname -m)
21 | PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
22 | ifeq ($(PKG_MULTIARCH),)
23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
24 | PKG_MULTIARCH  := $(ARCH)-linux-gnu
25 | endif
26 | 
27 | prep : $(RPMTARGETS)
28 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
29 | 
30 | build : prep
31 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
32 | 	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
33 | 	@printf "Building Redhat package\n"
34 | 	mkdir -p $(PKGDIR)
35 | 	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
36 |                  --define "_rpmdir $(PKGDIR)" \
37 |                  --define "_builddir $(PKGDIR)/build/" \
38 |                  --define "_buildrootdir $(PKGDIR)/buildroot/" \
39 |                  -bb $(BUILDDIR)/redhat/nccl.spec
40 | 
41 | clean:
42 | 	rm -Rf $(RPMPREPDIR) $(PKGDIR)
43 | 
44 | $(RPMPREPDIR)/% : %.in
45 | 	@printf "Generating %-35s > %s\n" $< $@
46 | 	mkdir -p $(RPMPREPDIR)
47 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
48 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
49 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
50 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
51 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
52 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
53 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
54 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
55 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
56 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
57 | 	    $< > $@
58 | 
59 | $(RPMPREPDIR)/% : %
60 | 	@printf "Grabbing   %-35s > %s\n" $< $@
61 | 	mkdir -p $(RPMPREPDIR)
62 | 	cp -f $< $@
63 | 


--------------------------------------------------------------------------------
/pkg/redhat/nccl.spec.in:
--------------------------------------------------------------------------------
 1 | Name:           libnccl
 2 | Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
 3 | Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
 4 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 5 | 
 6 | Group:          Development/Libraries
 7 | License:        BSD
 8 | URL:            http://developer.nvidia.com/nccl
 9 | Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
10 | Requires(pre,preun): /sbin/ldconfig
11 | 
12 | %description
13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 | broadcast, and reduce-scatter.
16 | It has been optimized to achieve high bandwidth on any platform using PCIe,
17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 | sockets.
19 | 
20 | %package devel
21 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
22 | Group:          Development/Libraries
23 | %description devel
24 | NCCL development files
25 | 
26 | %package static
27 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
28 | Group:          Development/Libraries
29 | %description static
30 | NCCL static library
31 | 
32 | %define debug_package %{nil}
33 | 
34 | %prep
35 | %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
36 | 
37 | %build
38 | 
39 | %install
40 | rm -rf $RPM_BUILD_ROOT
41 | install -m 755 -d $RPM_BUILD_ROOT
42 | install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
43 | install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
44 | ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
45 | 
46 | # devel
47 | install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
48 | install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
49 | install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir}
50 | ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
51 | 
52 | # static
53 | install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
54 | 
55 | %post -p /sbin/ldconfig
56 | %postun -p /sbin/ldconfig
57 | 
58 | %post devel -p /sbin/ldconfig
59 | %postun devel -p /sbin/ldconfig
60 | 
61 | %clean
62 | rm -rf $RPM_BUILD_ROOT
63 | 
64 | %files devel
65 | %doc LICENSE.txt
66 | %defattr(-,root,root,-)
67 | %{_includedir}/nccl.h
68 | %{_includedir}/nccl_net.h
69 | %{_libdir}/libnccl.so
70 | 
71 | %files static
72 | %doc LICENSE.txt
73 | %defattr(-,root,root,-)
74 | %{_libdir}/libnccl_static.a
75 | 
76 | %files
77 | %doc LICENSE.txt
78 | %defattr(-,root,root,-)
79 | %{_libdir}/libnccl.so.${nccl:Major}
80 | %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
81 | 
82 | %changelog
83 | 


--------------------------------------------------------------------------------
/pkg/srctxz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/srctxz
11 | PKGDIR  := $(BUILDDIR)/pkg/srctxz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_REVISION   ?= 3
18 | PKG_ARCH       := $(shell uname -m)
19 | 
20 | prep: $(TXZTARGETS)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../../src clean
24 | 	@printf "Building source tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
40 | 	    $< > $@
41 | 


--------------------------------------------------------------------------------
/pkg/srctxz/create_srctxz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | cd ..
11 | NCCLDIR=`basename $PWD`
12 | 
13 | echo "Checking for unclean directory ..."
14 | git clean -x -i
15 | echo "Clean done."
16 | echo "Checking for uncommited files ..."
17 | if [ "`git status -s | wc -l`" != "0" ]; then
18 |   git status -s
19 |   echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
20 |   read
21 | fi
22 | 
23 | cd ..
24 | NCCL_MAJOR=${nccl:Major}
25 | NCCL_MINOR=${nccl:Minor}
26 | NCCL_PATCH=${nccl:Patch}
27 | NCCL_SUFFIX=${nccl:Suffix}
28 | NCCL_BUILD=${pkg:Revision}
29 | 
30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
31 | 
32 | tar --exclude build \
33 |     --exclude ".git*" \
34 |     --exclude pkg/srctxz \
35 |     --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
36 | 


--------------------------------------------------------------------------------
/pkg/txz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/txz
11 | PKGDIR  := $(BUILDDIR)/pkg/txz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_ARCH   := $(shell uname -m)
18 | 
19 | prep: $(TXZTARGETS)
20 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
24 | 	@printf "Building tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash txz/create_txz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
40 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
41 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
42 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
43 | 	    $< > $@
44 | 


--------------------------------------------------------------------------------
/pkg/txz/create_txz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | BUILDDIR=`basename $PWD`
11 | 
12 | cd ..
13 | NCCL_MAJOR=${nccl:Major}
14 | NCCL_MINOR=${nccl:Minor}
15 | NCCL_PATCH=${nccl:Patch}
16 | NCCL_SUFFIX=${nccl:Suffix}
17 | CUDA_MAJOR=${cuda:Major}
18 | CUDA_MINOR=${cuda:Minor}
19 | PKG_REVISION=${pkg:Revision}
20 | PKG_ARCH=${pkg:Arch}
21 | 
22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
23 | 
24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
25 | 


--------------------------------------------------------------------------------
/src/device/Makefile:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
  3 | # Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
  4 | #
  5 | # See LICENSE.txt for license information
  6 | #
  7 | 
  8 | SHELL := /usr/bin/env bash
  9 | MAKEFALGS += -r
 10 | .SUFFIXES:
 11 | .SECONDARY:
 12 | 
 13 | NCCLDIR := ../..
 14 | include $(NCCLDIR)/makefiles/common.mk
 15 | include $(NCCLDIR)/makefiles/version.mk
 16 | 
 17 | BUILDDIR ?= $(abspath ../../build)
 18 | OBJDIR := $(BUILDDIR)/obj/device
 19 | 
 20 | MANIFEST := $(OBJDIR)/manifest
 21 | DEVGLUE_OBJ  := $(OBJDIR)/device_glue.o
 22 | 
 23 | INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include
 24 | NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 25 | CXXFLAGS  += $(INCFLAGS)
 26 | 
 27 | SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY
 28 | 
 29 | COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1
 30 | COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1
 31 | define COMPILE
 32 | @$(SAY) "Compiling" $2;\
 33 |  mkdir -p $(dir $1);\
 34 |  $(call COMPILE$(suffix $2),$1,$2)
 35 | endef
 36 | 
 37 | DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
 38 | DEPENDS.cc = $(CXX) $(CXXFLAGS) -M -c $1
 39 | define DEPENDS
 40 | @$(SAY) "Dependencies" $2;\
 41 |  mkdir -p $(dir $1);\
 42 |  mk=$$($(call DEPENDS$(suffix $2),$2));\
 43 |  [[ $$mk =~ ^[^:]*:(.*)$$ ]];\
 44 |  files=$${BASH_REMATCH[1]};\
 45 |  files=$$(for x in $$files; do case "$$x" in '\'|$$'\t') ;; *) echo "$$x"; esac; done);\
 46 |  files=$$(for x in $$files; do [[ "$$(realpath "$$x")" == "$$(realpath "$(NCCLDIR)")"* ]] && echo "$$x"; done);\
 47 |  echo "$(patsubst %.d,%.o,$1) $1: " $$files > $1
 48 | endef
 49 | 
 50 | all: $(MANIFEST)
 51 | 
 52 | ifeq (1,1)
 53 | # Case if the <gensrc> directory is generated on-demand:
 54 | $(OBJDIR)/gensrc: generate.py
 55 | 	@mkdir -p $@
 56 | 	(which python3 >/dev/null || \
 57 | 	  (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \
 58 | 	   printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \
 59 | 	   exit 1)) \
 60 | 	&& ./generate.py $@ "$(ONLY_FUNCS)"
 61 | else
 62 | # Case if the <gensrc> directory is pre-generated and checked in the repo as ./gen:
 63 | $(OBJDIR)/gensrc:
 64 | 	@mkdir -p $(OBJDIR); ln -srfn ./gen $@
 65 | endif
 66 | 
 67 | # The trailing ";" is necessary to make this an "empty recipe":
 68 | # https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html
 69 | $(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ;
 70 | 
 71 | -include $(OBJDIR)/gensrc/rules.mk
 72 | # "gensrc/rules.mk" populates $(LIB_OBJS_GEN)
 73 | 
 74 | SRCS = common.cu onerank.cu msccl_kernel.cu
 75 | 
 76 | LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN)
 77 | 
 78 | $(OBJDIR)/%.o: % $(OBJDIR)/%.d
 79 | 	$(call COMPILE,$@,$<)
 80 | 
 81 | $(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d
 82 | 	$(call COMPILE,$@,$(OBJDIR)/gensrc/$*)
 83 | 
 84 | $(OBJDIR)/%.d: %
 85 | 	$(call DEPENDS,$@,$<)
 86 | 
 87 | $(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/%
 88 | 	$(call DEPENDS,$@,$<)
 89 | 
 90 | $(DEVGLUE_OBJ): $(LIB_OBJS)
 91 | 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
 92 | 
 93 | $(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ)
 94 | 	@echo $^ > $@
 95 | 
 96 | -include $(wildcard $(OBJDIR)/*.d)
 97 | -include $(wildcard $(OBJDIR)/genobj/*.d)
 98 | 
 99 | .PHONY: clean
100 | clean:
101 | 	rm -rf $(OBJDIR)
102 | 


--------------------------------------------------------------------------------
/src/device/broadcast.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "device.h"
 8 | #include "collectives.h"
 9 | #include "primitives.h"
10 | 
11 | namespace {
12 |   template<typename T, typename RedOp, typename Proto>
13 |   __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
14 |     ncclRing *ring = &ncclShmem.channel.ring;
15 |     const int rank = ring->userRanks[0];
16 |     const int nextRank = ring->userRanks[1];
17 |     const int root = work->root;
18 |     size_t chunkCount;
19 |     size_t channelCount;
20 |     size_t gridOffset;
21 |     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
22 |     size_t offset;
23 |     int nelem;
24 | 
25 |     T *inputBuf = (T*)work->sendbuff;
26 |     T *outputBuf = (T*)work->recvbuff;
27 |     // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
28 |     // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
29 |     // coverity[callee_ptr_arith:FALSE]
30 |     Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
31 |       prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
32 | 
33 |     for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
34 |       offset = gridOffset + elemOffset;
35 |       nelem = min(chunkCount, channelCount - elemOffset);
36 | 
37 |       if (rank == root) {
38 |         if (inputBuf == outputBuf) {
39 |           prims.directSend(offset, offset, nelem);
40 |         } else {
41 |           prims.directCopySend(offset, offset, nelem);
42 |         }
43 |       } else if (nextRank == root) {
44 |         prims.directRecv(offset, offset, nelem);
45 |       } else {
46 |         prims.directRecvCopyDirectSend(offset, nelem);
47 |       }
48 |     }
49 |   }
50 | }
51 | 
52 | template<typename T, typename RedOp>
53 | struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
54 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
55 |     using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
56 |     runRing<T, RedOp, Proto>(tid, nthreads, work);
57 |   }
58 | };
59 | 
60 | template<typename T, typename RedOp>
61 | struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
62 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
63 |     runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
64 |   }
65 | };
66 | 
67 | template<typename T, typename RedOp>
68 | struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
69 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
70 |     runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
71 |   }
72 | };
73 | 


--------------------------------------------------------------------------------
/src/device/common.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "device.h"
 8 | #include "collectives.h"
 9 | #include "common.h"
10 | 
11 | __shared__ ncclShmemData ncclShmem;
12 | #if __CUDA_ARCH__ < 700
13 |   __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
14 | #endif
15 | 
16 | struct RunWorkNop {
17 |   __device__ void run() {}
18 | };
19 | 
20 | __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
21 |   ncclKernelMain<-1, RunWorkNop>(&args4K.args);
22 | }
23 | 
24 | __device__ void ncclDevFunc_Nop() {}
25 | 


--------------------------------------------------------------------------------
/src/device/network/unpack/unpack_defs.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, Google LLC.  All rights reserved.
 3 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | #ifndef NET_DEVICE_UNPACK_DEFS_H
 8 | #define NET_DEVICE_UNPACK_DEFS_H
 9 | 
10 | #include <stdint.h>
11 | 
12 | #include "device.h"
13 | 
14 | #define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16
15 | 
16 | union alignas(16) loadMeta {
17 |   uint64_t r64[2];
18 |   struct {
19 |     uint32_t src_off;
20 |     uint32_t len;
21 |     uint64_t dst_off;
22 |   };
23 | };
24 | static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned");
25 | 
26 | /****** global memory ******/
27 | 
28 | #define NET_UNPACK_MAX_QUEUE_DEPTH 16  // MAX_REQUESTS
29 | #define NET_UNPACK_MAX_SLICE_SIZE 4194304  // 4MB per Irecv call
30 | #define SLICE_PAGE_SIZE 4096
31 | #define NET_UNPACK_MAX_SLICE_PAGES \
32 |   (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2)  // * 2 for slack, wasteful..
33 | 
34 | struct netUnpackMeta {
35 |   loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES];
36 |   uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH];
37 | };
38 | 
39 | struct unpackNetDeviceHandle {
40 |   struct netUnpackMeta *meta;  // mapped
41 |   void* bounce_buf;
42 |   uint64_t head;
43 | };
44 | 
45 | /****** shared memory ******/
46 | 
47 | #define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h
48 | #define NET_UNPACK_MAX_NPEERS 2  // The most you should have is 2 network peers per-group (indexed by index)
49 | #define WARP_SHM_PAGE_CNT 4
50 | #define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta))
51 | struct unpackShmem {
52 |   void* bounce_buf;
53 | };
54 | 
55 | struct unpackGroupShmem {
56 |   int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
57 |   uint64_t head[NET_UNPACK_MAX_NPEERS];
58 |   struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
59 | };
60 | 
61 | #endif // NET_DEVICE_UNPACK_DEFS_H_
62 | 


--------------------------------------------------------------------------------
/src/device/onerank.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #include "alloc.h"
 9 | #include "collectives.h"
10 | #include "common_kernel.h"
11 | #include "common.h"
12 | #include <cuda_runtime.h>
13 | 
14 | namespace {
15 |   template<typename RedOp>
16 |   __global__ __launch_bounds__(512, 1)
17 |   void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) {
18 |     using T = typename RedOp::EltType;
19 |     int tid = threadIdx.x;
20 |     int tn = blockDim.x;
21 |     int bid = blockIdx.x;
22 |     int bn = gridDim.x;
23 | 
24 |     // each block/channel gets a roughly equal segment of 16 byte packs
25 |     constexpr int EltPerPack = 16/sizeof(T);
26 |     intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack);
27 |     intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack);
28 |     i0 = min(i0, nElts);
29 |     i1 = min(i1, nElts);
30 |     src = (T*)src + i0;
31 |     dst = (T*)dst + i0;
32 | 
33 |     if (redOpArgIsPtr) {
34 |       if (redOpArg%2 != 0) {
35 |         redOpArg = *reinterpret_cast<uint8_t*>(redOpArg);
36 |       } else if (redOpArg%4 != 0) {
37 |         redOpArg = *reinterpret_cast<uint16_t*>(redOpArg);
38 |       } else if (redOpArg%8 != 0) {
39 |         redOpArg = *reinterpret_cast<uint32_t*>(redOpArg);
40 |       } else {
41 |         redOpArg = *reinterpret_cast<uint64_t*>(redOpArg);
42 |       }
43 |     }
44 |     reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
45 |       (tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0);
46 |   }
47 | }
48 | 
49 | ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) {
50 |   size_t eltSize = ncclTypeSize(eltType);
51 |   if (redOp.op != ncclDevPreMulSum) {
52 |     if (dst != src) {
53 |       NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream));
54 |     }
55 |     return ncclSuccess;
56 |   }
57 | 
58 |   void const* kernel;
59 |   switch (eltType) {
60 |   case ncclInt8:     kernel = (void const*)&oneRankReduce<FuncPreMulSum<int8_t>>; break;
61 |   case ncclUint8:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint8_t>>; break;
62 |   case ncclInt32:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int32_t>>; break;
63 |   case ncclUint32:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint32_t>>; break;
64 |   case ncclInt64:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int64_t>>; break;
65 |   case ncclUint64:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint64_t>>; break;
66 |   case ncclFloat16:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<half>>; break;
67 | #if defined(__CUDA_BF16_TYPES_EXIST__)
68 |   case ncclBfloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_bfloat16>>; break;
69 | #endif
70 | #if defined(__CUDA_FP8_TYPES_EXIST__)
71 |   case ncclFp8E4M3: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e4m3>>; break;
72 |   case ncclFp8E5M2: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e5m2>>; break;
73 | #endif
74 |   case ncclFloat32:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<float>>; break;
75 |   case ncclFloat64:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<double>>; break;
76 |   default: return ncclInvalidArgument;
77 |   }
78 |   dim3 grid = {0, 1, 1};
79 |   grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10));
80 |   dim3 block = {512, 1, 1};
81 |   void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr};
82 |   CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream));
83 |   return ncclSuccess;
84 | }
85 | 


--------------------------------------------------------------------------------
/src/device/reduce.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "device.h"
 8 | #include "collectives.h"
 9 | #include "primitives.h"
10 | 
11 | namespace {
12 |   template<typename T, typename RedOp, typename Proto>
13 |   __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
14 |     ncclRing *ring = &ncclShmem.channel.ring;
15 |     const int nranks = ncclShmem.comm.nRanks;
16 |     const int rank = ncclShmem.comm.rank;
17 |     const int prevRank = ring->userRanks[nranks-1];
18 |     const int root = work->root;
19 |     size_t chunkCount;
20 |     size_t channelCount;
21 |     size_t gridOffset;
22 |     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
23 |     size_t offset;
24 |     int nelem;
25 | 
26 |     // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
27 |     // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
28 |     // coverity[callee_ptr_arith:FALSE]
29 |     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
30 |       prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
31 | 
32 |     if (prevRank == root) {
33 |       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
34 |         offset = gridOffset + elemOffset;
35 |         nelem = min(chunkCount, channelCount - elemOffset);
36 |         prims.send(offset, nelem);
37 |       }
38 |     }
39 |     else if (rank == root) {
40 |       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
41 |         offset = gridOffset + elemOffset;
42 |         nelem = min(chunkCount, channelCount - elemOffset);
43 |         prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
44 |       }
45 |     }
46 |     else {
47 |       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
48 |         offset = gridOffset + elemOffset;
49 |         nelem = min(chunkCount, channelCount - elemOffset);
50 |         prims.recvReduceSend(offset, nelem);
51 |       }
52 |     }
53 |   }
54 | }
55 | 
56 | template<typename T, typename RedOp>
57 | struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
58 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
59 |     using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
60 |     runRing<T, RedOp, Proto>(tid, nthreads, work);
61 |   }
62 | };
63 | 
64 | template<typename T, typename RedOp>
65 | struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
66 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
67 |     runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
68 |   }
69 | };
70 | 
71 | template<typename T, typename RedOp>
72 | struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
73 |   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
74 |     runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
75 |   }
76 | };
77 | 


--------------------------------------------------------------------------------
/src/enhcompat.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
 8 | 
 9 | enum cudaError_t { cudaErrorStubLibrary = 34 };
10 | 
11 | extern "C" {
12 | 
13 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         __attribute__((visibility("hidden"))) __attribute((weak));
14 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         { return cudaErrorStubLibrary; }
15 | 
16 | cudaError_t cudaUserObjectCreate(...)                __attribute__((visibility("hidden"))) __attribute((weak));
17 | cudaError_t cudaUserObjectCreate(...)                { return cudaErrorStubLibrary; }
18 | 
19 | cudaError_t cudaGraphRetainUserObject(...)           __attribute__((visibility("hidden"))) __attribute((weak));
20 | cudaError_t cudaGraphRetainUserObject(...)           { return cudaErrorStubLibrary; }
21 | 
22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
24 | 
25 | cudaError_t cudaGetDriverEntryPoint(...)             __attribute__((visibility("hidden"))) __attribute((weak));
26 | cudaError_t cudaGetDriverEntryPoint(...)             { return cudaErrorStubLibrary; }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/graph/rings.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "core.h"
 8 | 
 9 | void dumpLine(int* values, int nranks, const char* prefix) {
10 |   constexpr int line_length = 128;
11 |   char line[line_length];
12 |   int num_width = snprintf(nullptr, 0, "%d", nranks-1);  // safe as per "man snprintf"
13 |   int n = snprintf(line, line_length, "%s", prefix);
14 |   for (int i = 0; i < nranks && n < line_length-1; i++) {
15 |     n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]);
16 |     // At this point n may be more than line_length-1, so don't use it
17 |     // for indexing into "line".
18 |   }
19 |   if (n >= line_length) {
20 |     // Sprintf wanted to write more than would fit in the buffer. Assume
21 |     // line_length is at least 4 and replace the end with "..." to
22 |     // indicate that it was truncated.
23 |     snprintf(line+line_length-4, 4, "...");
24 |   }
25 |   INFO(NCCL_INIT, "%s", line);
26 | }
27 | 
28 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
29 |   for (int r=0; r<nrings; r++) {
30 |     char prefix[40];
31 |     /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
32 |     dumpLine(prev+r*nranks, nranks, prefix);
33 |     sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
34 |     dumpLine(next+r*nranks, nranks, prefix);*/
35 | 
36 |     int current = rank;
37 |     for (int i=0; i<nranks; i++) {
38 |       rings[r*nranks+i] = current;
39 |       current = next[r*nranks+current];
40 |     }
41 |     snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings);
42 |     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
43 |     if (current != rank) {
44 |       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
45 |       return ncclInternalError;
46 |     }
47 |     // Check that all ranks are there
48 |     for (int i=0; i<nranks; i++) {
49 |       int found = 0;
50 |       for (int j=0; j<nranks; j++) {
51 |         if (rings[r*nranks+j] == i) {
52 |           found = 1;
53 |           break;
54 |         }
55 |       }
56 |       if (found == 0) {
57 |         WARN("Error : ring %d does not contain rank %d", r, i);
58 |         return ncclInternalError;
59 |       }
60 |     }
61 |   }
62 |   return ncclSuccess;
63 | }
64 | 


--------------------------------------------------------------------------------
/src/graph/rings.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 |  *
4 |  * See LICENSE.txt for license information
5 |  ************************************************************************/
6 | 
7 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
8 | 


--------------------------------------------------------------------------------
/src/graph/trees.cc:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "nccl.h"
  8 | 
  9 | #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
 10 | 
 11 | /* Btree which alternates leaves and nodes.
 12 |  * Assumes root is 0, which conveniently builds a tree on powers of two,
 13 |  * (because we have pow2-1 ranks) which lets us manipulate bits.
 14 |  * Find first non-zero bit, then :
 15 |  * Find the parent :
 16 |  *   xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
 17 |  *   xx11[0] -> xx10[0] (3,7,11 below)
 18 |  * Find the children :
 19 |  *   xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
 20 |  *   xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
 21 |  *
 22 |  * Illustration :
 23 |  * 0---------------8
 24 |  *          ______/ \______
 25 |  *         4               12
 26 |  *       /   \            /  \
 27 |  *     2       6       10     \
 28 |  *    / \     / \     /  \     \
 29 |  *   1   3   5   7   9   11    13
 30 |  */
 31 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
 32 |   int up, down0, down1;
 33 |   int bit;
 34 |   for (bit=1; bit<nranks; bit<<=1) {
 35 |     if (bit & rank) break;
 36 |   }
 37 | 
 38 |   if (rank == 0) {
 39 |     *u = -1;
 40 |     *d0 = -1;
 41 |     // Child rank is > 0 so it has to be our child 1, not 0.
 42 |     *d1 = nranks > 1 ? bit >> 1 : -1;
 43 |     return ncclSuccess;
 44 |   }
 45 | 
 46 |   up = (rank ^ bit) | (bit << 1);
 47 |   // if smaller than the parent, we are his first child, otherwise we're his second
 48 |   if (up >= nranks) up = (rank ^ bit);
 49 |   *parentChildType = (rank < up) ? 0 : 1;
 50 |   *u = up;
 51 | 
 52 |   int lowbit = bit >> 1;
 53 |   // down0 is always within bounds
 54 |   down0 = lowbit == 0 ? -1 : rank-lowbit;
 55 | 
 56 |   down1 = lowbit == 0 ? -1 : rank+lowbit;
 57 |   // Make sure down1 is within bounds
 58 |   while (down1 >= nranks) {
 59 |     down1 = lowbit == 0 ? -1 : rank+lowbit;
 60 |     lowbit >>= 1;
 61 |   }
 62 |   *d0 = down0; *d1 = down1;
 63 | 
 64 |   return ncclSuccess;
 65 | }
 66 | 
 67 | /* Build a double binary tree. Take the previous tree for the first tree.
 68 |  * For the second tree, we use a mirror tree (if nranks is even)
 69 |  *
 70 |  * 0---------------8                   3----------------11
 71 |  *          ______/ \                 / \______
 72 |  *         4         \               /         7
 73 |  *       /   \        \             /        /   \
 74 |  *     2       6       10         1        5      9
 75 |  *    / \     / \     /  \       / \      / \    / \
 76 |  *   1   3   5   7   9   11     0   2    4   6  8   10
 77 |  *
 78 |  * or shift it by one rank (if nranks is odd).
 79 |  *
 80 |  * 0---------------8            1---------------9
 81 |  *          ______/ \______              ______/ \______
 82 |  *         4               12           5                0
 83 |  *       /   \            /           /   \            /
 84 |  *     2       6       10           3       7       11
 85 |  *    / \     / \     /  \         / \     / \     /  \
 86 |  *   1   3   5   7   9   11       2   4   6   8  10   12
 87 |  */
 88 | ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
 89 |   // First tree ... use a btree
 90 |   ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
 91 |   // Second tree ... mirror or shift
 92 |   if (nranks % 2 == 1) {
 93 |     // shift
 94 |     int shiftrank = (rank-1+nranks) % nranks;
 95 |     int u, d0, d1;
 96 |     ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
 97 |     *s1 = u == -1 ? -1 : (u+1) % nranks;
 98 |     *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
 99 |     *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
100 |   } else {
101 |     // mirror
102 |     int u, d0, d1;
103 |     ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
104 |     *s1 = u == -1 ? -1 : nranks-1-u;
105 |     *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
106 |     *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
107 |   }
108 |   return ncclSuccess;
109 | }
110 | 


--------------------------------------------------------------------------------
/src/include/argcheck.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ARGCHECK_H_
 8 | #define NCCL_ARGCHECK_H_
 9 | 
10 | #include "core.h"
11 | #include "info.h"
12 | 
13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
14 | ncclResult_t CommCheck(struct ncclComm* ptr, const char* opname, const char* ptrname);
15 | ncclResult_t ArgsCheck(struct ncclInfo* info);
16 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname);
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/src/include/bootstrap.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_BOOTSTRAP_H_
 8 | #define NCCL_BOOTSTRAP_H_
 9 | 
10 | #include "nccl.h"
11 | #include "comm.h"
12 | 
13 | struct ncclBootstrapHandle {
14 |   uint64_t magic;
15 |   union ncclSocketAddress addr;
16 | };
17 | static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
18 | 
19 | ncclResult_t bootstrapNetInit();
20 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
21 | ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
22 | ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
23 | ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
24 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
25 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
26 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
27 | ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag);
28 | ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size);
29 | ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
30 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
31 | ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
32 | ncclResult_t bootstrapClose(void* commState);
33 | ncclResult_t bootstrapAbort(void* commState);
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/include/channel.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CHANNEL_H_
 8 | #define NCCL_CHANNEL_H_
 9 | #include "comm.h"
10 | #include "utils.h"
11 | 
12 | #include <algorithm>
13 | 
14 | ncclResult_t initChannel(struct ncclComm* comm, int channelid);
15 | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
16 | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
17 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
18 | 
19 | inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
20 |   if (comm->nNodes > 1) {
21 |     int nodeDelta = p2pRound/comm->maxLocalRanks;
22 |     int localDelta = p2pRound%comm->maxLocalRanks;
23 |     int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
24 |     base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
25 |     return base & 0xff;
26 |   } else {
27 |     return p2pRound & 0xff;
28 |   }
29 | }
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/src/include/coll_net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COLL_NET_H_
 8 | #define COLL_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | 
13 | typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
14 | 
15 | // Translation to external API
16 | static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
17 | static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
18 | static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
19 | static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
20 | static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
21 | static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
22 | static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
23 | /* DMA-BUF support */
24 | static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
25 | static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
26 | static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
27 |   NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
28 | static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
29 | static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
30 | static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
31 | static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
32 | 
33 | static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/src/include/core.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_CORE_H_
 9 | #define NCCL_CORE_H_
10 | 
11 | #include <pthread.h>
12 | #include <unistd.h>
13 | #include <stdlib.h>
14 | #include <stdint.h>
15 | #include <algorithm> // For std::min/std::max
16 | #include "nccl.h"
17 | 
18 | #ifdef PROFAPI
19 | #define NCCL_API(ret, func, args...)        \
20 |     __attribute__ ((visibility("default"))) \
21 |     __attribute__ ((alias(#func)))          \
22 |     ret p##func (args);                     \
23 |     extern "C"                              \
24 |     __attribute__ ((visibility("default"))) \
25 |     __attribute__ ((weak))                  \
26 |     ret func(args)
27 | #else
28 | #define NCCL_API(ret, func, args...)        \
29 |     extern "C"                              \
30 |     __attribute__ ((visibility("default"))) \
31 |     ret func(args)
32 | #endif // end PROFAPI
33 | 
34 | #include "debug.h"
35 | #include "checks.h"
36 | #include "cudawrap.h"
37 | #include "alloc.h"
38 | #include "utils.h"
39 | #include "param.h"
40 | #include "nvtx.h"
41 | 
42 | #endif // end include guard
43 | 


--------------------------------------------------------------------------------
/src/include/cpuset.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CPUSET_H_
 8 | #define NCCL_CPUSET_H_
 9 | 
10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
11 | 
12 | static int hexToInt(char c) {
13 |   int v = c - '0';
14 |   if (v < 0) return -1;
15 |   if (v > 9) v = 10 + c - 'a';
16 |   if ((v < 0) || (v > 15)) return -1;
17 |   return v;
18 | }
19 | 
20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
21 | 
22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
23 |   uint32_t cpumasks[CPU_SET_N_U32];
24 |   int m = CPU_SET_N_U32-1;
25 |   cpumasks[m] = 0;
26 |   for (int o=0; o<strlen(str); o++) {
27 |     char c = str[o];
28 |     if (c == ',') {
29 |       m--;
30 |       cpumasks[m] = 0;
31 |     } else {
32 |       int v = hexToInt(c);
33 |       if (v == -1) break;
34 |       cpumasks[m] <<= 4;
35 |       cpumasks[m] += v;
36 |     }
37 |   }
38 |   // Copy cpumasks to mask
39 |   for (int a=0; m<CPU_SET_N_U32; a++,m++) {
40 |     memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
41 |   }
42 |   return ncclSuccess;
43 | }
44 | 
45 | static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
46 |   int c = 0;
47 |   uint8_t* m8 = (uint8_t*)mask;
48 |   for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
49 |     if (c == 0 && m8[o] == 0) continue;
50 |     sprintf(str+c, "%02x", m8[o]);
51 |     c+=2;
52 |     if (o && o%4 == 0) {
53 |       sprintf(str+c, ",");
54 |       c++;
55 |     }
56 |   }
57 |   str[c] = '\0';
58 |   return ncclSuccess;
59 | }
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/include/cudawrap.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_CUDAWRAP_H_
  8 | #define NCCL_CUDAWRAP_H_
  9 | 
 10 | #include <cuda.h>
 11 | #include <cuda_runtime.h>
 12 | #include "checks.h"
 13 | 
 14 | // Is cuMem API usage enabled
 15 | extern int ncclCuMemEnable();
 16 | extern int ncclCuMemHostEnable();
 17 | 
 18 | #if CUDART_VERSION >= 11030
 19 | #include <cudaTypedefs.h>
 20 | 
 21 | // Handle type used for cuMemCreate()
 22 | extern CUmemAllocationHandleType ncclCuMemHandleType;
 23 | 
 24 | #endif
 25 | 
 26 | #define CUPFN(symbol) pfn_##symbol
 27 | 
 28 | // Check CUDA PFN driver calls
 29 | #define CUCHECK(cmd) do {				      \
 30 |     CUresult err = pfn_##cmd;				      \
 31 |     if( err != CUDA_SUCCESS ) {				      \
 32 |       const char *errStr;				      \
 33 |       (void) pfn_cuGetErrorString(err, &errStr);	      \
 34 |       WARN("Cuda failure %d '%s'", err, errStr);	      \
 35 |       return ncclUnhandledCudaError;			      \
 36 |     }							      \
 37 | } while(false)
 38 | 
 39 | #define CUCHECKGOTO(cmd, res, label) do {		      \
 40 |     CUresult err = pfn_##cmd;				      \
 41 |     if( err != CUDA_SUCCESS ) {				      \
 42 |       const char *errStr;				      \
 43 |       (void) pfn_cuGetErrorString(err, &errStr);	      \
 44 |       WARN("Cuda failure %d '%s'", err, errStr);	      \
 45 |       res = ncclUnhandledCudaError;			      \
 46 |       goto label;					      \
 47 |     }							      \
 48 | } while(false)
 49 | 
 50 | // Report failure but clear error and continue
 51 | #define CUCHECKIGNORE(cmd) do {						\
 52 |     CUresult err = pfn_##cmd;						\
 53 |     if( err != CUDA_SUCCESS ) {						\
 54 |       const char *errStr;						\
 55 |       (void) pfn_cuGetErrorString(err, &errStr);			\
 56 |       INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \
 57 |     }									\
 58 | } while(false)
 59 | 
 60 | #define CUCHECKTHREAD(cmd, args) do {					\
 61 |     CUresult err = pfn_##cmd;						\
 62 |     if (err != CUDA_SUCCESS) {						\
 63 |       INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
 64 |       args->ret = ncclUnhandledCudaError;				\
 65 |       return args;							\
 66 |     }									\
 67 | } while(0)
 68 | 
 69 | #define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
 70 | 
 71 | #if CUDART_VERSION >= 11030
 72 | /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
 73 | DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
 74 | DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
 75 | DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
 76 | DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
 77 | DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
 78 | DECLARE_CUDA_PFN_EXTERN(cuCtxCreate);
 79 | DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
 80 | DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
 81 | DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
 82 | DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
 83 | DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
 84 | DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
 85 | #if CUDART_VERSION >= 11080
 86 | DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
 87 | #endif
 88 | // cuMem API support
 89 | DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
 90 | DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
 91 | DECLARE_CUDA_PFN_EXTERN(cuMemCreate);
 92 | DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity);
 93 | DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle);
 94 | DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle);
 95 | DECLARE_CUDA_PFN_EXTERN(cuMemMap);
 96 | DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
 97 | DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
 98 | DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
 99 | DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
100 | DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
101 | #if CUDA_VERSION >= 11070
102 | DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
103 | #endif
104 | #if CUDA_VERSION >= 12010
105 | /* NVSwitch Multicast support */
106 | DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice);
107 | DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem);
108 | DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr);
109 | DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate);
110 | DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity);
111 | DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind);
112 | #endif
113 | #endif
114 | 
115 | ncclResult_t ncclCudaLibraryInit(void);
116 | 
117 | extern int ncclCudaDriverVersionCache;
118 | extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
119 | 
120 | inline ncclResult_t ncclCudaDriverVersion(int* driver) {
121 |   int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
122 |   if (version == -1) {
123 |     CUDACHECK(cudaDriverGetVersion(&version));
124 |     __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
125 |   }
126 |   *driver = version;
127 |   return ncclSuccess;
128 | }
129 | #endif
130 | 


--------------------------------------------------------------------------------
/src/include/debug.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INT_DEBUG_H_
 8 | #define NCCL_INT_DEBUG_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_common.h"
12 | #include <stdio.h>
13 | #include <pthread.h>
14 | 
15 | // Conform to pthread and NVTX standard
16 | #define NCCL_THREAD_NAMELEN 16
17 | 
18 | extern int ncclDebugLevel;
19 | extern FILE *ncclDebugFile;
20 | 
21 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
22 | 
23 | // Let code temporarily downgrade WARN into INFO
24 | extern thread_local int ncclDebugNoWarn;
25 | extern char ncclLastError[];
26 | 
27 | #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
28 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
29 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
30 | #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
31 | 
32 | #ifdef ENABLE_TRACE
33 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
34 | #else
35 | #define TRACE(...)
36 | #endif
37 | 
38 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/src/include/enqueue.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ENQUEUE_H_
 8 | #define NCCL_ENQUEUE_H_
 9 | 
10 | #include "comm.h"
11 | #include "group.h"
12 | #include "collectives.h"
13 | #include "utils.h"
14 | 
15 | #define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t)
16 | #define NCCL_LL128_ALIGNMENT_PER_WARP 480
17 | #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
18 | #define NCCL_BYTES_ALIGNMENT 16
19 | 
20 | ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
21 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
22 | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
23 | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
24 | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
25 | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
26 | ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
27 | ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
28 | 
29 | #endif // End include guard
30 | 


--------------------------------------------------------------------------------
/src/include/ibvsymbols.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_IBV_SYMBOLS_H_
 2 | #define NCCL_IBV_SYMBOLS_H_
 3 | 
 4 | #ifdef NCCL_BUILD_RDMA_CORE
 5 | #include <infiniband/verbs.h>
 6 | #else
 7 | #include "ibvcore.h"
 8 | #endif
 9 | 
10 | #include "nccl.h"
11 | 
12 | /* IB Verbs Function Pointers*/
13 | struct ncclIbvSymbols {
14 |   int (*ibv_internal_fork_init)(void);
15 |   struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
16 |   void (*ibv_internal_free_device_list)(struct ibv_device **list);
17 |   const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
18 |   struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
19 |   int (*ibv_internal_close_device)(struct ibv_context *context);
20 |   int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
21 |   void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
22 |   int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
23 |   int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
24 |   int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
25 |   int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
26 |   struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
27 |   int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
28 |   struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
29 |   struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
30 |   /* DMA-BUF support */
31 |   struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
32 |   int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
33 |   struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
34 |   int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
35 |   struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
36 |   int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
37 |   int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
38 |   const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
39 |   int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
40 |   int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
41 | };
42 | 
43 | /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
44 | ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
45 | 
46 | #endif  // NCCL_IBV_SYMBOLS_H_
47 | 


--------------------------------------------------------------------------------
/src/include/ibvwrap.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
 3 |  * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
 4 |  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
 5 |  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
 6 |  *
 7 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 8 |  *
 9 |  * See LICENSE.txt for license information
10 |  ************************************************************************/
11 | 
12 | #ifndef NCCL_IBVWRAP_H_
13 | #define NCCL_IBVWRAP_H_
14 | 
15 | #ifdef NCCL_BUILD_RDMA_CORE
16 | #include <infiniband/verbs.h>
17 | #else
18 | #include "ibvcore.h"
19 | #endif
20 | 
21 | #include "core.h"
22 | #include <sys/types.h>
23 | #include <unistd.h>
24 | 
25 | typedef enum ibv_return_enum
26 | {
27 |     IBV_SUCCESS = 0,                   //!< The operation was successful
28 | } ibv_return_t;
29 | 
30 | ncclResult_t wrap_ibv_symbols(void);
31 | /* NCCL wrappers of IB verbs functions */
32 | ncclResult_t wrap_ibv_fork_init(void);
33 | ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
34 | ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
35 | const char *wrap_ibv_get_device_name(struct ibv_device *device);
36 | ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
37 | ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
38 | ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
39 | ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
40 | ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
41 | ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
42 | ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
43 | ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
44 | ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
45 | ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
46 | ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
47 | struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
48 | ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
49 | /* DMA-BUF support */
50 | ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
51 | struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
52 | ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
53 | ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
54 | ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
55 | ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
56 | ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
57 | static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
58 |   int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
59 |   if (done < 0) {
60 |     WARN("Call to ibv_poll_cq() returned %d", done);
61 |     return ncclSystemError;
62 |   }
63 |   *num_done = done;
64 |   return ncclSuccess;
65 | }
66 | ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
67 | ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
68 | ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
69 | ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
70 | ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
71 | 
72 | static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
73 |   int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
74 |   if (ret != IBV_SUCCESS) {
75 |     WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
76 |     return ncclSystemError;
77 |   }
78 |   return ncclSuccess;
79 | }
80 | 
81 | static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
82 |   int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
83 |   if (ret != IBV_SUCCESS) {
84 |     WARN("ibv_post_recv() failed with error %s", strerror(ret));
85 |     return ncclSystemError;
86 |   }
87 |   return ncclSuccess;
88 | }
89 | 
90 | ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
91 | 
92 | #endif //End include guard
93 | 


--------------------------------------------------------------------------------
/src/include/info.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INFO_H_
 8 | #define NCCL_INFO_H_
 9 | 
10 | #include "nccl.h"
11 | #include "collectives.h"
12 | #include "core.h"
13 | #include "utils.h"
14 | 
15 | // Used to pass NCCL call information between functions
16 | struct ncclInfo {
17 |   ncclFunc_t coll;
18 |   const char* opName;
19 |   // NCCL Coll Args
20 |   const void* sendbuff;
21 |   void* recvbuff;
22 |   size_t count;
23 |   ncclDataType_t datatype;
24 |   ncclRedOp_t op;
25 |   int root; // peer for p2p operations
26 |   ncclComm_t comm;
27 |   cudaStream_t stream;
28 |   // Algorithm details
29 |   int chunkSteps;
30 |   int sliceSteps;
31 | };
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/include/ipcsocket.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See COPYRIGHT for license information
 5 |  */
 6 | 
 7 | #ifndef NCCL_IPCSOCKET_H
 8 | #define NCCL_IPCSOCKET_H
 9 | 
10 | #include "nccl.h"
11 | #include <stdio.h>
12 | #include <fcntl.h>
13 | #include <sys/mman.h>
14 | #include <unistd.h>
15 | #include <errno.h>
16 | #include <sys/wait.h>
17 | #include <sys/types.h>
18 | #include <sys/socket.h>
19 | #include <memory.h>
20 | #include <sys/un.h>
21 | #include <inttypes.h>
22 | 
23 | #define NCCL_IPC_SOCKNAME_LEN 64
24 | 
25 | struct ncclIpcSocket {
26 |   int fd;
27 |   char socketName[NCCL_IPC_SOCKNAME_LEN];
28 |   volatile uint32_t* abortFlag;
29 | };
30 | 
31 | ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
32 | ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
33 | ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
34 | 
35 | ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
36 | ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
37 | 
38 | ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash);
39 | ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd);
40 | 
41 | #endif /* NCCL_IPCSOCKET_H */
42 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_kernel.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_KERNEL_H_
 7 | #define MSCCL_KERNEL_H_
 8 | 
 9 | #define MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto) mscclKernel_##devredop##_##type##_##proto
10 | 
11 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, proto) \
12 | __global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork work);
13 | 
14 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type) \
15 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL) \
16 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL128) \
17 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, Simple)
18 | 
19 | #if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__)
20 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop) \
21 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \
22 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \
23 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \
24 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \
25 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \
26 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t) \
27 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half) \
28 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float) \
29 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double) \
30 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, __nv_bfloat16) \
31 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, __nv_fp8_e4m3) \
32 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, __nv_fp8_e5m2)
33 | #elif defined(__CUDA_BF16_TYPES_EXIST__)
34 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop) \
35 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \
36 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \
37 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \
38 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \
39 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \
40 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t) \
41 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half) \
42 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float) \
43 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double) \
44 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, __nv_bfloat16)
45 | #else
46 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop) \
47 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \
48 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \
49 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \
50 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \
51 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \
52 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t) \
53 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half) \
54 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float) \
55 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double)
56 | #endif
57 | 
58 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(devredop) \
59 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \
60 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \
61 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \
62 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \
63 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \
64 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t)
65 | 
66 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC() \
67 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum) \
68 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod) \
69 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(MinMax) \
70 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum) \
71 |   MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv)
72 | 
73 | MSCCL_DECL_KERNEL_ENTRY_FUNC()
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_lifecycle.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_LIFECYCLE_H_
 7 | #define MSCCL_LIFECYCLE_H_
 8 | 
 9 | #include "enqueue.h"
10 | 
11 | #include "msccl/msccl_struct.h"
12 | 
13 | bool mscclEnabled();
14 | 
15 | void mscclSetIsCallerFlag();
16 | void mscclClearIsCallerFlag();
17 | bool mscclIsCaller();
18 | 
19 | bool mscclAvailable();
20 | 
21 | int getEnvInt(const char* env, int64_t deftVal);
22 | 
23 | ncclResult_t mscclSchedulerInit(ncclComm_t comm, int* numChannelsRequired);
24 | 
25 | ncclResult_t mscclInit(ncclComm_t comm);
26 | 
27 | ncclResult_t mscclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
28 | 
29 | ncclResult_t mscclGroupStart();
30 | 
31 | ncclResult_t mscclEnqueueCheck(
32 |     const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
33 |     void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
34 |     size_t count, ncclDataType_t datatype, int root, int peer, ncclRedOp_t op,
35 |     mscclFunc_t mscclFunc, ncclComm_t comm, cudaStream_t stream);
36 | 
37 | ncclResult_t mscclGroupEnd();
38 | 
39 | ncclResult_t mscclTeardown();
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_parser.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  4 |  * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #ifndef MSCCL_PARSER_H_
 10 | #define MSCCL_PARSER_H_
 11 | 
 12 | #include "nccl.h"
 13 | #include "debug.h"
 14 | #include "checks.h"
 15 | #include <stdlib.h>
 16 | 
 17 | #include "msccl/msccl_struct.h"
 18 | 
 19 | // A few constraints to make the implementation easy
 20 | #define MAX_STR_LEN 255
 21 | #define MAX_ATTR_COUNT 16
 22 | #define MAX_SUBS 1024
 23 | #define MAX_NODES 4096
 24 | 
 25 | #define NODE_TYPE_NONE 0
 26 | #define NODE_TYPE_OPEN 1
 27 | #define NODE_TYPE_CLOSE 2
 28 | #define NODE_TYPE_SINGLE 3
 29 | 
 30 | struct mscclXmlNode {
 31 |   char name[MAX_STR_LEN+1];
 32 |   struct {
 33 |     char key[MAX_STR_LEN+1];
 34 |     char value[MAX_STR_LEN+1];
 35 |   } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
 36 |   int nAttrs;
 37 |   int type;
 38 |   struct mscclXmlNode* parent;
 39 |   struct mscclXmlNode* subs[MAX_SUBS];
 40 |   int nSubs;
 41 | };
 42 | 
 43 | struct mscclXml {
 44 |   struct mscclXmlNode nodes[MAX_NODES];
 45 |   int maxIndex;
 46 | };
 47 | 
 48 | static ncclResult_t mscclXmlGetAttrIndex(struct mscclXmlNode* node, const char* attrName, int* index) {
 49 |   *index = -1;
 50 |   const int nAttrs = node->nAttrs;
 51 |   for (int a=0; a<nAttrs; a++) {
 52 |     if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
 53 |       *index = a;
 54 |       return ncclSuccess;
 55 |     }
 56 |   }
 57 |   return ncclSuccess;
 58 | }
 59 | 
 60 | static ncclResult_t mscclXmlGetAttr(struct mscclXmlNode* node, const char* attrName, const char** value) {
 61 |   int index;
 62 |   NCCLCHECK(mscclXmlGetAttrIndex(node, attrName, &index));
 63 |   *value = index == -1 ? NULL : node->attrs[index].value;
 64 |   return ncclSuccess;
 65 | }
 66 | 
 67 | static ncclResult_t mscclXmlGetAttrStr(struct mscclXmlNode* node, const char* attrName, const char** value) {
 68 |   NCCLCHECK(mscclXmlGetAttr(node, attrName, value));
 69 |   if (*value == NULL) {
 70 |     WARN("Attribute %s of node %s not found", attrName, node->name);
 71 |     return ncclInternalError;
 72 |   }
 73 |   return ncclSuccess;
 74 | }
 75 | static ncclResult_t mscclXmlGetAttrInt(struct mscclXmlNode* node, const char* attrName, int* value) {
 76 |   const char* str;
 77 |   NCCLCHECK(mscclXmlGetAttrStr(node, attrName, &str));
 78 |   *value = strtol(str, NULL, 0);
 79 |   return ncclSuccess;
 80 | }
 81 | 
 82 | static ncclResult_t mscclXmlGetAttrInt64(struct mscclXmlNode* node, const char* attrName, int64_t* value) {
 83 |   const char* str;
 84 |   NCCLCHECK(mscclXmlGetAttrStr(node, attrName, &str));
 85 |   *value = strtoll(str, NULL, 0);
 86 |   return ncclSuccess;
 87 | }
 88 | 
 89 | static ncclResult_t mscclXmlFindTag(struct mscclXml* xml, const char* tagName, struct mscclXmlNode** node) {
 90 |   *node = NULL;
 91 |   for (int i=0; i<xml->maxIndex; i++) {
 92 |     struct mscclXmlNode* n = xml->nodes+i;
 93 |     if (strcmp(n->name, tagName) == 0) {
 94 |       *node = n;
 95 |       return ncclSuccess;
 96 |     }
 97 |   }
 98 |   return ncclSuccess;
 99 | }
100 | 
101 | ncclResult_t mscclGetAlgoFromXmlFile(const char* xmlGraphFile, struct mscclAlgo* algo, int rank);
102 | 
103 | ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* xmlGraphFile, struct mscclAlgoMeta* algoMeta);
104 | 
105 | #endif
106 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_scheduler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_SCHEDULER_H_
 7 | #define MSCCL_SCHEDULER_H_
 8 | 
 9 | typedef enum { mscclFuncReduce             =  0,
10 |                mscclFuncBroadcast          =  1,
11 |                mscclFuncAllReduce          =  2,
12 |                mscclFuncReduceScatter      =  3,
13 |                mscclFuncAllGather          =  4,
14 |                mscclFuncSend               =  5,
15 |                mscclFuncRecv               =  6,
16 |                mscclFuncGather             =  7,
17 |                mscclFuncScatter            =  8,
18 |                mscclFuncAllToAll           =  9,
19 |                mscclFuncAllToAllv          =  10,
20 |                mscclNumFuncs               =  11 } mscclFunc_t;
21 | 
22 | struct mscclSchedulerParam {
23 |   const void* sendBuff;
24 |   const size_t* sendCounts;
25 |   const size_t* sDisPls;
26 |   void* recvBuff;
27 |   const size_t* recvCounts;
28 |   const size_t* rDisPls;
29 |   size_t count;
30 |   ncclDataType_t dataType;
31 |   int root;
32 |   int peer;
33 |   ncclRedOp_t op;
34 |   mscclFunc_t func;
35 |   int rank;
36 |   int nRanks;
37 |   bool scheduled;
38 |   mscclAlgoHandle_t handle;
39 | };
40 | 
41 | typedef struct {
42 |   // Name of the scheduler (mainly for logs)
43 |   const char* name;
44 |   // Load all algorithms
45 |   ncclResult_t (*init)();
46 |   // Select an algorithm
47 |   ncclResult_t (*selectAlgo)(struct mscclSchedulerParam* param);
48 |   // Unload all algorithms
49 |   ncclResult_t (*teardown)();
50 | } mscclSchedulerInterface;
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_setup.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_SETUP_H_
 7 | #define MSCCL_SETUP_H_
 8 | 
 9 | #include "comm.h"
10 | #include "msccl/msccl_struct.h"
11 | 
12 | ncclResult_t mscclGetCaptureStatus(cudaStream_t stream);
13 | 
14 | ncclResult_t mscclSetupScratch(struct mscclAlgo* hostAlgo, cudaStream_t stream);
15 | 
16 | ncclResult_t mscclSetupSyncFlags(cudaStream_t stream);
17 | 
18 | ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo, ncclComm_t comm);
19 | 
20 | ncclResult_t mscclSetupCount(struct mscclAlgo* hostAlgo, ncclComm_t comm, size_t count, ncclDataType_t dataType);
21 | 
22 | ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm, cudaStream_t stream);
23 | 
24 | ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count,
25 |     ncclDataType_t dataType, ncclRedOp_t op, struct mscclAlgo* hostAlgo, struct mscclAlgo* devAlgo,
26 |     ncclComm_t comm, cudaStream_t stream);
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_status.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_STATUS_H_
 7 | #define MSCCL_STATUS_H_
 8 | 
 9 | #include "msccl/msccl_struct.h"
10 | 
11 | mscclStatus& mscclGetStatus();
12 | 
13 | mscclSavedProxyArgs& mscclGetSavedProxyArgs();
14 | 
15 | mscclThreadLocalStatus& mscclGetThreadLocalStatus();
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/include/nccl_common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_DEBUG_H_
 8 | #define NCCL_DEBUG_H_
 9 | 
10 | typedef enum {
11 |   NCCL_LOG_NONE = 0,
12 |   NCCL_LOG_VERSION = 1,
13 |   NCCL_LOG_WARN = 2,
14 |   NCCL_LOG_INFO = 3,
15 |   NCCL_LOG_ABORT = 4,
16 |   NCCL_LOG_TRACE = 5
17 | } ncclDebugLogLevel;
18 | 
19 | typedef enum {
20 |   NCCL_INIT = 0x1,
21 |   NCCL_COLL = 0x2,
22 |   NCCL_P2P = 0x4,
23 |   NCCL_SHM = 0x8,
24 |   NCCL_NET = 0x10,
25 |   NCCL_GRAPH = 0x20,
26 |   NCCL_TUNING = 0x40,
27 |   NCCL_ENV = 0x80,
28 |   NCCL_ALLOC = 0x100,
29 |   NCCL_CALL = 0x200,
30 |   NCCL_PROXY = 0x400,
31 |   NCCL_NVLS = 0x800,
32 |   NCCL_BOOTSTRAP = 0x1000,
33 |   NCCL_REG = 0x2000,
34 |   NCCL_PROFILE = 0x4000,
35 |   NCCL_ALL = ~0
36 | } ncclDebugLogSubSys;
37 | 
38 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
39 | 
40 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
41 | typedef enum {
42 |   ncclFuncBroadcast = 0,
43 |   ncclFuncReduce = 1,
44 |   ncclFuncAllGather = 2,
45 |   ncclFuncReduceScatter = 3,
46 |   ncclFuncAllReduce = 4,
47 |   ncclFuncSendRecv = 5,
48 |   ncclFuncSend = 6,
49 |   ncclFuncRecv = 7,
50 |   ncclNumFuncs = 8
51 | } ncclFunc_t;
52 | 
53 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
54 | #define NCCL_ALGO_UNDEF -1
55 | #define NCCL_ALGO_TREE 0
56 | #define NCCL_ALGO_RING 1
57 | #define NCCL_ALGO_COLLNET_DIRECT 2
58 | #define NCCL_ALGO_COLLNET_CHAIN 3
59 | #define NCCL_ALGO_NVLS 4
60 | #define NCCL_ALGO_NVLS_TREE 5
61 | #define NCCL_ALGO_PAT 6
62 | 
63 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
64 | #define NCCL_PROTO_UNDEF -1
65 | #define NCCL_PROTO_LL 0
66 | #define NCCL_PROTO_LL128 1
67 | #define NCCL_PROTO_SIMPLE 2
68 | 
69 | #define NCCL_ALGO_PROTO_IGNORE -1.0
70 | #endif
71 | 


--------------------------------------------------------------------------------
/src/include/nccl_tuner.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
  3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
  4 |  *
  5 |  * See LICENSE.txt for license information
  6 |  ************************************************************************/
  7 | 
  8 | #ifndef NCCL_TUNER_H_
  9 | #define NCCL_TUNER_H_
 10 | 
 11 | #include "nccl.h"
 12 | #include "nccl_common.h"
 13 | 
 14 | // API to be implemented by external tuner
 15 | typedef struct {
 16 |   // Name of the tuner
 17 |   const char* name;
 18 | 
 19 |   // Initializes tuner states.
 20 |   // Inputs:
 21 |   //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
 22 |   //   - nNodes: number of nodes in current communicator.
 23 |   //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
 24 |   // Outputs:
 25 |   //   - context: tuner context object
 26 |   ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
 27 | 
 28 |   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
 29 |   // Inputs:
 30 |   //   - context: tuner context object
 31 |   //   - collType: collective type , e.g., allreduce, allgather…
 32 |   //   - nBytes: collective size in bytes
 33 |   //   - numPipeOps: number of operations in the group
 34 |   //   - numAlgo: number of algorithms in collCostTable
 35 |   //   - numProto: number of protocols in collCostTable
 36 |   //
 37 |   // Outputs:
 38 |   //   - nChannels: number of channels (hence SMs) to be used.
 39 |   //
 40 |   // InOut:
 41 |   //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
 42 |   //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
 43 |   //
 44 |   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
 45 |   // default tuning for the given collective.
 46 |   // Also, the plugin is allowed to not set any output, or set only the
 47 |   // algorithm and protocol, but not only the algorithm or only the protocol.
 48 |   // Unset fields will be set automatically by NCCL.
 49 |   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
 50 |                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
 51 |                               int* nChannels);
 52 | 
 53 |   // Terminates the plugin and cleans up any resources that the plugin allocated.
 54 |   // context: tuner context object
 55 |   ncclResult_t (*destroy)(void* context);
 56 | } ncclTuner_v3_t;
 57 | 
 58 | typedef ncclTuner_v3_t ncclTuner_t;
 59 | 
 60 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
 61 | 
 62 | // API to be implemented by external tuner
 63 | typedef struct {
 64 |   // Name of the tuner
 65 |   const char* name;
 66 | 
 67 |   // Initializes tuner states.
 68 |   // Inputs:
 69 |   //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
 70 |   //   - nNodes: number of nodes in current communicator.
 71 |   //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
 72 |   // Outputs:
 73 |   //   - context: tuner context object
 74 |   ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
 75 | 
 76 |   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
 77 |   // Inputs:
 78 |   //   - context: tuner context object
 79 |   //   - collType: collective type , e.g., allreduce, allgather…
 80 |   //   - nBytes: collective size in bytes
 81 |   //   - collNetTypeSupport: whether collnet supports this type
 82 |   //   - nvlsTypeSupport: whether nvlink sharp supports this time
 83 |   //   - numPipeOps: number of operations in the group
 84 |   //
 85 |   // Outputs:
 86 |   //   - algorithm: selected algorithm to be used for the given collective
 87 |   //   - protocol: selected protocol to be used for the give collective
 88 |   //   - nChannels: number of channels (hence SMs) to be used.
 89 |   //
 90 |   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
 91 |   // default tuning for the given collective.
 92 |   // Also, the plugin is allowed to not set any output, or set only the
 93 |   // algorithm and protocol, but not only the algorithm or only the protocol.
 94 |   // Unset fields will be set automatically by NCCL.
 95 |   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
 96 |                               int collNetSupport, int nvlsSupport, int numPipeOps,
 97 |                               int* algorithm, int* protocol, int* nChannels);
 98 | 
 99 |   // Terminates the plugin and cleans up any resources that the plugin allocated.
100 |   // context: tuner context object
101 |   ncclResult_t (*destroy)(void* context);
102 | } ncclTuner_v2_t;
103 | 
104 | #endif
105 | 


--------------------------------------------------------------------------------
/src/include/net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INT_NET_H_
 8 | #define NCCL_INT_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | #include "comm.h"
13 | #include "checks.h"
14 | 
15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
16 | 
17 | ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
18 | ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
19 | ncclResult_t ncclNetInit(struct ncclComm* comm);
20 | ncclResult_t ncclNetFinalize(struct ncclComm* comm);
21 | int ncclNetVersion(struct ncclComm* comm);
22 | 
23 | // Test whether the current GPU support GPU Direct RDMA.
24 | ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
25 | 
26 | extern ncclNet_t ncclNetIb;
27 | extern ncclNet_t ncclNetSocket;
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/include/net_device.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_NET_DEVICE_H_
 8 | #define NCCL_NET_DEVICE_H_
 9 | 
10 | #define NCCL_NET_DEVICE_INVALID_VERSION      0x0
11 | #define NCCL_NET_MTU_SIZE                    4096
12 | 
13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
16 | 
17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
18 | 
19 | typedef struct {
20 |   ncclNetDeviceType netDeviceType; // Network offload type
21 |   int netDeviceVersion;            // Version number for network offload
22 |   void* handle;
23 |   size_t size;
24 |   int needsProxyProgress;
25 | } ncclNetDeviceHandle_v7_t;
26 | 
27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/include/npkit/npkit.h:
--------------------------------------------------------------------------------
 1 | #ifndef NPKIT_H_
 2 | #define NPKIT_H_
 3 | 
 4 | #include <string>
 5 | #include <thread>
 6 | 
 7 | #include <cuda_runtime.h>
 8 | 
 9 | #include "npkit/npkit_event.h"
10 | #include "npkit/npkit_struct.h"
11 | 
12 | class NpKit {
13 |  public:
14 |   static const uint64_t kNumGpuEventBuffers = 512;
15 | 
16 |   static const uint64_t kNumCpuEventBuffers = 32;
17 | 
18 |   static ncclResult_t Init(int rank);
19 | 
20 |   static ncclResult_t Dump(const std::string& dump_dir);
21 | 
22 |   static ncclResult_t Shutdown();
23 | 
24 |   static NpKitEventCollectContext* GetGpuEventCollectContexts();
25 | 
26 |   static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp,
27 |                                                 NpKitEventCollectContext* ctx) {
28 |     uint64_t event_buffer_head = ctx->event_buffer_head;
29 |     if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
30 |       NpKitEvent& event = ctx->event_buffer[event_buffer_head];
31 |       event.fields.type = type;
32 |       event.fields.size = size;
33 |       event.fields.rsvd = rsvd;
34 |       event.fields.timestamp = timestamp;
35 |       ctx->event_buffer_head++;
36 |     }
37 |   }
38 | 
39 |   static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
40 | 
41 |   static uint64_t* GetCpuTimestamp();
42 | 
43 |  private:
44 |   static void CpuTimestampUpdateThread();
45 | 
46 |   // 64K * 512 * 16B = 512MB per GPU
47 |   static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 16;
48 | 
49 |   // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
50 |   static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
51 | 
52 |   static NpKitEvent** gpu_event_buffers_;
53 |   static NpKitEvent** cpu_event_buffers_;
54 | 
55 |   static NpKitEventCollectContext* gpu_collect_contexts_;
56 |   static NpKitEventCollectContext* cpu_collect_contexts_;
57 |   static uint64_t* cpu_timestamp_;
58 | 
59 |   static uint64_t rank_;
60 | 
61 |   static std::thread* cpu_timestamp_update_thread_;
62 |   static volatile bool cpu_timestamp_update_thread_should_stop_;
63 | };
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/src/include/npkit/npkit_struct.h:
--------------------------------------------------------------------------------
 1 | #ifndef NPKIT_STRUCT_H_
 2 | #define NPKIT_STRUCT_H_
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | #pragma pack(push, 1)
 7 | 
 8 | union NpKitEvent {
 9 |   uint64_t bits[2];
10 |   struct {
11 |     uint64_t type : 8;
12 |     uint64_t size : 32;
13 |     uint64_t rsvd : 24;
14 |     uint64_t timestamp;
15 |   } fields;
16 | };
17 | 
18 | struct NpKitEventCollectContext {
19 |   NpKitEvent* event_buffer;
20 |   uint64_t event_buffer_head;
21 | };
22 | 
23 | #pragma pack(pop)
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/src/include/nvtx.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_NVTX_H_
 8 | #define NCCL_NVTX_H_
 9 | 
10 | #include "nvtx3/nvtx3.hpp"
11 | 
12 | #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
13 | #define NVTX3_CONSTEXPR_IF_CPP14 constexpr
14 | #else
15 | #define NVTX3_CONSTEXPR_IF_CPP14
16 | #endif
17 | 
18 | // Define all NCCL-provided static schema IDs here (avoid duplicates).
19 | #define NVTX_SID_CommInitRank         0
20 | #define NVTX_SID_CommInitAll          1
21 | #define NVTX_SID_CommDestroy          2 // same schema as NVTX_SID_CommInitRank
22 | #define NVTX_SID_CommAbort            3 // same schema as NVTX_SID_CommInitRank
23 | #define NVTX_SID_AllGather            4
24 | #define NVTX_SID_AllReduce            5
25 | #define NVTX_SID_Broadcast            6
26 | #define NVTX_SID_ReduceScatter        7
27 | #define NVTX_SID_Reduce               8
28 | #define NVTX_SID_Send                 9
29 | #define NVTX_SID_Recv                 10
30 | #define NVTX_SID_CommInitRankConfig   11 // same schema as NVTX_SID_CommInitRank
31 | #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
32 | #define NVTX_SID_CommSplit            13
33 | 
34 | // Define static schema ID for the reduction operation.
35 | #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
36 | 
37 | extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
38 | 
39 | struct nccl_domain{static constexpr char const* name{"NCCL"};};
40 | 
41 | class payload_schema {
42 |  public:
43 |   explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
44 |   {
45 |     schema_attr.name = schemaName;
46 |     schema_attr.entries = entries;
47 |     schema_attr.numEntries = numEntries;
48 |     schema_attr.schemaId = schemaId;
49 |     nvtxPayloadSchemaRegister(nvtx3::domain::get<nccl_domain>(), &schema_attr);
50 |   }
51 | 
52 |   payload_schema() = delete;
53 |   ~payload_schema() = default;
54 |   payload_schema(payload_schema const&) = default;
55 |   payload_schema& operator=(payload_schema const&) = default;
56 |   payload_schema(payload_schema&&) = default;
57 |   payload_schema& operator=(payload_schema&&) = default;
58 | 
59 |  private:
60 |   nvtxPayloadSchemaAttr_t schema_attr{
61 |     NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
62 |     NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
63 |     NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
64 |     NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
65 |     NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
66 |     nullptr,
67 |     NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
68 |     NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
69 |     nullptr, 0, 0, 0, 0, nullptr};
70 | };
71 | 
72 | // Create NVTX push/pop range with parameters
73 | // @param name of the operation (see `NVTX_SID_*`)
74 | // @param N  schema name
75 | // @param S  schema (entries)
76 | // @param P  payload (struct)
77 | #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
78 |   static const payload_schema schema{S, std::extent<decltype(S)>::value, \
79 |     NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
80 |   static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
81 |   nvtxPayloadData_t nvtx3_bpl__[] = { \
82 |     {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
83 |   ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
84 |   ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
85 | 
86 | extern void initNvtxRegisteredEnums();
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtCuda.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
  5 | * See https://llvm.org/LICENSE.txt for license information.
  6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7 | */
  8 | 
  9 | #include "nvToolsExt.h"
 10 | 
 11 | #include "cuda.h"
 12 | 
 13 | #ifndef NVTOOLSEXT_CUDA_V3
 14 | #define NVTOOLSEXT_CUDA_V3
 15 | 
 16 | #ifdef __cplusplus
 17 | extern "C" {
 18 | #endif /* __cplusplus */
 19 | 
 20 | /* ========================================================================= */
 21 | /** \name Functions for CUDA Resource Naming
 22 | */
 23 | /** \addtogroup RESOURCE_NAMING
 24 |  * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
 25 |  *
 26 |  * This section covers the API functions that allow to annotate CUDA resources
 27 |  * with user-provided names.
 28 |  *
 29 |  * @{
 30 |  */
 31 | 
 32 | /*  ------------------------------------------------------------------------- */
 33 | /* \cond SHOW_HIDDEN
 34 | * \brief Used to build a non-colliding value for resource types separated class
 35 | * \version \NVTX_VERSION_2
 36 | */
 37 | #define NVTX_RESOURCE_CLASS_CUDA  4
 38 | /** \endcond */
 39 | 
 40 | /*  ------------------------------------------------------------------------- */
 41 | /** \brief Resource types for CUDA
 42 | */
 43 | typedef enum nvtxResourceCUDAType_t
 44 | {
 45 |     NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
 46 |     NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
 47 |     NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
 48 |     NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
 49 | } nvtxResourceCUDAType_t;
 50 | 
 51 | 
 52 | /* ------------------------------------------------------------------------- */
 53 | /** \brief Annotates a CUDA device.
 54 |  *
 55 |  * Allows the user to associate a CUDA device with a user-provided name.
 56 |  *
 57 |  * \param device - The handle of the CUDA device to name.
 58 |  * \param name   - The name of the CUDA device.
 59 |  *
 60 |  * \version \NVTX_VERSION_1
 61 |  * @{ */
 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
 64 | /** @} */
 65 | 
 66 | /* ------------------------------------------------------------------------- */
 67 | /** \brief Annotates a CUDA context.
 68 |  *
 69 |  * Allows the user to associate a CUDA context with a user-provided name.
 70 |  *
 71 |  * \param context - The handle of the CUDA context to name.
 72 |  * \param name    - The name of the CUDA context.
 73 |  *
 74 |  * \par Example:
 75 |  * \code
 76 |  * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
 77 |  * if ( CUDA_SUCCESS != status )
 78 |  *     goto Error;
 79 |  * nvtxNameCuContext(cuContext, "CTX_NAME");
 80 |  * \endcode
 81 |  *
 82 |  * \version \NVTX_VERSION_1
 83 |  * @{ */
 84 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
 85 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
 86 | /** @} */
 87 | 
 88 | /* ------------------------------------------------------------------------- */
 89 | /** \brief Annotates a CUDA stream.
 90 |  *
 91 |  * Allows the user to associate a CUDA stream with a user-provided name.
 92 |  *
 93 |  * \param stream - The handle of the CUDA stream to name.
 94 |  * \param name   - The name of the CUDA stream.
 95 |  *
 96 |  * \version \NVTX_VERSION_1
 97 |  * @{ */
 98 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
 99 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
100 | /** @} */
101 | 
102 | /* ------------------------------------------------------------------------- */
103 | /** \brief Annotates a CUDA event.
104 |  *
105 |  * Allows the user to associate a CUDA event with a user-provided name.
106 |  *
107 |  * \param event - The handle of the CUDA event to name.
108 |  * \param name  - The name of the CUDA event.
109 |  *
110 |  * \version \NVTX_VERSION_1
111 |  * @{ */
112 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
113 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
114 | /** @} */
115 | 
116 | /** @} */ /* END RESOURCE_NAMING */
117 | 
118 | /* ========================================================================= */
119 | #ifdef UNICODE
120 |   #define nvtxNameCuDevice   nvtxNameCuDeviceW
121 |   #define nvtxNameCuContext  nvtxNameCuContextW
122 |   #define nvtxNameCuStream   nvtxNameCuStreamW
123 |   #define nvtxNameCuEvent    nvtxNameCuEventW
124 | #else
125 |   #define nvtxNameCuDevice   nvtxNameCuDeviceA
126 |   #define nvtxNameCuContext  nvtxNameCuContextA
127 |   #define nvtxNameCuStream   nvtxNameCuStreamA
128 |   #define nvtxNameCuEvent    nvtxNameCuEventA
129 | #endif
130 | 
131 | #ifdef __cplusplus
132 | }
133 | #endif /* __cplusplus */
134 | 
135 | #ifndef NVTX_NO_IMPL
136 | #define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */
137 | #include "nvtxDetail/nvtxImplCuda_v3.h"
138 | #undef NVTX_IMPL_GUARD_CUDA
139 | #endif /*NVTX_NO_IMPL*/
140 | 
141 | #endif /* NVTOOLSEXT_CUDA_V3 */
142 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtCudaRt.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
  5 | * See https://llvm.org/LICENSE.txt for license information.
  6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7 | */
  8 | 
  9 | #include "nvToolsExt.h"
 10 | 
 11 | #include "cuda.h"
 12 | #include "driver_types.h"
 13 | 
 14 | #ifndef NVTOOLSEXT_CUDART_V3
 15 | #define NVTOOLSEXT_CUDART_V3
 16 | 
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif /* __cplusplus */
 20 | 
 21 | /* ========================================================================= */
 22 | /** \name Functions for CUDA Resource Naming
 23 | */
 24 | /** \addtogroup RESOURCE_NAMING
 25 |  * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
 26 |  *
 27 |  * This section covers the API functions that allow to annotate CUDA resources
 28 |  * with user-provided names.
 29 |  *
 30 |  * @{
 31 |  */
 32 | 
 33 | /*  ------------------------------------------------------------------------- */
 34 | /* \cond SHOW_HIDDEN
 35 | * \brief Used to build a non-colliding value for resource types separated class
 36 | * \version \NVTX_VERSION_2
 37 | */
 38 | #define NVTX_RESOURCE_CLASS_CUDART 5
 39 | /** \endcond */
 40 | 
 41 | /*  ------------------------------------------------------------------------- */
 42 | /** \brief Resource types for CUDART
 43 | */
 44 | typedef enum nvtxResourceCUDARTType_t
 45 | {
 46 |     NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
 47 |     NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
 48 |     NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
 49 | } nvtxResourceCUDARTType_t;
 50 | 
 51 | 
 52 | /* ------------------------------------------------------------------------- */
 53 | /** \brief Annotates a CUDA device.
 54 |  *
 55 |  * Allows the user to associate a CUDA device with a user-provided name.
 56 |  *
 57 |  * \param device - The id of the CUDA device to name.
 58 |  * \param name   - The name of the CUDA device.
 59 |  *
 60 |  * \version \NVTX_VERSION_1
 61 |  * @{ */
 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
 64 | /** @} */
 65 | 
 66 | /* ------------------------------------------------------------------------- */
 67 | /** \brief Annotates a CUDA stream.
 68 |  *
 69 |  * Allows the user to associate a CUDA stream with a user-provided name.
 70 |  *
 71 |  * \param stream - The handle of the CUDA stream to name.
 72 |  * \param name   - The name of the CUDA stream.
 73 |  *
 74 |  * \version \NVTX_VERSION_1
 75 |  * @{ */
 76 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
 77 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
 78 | /** @} */
 79 | 
 80 | /* ------------------------------------------------------------------------- */
 81 | /** \brief Annotates a CUDA event.
 82 |  *
 83 |  * Allows the user to associate a CUDA event with a user-provided name.
 84 |  *
 85 |  * \param event - The handle of the CUDA event to name.
 86 |  * \param name  - The name of the CUDA event.
 87 |  *
 88 |  * \version \NVTX_VERSION_1
 89 |  * @{ */
 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
 91 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
 92 | /** @} */
 93 | 
 94 | /** @} */ /* END RESOURCE_NAMING */
 95 | 
 96 | /* ========================================================================= */
 97 | #ifdef UNICODE
 98 |   #define nvtxNameCudaDevice nvtxNameCudaDeviceW
 99 |   #define nvtxNameCudaStream nvtxNameCudaStreamW
100 |   #define nvtxNameCudaEvent  nvtxNameCudaEventW
101 | #else
102 |   #define nvtxNameCudaDevice nvtxNameCudaDeviceA
103 |   #define nvtxNameCudaStream nvtxNameCudaStreamA
104 |   #define nvtxNameCudaEvent  nvtxNameCudaEventA
105 | #endif
106 | 
107 | #ifdef __cplusplus
108 | }
109 | #endif /* __cplusplus */
110 | 
111 | #ifndef NVTX_NO_IMPL
112 | #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot be included directly */
113 | #include "nvtxDetail/nvtxImplCudaRt_v3.h"
114 | #undef NVTX_IMPL_GUARD_CUDART
115 | #endif /*NVTX_NO_IMPL*/
116 | 
117 | #endif /* NVTOOLSEXT_CUDART_V3 */
118 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtSemanticsCounters.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2024  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /**
10 |  * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
11 |  */
12 | 
13 | #ifndef NVTX_SEMANTIC_ID_COUNTERS_V1
14 | #define NVTX_SEMANTIC_ID_COUNTERS_V1 2
15 | 
16 | /**
17 |  * Flags to extend the semantics of counters.
18 |  */
19 | #define NVTX_COUNTERS_FLAGS_NONE  0
20 | 
21 | /**
22 |  * Convert the fixed point value to a normalized floating point value.
23 |  * Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type
24 |  * this flag is applied to.
25 |  */
26 | #define NVTX_COUNTERS_FLAG_NORMALIZE    (1 << 1)
27 | 
28 | /**
29 |  *  Visual tools should apply scale and limits when graphing.
30 |  */
31 | #define NVTX_COUNTERS_FLAG_LIMIT_MIN    (1 << 2)
32 | #define NVTX_COUNTERS_FLAG_LIMIT_MAX    (1 << 3)
33 | #define NVTX_COUNTERS_FLAG_LIMITS \
34 |     (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
35 | 
36 | /**
37 |  * Counter time scopes.
38 |  */
39 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT        (1 << 5)
40 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST   (2 << 5)
41 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT   (3 << 5)
42 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START  (4 << 5)
43 | 
44 | /**
45 |  * Counter value types.
46 |  */
47 | #define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10)
48 | /** Delta to previous value of same counter type. */
49 | #define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA    (2 << 10)
50 | 
51 | /**
52 |  * Datatypes for the `limits` union.
53 |  */
54 | #define NVTX_COUNTERS_LIMIT_I64 0
55 | #define NVTX_COUNTERS_LIMIT_U64 1
56 | #define NVTX_COUNTERS_LIMIT_F64 2
57 | 
58 | /**
59 |  *\brief Specify counter semantics.
60 |  */
61 | typedef struct nvtxSemanticsCounter_v1 {
62 |     /** Header of the semantic extensions (with identifier, version, etc.). */
63 |     struct nvtxSemanticsHeader_v1 header;
64 | 
65 |     /** Flags to provide more context about the counter value. */
66 |     uint64_t flags;
67 | 
68 |     /** Unit of the counter value (case-insensitive). */
69 |     const char*  unit;
70 | 
71 |     /** Should be 1 if not used. */
72 |     uint64_t unitScaleNumerator;
73 | 
74 |     /** Should be 1 if not used. */
75 |     uint64_t unitScaleDenominator;
76 | 
77 |     /** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */
78 |     int64_t limitType;
79 | 
80 |     /** Graph limits {minimum, maximum}. */
81 |     union limits_t {
82 |         int64_t  i64[2];
83 |         uint64_t u64[2];
84 |         double   d[2];
85 |     } limits;
86 | } nvtxSemanticsCounter_t;
87 | 
88 | #endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtSemanticsScope.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2024  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /**
10 |  * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
11 |  */
12 | 
13 | #ifndef NVTX_SEMANTIC_ID_SCOPE_V1
14 | #define NVTX_SEMANTIC_ID_SCOPE_V1 1
15 | 
16 | /**
17 |  * \brief Specify the NVTX scope for a payload entry.
18 |  *
19 |  * This allows the scope to be set for a specific value or counter in a payload.
20 |  * The scope must be known at schema registration time.
21 |  */
22 | typedef struct nvtxSemanticsScope_v1
23 | {
24 |     struct nvtxSemanticsHeader_v1 header;
25 | 
26 |     /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */
27 |     uint64_t scopeId;
28 | } nvtxSemanticsScope_t;
29 | 
30 | #endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2023  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_HELPER_MACROS_H
10 | #define NVTX_EXT_HELPER_MACROS_H
11 | 
12 | /* Combine tokens */
13 | #define _NVTX_EXT_CONCAT(a, b) a##b
14 | #define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b)
15 | 
16 | /* Resolves to the number of arguments passed. */
17 | #define NVTX_EXT_NUM_ARGS(...) \
18 |     NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway)
19 | #define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16
20 | 
21 | /* Cast argument(s) to void to prevent unused variable warnings. */
22 | #define _NVTX_EXT_VOIDIFY1(a1) (void)a1;
23 | #define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2;
24 | #define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3;
25 | #define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4;
26 | 
27 | /* Mark function arguments as unused. */
28 | #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \
29 |     NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
30 | 
31 | #endif /* NVTX_EXT_HELPER_MACROS_H */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_IMPL_GUARD
10 | #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifndef NVTX_EXT_IMPL_H
14 | #define NVTX_EXT_IMPL_H
15 | /* ---- Include required platform headers ---- */
16 | 
17 | #include <stdlib.h>
18 | #include <stdio.h>
19 | #include <string.h>
20 | #include <wchar.h>
21 | 
22 | #if defined(_WIN32)
23 | 
24 | #include <Windows.h>
25 | 
26 | #else
27 | #include <unistd.h>
28 | 
29 | #if defined(__ANDROID__)
30 | #include <android/api-level.h>
31 | #endif
32 | 
33 | #if defined(__linux__) || defined(__CYGWIN__)
34 | #include <sched.h>
35 | #endif
36 | 
37 | #include <sys/types.h>
38 | #include <limits.h>
39 | #include <dlfcn.h>
40 | #include <fcntl.h>
41 | #include <errno.h>
42 | #include <pthread.h>
43 | 
44 | #endif
45 | 
46 | /* ---- Define macros used in this file ---- */
47 | 
48 | #ifdef NVTX_DEBUG_PRINT
49 | #ifdef __ANDROID__
50 | #include <android/log.h>
51 | #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
52 | #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
53 | #else
54 | #include <stdio.h>
55 | #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
56 | #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
57 | #endif
58 | #else /* !defined(NVTX_DEBUG_PRINT) */
59 | #define NVTX_ERR(...)
60 | #define NVTX_INFO(...)
61 | #endif
62 | 
63 | #ifdef __cplusplus
64 | extern "C" {
65 | #endif /* __cplusplus */
66 | /*
67 | #ifdef __GNUC__
68 | #pragma GCC visibility push(hidden)
69 | #endif
70 | */
71 | #define NVTX_EXTENSION_FRESH 0
72 | #define NVTX_EXTENSION_DISABLED 1
73 | #define NVTX_EXTENSION_STARTING 2
74 | #define NVTX_EXTENSION_LOADED 3
75 | 
76 | /* Function slots are local to each extension */
77 | typedef struct nvtxExtGlobals1_t
78 | {
79 |     NvtxExtInitializeInjectionFunc_t injectionFnPtr;
80 | } nvtxExtGlobals1_t;
81 | 
82 | NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) =
83 | {
84 |     (NvtxExtInitializeInjectionFunc_t)0
85 | };
86 | 
87 | #define NVTX_EXT_INIT_GUARD
88 | #include "nvtxExtInit.h"
89 | #undef NVTX_EXT_INIT_GUARD
90 | /*
91 | #ifdef __GNUC__
92 | #pragma GCC visibility pop
93 | #endif
94 | */
95 | #ifdef __cplusplus
96 | } /* extern "C" */
97 | #endif /* __cplusplus */
98 | 
99 | #endif /* NVTX_EXT_IMPL_H */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD
10 | #error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif /* __cplusplus */
16 | 
17 | #ifdef NVTX_DISABLE
18 | 
19 | #include "nvtxExtHelperMacros.h"
20 | 
21 | #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
22 | ret_val fn_name signature { \
23 |     NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
24 |     return ((ret_val)(intptr_t)-1); \
25 | }
26 | 
27 | #else  /* NVTX_DISABLE */
28 | 
29 | #define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
30 | typedef ret_type ( * fn_name##_impl_fntype )signature; \
31 |     NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
32 |     intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
33 |     if (slot != NVTX_EXTENSION_DISABLED) { \
34 |         if (slot != NVTX_EXTENSION_FRESH) { \
35 |             return (*(fn_name##_impl_fntype)slot) arg_names; \
36 |         } else { \
37 |             NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
38 |             /* Re-read function slot after extension initialization. */ \
39 |             slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
40 |             if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
41 |                 return (*(fn_name##_impl_fntype)slot) arg_names; \
42 |             } \
43 |         } \
44 |     } \
45 |     NVTX_EXT_FN_RETURN_INVALID(ret_type) \
46 | }
47 | 
48 | #endif /*NVTX_DISABLE*/
49 | 
50 | /* Non-void functions. */
51 | #define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
52 | 
53 | NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain))
54 | 
55 | NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device))
56 | 
57 | #undef NVTX_EXT_FN_RETURN_INVALID
58 | /* END: Non-void functions. */
59 | 
60 | /* void functions. */
61 | #define NVTX_EXT_FN_RETURN_INVALID(rtype)
62 | #define return
63 | 
64 | NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags))
65 | 
66 | #undef return
67 | #undef NVTX_EXT_FN_RETURN_INVALID
68 | /* END: void functions. */
69 | 
70 | #undef NVTX_EXT_FN_IMPL
71 | 
72 | #ifdef __cplusplus
73 | } /* extern "C" */
74 | #endif /* __cplusplus */
75 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2021  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /* This header defines types which are used by the internal implementation
10 | *  of NVTX and callback subscribers.  API clients do not use these types,
11 | *  so they are defined here instead of in nvToolsExt.h to clarify they are
12 | *  not part of the NVTX client API. */
13 | 
14 | #ifndef NVTXEXTTYPES_H
15 | #define NVTXEXTTYPES_H
16 | 
17 | #ifndef NVTX_EXT_TYPES_GUARD
18 | #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h.
19 | #endif
20 | 
21 | typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
22 | 
23 | typedef struct nvtxExtModuleSegment_t
24 | {
25 |     size_t segmentId;
26 |     size_t slotCount;
27 |     intptr_t* functionSlots;
28 | } nvtxExtModuleSegment_t;
29 | 
30 | typedef struct nvtxExtModuleInfo_t
31 | {
32 |     uint16_t nvtxVer;
33 |     uint16_t structSize;
34 |     uint16_t moduleId;
35 |     uint16_t compatId;
36 |     size_t segmentsCount;
37 |     nvtxExtModuleSegment_t* segments;
38 |     NvtxExtGetExportFunction_t getExportFunction;
39 |     const void* extInfo;
40 | } nvtxExtModuleInfo_t;
41 | 
42 | typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
43 | 
44 | #endif /* NVTXEXTTYPES_H */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_IMPL_GUARD_CUDART
10 | #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif /* __cplusplus */
16 | 
17 | typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
18 | typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
19 | typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
20 | typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
21 | typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
22 | typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
23 | 
24 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
25 | {
26 | #ifndef NVTX_DISABLE
27 |     nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
28 |     if(local!=0)
29 |         (*local)(device, name);
30 | #endif /*NVTX_DISABLE*/
31 | }
32 | 
33 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
34 | {
35 | #ifndef NVTX_DISABLE
36 |     nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
37 |     if(local!=0)
38 |         (*local)(device, name);
39 | #endif /*NVTX_DISABLE*/
40 | }
41 | 
42 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
43 | {
44 | #ifndef NVTX_DISABLE
45 |     nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
46 |     if(local!=0)
47 |         (*local)(stream, name);
48 | #endif /*NVTX_DISABLE*/
49 | }
50 | 
51 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
52 | {
53 | #ifndef NVTX_DISABLE
54 |     nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
55 |     if(local!=0)
56 |         (*local)(stream, name);
57 | #endif /*NVTX_DISABLE*/
58 | }
59 | 
60 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
61 | {
62 | #ifndef NVTX_DISABLE
63 |     nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
64 |     if(local!=0)
65 |         (*local)(event, name);
66 | #endif /*NVTX_DISABLE*/
67 | }
68 | 
69 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
70 | {
71 | #ifndef NVTX_DISABLE
72 |     nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
73 |     if(local!=0)
74 |         (*local)(event, name);
75 | #endif /*NVTX_DISABLE*/
76 | }
77 | 
78 | #ifdef __cplusplus
79 | } /* extern "C" */
80 | #endif /* __cplusplus */
81 | 
82 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
  5 | * See https://llvm.org/LICENSE.txt for license information.
  6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7 | */
  8 | 
  9 | #ifndef NVTX_IMPL_GUARD_CUDA
 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
 11 | #endif
 12 | 
 13 | 
 14 | #ifdef __cplusplus
 15 | extern "C" {
 16 | #endif /* __cplusplus */
 17 | 
 18 | typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
 19 | typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
 20 | typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
 21 | typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
 22 | typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
 23 | typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
 24 | typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
 25 | typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
 26 | 
 27 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
 28 | {
 29 | #ifndef NVTX_DISABLE
 30 |     nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
 31 |     if(local!=0)
 32 |         (*local)(device, name);
 33 | #endif /*NVTX_DISABLE*/
 34 | }
 35 | 
 36 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
 37 | {
 38 | #ifndef NVTX_DISABLE
 39 |     nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
 40 |     if(local!=0)
 41 |         (*local)(device, name);
 42 | #endif /*NVTX_DISABLE*/
 43 | }
 44 | 
 45 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
 46 | {
 47 | #ifndef NVTX_DISABLE
 48 |     nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
 49 |     if(local!=0)
 50 |         (*local)(context, name);
 51 | #endif /*NVTX_DISABLE*/
 52 | }
 53 | 
 54 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
 55 | {
 56 | #ifndef NVTX_DISABLE
 57 |     nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
 58 |     if(local!=0)
 59 |         (*local)(context, name);
 60 | #endif /*NVTX_DISABLE*/
 61 | }
 62 | 
 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
 64 | {
 65 | #ifndef NVTX_DISABLE
 66 |     nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
 67 |     if(local!=0)
 68 |         (*local)(stream, name);
 69 | #endif /*NVTX_DISABLE*/
 70 | }
 71 | 
 72 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
 73 | {
 74 | #ifndef NVTX_DISABLE
 75 |     nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
 76 |     if(local!=0)
 77 |         (*local)(stream, name);
 78 | #endif /*NVTX_DISABLE*/
 79 | }
 80 | 
 81 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
 82 | {
 83 | #ifndef NVTX_DISABLE
 84 |     nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
 85 |     if(local!=0)
 86 |         (*local)(event, name);
 87 | #endif /*NVTX_DISABLE*/
 88 | }
 89 | 
 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
 91 | {
 92 | #ifndef NVTX_DISABLE
 93 |     nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
 94 |     if(local!=0)
 95 |         (*local)(event, name);
 96 | #endif /*NVTX_DISABLE*/
 97 | }
 98 | 
 99 | #ifdef __cplusplus
100 | } /* extern "C" */
101 | #endif /* __cplusplus */
102 | 
103 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_IMPL_GUARD_SYNC
10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif /* __cplusplus */
17 | 
18 | typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
19 | typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
20 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
21 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
22 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
23 | typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
24 | 
25 | NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
26 | {
27 | #ifndef NVTX_DISABLE
28 |     nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
29 |     if(local!=0)
30 |         return (*local)(domain, attribs);
31 |     else
32 | #endif  /*NVTX_DISABLE*/
33 |         return (nvtxSyncUser_t)0;
34 | }
35 | 
36 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
37 | {
38 | #ifndef NVTX_DISABLE
39 |     nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
40 |     if(local!=0)
41 |         (*local)(handle);
42 | #endif /*NVTX_DISABLE*/
43 | }
44 | 
45 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
46 | {
47 | #ifndef NVTX_DISABLE
48 |     nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
49 |     if(local!=0)
50 |         (*local)(handle);
51 | #endif /*NVTX_DISABLE*/
52 | }
53 | 
54 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
55 | {
56 | #ifndef NVTX_DISABLE
57 |     nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
58 |     if(local!=0)
59 |         (*local)(handle);
60 | #endif /*NVTX_DISABLE*/
61 | }
62 | 
63 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
64 | {
65 | #ifndef NVTX_DISABLE
66 |     nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
67 |     if(local!=0)
68 |         (*local)(handle);
69 | #endif /*NVTX_DISABLE*/
70 | }
71 | 
72 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
73 | {
74 | #ifndef NVTX_DISABLE
75 |     nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
76 |     if(local!=0)
77 |         (*local)(handle);
78 | #endif /*NVTX_DISABLE*/
79 | }
80 | 
81 | #ifdef __cplusplus
82 | } /* extern "C" */
83 | #endif /* __cplusplus */
84 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef __NVTX_LINKONCE_H__
10 | #define __NVTX_LINKONCE_H__
11 | 
12 | /* This header defines macros to permit making definitions of global variables
13 |  * and functions in C/C++ header files which may be included multiple times in
14 |  * a translation unit or linkage unit.  It allows authoring header-only libraries
15 |  * which can be used by multiple other header-only libraries (either as the same
16 |  * copy or multiple copies), and does not require any build changes, such as
17 |  * adding another .c file, linking a static library, or deploying a dynamic
18 |  * library.  Globals defined with these macros have the property that they have
19 |  * the same address, pointing to a single instance, for the entire linkage unit.
20 |  * It is expected but not guaranteed that each linkage unit will have a separate
21 |  * instance.
22 |  *
23 |  * In some situations it is desirable to declare a variable without initializing
24 |  * it, refer to it in code or other variables' initializers, and then initialize
25 |  * it later.  Similarly, functions can be prototyped, have their address taken,
26 |  * and then have their body defined later.  In such cases, use the FWDDECL macros
27 |  * when forward-declaring LINKONCE global variables without initializers and
28 |  * function prototypes, and then use the DEFINE macros when later defining them.
29 |  * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
30 |  * following this pattern makes code maximally portable.
31 |  */
32 | 
33 | #if defined(__MINGW32__) /* MinGW */
34 |     #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
35 |     #if defined(__cplusplus)
36 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
37 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK
38 |     #else
39 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
40 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
41 |     #endif
42 | #elif defined(_MSC_VER) /* MSVC */
43 |     #if defined(__cplusplus)
44 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   extern "C" __declspec(selectany)
45 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
46 |     #else
47 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
48 |         #define NVTX_LINKONCE_DEFINE_FUNCTION __inline
49 |     #endif
50 | #elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */
51 |     #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
52 |     #if defined(__cplusplus)
53 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
54 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK
55 |     #else
56 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
57 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
58 |     #endif
59 | #elif defined(__CYGWIN__) /* Assume GCC or compatible */
60 |     #define NVTX_LINKONCE_WEAK __attribute__((weak))
61 |     #if defined(__cplusplus)
62 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
63 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
64 |     #else
65 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
66 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
67 |     #endif
68 | #else /* All others: Assume GCC, clang, or compatible */
69 |     #define NVTX_LINKONCE_WEAK   __attribute__((weak))
70 |     #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden")))
71 |     #if defined(__cplusplus)
72 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
73 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline
74 |     #else
75 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
76 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
77 |     #endif
78 | #endif
79 | 
80 | #define NVTX_LINKONCE_FWDDECL_GLOBAL   NVTX_LINKONCE_DEFINE_GLOBAL   extern
81 | #define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION
82 | 
83 | #endif /* __NVTX_LINKONCE_H__ */
84 | 


--------------------------------------------------------------------------------
/src/include/p2p.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <stdlib.h>
 8 | 
 9 | #ifndef NCCL_P2P_H_
10 | #define NCCL_P2P_H_
11 | 
12 | #include <cuda.h>
13 | #include <cuda_runtime.h>
14 | 
15 | #include "core.h"
16 | 
17 | #if CUDART_VERSION < 12030
18 | // MNNVL: FABRIC handle support lifted from CUDA 12.3
19 | #define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128)
20 | #define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
21 | #define CU_IPC_HANDLE_SIZE 64
22 | typedef struct CUmemFabricHandle_st {
23 |     unsigned char data[CU_IPC_HANDLE_SIZE];
24 | } CUmemFabricHandle_v1;
25 | typedef CUmemFabricHandle_v1 CUmemFabricHandle;
26 | #endif
27 | 
28 | typedef union {
29 |   uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
30 |   CUmemFabricHandle handle;
31 | } ncclCuDesc;
32 | 
33 | typedef union {
34 |   // Legacy CUDA IPC
35 |   cudaIpcMemHandle_t devIpc;
36 |   // cuMem API support
37 |   struct {
38 |     ncclCuDesc cuDesc;
39 |     CUmemGenericAllocationHandle memHandle;
40 |   };
41 | } ncclIpcDesc;
42 | 
43 | enum ncclIpcRegType {
44 |   NCCL_IPC_SENDRECV = 0,
45 |   NCCL_IPC_COLLECTIVE = 1
46 | };
47 | 
48 | struct ncclIpcImpInfo {
49 |   void* rmtRegAddr;
50 |   bool legacyIpcCap;
51 |   uintptr_t offset;
52 | };
53 | 
54 | struct ncclIpcRegInfo {
55 |   int peerRank;
56 |   void* baseAddr;
57 |   struct ncclProxyConnector* ipcProxyconn;
58 |   struct ncclIpcImpInfo impInfo;
59 | };
60 | 
61 | ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr);
62 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
63 | ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
64 | ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut);
65 | ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts);
66 | 
67 | ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo);
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/src/include/param.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PARAM_H_
 8 | #define NCCL_PARAM_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | const char* userHomeDir();
13 | void setEnvFile(const char* fileName);
14 | void initEnv();
15 | const char *ncclGetEnv(const char *name);
16 | 
17 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
18 | 
19 | #define NCCL_PARAM(name, env, deftVal) \
20 |   int64_t ncclParam##name() { \
21 |     constexpr int64_t uninitialized = INT64_MIN; \
22 |     static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
23 |     static int64_t cache = uninitialized; \
24 |     if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
25 |       ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \
26 |     } \
27 |     return cache; \
28 |   }
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/include/profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PROFILER_H_
 8 | #define PROFILER_H_
 9 | 
10 | #include <cuda_runtime.h>
11 | #include "nccl_profiler.h"
12 | 
13 | struct ncclProxyArgs;
14 | struct ncclKernelPlan;
15 | struct ncclTaskColl;
16 | struct ncclTaskP2p;
17 | struct ncclInfo;
18 | struct ncclComm;
19 | struct ncclProxyOp;
20 | 
21 | // Plugin Init/Finalize Wrappers
22 | ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
23 | ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm);
24 | 
25 | // Profiler Start/Stop Group Wrappers
26 | ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan);
27 | ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan);
28 | 
29 | // Profiler Start/Stop Task Events Wrappers
30 | ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
31 | ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
32 | 
33 | // Proxy Op Start/Stop Event Wrappers
34 | ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
35 | ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
36 | ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
37 | 
38 | // Proxy Step Start/Stop Event Wrappers
39 | ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
40 | ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
41 | ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
42 | 
43 | // Proxy Control Start/Stop Events Wrappers
44 | ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
45 | ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
46 | 
47 | // Record Event Wrappers
48 | ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
49 | ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState);
50 | ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
51 | 
52 | // Profiler utility functions
53 | ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/src/include/register.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_REGISTER_H_
 2 | #define NCCL_REGISTER_H_
 3 | 
 4 | #include "device.h"
 5 | 
 6 | #include <cuda.h>
 7 | #include <stdint.h>
 8 | 
 9 | enum {
10 |   NET_REG_COMPLETE = 0x01,
11 |   NVLS_REG_COMPLETE = 0x02,
12 |   NVLS_REG_POSSIBLE = 0x04,
13 |   NVLS_REG_NO_SUPPORT = 0x08,
14 |   COLLNET_REG_COMPLETE = 0x10,
15 |   IPC_REG_COMPLETE = 0x20
16 | };
17 | 
18 | struct ncclPeerRegIpcAddr {
19 |   uintptr_t* devPeerRmtAddrs;
20 |   uintptr_t* hostPeerRmtAddrs;
21 | };
22 | 
23 | struct ncclReg {
24 |   // common attributes
25 |   size_t pages;
26 |   int refs;
27 |   uintptr_t addr;
28 |   uint32_t state;
29 |   // net reg
30 |   int nDevs;
31 |   int devs[MAXCHANNELS];
32 |   void** handles;
33 |   // nvls reg
34 |   uintptr_t baseAddr;
35 |   size_t baseSize;
36 |   CUdeviceptr regAddr;
37 |   size_t regSize;
38 |   int dev;
39 |   CUmemGenericAllocationHandle mcHandle;
40 |   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
41 |   // collnet reg
42 |   void* collnetHandle;
43 |   struct ncclProxyConnector* collnetProxyconn;
44 |   // general ipc reg
45 |   struct ncclPeerRegIpcAddr regIpcAddrs;
46 |   struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
47 | };
48 | 
49 | struct ncclRegCache {
50 |   struct ncclReg **slots;
51 |   int capacity, population;
52 |   uintptr_t pageSize;
53 |   void* sComms[MAXCHANNELS];
54 |   void* rComms[MAXCHANNELS];
55 | };
56 | 
57 | ncclResult_t ncclRegCleanup(struct ncclComm* comm);
58 | ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/include/shm.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_SHM_H_
 2 | #define NCCL_SHM_H_
 3 | 
 4 | #include "comm.h"
 5 | 
 6 | struct shmLegacyIpc {
 7 |   char shmSuffix[7];
 8 |   ncclShmHandle_t handle;
 9 |   size_t shmSize;
10 | };
11 | 
12 | struct shmCuIpc {
13 |   union {
14 |     CUmemFabricHandle handle;
15 |     CUmemGenericAllocationHandle data;
16 |   };
17 |   int tpProxyRank;
18 |   void *ptr;
19 |   size_t size;
20 | };
21 | 
22 | struct shmIpcDesc {
23 |   union
24 |   {
25 |     struct shmLegacyIpc shmli;
26 |     struct shmCuIpc shmci;
27 |   };
28 |   bool legacy;
29 | };
30 | 
31 | typedef struct shmIpcDesc ncclShmIpcDesc_t;
32 | 
33 | ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
34 | ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
35 | ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/include/shmutils.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SHMUTILS_H_
 8 | #define NCCL_SHMUTILS_H_
 9 | 
10 | #include "nccl.h"
11 | 
12 | typedef void* ncclShmHandle_t;
13 | ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
14 | ncclResult_t ncclShmClose(ncclShmHandle_t handle);
15 | ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
16 | 
17 | struct ncclShmemCollBuff {
18 |   volatile size_t *cnt[2];
19 |   volatile void *ptr[2];
20 |   int round;
21 |   size_t maxTypeSize;
22 | };
23 | 
24 | ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/include/socket.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SOCKET_H_
 8 | #define NCCL_SOCKET_H_
 9 | 
10 | #include "nccl.h"
11 | #include <sys/socket.h>
12 | #include <arpa/inet.h>
13 | #include <netinet/tcp.h>
14 | #include <netdb.h>
15 | #include <fcntl.h>
16 | #include <poll.h>
17 | 
18 | #define MAX_IFS 16
19 | #define MAX_IF_NAME_SIZE 16
20 | #define SLEEP_INT            1000 // connection retry sleep interval in usec
21 | #define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
22 | #define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
23 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
24 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
25 | 
26 | /* Common socket address storage structure for IPv4/IPv6 */
27 | union ncclSocketAddress {
28 |   struct sockaddr sa;
29 |   struct sockaddr_in sin;
30 |   struct sockaddr_in6 sin6;
31 | };
32 | 
33 | enum ncclSocketState {
34 |   ncclSocketStateNone = 0,
35 |   ncclSocketStateInitialized = 1,
36 |   ncclSocketStateAccepting = 2,
37 |   ncclSocketStateAccepted = 3,
38 |   ncclSocketStateConnecting = 4,
39 |   ncclSocketStateConnectPolling = 5,
40 |   ncclSocketStateConnected = 6,
41 |   ncclSocketStateReady = 7,
42 |   ncclSocketStateClosed = 8,
43 |   ncclSocketStateError = 9,
44 |   ncclSocketStateNum = 10
45 | };
46 | 
47 | enum ncclSocketType {
48 |   ncclSocketTypeUnknown = 0,
49 |   ncclSocketTypeBootstrap = 1,
50 |   ncclSocketTypeProxy = 2,
51 |   ncclSocketTypeNetSocket = 3,
52 |   ncclSocketTypeNetIb = 4
53 | };
54 | 
55 | struct ncclSocket {
56 |   int fd;
57 |   int acceptFd;
58 |   int timedOutRetries;
59 |   int refusedRetries;
60 |   union ncclSocketAddress addr;
61 |   volatile uint32_t* abortFlag;
62 |   int asyncFlag;
63 |   enum ncclSocketState state;
64 |   int salen;
65 |   uint64_t magic;
66 |   enum ncclSocketType type;
67 | };
68 | 
69 | const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
70 | ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
71 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
72 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
73 | 
74 | // Initialize a socket
75 | ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
76 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
77 | ncclResult_t ncclSocketListen(struct ncclSocket* sock);
78 | ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
79 | // Connect to sock->addr. sock->fd is set after a successful call.
80 | ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
81 | // Return socket connection state.
82 | ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
83 | // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
84 | ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
85 | ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
86 | ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
87 | 
88 | #define NCCL_SOCKET_SEND 0
89 | #define NCCL_SOCKET_RECV 1
90 | 
91 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
92 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
93 | ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
94 | ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
95 | ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
96 | ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
97 | ncclResult_t ncclSocketClose(struct ncclSocket* sock);
98 | #endif
99 | 


--------------------------------------------------------------------------------
/src/include/timer.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TIMER_H_
 8 | #define NCCL_TIMER_H_
 9 | #if ENABLE_TIMER
10 | #include <unistd.h>
11 | #include <sys/time.h>
12 | #include <x86intrin.h>
13 | static double freq = -1;
14 | static void calibrate() {
15 |   struct timeval tv;
16 |   gettimeofday(&tv, NULL);
17 |   uint64_t timeCycles = __rdtsc();
18 |   double time = - tv.tv_sec*1E6 - tv.tv_usec;
19 |   uint64_t total = 0ULL;
20 |   for (int i=0; i<10000; i++) total += __rdtsc();
21 |   gettimeofday(&tv, NULL);
22 |   timeCycles = __rdtsc() - timeCycles;
23 |   time += tv.tv_sec*1E6 + tv.tv_usec;
24 |   freq = timeCycles/time;
25 | }
26 | static inline double gettime() {
27 |   if (freq == -1) calibrate();
28 |   return __rdtsc()/freq;
29 | }
30 | static uint64_t counts[8];
31 | static double times[8];
32 | static double startTimes[8];
33 | #define TIME_START(index) do { \
34 |   counts[index]++; \
35 |   startTimes[index] = gettime(); \
36 | } while (0)
37 | 
38 | #define TIME_STOP(index) do { \
39 |   times[index] += gettime() - startTimes[index]; \
40 | } while (0)
41 | 
42 | #define TIME_CANCEL(index) do { \
43 |   counts[index]--; \
44 | } while (0)
45 | 
46 | #define TIME_PRINT(name) do { \
47 |   printf("%s stats", name); \
48 |   for (int i=0; i<8; i++) { \
49 |     if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
50 |     counts[i] = 0; \
51 |   } \
52 |   printf("\n"); \
53 | } while (0)
54 | #else
55 | #define TIME_START(index) do {} while(0)
56 | #define TIME_STOP(index) do {} while(0)
57 | #define TIME_CANCEL(index) do {} while(0)
58 | #define TIME_PRINT(name)
59 | #endif
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/include/trees.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TREES_H_
 8 | #define NCCL_TREES_H_
 9 | 
10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/include/tuner.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_INT_TUNER_H_
 9 | #define NCCL_INT_TUNER_H_
10 | 
11 | #include "nccl_tuner.h"
12 | #include "comm.h"
13 | 
14 | // Tuning plugin to override NCCL's default algorithm/protocol tuning.
15 | 
16 | // Attempts to load NCCL tuner from environmental variable.
17 | // Returns ncclSuccess if the correct tuner symbol has been found and
18 | // successully loaded.  Otherwise returns an error and also logs the error.
19 | ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm);
20 | 
21 | // Cleans up NCCL tuner plugin.
22 | ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm);
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/init_nvtx.cc:
--------------------------------------------------------------------------------
 1 | #include "nccl.h"
 2 | #include "nvtx.h"
 3 | 
 4 | static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
 5 |   {"Sum", ncclSum, 0},
 6 |   {"Product", ncclProd, 0},
 7 |   {"Max", ncclMax, 0},
 8 |   {"Min", ncclMin, 0},
 9 |   {"Avg", ncclAvg, 0}
10 | };
11 | 
12 | // Must be called before the first call to any reduction operation.
13 | void initNvtxRegisteredEnums() {
14 |   // Register schemas and strings
15 |   constexpr const nvtxPayloadEnumAttr_t eAttr {
16 |     .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES |
17 |       NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID,
18 |     .name = NULL,
19 |     .entries = NvtxEnumRedSchema,
20 |     .numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
21 |     .sizeOfEnum = sizeof(ncclRedOp_t),
22 |     .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP,
23 |     .extension = nullptr
24 |   };
25 | 
26 |   nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
27 | }
28 | 


--------------------------------------------------------------------------------
/src/misc/argcheck.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "argcheck.h"
 8 | #include "comm.h"
 9 | 
10 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
11 |   cudaPointerAttributes attr;
12 |   cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
13 |   if (err != cudaSuccess || attr.devicePointer == NULL) {
14 |     WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer);
15 |     return ncclInvalidArgument;
16 |   }
17 | #if CUDART_VERSION >= 10000
18 |   if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
19 | #else
20 |   if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
21 | #endif
22 |     WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
23 |     return ncclInvalidArgument;
24 |   }
25 |   return ncclSuccess;
26 | }
27 | 
28 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
29 |   if (ptr == NULL) {
30 |     WARN("%s : %s argument is NULL", opname, ptrname);
31 |     return ncclInvalidArgument;
32 |   }
33 |   return ncclSuccess;
34 | }
35 | 
36 | ncclResult_t CommCheck(struct ncclComm* comm, const char* opname, const char* ptrname) {
37 |   NCCLCHECK(PtrCheck(comm, opname, ptrname));
38 |   if (comm->startMagic != NCCL_MAGIC || comm->endMagic != NCCL_MAGIC) {
39 |     WARN("Error: corrupted comm object detected");
40 |     return ncclInvalidArgument;
41 |   }
42 |   return ncclSuccess;
43 | }
44 | 
45 | ncclResult_t ArgsCheck(struct ncclInfo* info) {
46 |   // First, the easy ones
47 |   if (info->root < 0 || info->root >= info->comm->nRanks) {
48 |     WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
49 |     return ncclInvalidArgument;
50 |   }
51 |   if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
52 |     WARN("%s : invalid type %d", info->opName, info->datatype);
53 |     return ncclInvalidArgument;
54 |   }
55 | 
56 |   // ncclMaxRedOp < info->op will always be false due to the sizes of
57 |   // the datatypes involved, and that's by design.  We keep the check though
58 |   // just as a reminder.
59 |   // coverity[result_independent_of_operands]
60 |   if (info->op < 0 || ncclMaxRedOp < info->op) {
61 |     WARN("%s : invalid reduction operation %d", info->opName, info->op);
62 |     return ncclInvalidArgument;
63 |   }
64 |   int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps);
65 |   if (ncclNumOps <= info->op &&
66 |       (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) {
67 |     WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op);
68 |     return ncclInvalidArgument;
69 |   }
70 | 
71 |   if (info->comm->checkPointers) {
72 |     if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) {
73 |       if (info->count >0)
74 |         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName));
75 |     } else {
76 |       // Check CUDA device pointers
77 |       if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
78 |         NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
79 |       }
80 |       if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
81 |         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
82 |       }
83 |     }
84 |   }
85 |   return ncclSuccess;
86 | }
87 | 


--------------------------------------------------------------------------------
/src/misc/msccl/msccl_status.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #include "msccl/msccl_status.h"
 7 | #include "msccl/msccl_struct.h"
 8 | 
 9 | mscclStatus& mscclGetStatus() {
10 |   static mscclStatus status;
11 |   return status;
12 | }
13 | 
14 | mscclThreadLocalStatus& mscclGetThreadLocalStatus() {
15 |   static thread_local mscclThreadLocalStatus threadLocalStatus;
16 |   return threadLocalStatus;
17 | }
18 | 
19 | mscclSavedProxyArgs& mscclGetSavedProxyArgs() {
20 |   static mscclSavedProxyArgs savedProxyArgs;
21 |   return savedProxyArgs;
22 | }


--------------------------------------------------------------------------------
/src/misc/param.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "param.h"
 8 | #include "debug.h"
 9 | 
10 | #include <algorithm>
11 | #include <errno.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <string.h>
15 | #include <sys/types.h>
16 | #include <unistd.h>
17 | #include <pthread.h>
18 | #include <pwd.h>
19 | 
20 | const char* userHomeDir() {
21 |   struct passwd *pwUser = getpwuid(getuid());
22 |   return pwUser == NULL ? NULL : pwUser->pw_dir;
23 | }
24 | 
25 | void setEnvFile(const char* fileName) {
26 |   FILE * file = fopen(fileName, "r");
27 |   if (file == NULL) return;
28 | 
29 |   char *line = NULL;
30 |   char envVar[1024];
31 |   char envValue[1024];
32 |   size_t n = 0;
33 |   ssize_t read;
34 |   while ((read = getline(&line, &n, file)) != -1) {
35 |     if (line[read-1] == '\n') line[read-1] = '\0';
36 |     int s=0; // Env Var Size
37 |     while (line[s] != '\0' && line[s] != '=') s++;
38 |     if (line[s] == '\0') continue;
39 |     strncpy(envVar, line, std::min(1023,s));
40 |     envVar[std::min(1023,s)] = '\0';
41 |     s++;
42 |     strncpy(envValue, line+s, 1023);
43 |     envValue[1023]='\0';
44 |     setenv(envVar, envValue, 0);
45 |     //printf("%s : %s->%s\n", fileName, envVar, envValue);
46 |   }
47 |   if (line) free(line);
48 |   fclose(file);
49 | }
50 | 
51 | static void initEnvFunc() {
52 |   char confFilePath[1024];
53 |   const char* userFile = getenv("NCCL_CONF_FILE");
54 |   if (userFile && strlen(userFile) > 0) {
55 |     snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
56 |     setEnvFile(confFilePath);
57 |   } else {
58 |     const char* userDir = userHomeDir();
59 |     if (userDir) {
60 |       snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir);
61 |       setEnvFile(confFilePath);
62 |     }
63 |   }
64 |   snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf");
65 |   setEnvFile(confFilePath);
66 | }
67 | 
68 | void initEnv() {
69 |   static pthread_once_t once = PTHREAD_ONCE_INIT;
70 |   pthread_once(&once, initEnvFunc);
71 | }
72 | 
73 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
74 |   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
75 |   pthread_mutex_lock(&mutex);
76 |   if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
77 |     const char* str = ncclGetEnv(env);
78 |     int64_t value = deftVal;
79 |     if (str && strlen(str) > 0) {
80 |       errno = 0;
81 |       value = strtoll(str, nullptr, 0);
82 |       if (errno) {
83 |         value = deftVal;
84 |         INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
85 |       } else {
86 |         INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value);
87 |       }
88 |     }
89 |     __atomic_store_n(cache, value, __ATOMIC_RELAXED);
90 |   }
91 |   pthread_mutex_unlock(&mutex);
92 | }
93 | 
94 | const char* ncclGetEnv(const char* name) {
95 |   initEnv();
96 |   return getenv(name);
97 | }
98 | 


--------------------------------------------------------------------------------
/src/nccl.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=${nccl:Prefix}
 2 | exec_prefix=${prefix}
 3 | libdir=${exec_prefix}/lib
 4 | includedir=${prefix}/include
 5 | 
 6 | Name: nccl
 7 | Description: Optimized primitives for collective multi-GPU communication
 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
 9 | Libs: -L${libdir} -lnccl
10 | Cflags: -I${includedir}
11 | 


--------------------------------------------------------------------------------
/src/transport/generic.cc:
--------------------------------------------------------------------------------
 1 | #include "comm.h"
 2 | #include "transport.h"
 3 | 
 4 | ncclResult_t ncclTransportRingConnect(struct ncclComm* comm, int* highestTransportType/*=NULL*/, bool* needsProxy/*=NULL*/) {
 5 |   ncclResult_t ret = ncclSuccess;
 6 |   if (comm && comm->nRanks > 1) {
 7 |     for (int c = 0; c < comm->nChannels; c++) {
 8 |       struct ncclChannel* channel = comm->channels + c;
 9 |       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
10 |     }
11 |     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0, highestTransportType, needsProxy), ret, fail);
12 |     INFO(NCCL_INIT, "Connected all rings");
13 |   }
14 | exit:
15 |   return ret;
16 | fail:
17 |   goto exit;
18 | }
19 | 
20 | ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm, int* highestTransportType/*=NULL*/, bool* needsProxy/*=NULL*/) {
21 |   ncclResult_t ret = ncclSuccess;
22 |   if (comm && comm->nRanks > 1) {
23 |     // Connect Trees
24 |     for (int c = 0; c < comm->nChannels; c++) {
25 |       struct ncclChannel* channel = comm->channels + c;
26 |       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
27 |       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
28 |     }
29 |     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0, highestTransportType, needsProxy), ret, fail);
30 |     INFO(NCCL_INIT, "Connected all trees");
31 |   }
32 | exit:
33 |   return ret;
34 | fail:
35 |   goto exit;
36 | }
37 | 
38 | ncclResult_t ncclTransportPatConnect(struct ncclComm* comm, int* highestTransportType/*=NULL*/, bool* needsProxy/*=NULL*/) {
39 |   ncclResult_t ret = ncclSuccess;
40 |   if (comm && comm->nRanks > 1) {
41 |     for (int mask=1; mask<comm->nRanks; mask<<=1) {
42 |       int prevPeer = (comm->rank + mask) % comm->nRanks;
43 |       int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks;
44 |       for (int c = 0; c < comm->nChannels; c++) {
45 |         NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter
46 |       }
47 |       NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
48 |       for (int c = 0; c < comm->nChannels; c++) {
49 |         NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather
50 |       }
51 |       NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0, highestTransportType, needsProxy), ret, fail);
52 |     }
53 |     INFO(NCCL_INIT, "Connected binomial trees");
54 |   }
55 | exit:
56 |   return ret;
57 | fail:
58 |   goto exit;
59 | }
60 | 


--------------------------------------------------------------------------------