├── .azure-pipelines └── integration-test.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── LICENSE.txt ├── Makefile ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── cgmanifest.json ├── ext-net ├── README.md ├── example │ ├── Makefile │ ├── nccl │ │ ├── common.h │ │ ├── err.h │ │ ├── net.h │ │ ├── net_device.h │ │ ├── net_v2.h │ │ ├── net_v3.h │ │ ├── net_v4.h │ │ ├── net_v5.h │ │ ├── net_v6.h │ │ ├── net_v7.h │ │ ├── net_v8.h │ │ └── types.h │ └── plugin.c └── google-fastsocket │ └── Makefile ├── ext-profiler └── example │ ├── Makefile │ ├── event.c │ ├── event.h │ ├── nccl │ ├── common.h │ ├── err.h │ ├── profiler.h │ ├── profiler_v1.h │ └── types.h │ ├── plugin.c │ ├── print_event.c │ └── print_event.h ├── ext-tuner └── example │ ├── Makefile │ ├── nccl │ ├── common.h │ ├── err.h │ └── tuner.h │ └── plugin.c ├── makefiles ├── common.mk ├── formatting.mk └── version.mk ├── pkg ├── Makefile ├── debian │ ├── .gitignore │ ├── Makefile │ ├── changelog.in │ ├── compat │ ├── control.in │ ├── copyright │ ├── gbp.conf │ ├── libnccl-dev.install.in │ ├── libnccl2.install.in │ ├── rules │ └── source │ │ └── format ├── redhat │ ├── Makefile │ └── nccl.spec.in ├── srctxz │ ├── Makefile │ └── create_srctxz.sh.in └── txz │ ├── Makefile │ └── create_txz.sh.in └── src ├── Makefile ├── bootstrap.cc ├── channel.cc ├── collectives.cc ├── debug.cc ├── device ├── Makefile ├── all_gather.h ├── all_reduce.h ├── broadcast.h ├── common.cu ├── common.h ├── common_kernel.h ├── generate.py ├── msccl_kernel.cu ├── network │ └── unpack │ │ ├── unpack.h │ │ └── unpack_defs.h ├── onerank.cu ├── op128.h ├── primitives.h ├── prims_ll.h ├── prims_ll128.h ├── prims_simple.h ├── reduce.h ├── reduce_kernel.h ├── reduce_scatter.h └── sendrecv.h ├── enhcompat.cc ├── enqueue.cc ├── graph ├── connect.cc ├── paths.cc ├── rings.cc ├── rings.h ├── search.cc ├── topo.cc ├── topo.h ├── trees.cc ├── tuning.cc ├── xml.cc └── xml.h ├── group.cc ├── include ├── alloc.h ├── argcheck.h ├── bitops.h ├── bootstrap.h ├── channel.h ├── checks.h ├── coll_net.h ├── collectives.h ├── comm.h ├── core.h ├── cpuset.h ├── cudawrap.h ├── debug.h ├── device.h ├── enqueue.h ├── gdrwrap.h ├── graph.h ├── group.h ├── ibvcore.h ├── ibvsymbols.h ├── ibvwrap.h ├── info.h ├── ipcsocket.h ├── msccl │ ├── msccl_kernel.h │ ├── msccl_lifecycle.h │ ├── msccl_parser.h │ ├── msccl_scheduler.h │ ├── msccl_setup.h │ ├── msccl_status.h │ └── msccl_struct.h ├── nccl_common.h ├── nccl_net.h ├── nccl_profiler.h ├── nccl_tuner.h ├── net.h ├── net_device.h ├── npkit │ ├── npkit.h │ ├── npkit_event.h │ └── npkit_struct.h ├── nvmlwrap.h ├── nvtx.h ├── nvtx3 │ ├── nvToolsExt.h │ ├── nvToolsExtCounters.h │ ├── nvToolsExtCuda.h │ ├── nvToolsExtCudaRt.h │ ├── nvToolsExtMem.h │ ├── nvToolsExtMemCudaRt.h │ ├── nvToolsExtOpenCL.h │ ├── nvToolsExtPayload.h │ ├── nvToolsExtPayloadHelper.h │ ├── nvToolsExtSemanticsCounters.h │ ├── nvToolsExtSemanticsScope.h │ ├── nvToolsExtSync.h │ ├── nvtx3.hpp │ └── nvtxDetail │ │ ├── nvtxExtHelperMacros.h │ │ ├── nvtxExtImpl.h │ │ ├── nvtxExtImplCounters_v1.h │ │ ├── nvtxExtImplMemCudaRt_v1.h │ │ ├── nvtxExtImplMem_v1.h │ │ ├── nvtxExtImplPayload_v1.h │ │ ├── nvtxExtInit.h │ │ ├── nvtxExtPayloadHelperInternal.h │ │ ├── nvtxExtPayloadTypeInfo.h │ │ ├── nvtxExtTypes.h │ │ ├── nvtxImpl.h │ │ ├── nvtxImplCore.h │ │ ├── nvtxImplCudaRt_v3.h │ │ ├── nvtxImplCuda_v3.h │ │ ├── nvtxImplOpenCL_v3.h │ │ ├── nvtxImplSync_v3.h │ │ ├── nvtxInit.h │ │ ├── nvtxInitDecls.h │ │ ├── nvtxInitDefs.h │ │ ├── nvtxLinkOnce.h │ │ └── nvtxTypes.h ├── p2p.h ├── param.h ├── profiler.h ├── proxy.h ├── register.h ├── shm.h ├── shmutils.h ├── socket.h ├── strongstream.h ├── timer.h ├── transport.h ├── trees.h ├── tuner.h └── utils.h ├── init.cc ├── init_nvtx.cc ├── misc ├── argcheck.cc ├── cudawrap.cc ├── gdrwrap.cc ├── ibvsymbols.cc ├── ibvwrap.cc ├── ipcsocket.cc ├── msccl │ ├── msccl_lifecycle.cc │ ├── msccl_parser.cc │ ├── msccl_setup.cc │ └── msccl_status.cc ├── npkit.cc ├── nvmlwrap.cc ├── param.cc ├── profiler.cc ├── shmutils.cc ├── socket.cc ├── strongstream.cc ├── tuner.cc └── utils.cc ├── nccl.h.in ├── nccl.pc.in ├── net.cc ├── proxy.cc ├── register.cc ├── transport.cc └── transport ├── coll_net.cc ├── generic.cc ├── net.cc ├── net_ib.cc ├── net_socket.cc ├── nvls.cc ├── p2p.cc └── shm.cc /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. 2 | /build 3 | *.gcov 4 | /coverage/ 5 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 14 | Laboratory, the U.S. Department of Energy, nor the names of their 15 | contributors may be used to endorse or promote products derived 16 | from this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 19 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 22 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 26 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | The U.S. Department of Energy funded the development of this software 31 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 32 | 33 | 34 | This code also includes files from the NVIDIA Tools Extension SDK project. 35 | 36 | See: 37 | 38 | https://github.com/NVIDIA/NVTX 39 | 40 | for more information and license details. 41 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 14 | Laboratory, the U.S. Department of Energy, nor the names of their 15 | contributors may be used to endorse or promote products derived 16 | from this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 19 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 22 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 26 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | The U.S. Department of Energy funded the development of this software 31 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 32 | 33 | 34 | This code also includes files from the NVIDIA Tools Extension SDK project. 35 | 36 | See: 37 | 38 | https://github.com/NVIDIA/NVTX 39 | 40 | for more information and license details. 41 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : src.build 9 | install : src.install 10 | BUILDDIR ?= $(abspath ./build) 11 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 12 | TARGETS := src pkg 13 | clean: ${TARGETS:%=%.clean} 14 | test.build: src.build 15 | LICENSE_FILES := LICENSE.txt 16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) 17 | lic: $(LICENSE_TARGETS) 18 | 19 | ${BUILDDIR}/%.txt: %.txt 20 | @printf "Copying %-35s > %s\n" $< $@ 21 | mkdir -p ${BUILDDIR} 22 | cp $< $@ 23 | 24 | src.%: 25 | ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} 26 | 27 | pkg.%: 28 | ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} 29 | 30 | pkg.debian.prep: lic 31 | pkg.txz.prep: lic 32 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses [GitHub Issues] to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new issue. 8 | 9 | For help and questions about using this project, please create a new post in [GitHub Discussions]. 10 | 11 | ## Microsoft Support Policy 12 | 13 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 14 | 15 | [GitHub Issues]: https://github.com/Azure/msccl-executor-nccl/issues 16 | [GitHub Discussions]: https://github.com/Azure/msccl-executor-nccl/discussions 17 | -------------------------------------------------------------------------------- /cgmanifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "registrations": [ 3 | { 4 | "component": { 5 | "type": "git", 6 | "git": { 7 | "repositoryUrl": "https://github.com/NVIDIA/nccl.git", 8 | "commitHash": "5d3ab08" 9 | } 10 | } 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /ext-net/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME:=../../build/ 7 | CUDA_HOME:=/usr/local/cuda 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl 9 | PLUGIN_SO:=libnccl-net.so 10 | 11 | default: $(PLUGIN_SO) 12 | 13 | $(PLUGIN_SO): plugin.c 14 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 15 | 16 | clean: 17 | rm -f $(PLUGIN_SO) 18 | -------------------------------------------------------------------------------- /ext-net/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 12 | 13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /ext-net/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_ERR_H_ 6 | #define NCCL_ERR_H_ 7 | 8 | /* Error type for plugins */ 9 | typedef enum { ncclSuccess = 0, 10 | ncclUnhandledCudaError = 1, 11 | ncclSystemError = 2, 12 | ncclInternalError = 3, 13 | ncclInvalidArgument = 4, 14 | ncclInvalidUsage = 5, 15 | ncclRemoteError = 6 } ncclResult_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_H_ 6 | #define NCCL_NET_H_ 7 | 8 | #include 9 | #include 10 | 11 | #include "common.h" 12 | #include "err.h" 13 | 14 | #define NCCL_NET_HANDLE_MAXSIZE 128 15 | 16 | #define NCCL_PTR_HOST 0x1 17 | #define NCCL_PTR_CUDA 0x2 18 | #define NCCL_PTR_DMABUF 0x4 19 | 20 | // Maximum number of requests per comm object 21 | #define NCCL_NET_MAX_REQUESTS 32 22 | 23 | #include "net_v8.h" 24 | #include "net_v7.h" 25 | #include "net_v6.h" 26 | #include "net_v5.h" 27 | #include "net_v4.h" 28 | #include "net_v3.h" 29 | #include "net_v2.h" 30 | 31 | #endif // end include guard 32 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_device.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_DEVICE_H_ 8 | #define NET_DEVICE_H_ 9 | 10 | #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 11 | #define NCCL_NET_MTU_SIZE 4096 12 | 13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin 14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. 15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 16 | 17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; 18 | 19 | typedef struct { 20 | ncclNetDeviceType netDeviceType; // Network offload type 21 | int netDeviceVersion; // Version number for network offload 22 | void* handle; 23 | size_t size; 24 | int needsProxyProgress; 25 | } ncclNetDeviceHandle_v7_t; 26 | 27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; 28 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t; 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V2_H_ 6 | #define NCCL_NET_V2_H_ 7 | 8 | typedef struct { 9 | // Name of the network (mainly for logs) 10 | const char* name; 11 | // Initialize the network. 12 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 13 | // Return the number of adapters. 14 | ncclResult_t (*devices)(int* ndev); 15 | // Return the device path in /sys. NCCL will call free on this path. 16 | ncclResult_t (*pciPath)(int dev, char** path); 17 | // Return whether this device supports host pointers and/or CUDA pointers 18 | // as data from the current GPU. Supported types should be composed with 19 | // NCCL_PTR_HOST and NCCL_PTR_CUDA. 20 | ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); 21 | // Create a receiving object and provide a handle to connect to it. The 22 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 23 | // between ranks to create a connection. 24 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 25 | // Connect to a handle and return a sending comm object for that peer. 26 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 27 | // Finalize connection establishment after remote peer has called connectHandle 28 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 29 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 30 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 31 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 32 | // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 33 | // May return request == NULL if the call cannot be performed (or would block) 34 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 35 | // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 36 | // May return request == NULL if the call cannot be performed (or would block) 37 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 38 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 39 | // visible to the GPU 40 | ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); 41 | // Test whether a request is complete. If size is not NULL, it returns the 42 | // number of bytes sent/received. 43 | ncclResult_t (*test)(void* request, int* done, int* size); 44 | // Close and free send/recv comm objects 45 | ncclResult_t (*closeSend)(void* sendComm); 46 | ncclResult_t (*closeRecv)(void* recvComm); 47 | ncclResult_t (*closeListen)(void* listenComm); 48 | } ncclNet_v2_t; 49 | 50 | #endif // end include guard 51 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V3_H_ 6 | #define NCCL_NET_V3_H_ 7 | 8 | #define NCCL_NET_MAX_REQUESTS_V3 16 9 | 10 | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t; 11 | typedef struct { 12 | // Name of the network (mainly for logs) 13 | const char* name; 14 | // Initialize the network. 15 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 16 | // Return the number of adapters. 17 | ncclResult_t (*devices)(int* ndev); 18 | // Get various device properties. 19 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); 20 | // Create a receiving object and provide a handle to connect to it. The 21 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 22 | // between ranks to create a connection. 23 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 24 | // Connect to a handle and return a sending comm object for that peer. 25 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 26 | // Finalize connection establishment after remote peer has called connectHandle 27 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 28 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 29 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 30 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 31 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 32 | // Asynchronous send to a peer. 33 | // May return request == NULL if the call cannot be performed (or would block) 34 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 35 | // Asynchronous recv from a peer. 36 | // May return request == NULL if the call cannot be performed (or would block) 37 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 38 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 39 | // visible to the GPU 40 | ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); 41 | // Test whether a request is complete. If size is not NULL, it returns the 42 | // number of bytes sent/received. 43 | ncclResult_t (*test)(void* request, int* done, int* size); 44 | // Close and free send/recv comm objects 45 | ncclResult_t (*closeSend)(void* sendComm); 46 | ncclResult_t (*closeRecv)(void* recvComm); 47 | ncclResult_t (*closeListen)(void* listenComm); 48 | } ncclNet_v3_t; 49 | 50 | #endif // end include guard 51 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v4.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V4_H_ 6 | #define NCCL_NET_V4_H_ 7 | 8 | #define NCCL_NET_HANDLE_MAXSIZE_V4 64 9 | 10 | typedef struct { 11 | char* name; // Used mostly for logging. 12 | char* pciPath; // Path to the PCI device in /sys. 13 | uint64_t guid; // Unique identifier for the NIC chip. Important for 14 | // cards with multiple PCI functions (Physical or virtual). 15 | int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA 16 | int speed; // Port speed in Mbps. 17 | int port; // Port number. 18 | int maxComms; // Maximum number of comms we can create 19 | } ncclNetProperties_v4_t; 20 | 21 | // v4 struct for backwards compatibility 22 | typedef struct { 23 | // Name of the network (mainly for logs) 24 | const char* name; 25 | // Initialize the network. 26 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 27 | // Return the number of adapters. 28 | ncclResult_t (*devices)(int* ndev); 29 | // Get various device properties. 30 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); 31 | // Create a receiving object and provide a handle to connect to it. The 32 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 33 | // between ranks to create a connection. 34 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 35 | // Connect to a handle and return a sending comm object for that peer. 36 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 37 | // Finalize connection establishment after remote peer has called connectHandle 38 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 39 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 40 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 41 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 42 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 43 | // Asynchronous send to a peer. 44 | // May return request == NULL if the call cannot be performed (or would block) 45 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 46 | // Asynchronous recv from a peer. 47 | // May return request == NULL if the call cannot be performed (or would block) 48 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 49 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 50 | // visible to the GPU 51 | ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); 52 | // Test whether a request is complete. If size is not NULL, it returns the 53 | // number of bytes sent/received. 54 | ncclResult_t (*test)(void* request, int* done, int* size); 55 | // Close and free send/recv comm objects 56 | ncclResult_t (*closeSend)(void* sendComm); 57 | ncclResult_t (*closeRecv)(void* recvComm); 58 | ncclResult_t (*closeListen)(void* listenComm); 59 | } ncclNet_v4_t; 60 | 61 | #endif // end include guard 62 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v5.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V5_H_ 6 | #define NCCL_NET_V5_H_ 7 | 8 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; 9 | typedef struct { 10 | // Name of the network (mainly for logs) 11 | const char* name; 12 | // Initialize the network. 13 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 14 | // Return the number of adapters. 15 | ncclResult_t (*devices)(int* ndev); 16 | // Get various device properties. 17 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); 18 | // Create a receiving object and provide a handle to connect to it. The 19 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 20 | // between ranks to create a connection. 21 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 22 | // Connect to a handle and return a sending comm object for that peer. 23 | // This call must not block for the connection to be established, and instead 24 | // should return successfully with sendComm == NULL with the expectation that 25 | // it will be called again until sendComm != NULL. 26 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 27 | // Finalize connection establishment after remote peer has called connect. 28 | // This call must not block for the connection to be established, and instead 29 | // should return successfully with recvComm == NULL with the expectation that 30 | // it will be called again until recvComm != NULL. 31 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 32 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 33 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 34 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 35 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 36 | // Asynchronous send to a peer. 37 | // May return request == NULL if the call cannot be performed (or would block) 38 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 39 | // Asynchronous recv from a peer. 40 | // May return request == NULL if the call cannot be performed (or would block) 41 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 42 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 43 | // visible to the GPU 44 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 45 | // Test whether a request is complete. If size is not NULL, it returns the 46 | // number of bytes sent/received. 47 | ncclResult_t (*test)(void* request, int* done, int* sizes); 48 | // Close and free send/recv comm objects 49 | ncclResult_t (*closeSend)(void* sendComm); 50 | ncclResult_t (*closeRecv)(void* recvComm); 51 | ncclResult_t (*closeListen)(void* listenComm); 52 | } ncclNet_v5_t; 53 | 54 | #endif // end include guard 55 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v6.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V6_H_ 6 | #define NCCL_NET_V6_H_ 7 | 8 | #define NCCL_NET_MAX_REQUESTS_V6 8 9 | 10 | typedef struct { 11 | char* name; // Used mostly for logging. 12 | char* pciPath; // Path to the PCI device in /sys. 13 | uint64_t guid; // Unique identifier for the NIC chip. Important for 14 | // cards with multiple PCI functions (Physical or virtual). 15 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 16 | int speed; // Port speed in Mbps. 17 | int port; // Port number. 18 | float latency; // Network latency 19 | int maxComms; // Maximum number of comms we can create 20 | int maxRecvs; // Maximum number of grouped receives. 21 | }ncclNetProperties_v6_t; 22 | 23 | typedef struct { 24 | // Name of the network (mainly for logs) 25 | const char* name; 26 | // Initialize the network. 27 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 28 | // Return the number of adapters. 29 | ncclResult_t (*devices)(int* ndev); 30 | // Get various device properties. 31 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); 32 | // Create a receiving object and provide a handle to connect to it. The 33 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 34 | // between ranks to create a connection. 35 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 36 | // Connect to a handle and return a sending comm object for that peer. 37 | // This call must not block for the connection to be established, and instead 38 | // should return successfully with sendComm == NULL with the expectation that 39 | // it will be called again until sendComm != NULL. 40 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 41 | // Finalize connection establishment after remote peer has called connect. 42 | // This call must not block for the connection to be established, and instead 43 | // should return successfully with recvComm == NULL with the expectation that 44 | // it will be called again until recvComm != NULL. 45 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 46 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 47 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 48 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 49 | /* DMA-BUF support */ 50 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 51 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 52 | // Asynchronous send to a peer. 53 | // May return request == NULL if the call cannot be performed (or would block) 54 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 55 | // Asynchronous recv from a peer. 56 | // May return request == NULL if the call cannot be performed (or would block) 57 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 58 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 59 | // visible to the GPU 60 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 61 | // Test whether a request is complete. If size is not NULL, it returns the 62 | // number of bytes sent/received. 63 | ncclResult_t (*test)(void* request, int* done, int* sizes); 64 | // Close and free send/recv comm objects 65 | ncclResult_t (*closeSend)(void* sendComm); 66 | ncclResult_t (*closeRecv)(void* recvComm); 67 | ncclResult_t (*closeListen)(void* listenComm); 68 | } ncclNet_v6_t; 69 | 70 | #endif // end include guard 71 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v7.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V7_H_ 6 | #define NCCL_NET_V7_H_ 7 | 8 | #include "net_device.h" 9 | 10 | typedef struct { 11 | char* name; // Used mostly for logging. 12 | char* pciPath; // Path to the PCI device in /sys. 13 | uint64_t guid; // Unique identifier for the NIC chip. Important for 14 | // cards with multiple PCI functions (Physical or virtual). 15 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 16 | int speed; // Port speed in Mbps. 17 | int port; // Port number. 18 | float latency; // Network latency 19 | int maxComms; // Maximum number of comms we can create 20 | int maxRecvs; // Maximum number of grouped receives. 21 | ncclNetDeviceType netDeviceType; // Network offload type 22 | int netDeviceVersion; // Version number for network offload 23 | } ncclNetProperties_v7_t; 24 | 25 | typedef struct { 26 | // Name of the network (mainly for logs) 27 | const char* name; 28 | // Initialize the network. 29 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 30 | // Return the number of adapters. 31 | ncclResult_t (*devices)(int* ndev); 32 | // Get various device properties. 33 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); 34 | // Create a receiving object and provide a handle to connect to it. The 35 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 36 | // between ranks to create a connection. 37 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 38 | // Connect to a handle and return a sending comm object for that peer. 39 | // This call must not block for the connection to be established, and instead 40 | // should return successfully with sendComm == NULL with the expectation that 41 | // it will be called again until sendComm != NULL. 42 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); 43 | // Finalize connection establishment after remote peer has called connect. 44 | // This call must not block for the connection to be established, and instead 45 | // should return successfully with recvComm == NULL with the expectation that 46 | // it will be called again until recvComm != NULL. 47 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); 48 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 49 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 50 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 51 | /* DMA-BUF support */ 52 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 53 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 54 | // Asynchronous send to a peer. 55 | // May return request == NULL if the call cannot be performed (or would block) 56 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 57 | // Asynchronous recv from a peer. 58 | // May return request == NULL if the call cannot be performed (or would block) 59 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 60 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 61 | // visible to the GPU 62 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 63 | // Test whether a request is complete. If size is not NULL, it returns the 64 | // number of bytes sent/received. 65 | ncclResult_t (*test)(void* request, int* done, int* sizes); 66 | // Close and free send/recv comm objects 67 | ncclResult_t (*closeSend)(void* sendComm); 68 | ncclResult_t (*closeRecv)(void* recvComm); 69 | ncclResult_t (*closeListen)(void* listenComm); 70 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code 71 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); 72 | 73 | // Notify the plugin that a recv has completed by the device 74 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); 75 | } ncclNet_v7_t; 76 | 77 | #endif // end include guard 78 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v8.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V8_H_ 6 | #define NCCL_NET_V8_H_ 7 | 8 | #include "net_device.h" 9 | 10 | typedef struct { 11 | char* name; // Used mostly for logging. 12 | char* pciPath; // Path to the PCI device in /sys. 13 | uint64_t guid; // Unique identifier for the NIC chip. Important for 14 | // cards with multiple PCI functions (Physical or virtual). 15 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 16 | int regIsGlobal; // regMr is not tied to a particular comm 17 | int speed; // Port speed in Mbps. 18 | int port; // Port number. 19 | float latency; // Network latency 20 | int maxComms; // Maximum number of comms we can create 21 | int maxRecvs; // Maximum number of grouped receives. 22 | ncclNetDeviceType netDeviceType; // Network offload type 23 | int netDeviceVersion; // Version number for network offload 24 | } ncclNetProperties_v8_t; 25 | 26 | typedef ncclNetProperties_v8_t ncclNetProperties_t; 27 | 28 | typedef struct { 29 | // Name of the network (mainly for logs) 30 | const char* name; 31 | // Initialize the network. 32 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 33 | // Return the number of adapters. 34 | ncclResult_t (*devices)(int* ndev); 35 | // Get various device properties. 36 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); 37 | // Create a receiving object and provide a handle to connect to it. The 38 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 39 | // between ranks to create a connection. 40 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 41 | // Connect to a handle and return a sending comm object for that peer. 42 | // This call must not block for the connection to be established, and instead 43 | // should return successfully with sendComm == NULL with the expectation that 44 | // it will be called again until sendComm != NULL. 45 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection 46 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); 47 | // Finalize connection establishment after remote peer has called connect. 48 | // This call must not block for the connection to be established, and instead 49 | // should return successfully with recvComm == NULL with the expectation that 50 | // it will be called again until recvComm != NULL. 51 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection 52 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); 53 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 54 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 55 | ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); 56 | /* DMA-BUF support */ 57 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 58 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 59 | // Asynchronous send to a peer. 60 | // May return request == NULL if the call cannot be performed (or would block) 61 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 62 | // Asynchronous recv from a peer. 63 | // May return request == NULL if the call cannot be performed (or would block) 64 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 65 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 66 | // visible to the GPU 67 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 68 | // Test whether a request is complete. If size is not NULL, it returns the 69 | // number of bytes sent/received. 70 | ncclResult_t (*test)(void* request, int* done, int* sizes); 71 | // Close and free send/recv comm objects 72 | ncclResult_t (*closeSend)(void* sendComm); 73 | ncclResult_t (*closeRecv)(void* recvComm); 74 | ncclResult_t (*closeListen)(void* listenComm); 75 | 76 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code 77 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); 78 | 79 | // Notify the plugin that a recv has completed by the device 80 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); 81 | } ncclNet_v8_t; 82 | 83 | #endif // end include guard 84 | -------------------------------------------------------------------------------- /ext-net/example/nccl/types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_TYPES_H_ 6 | #define NCCL_TYPES_H_ 7 | 8 | /* Data types */ 9 | typedef enum { ncclInt8 = 0, ncclChar = 0, 10 | ncclUint8 = 1, 11 | ncclInt32 = 2, ncclInt = 2, 12 | ncclUint32 = 3, 13 | ncclInt64 = 4, 14 | ncclUint64 = 5, 15 | ncclFloat16 = 6, ncclHalf = 6, 16 | ncclFloat32 = 7, ncclFloat = 7, 17 | ncclFloat64 = 8, ncclDouble = 8, 18 | ncclBfloat16 = 9, 19 | } ncclDataType_t; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /ext-net/google-fastsocket/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME?=/usr/local/cuda 2 | INC:=-I$(CUDA_HOME)/include 3 | PLUGIN_SO:=libnccl-net.so 4 | 5 | default: $(PLUGIN_SO) 6 | 7 | $(PLUGIN_SO): nccl-fastsocket/*.cc 8 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 9 | 10 | nccl-fastsocket/*.cc: 11 | git clone https://github.com/google/nccl-fastsocket.git 12 | 13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO) 14 | 15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO) 16 | @printf "Grabbing %-35s > %s\n" $< $@ 17 | mkdir -p $(BUILDDIR)/lib 18 | install -m 644 $< $@ 19 | 20 | clean: 21 | rm -f $(PLUGIN_SO) 22 | rm -Rf nccl-fastsocket 23 | -------------------------------------------------------------------------------- /ext-profiler/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME := ../../build 7 | INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl 8 | PLUGIN_SO := libnccl-profiler.so 9 | 10 | default: $(PLUGIN_SO) 11 | 12 | $(PLUGIN_SO): plugin.c event.c print_event.c 13 | $(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 14 | 15 | clean: 16 | rm -f $(PLUGIN_SO) 17 | -------------------------------------------------------------------------------- /ext-profiler/example/event.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | #include "event.h" 9 | 10 | int taskEventQueueEmpty(struct group* g) { 11 | return g->eventHead == NULL; 12 | } 13 | 14 | void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) { 15 | event->next = NULL; 16 | if (g->eventHead) g->eventTail->next = event; 17 | else g->eventHead = event; 18 | g->eventTail = event; 19 | } 20 | 21 | struct taskEventBase* taskEventQueueHead(struct group* g) { 22 | return g->eventHead; 23 | } 24 | 25 | struct taskEventBase* taskEventQueueDequeue(struct group* g) { 26 | struct taskEventBase* tmp = g->eventHead; 27 | g->eventHead = g->eventHead->next; 28 | if (g->eventHead == NULL) g->eventTail = NULL; 29 | return tmp; 30 | } 31 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 12 | 13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ERR_H_ 8 | #define NCCL_ERR_H_ 9 | 10 | /* Error type for plugins */ 11 | typedef enum { ncclSuccess = 0, 12 | ncclUnhandledCudaError = 1, 13 | ncclSystemError = 2, 14 | ncclInternalError = 3, 15 | ncclInvalidArgument = 4, 16 | ncclInvalidUsage = 5, 17 | ncclRemoteError = 6 } ncclResult_t; 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PROFILER_H_ 8 | #define NCCL_PROFILER_H_ 9 | 10 | #include 11 | #include 12 | 13 | #include "common.h" 14 | #include "err.h" 15 | 16 | #include "profiler_v1.h" 17 | 18 | #endif // end include guard 19 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_TYPES_H_ 6 | #define NCCL_TYPES_H_ 7 | 8 | /* Data types */ 9 | typedef enum { ncclInt8 = 0, ncclChar = 0, 10 | ncclUint8 = 1, 11 | ncclInt32 = 2, ncclInt = 2, 12 | ncclUint32 = 3, 13 | ncclInt64 = 4, 14 | ncclUint64 = 5, 15 | ncclFloat16 = 6, ncclHalf = 6, 16 | ncclFloat32 = 7, ncclFloat = 7, 17 | ncclFloat64 = 8, ncclDouble = 8, 18 | ncclBfloat16 = 9, 19 | } ncclDataType_t; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /ext-profiler/example/print_event.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PRINT_EVENT_H_ 8 | #define PRINT_EVENT_H_ 9 | 10 | void debugEvent(void* eHandle, const char* tag); 11 | void printEvent(FILE* fh, void* handle); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /ext-tuner/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME:=../../build/ 7 | CUDA_HOME:=/usr/local/cuda 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl 9 | PLUGIN_SO:=libnccl-tuner.so 10 | 11 | default: $(PLUGIN_SO) 12 | 13 | $(PLUGIN_SO): plugin.c 14 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 15 | 16 | clean: 17 | rm -f $(PLUGIN_SO) 18 | -------------------------------------------------------------------------------- /ext-tuner/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 12 | 13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /ext-tuner/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_ERR_H_ 6 | #define NCCL_ERR_H_ 7 | 8 | /* Error type for plugins */ 9 | typedef enum { ncclSuccess = 0, 10 | ncclUnhandledCudaError = 1, 11 | ncclSystemError = 2, 12 | ncclInternalError = 3, 13 | ncclInvalidArgument = 4, 14 | ncclInvalidUsage = 5, 15 | ncclRemoteError = 6 } ncclResult_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /ext-tuner/example/nccl/tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_TUNER_H_ 9 | #define NCCL_TUNER_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "common.h" 15 | #include "err.h" 16 | 17 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now 18 | typedef enum { 19 | ncclFuncBroadcast = 0, 20 | ncclFuncReduce = 1, 21 | ncclFuncAllGather = 2, 22 | ncclFuncReduceScatter = 3, 23 | ncclFuncAllReduce = 4, 24 | ncclFuncSendRecv = 5, 25 | ncclFuncSend = 6, 26 | ncclFuncRecv = 7, 27 | ncclNumFuncs = 8 28 | } ncclFunc_t; 29 | 30 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* 31 | #define NCCL_ALGO_UNDEF -1 32 | #define NCCL_ALGO_TREE 0 33 | #define NCCL_ALGO_RING 1 34 | #define NCCL_ALGO_COLLNET_DIRECT 2 35 | #define NCCL_ALGO_COLLNET_CHAIN 3 36 | #define NCCL_ALGO_NVLS 4 37 | #define NCCL_ALGO_NVLS_TREE 5 38 | #define NCCL_ALGO_PAT 6 39 | 40 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 41 | #define NCCL_PROTO_UNDEF -1 42 | #define NCCL_PROTO_LL 0 43 | #define NCCL_PROTO_LL128 1 44 | #define NCCL_PROTO_SIMPLE 2 45 | 46 | #define NCCL_ALGO_PROTO_IGNORE -1.0 47 | 48 | // API to be implemented by external tuner 49 | typedef struct { 50 | // Name of the tuner 51 | const char* name; 52 | 53 | // Initializes tuner states. 54 | // Inputs: 55 | // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. 56 | // - nNodes: number of nodes in current communicator. 57 | // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. 58 | // Outputs: 59 | // - context: tuner context object 60 | ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); 61 | 62 | // Gets info (algo, protocol, number of ctas and threads) for a given collective. 63 | // Inputs: 64 | // - context: tuner context object 65 | // - collType: collective type , e.g., allreduce, allgather… 66 | // - nBytes: collective size in bytes 67 | // - numPipeOps: number of operations in the group 68 | // - numAlgo: number of algorithms in collCostTable 69 | // - numProto: number of protocols in collCostTable 70 | // 71 | // Outputs: 72 | // - nChannels: number of channels (hence SMs) to be used. 73 | // 74 | // InOut: 75 | // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. 76 | // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). 77 | // 78 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 79 | // default tuning for the given collective. 80 | // Also, the plugin is allowed to not set any output, or set only the 81 | // algorithm and protocol, but not only the algorithm or only the protocol. 82 | // Unset fields will be set automatically by NCCL. 83 | ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, 84 | int numPipeOps, float** collCostTable, int numAlgo, int numProto, 85 | int* nChannels); 86 | 87 | // Terminates the plugin and cleans up any resources that the plugin allocated. 88 | // context: tuner context object 89 | ncclResult_t (*destroy)(void* context); 90 | } ncclTuner_v3_t; 91 | 92 | typedef ncclTuner_v3_t ncclTuner_t; 93 | 94 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3" 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /ext-tuner/example/plugin.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "tuner.h" 8 | 9 | #define __hidden __attribute__ ((visibility("hidden"))) 10 | 11 | __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; } 12 | 13 | __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, 14 | int numPipeOps, float** collCostTable, int numAlgo, int numProto, 15 | int* nChannels) { 16 | // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo 17 | if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { 18 | collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; 19 | } 20 | *nChannels = 1; 21 | return ncclSuccess; 22 | } 23 | 24 | __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } 25 | 26 | #define PLUGIN_NAME "Example" 27 | 28 | const ncclTuner_v3_t ncclTunerPlugin_v3 = { 29 | .name = PLUGIN_NAME, 30 | .init = pluginInit, 31 | .getCollInfo = pluginGetCollInfo, 32 | .destroy = pluginDestroy 33 | }; 34 | -------------------------------------------------------------------------------- /makefiles/formatting.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting 8 | # As this file defines a new target (format), it should be included at least after the definition of the 9 | # default target. 10 | 11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none 12 | ASTYLEDIR := $(BUILDDIR)/contrib 13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz 14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle 15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/ 16 | ASTYLEVER := 3.1 17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz" 18 | 19 | $(ASTYLEDIR) : 20 | @mkdir -p $(ASTYLEDIR) 21 | 22 | $(ASTYLETAR) : $(ASTYLEDIR) 23 | @wget -q -O $(ASTYLETAR) $(ASTYLEURL) 24 | 25 | $(ASTYLEBLD) : $(ASTYLETAR) 26 | @cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR) 27 | 28 | $(ASTYLEBIN) : $(ASTYLEBLD) 29 | ${MAKE} -C $(ASTYLEBLD) 30 | 31 | .PHONY : format 32 | format : $(ASTYLEBIN) 33 | @$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT) 34 | -------------------------------------------------------------------------------- /makefiles/version.mk: -------------------------------------------------------------------------------- 1 | ##### version 2 | NCCL_MAJOR := 2 3 | NCCL_MINOR := 23 4 | NCCL_PATCH := 4 5 | NCCL_SUFFIX := 6 | PKG_REVISION := 1 7 | -------------------------------------------------------------------------------- /pkg/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : build 9 | build : debian.build txz.build 10 | 11 | BUILDDIR ?= $(abspath ../build) 12 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 13 | TARGETS := debian txz 14 | all: ${TARGETS:%=%.build} 15 | prep: ${TARGETS:%=%.prep} 16 | build: ${TARGETS:%=%.build} 17 | clean: ${TARGETS:%=%.clean} 18 | 19 | %.prep: 20 | ${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR} 21 | 22 | %.build: 23 | ${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR} 24 | 25 | %.clean: 26 | ${MAKE} -C $* clean 27 | -------------------------------------------------------------------------------- /pkg/debian/.gitignore: -------------------------------------------------------------------------------- 1 | /*.debhelper.log 2 | /*.debhelper 3 | /*.substvars 4 | /tmp/ 5 | /files 6 | /libnccl1/ 7 | /libnccl-dev/ 8 | -------------------------------------------------------------------------------- /pkg/debian/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | DEBPREPDIR := $(BUILDDIR)/debian 11 | PKGDIR := $(BUILDDIR)/pkg/deb/ 12 | 13 | DEBGEN_IN := $(wildcard *.in) 14 | DEBGEN := $(DEBGEN_IN:.in=) 15 | DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN) 16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) 20 | PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) 21 | 22 | prep : $(DEBTARGETS) 23 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 24 | 25 | build : prep 26 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 27 | @printf "Building Debian package\n" 28 | (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b) 29 | mkdir -p $(PKGDIR) 30 | mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ 31 | 32 | clean: 33 | rm -Rf $(DEBPREPDIR) $(PKGDIR) 34 | 35 | $(DEBPREPDIR)/% : %.in 36 | @printf "Generating %-35s > %s\n" $< $@ 37 | mkdir -p $(DEBPREPDIR) 38 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 39 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 40 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 41 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 42 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 43 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 44 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 45 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 46 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 47 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 48 | $< > $@ 49 | 50 | $(DEBPREPDIR)/% : % 51 | @printf "Grabbing %-35s > %s\n" $< $@ 52 | mkdir -p $(DEBPREPDIR) 53 | cp -f $< $@ 54 | -------------------------------------------------------------------------------- /pkg/debian/changelog.in: -------------------------------------------------------------------------------- 1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium 2 | 3 | * Automatic Debian package from build 4 | 5 | -- cudatools ${pkg:Timestamp} 6 | -------------------------------------------------------------------------------- /pkg/debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /pkg/debian/control.in: -------------------------------------------------------------------------------- 1 | Source: nccl 2 | Section: libs 3 | Maintainer: cudatools 4 | Priority: optional 5 | Build-depends: debhelper(>=9) 6 | Standards-Version: 3.9.5 7 | 8 | Package: libnccl${nccl:Major} 9 | Section: libs 10 | Architecture: ${pkg:Arch} 11 | Depends: ${misc:Depends}, ${shlibs:Depends} 12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | Package: libnccl-dev 21 | Section: libdevel 22 | Architecture: ${pkg:Arch} 23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version}) 24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files 25 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 26 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 27 | broadcast, and reduce-scatter. 28 | It has been optimized to achieve high bandwidth on any platform using PCIe, 29 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 30 | sockets. 31 | -------------------------------------------------------------------------------- /pkg/debian/copyright: -------------------------------------------------------------------------------- 1 | ../../LICENSE.txt -------------------------------------------------------------------------------- /pkg/debian/gbp.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | debian-branch = master 3 | upstream-branch = master 4 | 5 | ignore-new = True 6 | 7 | [git-buildpackage] 8 | 9 | no-purge = True 10 | -------------------------------------------------------------------------------- /pkg/debian/libnccl-dev.install.in: -------------------------------------------------------------------------------- 1 | include/nccl.h /usr/include 2 | include/nccl_net.h /usr/include 3 | lib/libnccl.so /usr/lib/${pkg:MultiArch} 4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch} 5 | lib/pkgconfig/nccl.pc /usr/lib/${pkg:MultiArch}/pkgconfig 6 | lib/msccl-algorithms /usr/share/nccl 7 | -------------------------------------------------------------------------------- /pkg/debian/libnccl2.install.in: -------------------------------------------------------------------------------- 1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch} 2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch} 3 | -------------------------------------------------------------------------------- /pkg/debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | %: 4 | dh $@ --parallel 5 | 6 | override_dh_auto_install: 7 | PREFIX=debian/tmp dh_auto_install 8 | 9 | override_dh_auto_test: 10 | # Do not make test 11 | 12 | override_dh_auto_clean: 13 | # Do not make clean 14 | -------------------------------------------------------------------------------- /pkg/debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /pkg/redhat/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | RPMPREPDIR := $(BUILDDIR)/redhat 11 | PKGDIR := $(BUILDDIR)/pkg/rpm/ 12 | 13 | RPMGEN_IN := $(wildcard *.in) 14 | RPMGEN := $(RPMGEN_IN:.in=) 15 | RPMFILES := $(RPMGEN) 16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | ARCH := $(shell uname -m) 20 | PKG_ARCH ?= $(shell uname -m) 21 | PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) 22 | ifeq ($(PKG_MULTIARCH),) 23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it 24 | PKG_MULTIARCH := $(ARCH)-linux-gnu 25 | endif 26 | 27 | prep : $(RPMTARGETS) 28 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 29 | 30 | build : prep 31 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 32 | $(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR) 33 | @printf "Building Redhat package\n" 34 | mkdir -p $(PKGDIR) 35 | rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \ 36 | --define "_rpmdir $(PKGDIR)" \ 37 | --define "_builddir $(PKGDIR)/build/" \ 38 | --define "_buildrootdir $(PKGDIR)/buildroot/" \ 39 | -bb $(BUILDDIR)/redhat/nccl.spec 40 | 41 | clean: 42 | rm -Rf $(RPMPREPDIR) $(PKGDIR) 43 | 44 | $(RPMPREPDIR)/% : %.in 45 | @printf "Generating %-35s > %s\n" $< $@ 46 | mkdir -p $(RPMPREPDIR) 47 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 48 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 49 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 50 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 51 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 52 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 53 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 54 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 55 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 56 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 57 | $< > $@ 58 | 59 | $(RPMPREPDIR)/% : % 60 | @printf "Grabbing %-35s > %s\n" $< $@ 61 | mkdir -p $(RPMPREPDIR) 62 | cp -f $< $@ 63 | -------------------------------------------------------------------------------- /pkg/redhat/nccl.spec.in: -------------------------------------------------------------------------------- 1 | Name: libnccl 2 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} 3 | Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} 4 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 5 | 6 | Group: Development/Libraries 7 | License: BSD 8 | URL: http://developer.nvidia.com/nccl 9 | Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz 10 | Requires(pre,preun): /sbin/ldconfig 11 | 12 | %description 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | %package devel 21 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 22 | Group: Development/Libraries 23 | %description devel 24 | NCCL development files 25 | 26 | %package static 27 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 28 | Group: Development/Libraries 29 | %description static 30 | NCCL static library 31 | 32 | %define debug_package %{nil} 33 | 34 | %prep 35 | %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q 36 | 37 | %build 38 | 39 | %install 40 | rm -rf $RPM_BUILD_ROOT 41 | install -m 755 -d $RPM_BUILD_ROOT 42 | install -m 755 -d $RPM_BUILD_ROOT/%{_libdir} 43 | install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir} 44 | ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major} 45 | 46 | # devel 47 | install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} 48 | install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} 49 | install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir} 50 | ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so 51 | 52 | # static 53 | install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir} 54 | 55 | %post -p /sbin/ldconfig 56 | %postun -p /sbin/ldconfig 57 | 58 | %post devel -p /sbin/ldconfig 59 | %postun devel -p /sbin/ldconfig 60 | 61 | %clean 62 | rm -rf $RPM_BUILD_ROOT 63 | 64 | %files devel 65 | %doc LICENSE.txt 66 | %defattr(-,root,root,-) 67 | %{_includedir}/nccl.h 68 | %{_includedir}/nccl_net.h 69 | %{_libdir}/libnccl.so 70 | 71 | %files static 72 | %doc LICENSE.txt 73 | %defattr(-,root,root,-) 74 | %{_libdir}/libnccl_static.a 75 | 76 | %files 77 | %doc LICENSE.txt 78 | %defattr(-,root,root,-) 79 | %{_libdir}/libnccl.so.${nccl:Major} 80 | %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} 81 | 82 | %changelog 83 | -------------------------------------------------------------------------------- /pkg/srctxz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/srctxz 11 | PKGDIR := $(BUILDDIR)/pkg/srctxz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_REVISION ?= 3 18 | PKG_ARCH := $(shell uname -m) 19 | 20 | prep: $(TXZTARGETS) 21 | 22 | build: prep 23 | $(MAKE) -C ../../src clean 24 | @printf "Building source tar.xz package\n" 25 | (cd $(BUILDDIR); bash srctxz/create_srctxz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 40 | $< > $@ 41 | -------------------------------------------------------------------------------- /pkg/srctxz/create_srctxz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | cd .. 11 | NCCLDIR=`basename $PWD` 12 | 13 | echo "Checking for unclean directory ..." 14 | git clean -x -i 15 | echo "Clean done." 16 | echo "Checking for uncommited files ..." 17 | if [ "`git status -s | wc -l`" != "0" ]; then 18 | git status -s 19 | echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)" 20 | read 21 | fi 22 | 23 | cd .. 24 | NCCL_MAJOR=${nccl:Major} 25 | NCCL_MINOR=${nccl:Minor} 26 | NCCL_PATCH=${nccl:Patch} 27 | NCCL_SUFFIX=${nccl:Suffix} 28 | NCCL_BUILD=${pkg:Revision} 29 | 30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" 31 | 32 | tar --exclude build \ 33 | --exclude ".git*" \ 34 | --exclude pkg/srctxz \ 35 | --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR 36 | -------------------------------------------------------------------------------- /pkg/txz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/txz 11 | PKGDIR := $(BUILDDIR)/pkg/txz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_ARCH := $(shell uname -m) 18 | 19 | prep: $(TXZTARGETS) 20 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 21 | 22 | build: prep 23 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 24 | @printf "Building tar.xz package\n" 25 | (cd $(BUILDDIR); bash txz/create_txz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../nccl*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 40 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 41 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 42 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 43 | $< > $@ 44 | -------------------------------------------------------------------------------- /pkg/txz/create_txz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | BUILDDIR=`basename $PWD` 11 | 12 | cd .. 13 | NCCL_MAJOR=${nccl:Major} 14 | NCCL_MINOR=${nccl:Minor} 15 | NCCL_PATCH=${nccl:Patch} 16 | NCCL_SUFFIX=${nccl:Suffix} 17 | CUDA_MAJOR=${cuda:Major} 18 | CUDA_MINOR=${cuda:Minor} 19 | PKG_REVISION=${pkg:Revision} 20 | PKG_ARCH=${pkg:Arch} 21 | 22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" 23 | 24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt 25 | -------------------------------------------------------------------------------- /src/device/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | # Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | SHELL := /usr/bin/env bash 9 | MAKEFALGS += -r 10 | .SUFFIXES: 11 | .SECONDARY: 12 | 13 | NCCLDIR := ../.. 14 | include $(NCCLDIR)/makefiles/common.mk 15 | include $(NCCLDIR)/makefiles/version.mk 16 | 17 | BUILDDIR ?= $(abspath ../../build) 18 | OBJDIR := $(BUILDDIR)/obj/device 19 | 20 | MANIFEST := $(OBJDIR)/manifest 21 | DEVGLUE_OBJ := $(OBJDIR)/device_glue.o 22 | 23 | INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include 24 | NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" 25 | CXXFLAGS += $(INCFLAGS) 26 | 27 | SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY 28 | 29 | COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1 30 | COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1 31 | define COMPILE 32 | @$(SAY) "Compiling" $2;\ 33 | mkdir -p $(dir $1);\ 34 | $(call COMPILE$(suffix $2),$1,$2) 35 | endef 36 | 37 | DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1 38 | DEPENDS.cc = $(CXX) $(CXXFLAGS) -M -c $1 39 | define DEPENDS 40 | @$(SAY) "Dependencies" $2;\ 41 | mkdir -p $(dir $1);\ 42 | mk=$$($(call DEPENDS$(suffix $2),$2));\ 43 | [[ $$mk =~ ^[^:]*:(.*)$$ ]];\ 44 | files=$${BASH_REMATCH[1]};\ 45 | files=$$(for x in $$files; do case "$$x" in '\'|$$'\t') ;; *) echo "$$x"; esac; done);\ 46 | files=$$(for x in $$files; do [[ "$$(realpath "$$x")" == "$$(realpath "$(NCCLDIR)")"* ]] && echo "$$x"; done);\ 47 | echo "$(patsubst %.d,%.o,$1) $1: " $$files > $1 48 | endef 49 | 50 | all: $(MANIFEST) 51 | 52 | ifeq (1,1) 53 | # Case if the directory is generated on-demand: 54 | $(OBJDIR)/gensrc: generate.py 55 | @mkdir -p $@ 56 | (which python3 >/dev/null || \ 57 | (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \ 58 | printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \ 59 | exit 1)) \ 60 | && ./generate.py $@ "$(ONLY_FUNCS)" 61 | else 62 | # Case if the directory is pre-generated and checked in the repo as ./gen: 63 | $(OBJDIR)/gensrc: 64 | @mkdir -p $(OBJDIR); ln -srfn ./gen $@ 65 | endif 66 | 67 | # The trailing ";" is necessary to make this an "empty recipe": 68 | # https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html 69 | $(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ; 70 | 71 | -include $(OBJDIR)/gensrc/rules.mk 72 | # "gensrc/rules.mk" populates $(LIB_OBJS_GEN) 73 | 74 | SRCS = common.cu onerank.cu msccl_kernel.cu 75 | 76 | LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN) 77 | 78 | $(OBJDIR)/%.o: % $(OBJDIR)/%.d 79 | $(call COMPILE,$@,$<) 80 | 81 | $(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d 82 | $(call COMPILE,$@,$(OBJDIR)/gensrc/$*) 83 | 84 | $(OBJDIR)/%.d: % 85 | $(call DEPENDS,$@,$<) 86 | 87 | $(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/% 88 | $(call DEPENDS,$@,$<) 89 | 90 | $(DEVGLUE_OBJ): $(LIB_OBJS) 91 | $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ 92 | 93 | $(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ) 94 | @echo $^ > $@ 95 | 96 | -include $(wildcard $(OBJDIR)/*.d) 97 | -include $(wildcard $(OBJDIR)/genobj/*.d) 98 | 99 | .PHONY: clean 100 | clean: 101 | rm -rf $(OBJDIR) 102 | -------------------------------------------------------------------------------- /src/device/broadcast.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "device.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | 11 | namespace { 12 | template 13 | __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { 14 | ncclRing *ring = &ncclShmem.channel.ring; 15 | const int rank = ring->userRanks[0]; 16 | const int nextRank = ring->userRanks[1]; 17 | const int root = work->root; 18 | size_t chunkCount; 19 | size_t channelCount; 20 | size_t gridOffset; 21 | ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); 22 | size_t offset; 23 | int nelem; 24 | 25 | T *inputBuf = (T*)work->sendbuff; 26 | T *outputBuf = (T*)work->recvbuff; 27 | // Coverity reports that the callee treats &ring->next as an array. However, due to the use of 28 | // FanSymmetric<1>, only the first element is ever accessed, so it's fine. 29 | // coverity[callee_ptr_arith:FALSE] 30 | Primitives, 1, Proto, 0> 31 | prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work); 32 | 33 | for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { 34 | offset = gridOffset + elemOffset; 35 | nelem = min(chunkCount, channelCount - elemOffset); 36 | 37 | if (rank == root) { 38 | if (inputBuf == outputBuf) { 39 | prims.directSend(offset, offset, nelem); 40 | } else { 41 | prims.directCopySend(offset, offset, nelem); 42 | } 43 | } else if (nextRank == root) { 44 | prims.directRecv(offset, offset, nelem); 45 | } else { 46 | prims.directRecvCopyDirectSend(offset, nelem); 47 | } 48 | } 49 | } 50 | } 51 | 52 | template 53 | struct RunWorkColl { 54 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 55 | using Proto = ProtoSimple; 56 | runRing(tid, nthreads, work); 57 | } 58 | }; 59 | 60 | template 61 | struct RunWorkColl { 62 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 63 | runRing(tid, nthreads, work); 64 | } 65 | }; 66 | 67 | template 68 | struct RunWorkColl { 69 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 70 | runRing(tid, nthreads, work); 71 | } 72 | }; 73 | -------------------------------------------------------------------------------- /src/device/common.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "device.h" 8 | #include "collectives.h" 9 | #include "common.h" 10 | 11 | __shared__ ncclShmemData ncclShmem; 12 | #if __CUDA_ARCH__ < 700 13 | __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; 14 | #endif 15 | 16 | struct RunWorkNop { 17 | __device__ void run() {} 18 | }; 19 | 20 | __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { 21 | ncclKernelMain<-1, RunWorkNop>(&args4K.args); 22 | } 23 | 24 | __device__ void ncclDevFunc_Nop() {} 25 | -------------------------------------------------------------------------------- /src/device/network/unpack/unpack_defs.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, Google LLC. All rights reserved. 3 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | #ifndef NET_DEVICE_UNPACK_DEFS_H 8 | #define NET_DEVICE_UNPACK_DEFS_H 9 | 10 | #include 11 | 12 | #include "device.h" 13 | 14 | #define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16 15 | 16 | union alignas(16) loadMeta { 17 | uint64_t r64[2]; 18 | struct { 19 | uint32_t src_off; 20 | uint32_t len; 21 | uint64_t dst_off; 22 | }; 23 | }; 24 | static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned"); 25 | 26 | /****** global memory ******/ 27 | 28 | #define NET_UNPACK_MAX_QUEUE_DEPTH 16 // MAX_REQUESTS 29 | #define NET_UNPACK_MAX_SLICE_SIZE 4194304 // 4MB per Irecv call 30 | #define SLICE_PAGE_SIZE 4096 31 | #define NET_UNPACK_MAX_SLICE_PAGES \ 32 | (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2) // * 2 for slack, wasteful.. 33 | 34 | struct netUnpackMeta { 35 | loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES]; 36 | uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH]; 37 | }; 38 | 39 | struct unpackNetDeviceHandle { 40 | struct netUnpackMeta *meta; // mapped 41 | void* bounce_buf; 42 | uint64_t head; 43 | }; 44 | 45 | /****** shared memory ******/ 46 | 47 | #define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h 48 | #define NET_UNPACK_MAX_NPEERS 2 // The most you should have is 2 network peers per-group (indexed by index) 49 | #define WARP_SHM_PAGE_CNT 4 50 | #define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta)) 51 | struct unpackShmem { 52 | void* bounce_buf; 53 | }; 54 | 55 | struct unpackGroupShmem { 56 | int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv 57 | uint64_t head[NET_UNPACK_MAX_NPEERS]; 58 | struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy 59 | }; 60 | 61 | #endif // NET_DEVICE_UNPACK_DEFS_H_ 62 | -------------------------------------------------------------------------------- /src/device/onerank.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #include "alloc.h" 9 | #include "collectives.h" 10 | #include "common_kernel.h" 11 | #include "common.h" 12 | #include 13 | 14 | namespace { 15 | template 16 | __global__ __launch_bounds__(512, 1) 17 | void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) { 18 | using T = typename RedOp::EltType; 19 | int tid = threadIdx.x; 20 | int tn = blockDim.x; 21 | int bid = blockIdx.x; 22 | int bn = gridDim.x; 23 | 24 | // each block/channel gets a roughly equal segment of 16 byte packs 25 | constexpr int EltPerPack = 16/sizeof(T); 26 | intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack); 27 | intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack); 28 | i0 = min(i0, nElts); 29 | i1 = min(i1, nElts); 30 | src = (T*)src + i0; 31 | dst = (T*)dst + i0; 32 | 33 | if (redOpArgIsPtr) { 34 | if (redOpArg%2 != 0) { 35 | redOpArg = *reinterpret_cast(redOpArg); 36 | } else if (redOpArg%4 != 0) { 37 | redOpArg = *reinterpret_cast(redOpArg); 38 | } else if (redOpArg%8 != 0) { 39 | redOpArg = *reinterpret_cast(redOpArg); 40 | } else { 41 | redOpArg = *reinterpret_cast(redOpArg); 42 | } 43 | } 44 | reduceCopy 45 | (tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0); 46 | } 47 | } 48 | 49 | ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) { 50 | size_t eltSize = ncclTypeSize(eltType); 51 | if (redOp.op != ncclDevPreMulSum) { 52 | if (dst != src) { 53 | NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream)); 54 | } 55 | return ncclSuccess; 56 | } 57 | 58 | void const* kernel; 59 | switch (eltType) { 60 | case ncclInt8: kernel = (void const*)&oneRankReduce>; break; 61 | case ncclUint8: kernel = (void const*)&oneRankReduce>; break; 62 | case ncclInt32: kernel = (void const*)&oneRankReduce>; break; 63 | case ncclUint32: kernel = (void const*)&oneRankReduce>; break; 64 | case ncclInt64: kernel = (void const*)&oneRankReduce>; break; 65 | case ncclUint64: kernel = (void const*)&oneRankReduce>; break; 66 | case ncclFloat16: kernel = (void const*)&oneRankReduce>; break; 67 | #if defined(__CUDA_BF16_TYPES_EXIST__) 68 | case ncclBfloat16: kernel = (void const*)&oneRankReduce>; break; 69 | #endif 70 | #if defined(__CUDA_FP8_TYPES_EXIST__) 71 | case ncclFp8E4M3: kernel = (void const*)&oneRankReduce>; break; 72 | case ncclFp8E5M2: kernel = (void const*)&oneRankReduce>; break; 73 | #endif 74 | case ncclFloat32: kernel = (void const*)&oneRankReduce>; break; 75 | case ncclFloat64: kernel = (void const*)&oneRankReduce>; break; 76 | default: return ncclInvalidArgument; 77 | } 78 | dim3 grid = {0, 1, 1}; 79 | grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10)); 80 | dim3 block = {512, 1, 1}; 81 | void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr}; 82 | CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream)); 83 | return ncclSuccess; 84 | } 85 | -------------------------------------------------------------------------------- /src/device/reduce.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "device.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | 11 | namespace { 12 | template 13 | __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { 14 | ncclRing *ring = &ncclShmem.channel.ring; 15 | const int nranks = ncclShmem.comm.nRanks; 16 | const int rank = ncclShmem.comm.rank; 17 | const int prevRank = ring->userRanks[nranks-1]; 18 | const int root = work->root; 19 | size_t chunkCount; 20 | size_t channelCount; 21 | size_t gridOffset; 22 | ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); 23 | size_t offset; 24 | int nelem; 25 | 26 | // Coverity reports that the callee treats &ring->next as an array. However, due to the use of 27 | // FanSymmetric<1>, only the first element is ever accessed, so it's fine. 28 | // coverity[callee_ptr_arith:FALSE] 29 | Primitives, 0, Proto, 0> 30 | prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); 31 | 32 | if (prevRank == root) { 33 | for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { 34 | offset = gridOffset + elemOffset; 35 | nelem = min(chunkCount, channelCount - elemOffset); 36 | prims.send(offset, nelem); 37 | } 38 | } 39 | else if (rank == root) { 40 | for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { 41 | offset = gridOffset + elemOffset; 42 | nelem = min(chunkCount, channelCount - elemOffset); 43 | prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); 44 | } 45 | } 46 | else { 47 | for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { 48 | offset = gridOffset + elemOffset; 49 | nelem = min(chunkCount, channelCount - elemOffset); 50 | prims.recvReduceSend(offset, nelem); 51 | } 52 | } 53 | } 54 | } 55 | 56 | template 57 | struct RunWorkColl { 58 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 59 | using Proto = ProtoSimple; 60 | runRing(tid, nthreads, work); 61 | } 62 | }; 63 | 64 | template 65 | struct RunWorkColl { 66 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 67 | runRing(tid, nthreads, work); 68 | } 69 | }; 70 | 71 | template 72 | struct RunWorkColl { 73 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 74 | runRing(tid, nthreads, work); 75 | } 76 | }; 77 | -------------------------------------------------------------------------------- /src/enhcompat.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ 8 | 9 | enum cudaError_t { cudaErrorStubLibrary = 34 }; 10 | 11 | extern "C" { 12 | 13 | cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); 14 | cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } 15 | 16 | cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); 17 | cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } 18 | 19 | cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); 20 | cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } 21 | 22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); 23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } 24 | 25 | cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); 26 | cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/graph/rings.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "core.h" 8 | 9 | void dumpLine(int* values, int nranks, const char* prefix) { 10 | constexpr int line_length = 128; 11 | char line[line_length]; 12 | int num_width = snprintf(nullptr, 0, "%d", nranks-1); // safe as per "man snprintf" 13 | int n = snprintf(line, line_length, "%s", prefix); 14 | for (int i = 0; i < nranks && n < line_length-1; i++) { 15 | n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]); 16 | // At this point n may be more than line_length-1, so don't use it 17 | // for indexing into "line". 18 | } 19 | if (n >= line_length) { 20 | // Sprintf wanted to write more than would fit in the buffer. Assume 21 | // line_length is at least 4 and replace the end with "..." to 22 | // indicate that it was truncated. 23 | snprintf(line+line_length-4, 4, "..."); 24 | } 25 | INFO(NCCL_INIT, "%s", line); 26 | } 27 | 28 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { 29 | for (int r=0; r root ? rank-1 : rank) 10 | 11 | /* Btree which alternates leaves and nodes. 12 | * Assumes root is 0, which conveniently builds a tree on powers of two, 13 | * (because we have pow2-1 ranks) which lets us manipulate bits. 14 | * Find first non-zero bit, then : 15 | * Find the parent : 16 | * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below) 17 | * xx11[0] -> xx10[0] (3,7,11 below) 18 | * Find the children : 19 | * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13) 20 | * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13) 21 | * 22 | * Illustration : 23 | * 0---------------8 24 | * ______/ \______ 25 | * 4 12 26 | * / \ / \ 27 | * 2 6 10 \ 28 | * / \ / \ / \ \ 29 | * 1 3 5 7 9 11 13 30 | */ 31 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) { 32 | int up, down0, down1; 33 | int bit; 34 | for (bit=1; bit 0 so it has to be our child 1, not 0. 42 | *d1 = nranks > 1 ? bit >> 1 : -1; 43 | return ncclSuccess; 44 | } 45 | 46 | up = (rank ^ bit) | (bit << 1); 47 | // if smaller than the parent, we are his first child, otherwise we're his second 48 | if (up >= nranks) up = (rank ^ bit); 49 | *parentChildType = (rank < up) ? 0 : 1; 50 | *u = up; 51 | 52 | int lowbit = bit >> 1; 53 | // down0 is always within bounds 54 | down0 = lowbit == 0 ? -1 : rank-lowbit; 55 | 56 | down1 = lowbit == 0 ? -1 : rank+lowbit; 57 | // Make sure down1 is within bounds 58 | while (down1 >= nranks) { 59 | down1 = lowbit == 0 ? -1 : rank+lowbit; 60 | lowbit >>= 1; 61 | } 62 | *d0 = down0; *d1 = down1; 63 | 64 | return ncclSuccess; 65 | } 66 | 67 | /* Build a double binary tree. Take the previous tree for the first tree. 68 | * For the second tree, we use a mirror tree (if nranks is even) 69 | * 70 | * 0---------------8 3----------------11 71 | * ______/ \ / \______ 72 | * 4 \ / 7 73 | * / \ \ / / \ 74 | * 2 6 10 1 5 9 75 | * / \ / \ / \ / \ / \ / \ 76 | * 1 3 5 7 9 11 0 2 4 6 8 10 77 | * 78 | * or shift it by one rank (if nranks is odd). 79 | * 80 | * 0---------------8 1---------------9 81 | * ______/ \______ ______/ \______ 82 | * 4 12 5 0 83 | * / \ / / \ / 84 | * 2 6 10 3 7 11 85 | * / \ / \ / \ / \ / \ / \ 86 | * 1 3 5 7 9 11 2 4 6 8 10 12 87 | */ 88 | ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) { 89 | // First tree ... use a btree 90 | ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0); 91 | // Second tree ... mirror or shift 92 | if (nranks % 2 == 1) { 93 | // shift 94 | int shiftrank = (rank-1+nranks) % nranks; 95 | int u, d0, d1; 96 | ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1); 97 | *s1 = u == -1 ? -1 : (u+1) % nranks; 98 | *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks; 99 | *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks; 100 | } else { 101 | // mirror 102 | int u, d0, d1; 103 | ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1); 104 | *s1 = u == -1 ? -1 : nranks-1-u; 105 | *d1_0 = d0 == -1 ? -1 : nranks-1-d0; 106 | *d1_1 = d1 == -1 ? -1 : nranks-1-d1; 107 | } 108 | return ncclSuccess; 109 | } 110 | -------------------------------------------------------------------------------- /src/include/argcheck.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ARGCHECK_H_ 8 | #define NCCL_ARGCHECK_H_ 9 | 10 | #include "core.h" 11 | #include "info.h" 12 | 13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); 14 | ncclResult_t CommCheck(struct ncclComm* ptr, const char* opname, const char* ptrname); 15 | ncclResult_t ArgsCheck(struct ncclInfo* info); 16 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname); 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /src/include/bootstrap.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_BOOTSTRAP_H_ 8 | #define NCCL_BOOTSTRAP_H_ 9 | 10 | #include "nccl.h" 11 | #include "comm.h" 12 | 13 | struct ncclBootstrapHandle { 14 | uint64_t magic; 15 | union ncclSocketAddress addr; 16 | }; 17 | static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID"); 18 | 19 | ncclResult_t bootstrapNetInit(); 20 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv); 21 | ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle); 22 | ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm); 23 | ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks); 24 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); 25 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); 26 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); 27 | ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag); 28 | ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size); 29 | ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag); 30 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); 31 | ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size); 32 | ncclResult_t bootstrapClose(void* commState); 33 | ncclResult_t bootstrapAbort(void* commState); 34 | #endif 35 | -------------------------------------------------------------------------------- /src/include/channel.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CHANNEL_H_ 8 | #define NCCL_CHANNEL_H_ 9 | #include "comm.h" 10 | #include "utils.h" 11 | 12 | #include 13 | 14 | ncclResult_t initChannel(struct ncclComm* comm, int channelid); 15 | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); 16 | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); 17 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); 18 | 19 | inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) { 20 | if (comm->nNodes > 1) { 21 | int nodeDelta = p2pRound/comm->maxLocalRanks; 22 | int localDelta = p2pRound%comm->maxLocalRanks; 23 | int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); 24 | base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH; 25 | return base & 0xff; 26 | } else { 27 | return p2pRound & 0xff; 28 | } 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/include/coll_net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COLL_NET_H_ 8 | #define COLL_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | 13 | typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 14 | 15 | // Translation to external API 16 | static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } 17 | static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } 18 | static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } 19 | static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } 20 | static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } 21 | static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } 22 | static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } 23 | /* DMA-BUF support */ 24 | static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } 25 | static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; } 26 | static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { 27 | NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } 28 | static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } 29 | static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } 30 | static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } 31 | static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } 32 | 33 | static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/include/core.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_CORE_H_ 9 | #define NCCL_CORE_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include // For std::min/std::max 16 | #include "nccl.h" 17 | 18 | #ifdef PROFAPI 19 | #define NCCL_API(ret, func, args...) \ 20 | __attribute__ ((visibility("default"))) \ 21 | __attribute__ ((alias(#func))) \ 22 | ret p##func (args); \ 23 | extern "C" \ 24 | __attribute__ ((visibility("default"))) \ 25 | __attribute__ ((weak)) \ 26 | ret func(args) 27 | #else 28 | #define NCCL_API(ret, func, args...) \ 29 | extern "C" \ 30 | __attribute__ ((visibility("default"))) \ 31 | ret func(args) 32 | #endif // end PROFAPI 33 | 34 | #include "debug.h" 35 | #include "checks.h" 36 | #include "cudawrap.h" 37 | #include "alloc.h" 38 | #include "utils.h" 39 | #include "param.h" 40 | #include "nvtx.h" 41 | 42 | #endif // end include guard 43 | -------------------------------------------------------------------------------- /src/include/cpuset.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CPUSET_H_ 8 | #define NCCL_CPUSET_H_ 9 | 10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t 11 | 12 | static int hexToInt(char c) { 13 | int v = c - '0'; 14 | if (v < 0) return -1; 15 | if (v > 9) v = 10 + c - 'a'; 16 | if ((v < 0) || (v > 15)) return -1; 17 | return v; 18 | } 19 | 20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) 21 | 22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { 23 | uint32_t cpumasks[CPU_SET_N_U32]; 24 | int m = CPU_SET_N_U32-1; 25 | cpumasks[m] = 0; 26 | for (int o=0; o=0; o--) { 49 | if (c == 0 && m8[o] == 0) continue; 50 | sprintf(str+c, "%02x", m8[o]); 51 | c+=2; 52 | if (o && o%4 == 0) { 53 | sprintf(str+c, ","); 54 | c++; 55 | } 56 | } 57 | str[c] = '\0'; 58 | return ncclSuccess; 59 | } 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /src/include/cudawrap.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CUDAWRAP_H_ 8 | #define NCCL_CUDAWRAP_H_ 9 | 10 | #include 11 | #include 12 | #include "checks.h" 13 | 14 | // Is cuMem API usage enabled 15 | extern int ncclCuMemEnable(); 16 | extern int ncclCuMemHostEnable(); 17 | 18 | #if CUDART_VERSION >= 11030 19 | #include 20 | 21 | // Handle type used for cuMemCreate() 22 | extern CUmemAllocationHandleType ncclCuMemHandleType; 23 | 24 | #endif 25 | 26 | #define CUPFN(symbol) pfn_##symbol 27 | 28 | // Check CUDA PFN driver calls 29 | #define CUCHECK(cmd) do { \ 30 | CUresult err = pfn_##cmd; \ 31 | if( err != CUDA_SUCCESS ) { \ 32 | const char *errStr; \ 33 | (void) pfn_cuGetErrorString(err, &errStr); \ 34 | WARN("Cuda failure %d '%s'", err, errStr); \ 35 | return ncclUnhandledCudaError; \ 36 | } \ 37 | } while(false) 38 | 39 | #define CUCHECKGOTO(cmd, res, label) do { \ 40 | CUresult err = pfn_##cmd; \ 41 | if( err != CUDA_SUCCESS ) { \ 42 | const char *errStr; \ 43 | (void) pfn_cuGetErrorString(err, &errStr); \ 44 | WARN("Cuda failure %d '%s'", err, errStr); \ 45 | res = ncclUnhandledCudaError; \ 46 | goto label; \ 47 | } \ 48 | } while(false) 49 | 50 | // Report failure but clear error and continue 51 | #define CUCHECKIGNORE(cmd) do { \ 52 | CUresult err = pfn_##cmd; \ 53 | if( err != CUDA_SUCCESS ) { \ 54 | const char *errStr; \ 55 | (void) pfn_cuGetErrorString(err, &errStr); \ 56 | INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \ 57 | } \ 58 | } while(false) 59 | 60 | #define CUCHECKTHREAD(cmd, args) do { \ 61 | CUresult err = pfn_##cmd; \ 62 | if (err != CUDA_SUCCESS) { \ 63 | INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \ 64 | args->ret = ncclUnhandledCudaError; \ 65 | return args; \ 66 | } \ 67 | } while(0) 68 | 69 | #define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol 70 | 71 | #if CUDART_VERSION >= 11030 72 | /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ 73 | DECLARE_CUDA_PFN_EXTERN(cuDeviceGet); 74 | DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute); 75 | DECLARE_CUDA_PFN_EXTERN(cuGetErrorString); 76 | DECLARE_CUDA_PFN_EXTERN(cuGetErrorName); 77 | DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange); 78 | DECLARE_CUDA_PFN_EXTERN(cuCtxCreate); 79 | DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy); 80 | DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent); 81 | DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent); 82 | DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice); 83 | DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute); 84 | DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel); 85 | #if CUDART_VERSION >= 11080 86 | DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx); 87 | #endif 88 | // cuMem API support 89 | DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve); 90 | DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree); 91 | DECLARE_CUDA_PFN_EXTERN(cuMemCreate); 92 | DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity); 93 | DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle); 94 | DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle); 95 | DECLARE_CUDA_PFN_EXTERN(cuMemMap); 96 | DECLARE_CUDA_PFN_EXTERN(cuMemRelease); 97 | DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle); 98 | DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess); 99 | DECLARE_CUDA_PFN_EXTERN(cuMemUnmap); 100 | DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle); 101 | #if CUDA_VERSION >= 11070 102 | DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support 103 | #endif 104 | #if CUDA_VERSION >= 12010 105 | /* NVSwitch Multicast support */ 106 | DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice); 107 | DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem); 108 | DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr); 109 | DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate); 110 | DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity); 111 | DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind); 112 | #endif 113 | #endif 114 | 115 | ncclResult_t ncclCudaLibraryInit(void); 116 | 117 | extern int ncclCudaDriverVersionCache; 118 | extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit() 119 | 120 | inline ncclResult_t ncclCudaDriverVersion(int* driver) { 121 | int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED); 122 | if (version == -1) { 123 | CUDACHECK(cudaDriverGetVersion(&version)); 124 | __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED); 125 | } 126 | *driver = version; 127 | return ncclSuccess; 128 | } 129 | #endif 130 | -------------------------------------------------------------------------------- /src/include/debug.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INT_DEBUG_H_ 8 | #define NCCL_INT_DEBUG_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_common.h" 12 | #include 13 | #include 14 | 15 | // Conform to pthread and NVTX standard 16 | #define NCCL_THREAD_NAMELEN 16 17 | 18 | extern int ncclDebugLevel; 19 | extern FILE *ncclDebugFile; 20 | 21 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); 22 | 23 | // Let code temporarily downgrade WARN into INFO 24 | extern thread_local int ncclDebugNoWarn; 25 | extern char ncclLastError[]; 26 | 27 | #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 28 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 29 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) 30 | #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) 31 | 32 | #ifdef ENABLE_TRACE 33 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) 34 | #else 35 | #define TRACE(...) 36 | #endif 37 | 38 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...); 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/include/enqueue.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ENQUEUE_H_ 8 | #define NCCL_ENQUEUE_H_ 9 | 10 | #include "comm.h" 11 | #include "group.h" 12 | #include "collectives.h" 13 | #include "utils.h" 14 | 15 | #define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t) 16 | #define NCCL_LL128_ALIGNMENT_PER_WARP 480 17 | #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL) 18 | #define NCCL_BYTES_ALIGNMENT 16 19 | 20 | ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); 21 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); 22 | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); 23 | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); 24 | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); 25 | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); 26 | ncclResult_t ncclLaunchFinish(struct ncclComm* comm); 27 | ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo); 28 | 29 | #endif // End include guard 30 | -------------------------------------------------------------------------------- /src/include/ibvsymbols.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_IBV_SYMBOLS_H_ 2 | #define NCCL_IBV_SYMBOLS_H_ 3 | 4 | #ifdef NCCL_BUILD_RDMA_CORE 5 | #include 6 | #else 7 | #include "ibvcore.h" 8 | #endif 9 | 10 | #include "nccl.h" 11 | 12 | /* IB Verbs Function Pointers*/ 13 | struct ncclIbvSymbols { 14 | int (*ibv_internal_fork_init)(void); 15 | struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); 16 | void (*ibv_internal_free_device_list)(struct ibv_device **list); 17 | const char * (*ibv_internal_get_device_name)(struct ibv_device *device); 18 | struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); 19 | int (*ibv_internal_close_device)(struct ibv_context *context); 20 | int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); 21 | void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); 22 | int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); 23 | int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); 24 | int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); 25 | int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); 26 | struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); 27 | int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); 28 | struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); 29 | struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access); 30 | /* DMA-BUF support */ 31 | struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); 32 | int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); 33 | struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); 34 | int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); 35 | struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); 36 | int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); 37 | int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); 38 | const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); 39 | int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece); 40 | int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece); 41 | }; 42 | 43 | /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ 44 | ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols); 45 | 46 | #endif // NCCL_IBV_SYMBOLS_H_ 47 | -------------------------------------------------------------------------------- /src/include/ibvwrap.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. 3 | * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. 4 | * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. 5 | * Copyright (c) 2005 PathScale, Inc. All rights reserved. 6 | * 7 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 8 | * 9 | * See LICENSE.txt for license information 10 | ************************************************************************/ 11 | 12 | #ifndef NCCL_IBVWRAP_H_ 13 | #define NCCL_IBVWRAP_H_ 14 | 15 | #ifdef NCCL_BUILD_RDMA_CORE 16 | #include 17 | #else 18 | #include "ibvcore.h" 19 | #endif 20 | 21 | #include "core.h" 22 | #include 23 | #include 24 | 25 | typedef enum ibv_return_enum 26 | { 27 | IBV_SUCCESS = 0, //!< The operation was successful 28 | } ibv_return_t; 29 | 30 | ncclResult_t wrap_ibv_symbols(void); 31 | /* NCCL wrappers of IB verbs functions */ 32 | ncclResult_t wrap_ibv_fork_init(void); 33 | ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices); 34 | ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list); 35 | const char *wrap_ibv_get_device_name(struct ibv_device *device); 36 | ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device); 37 | ncclResult_t wrap_ibv_close_device(struct ibv_context *context); 38 | ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event); 39 | ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event); 40 | ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr); 41 | ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); 42 | ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); 43 | ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); 44 | ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context); 45 | ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); 46 | ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); 47 | struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); 48 | ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); 49 | /* DMA-BUF support */ 50 | ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); 51 | struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); 52 | ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); 53 | ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); 54 | ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); 55 | ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); 56 | ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq); 57 | static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) { 58 | int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/ 59 | if (done < 0) { 60 | WARN("Call to ibv_poll_cq() returned %d", done); 61 | return ncclSystemError; 62 | } 63 | *num_done = done; 64 | return ncclSuccess; 65 | } 66 | ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); 67 | ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); 68 | ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); 69 | ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); 70 | ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); 71 | 72 | static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { 73 | int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ 74 | if (ret != IBV_SUCCESS) { 75 | WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr); 76 | return ncclSystemError; 77 | } 78 | return ncclSuccess; 79 | } 80 | 81 | static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { 82 | int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ 83 | if (ret != IBV_SUCCESS) { 84 | WARN("ibv_post_recv() failed with error %s", strerror(ret)); 85 | return ncclSystemError; 86 | } 87 | return ncclSuccess; 88 | } 89 | 90 | ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event); 91 | 92 | #endif //End include guard 93 | -------------------------------------------------------------------------------- /src/include/info.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INFO_H_ 8 | #define NCCL_INFO_H_ 9 | 10 | #include "nccl.h" 11 | #include "collectives.h" 12 | #include "core.h" 13 | #include "utils.h" 14 | 15 | // Used to pass NCCL call information between functions 16 | struct ncclInfo { 17 | ncclFunc_t coll; 18 | const char* opName; 19 | // NCCL Coll Args 20 | const void* sendbuff; 21 | void* recvbuff; 22 | size_t count; 23 | ncclDataType_t datatype; 24 | ncclRedOp_t op; 25 | int root; // peer for p2p operations 26 | ncclComm_t comm; 27 | cudaStream_t stream; 28 | // Algorithm details 29 | int chunkSteps; 30 | int sliceSteps; 31 | }; 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/include/ipcsocket.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See COPYRIGHT for license information 5 | */ 6 | 7 | #ifndef NCCL_IPCSOCKET_H 8 | #define NCCL_IPCSOCKET_H 9 | 10 | #include "nccl.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #define NCCL_IPC_SOCKNAME_LEN 64 24 | 25 | struct ncclIpcSocket { 26 | int fd; 27 | char socketName[NCCL_IPC_SOCKNAME_LEN]; 28 | volatile uint32_t* abortFlag; 29 | }; 30 | 31 | ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); 32 | ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); 33 | ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd); 34 | 35 | ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); 36 | ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); 37 | 38 | ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash); 39 | ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd); 40 | 41 | #endif /* NCCL_IPCSOCKET_H */ 42 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_kernel.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_KERNEL_H_ 7 | #define MSCCL_KERNEL_H_ 8 | 9 | #define MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto) mscclKernel_##devredop##_##type##_##proto 10 | 11 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, proto) \ 12 | __global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork work); 13 | 14 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type) \ 15 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL) \ 16 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL128) \ 17 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, Simple) 18 | 19 | #if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__) 20 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop) \ 21 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \ 22 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \ 23 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \ 24 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \ 25 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \ 26 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t) \ 27 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half) \ 28 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float) \ 29 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double) \ 30 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, __nv_bfloat16) \ 31 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, __nv_fp8_e4m3) \ 32 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, __nv_fp8_e5m2) 33 | #elif defined(__CUDA_BF16_TYPES_EXIST__) 34 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop) \ 35 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \ 36 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \ 37 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \ 38 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \ 39 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \ 40 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t) \ 41 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half) \ 42 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float) \ 43 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double) \ 44 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, __nv_bfloat16) 45 | #else 46 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop) \ 47 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \ 48 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \ 49 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \ 50 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \ 51 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \ 52 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t) \ 53 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half) \ 54 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float) \ 55 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double) 56 | #endif 57 | 58 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(devredop) \ 59 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \ 60 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \ 61 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \ 62 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \ 63 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \ 64 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t) 65 | 66 | #define MSCCL_DECL_KERNEL_ENTRY_FUNC() \ 67 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum) \ 68 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod) \ 69 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(MinMax) \ 70 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum) \ 71 | MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv) 72 | 73 | MSCCL_DECL_KERNEL_ENTRY_FUNC() 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_lifecycle.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_LIFECYCLE_H_ 7 | #define MSCCL_LIFECYCLE_H_ 8 | 9 | #include "enqueue.h" 10 | 11 | #include "msccl/msccl_struct.h" 12 | 13 | bool mscclEnabled(); 14 | 15 | void mscclSetIsCallerFlag(); 16 | void mscclClearIsCallerFlag(); 17 | bool mscclIsCaller(); 18 | 19 | bool mscclAvailable(); 20 | 21 | int getEnvInt(const char* env, int64_t deftVal); 22 | 23 | ncclResult_t mscclSchedulerInit(ncclComm_t comm, int* numChannelsRequired); 24 | 25 | ncclResult_t mscclInit(ncclComm_t comm); 26 | 27 | ncclResult_t mscclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); 28 | 29 | ncclResult_t mscclGroupStart(); 30 | 31 | ncclResult_t mscclEnqueueCheck( 32 | const void* sendbuff, const size_t sendcounts[], const size_t sdispls[], 33 | void* recvbuff, const size_t recvcounts[], const size_t rdispls[], 34 | size_t count, ncclDataType_t datatype, int root, int peer, ncclRedOp_t op, 35 | mscclFunc_t mscclFunc, ncclComm_t comm, cudaStream_t stream); 36 | 37 | ncclResult_t mscclGroupEnd(); 38 | 39 | ncclResult_t mscclTeardown(); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_parser.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. 4 | * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef MSCCL_PARSER_H_ 10 | #define MSCCL_PARSER_H_ 11 | 12 | #include "nccl.h" 13 | #include "debug.h" 14 | #include "checks.h" 15 | #include 16 | 17 | #include "msccl/msccl_struct.h" 18 | 19 | // A few constraints to make the implementation easy 20 | #define MAX_STR_LEN 255 21 | #define MAX_ATTR_COUNT 16 22 | #define MAX_SUBS 1024 23 | #define MAX_NODES 4096 24 | 25 | #define NODE_TYPE_NONE 0 26 | #define NODE_TYPE_OPEN 1 27 | #define NODE_TYPE_CLOSE 2 28 | #define NODE_TYPE_SINGLE 3 29 | 30 | struct mscclXmlNode { 31 | char name[MAX_STR_LEN+1]; 32 | struct { 33 | char key[MAX_STR_LEN+1]; 34 | char value[MAX_STR_LEN+1]; 35 | } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params 36 | int nAttrs; 37 | int type; 38 | struct mscclXmlNode* parent; 39 | struct mscclXmlNode* subs[MAX_SUBS]; 40 | int nSubs; 41 | }; 42 | 43 | struct mscclXml { 44 | struct mscclXmlNode nodes[MAX_NODES]; 45 | int maxIndex; 46 | }; 47 | 48 | static ncclResult_t mscclXmlGetAttrIndex(struct mscclXmlNode* node, const char* attrName, int* index) { 49 | *index = -1; 50 | const int nAttrs = node->nAttrs; 51 | for (int a=0; aattrs[a].key, attrName, MAX_STR_LEN) == 0) { 53 | *index = a; 54 | return ncclSuccess; 55 | } 56 | } 57 | return ncclSuccess; 58 | } 59 | 60 | static ncclResult_t mscclXmlGetAttr(struct mscclXmlNode* node, const char* attrName, const char** value) { 61 | int index; 62 | NCCLCHECK(mscclXmlGetAttrIndex(node, attrName, &index)); 63 | *value = index == -1 ? NULL : node->attrs[index].value; 64 | return ncclSuccess; 65 | } 66 | 67 | static ncclResult_t mscclXmlGetAttrStr(struct mscclXmlNode* node, const char* attrName, const char** value) { 68 | NCCLCHECK(mscclXmlGetAttr(node, attrName, value)); 69 | if (*value == NULL) { 70 | WARN("Attribute %s of node %s not found", attrName, node->name); 71 | return ncclInternalError; 72 | } 73 | return ncclSuccess; 74 | } 75 | static ncclResult_t mscclXmlGetAttrInt(struct mscclXmlNode* node, const char* attrName, int* value) { 76 | const char* str; 77 | NCCLCHECK(mscclXmlGetAttrStr(node, attrName, &str)); 78 | *value = strtol(str, NULL, 0); 79 | return ncclSuccess; 80 | } 81 | 82 | static ncclResult_t mscclXmlGetAttrInt64(struct mscclXmlNode* node, const char* attrName, int64_t* value) { 83 | const char* str; 84 | NCCLCHECK(mscclXmlGetAttrStr(node, attrName, &str)); 85 | *value = strtoll(str, NULL, 0); 86 | return ncclSuccess; 87 | } 88 | 89 | static ncclResult_t mscclXmlFindTag(struct mscclXml* xml, const char* tagName, struct mscclXmlNode** node) { 90 | *node = NULL; 91 | for (int i=0; imaxIndex; i++) { 92 | struct mscclXmlNode* n = xml->nodes+i; 93 | if (strcmp(n->name, tagName) == 0) { 94 | *node = n; 95 | return ncclSuccess; 96 | } 97 | } 98 | return ncclSuccess; 99 | } 100 | 101 | ncclResult_t mscclGetAlgoFromXmlFile(const char* xmlGraphFile, struct mscclAlgo* algo, int rank); 102 | 103 | ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* xmlGraphFile, struct mscclAlgoMeta* algoMeta); 104 | 105 | #endif 106 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_scheduler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_SCHEDULER_H_ 7 | #define MSCCL_SCHEDULER_H_ 8 | 9 | typedef enum { mscclFuncReduce = 0, 10 | mscclFuncBroadcast = 1, 11 | mscclFuncAllReduce = 2, 12 | mscclFuncReduceScatter = 3, 13 | mscclFuncAllGather = 4, 14 | mscclFuncSend = 5, 15 | mscclFuncRecv = 6, 16 | mscclFuncGather = 7, 17 | mscclFuncScatter = 8, 18 | mscclFuncAllToAll = 9, 19 | mscclFuncAllToAllv = 10, 20 | mscclNumFuncs = 11 } mscclFunc_t; 21 | 22 | struct mscclSchedulerParam { 23 | const void* sendBuff; 24 | const size_t* sendCounts; 25 | const size_t* sDisPls; 26 | void* recvBuff; 27 | const size_t* recvCounts; 28 | const size_t* rDisPls; 29 | size_t count; 30 | ncclDataType_t dataType; 31 | int root; 32 | int peer; 33 | ncclRedOp_t op; 34 | mscclFunc_t func; 35 | int rank; 36 | int nRanks; 37 | bool scheduled; 38 | mscclAlgoHandle_t handle; 39 | }; 40 | 41 | typedef struct { 42 | // Name of the scheduler (mainly for logs) 43 | const char* name; 44 | // Load all algorithms 45 | ncclResult_t (*init)(); 46 | // Select an algorithm 47 | ncclResult_t (*selectAlgo)(struct mscclSchedulerParam* param); 48 | // Unload all algorithms 49 | ncclResult_t (*teardown)(); 50 | } mscclSchedulerInterface; 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_setup.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_SETUP_H_ 7 | #define MSCCL_SETUP_H_ 8 | 9 | #include "comm.h" 10 | #include "msccl/msccl_struct.h" 11 | 12 | ncclResult_t mscclGetCaptureStatus(cudaStream_t stream); 13 | 14 | ncclResult_t mscclSetupScratch(struct mscclAlgo* hostAlgo, cudaStream_t stream); 15 | 16 | ncclResult_t mscclSetupSyncFlags(cudaStream_t stream); 17 | 18 | ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo, ncclComm_t comm); 19 | 20 | ncclResult_t mscclSetupCount(struct mscclAlgo* hostAlgo, ncclComm_t comm, size_t count, ncclDataType_t dataType); 21 | 22 | ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm, cudaStream_t stream); 23 | 24 | ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count, 25 | ncclDataType_t dataType, ncclRedOp_t op, struct mscclAlgo* hostAlgo, struct mscclAlgo* devAlgo, 26 | ncclComm_t comm, cudaStream_t stream); 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_status.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_STATUS_H_ 7 | #define MSCCL_STATUS_H_ 8 | 9 | #include "msccl/msccl_struct.h" 10 | 11 | mscclStatus& mscclGetStatus(); 12 | 13 | mscclSavedProxyArgs& mscclGetSavedProxyArgs(); 14 | 15 | mscclThreadLocalStatus& mscclGetThreadLocalStatus(); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/include/nccl_common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_DEBUG_H_ 8 | #define NCCL_DEBUG_H_ 9 | 10 | typedef enum { 11 | NCCL_LOG_NONE = 0, 12 | NCCL_LOG_VERSION = 1, 13 | NCCL_LOG_WARN = 2, 14 | NCCL_LOG_INFO = 3, 15 | NCCL_LOG_ABORT = 4, 16 | NCCL_LOG_TRACE = 5 17 | } ncclDebugLogLevel; 18 | 19 | typedef enum { 20 | NCCL_INIT = 0x1, 21 | NCCL_COLL = 0x2, 22 | NCCL_P2P = 0x4, 23 | NCCL_SHM = 0x8, 24 | NCCL_NET = 0x10, 25 | NCCL_GRAPH = 0x20, 26 | NCCL_TUNING = 0x40, 27 | NCCL_ENV = 0x80, 28 | NCCL_ALLOC = 0x100, 29 | NCCL_CALL = 0x200, 30 | NCCL_PROXY = 0x400, 31 | NCCL_NVLS = 0x800, 32 | NCCL_BOOTSTRAP = 0x1000, 33 | NCCL_REG = 0x2000, 34 | NCCL_PROFILE = 0x4000, 35 | NCCL_ALL = ~0 36 | } ncclDebugLogSubSys; 37 | 38 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 39 | 40 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now 41 | typedef enum { 42 | ncclFuncBroadcast = 0, 43 | ncclFuncReduce = 1, 44 | ncclFuncAllGather = 2, 45 | ncclFuncReduceScatter = 3, 46 | ncclFuncAllReduce = 4, 47 | ncclFuncSendRecv = 5, 48 | ncclFuncSend = 6, 49 | ncclFuncRecv = 7, 50 | ncclNumFuncs = 8 51 | } ncclFunc_t; 52 | 53 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* 54 | #define NCCL_ALGO_UNDEF -1 55 | #define NCCL_ALGO_TREE 0 56 | #define NCCL_ALGO_RING 1 57 | #define NCCL_ALGO_COLLNET_DIRECT 2 58 | #define NCCL_ALGO_COLLNET_CHAIN 3 59 | #define NCCL_ALGO_NVLS 4 60 | #define NCCL_ALGO_NVLS_TREE 5 61 | #define NCCL_ALGO_PAT 6 62 | 63 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 64 | #define NCCL_PROTO_UNDEF -1 65 | #define NCCL_PROTO_LL 0 66 | #define NCCL_PROTO_LL128 1 67 | #define NCCL_PROTO_SIMPLE 2 68 | 69 | #define NCCL_ALGO_PROTO_IGNORE -1.0 70 | #endif 71 | -------------------------------------------------------------------------------- /src/include/nccl_tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_TUNER_H_ 9 | #define NCCL_TUNER_H_ 10 | 11 | #include "nccl.h" 12 | #include "nccl_common.h" 13 | 14 | // API to be implemented by external tuner 15 | typedef struct { 16 | // Name of the tuner 17 | const char* name; 18 | 19 | // Initializes tuner states. 20 | // Inputs: 21 | // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. 22 | // - nNodes: number of nodes in current communicator. 23 | // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. 24 | // Outputs: 25 | // - context: tuner context object 26 | ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); 27 | 28 | // Gets info (algo, protocol, number of ctas and threads) for a given collective. 29 | // Inputs: 30 | // - context: tuner context object 31 | // - collType: collective type , e.g., allreduce, allgather… 32 | // - nBytes: collective size in bytes 33 | // - numPipeOps: number of operations in the group 34 | // - numAlgo: number of algorithms in collCostTable 35 | // - numProto: number of protocols in collCostTable 36 | // 37 | // Outputs: 38 | // - nChannels: number of channels (hence SMs) to be used. 39 | // 40 | // InOut: 41 | // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. 42 | // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). 43 | // 44 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 45 | // default tuning for the given collective. 46 | // Also, the plugin is allowed to not set any output, or set only the 47 | // algorithm and protocol, but not only the algorithm or only the protocol. 48 | // Unset fields will be set automatically by NCCL. 49 | ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, 50 | int numPipeOps, float** collCostTable, int numAlgo, int numProto, 51 | int* nChannels); 52 | 53 | // Terminates the plugin and cleans up any resources that the plugin allocated. 54 | // context: tuner context object 55 | ncclResult_t (*destroy)(void* context); 56 | } ncclTuner_v3_t; 57 | 58 | typedef ncclTuner_v3_t ncclTuner_t; 59 | 60 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3" 61 | 62 | // API to be implemented by external tuner 63 | typedef struct { 64 | // Name of the tuner 65 | const char* name; 66 | 67 | // Initializes tuner states. 68 | // Inputs: 69 | // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. 70 | // - nNodes: number of nodes in current communicator. 71 | // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. 72 | // Outputs: 73 | // - context: tuner context object 74 | ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); 75 | 76 | // Gets info (algo, protocol, number of ctas and threads) for a given collective. 77 | // Inputs: 78 | // - context: tuner context object 79 | // - collType: collective type , e.g., allreduce, allgather… 80 | // - nBytes: collective size in bytes 81 | // - collNetTypeSupport: whether collnet supports this type 82 | // - nvlsTypeSupport: whether nvlink sharp supports this time 83 | // - numPipeOps: number of operations in the group 84 | // 85 | // Outputs: 86 | // - algorithm: selected algorithm to be used for the given collective 87 | // - protocol: selected protocol to be used for the give collective 88 | // - nChannels: number of channels (hence SMs) to be used. 89 | // 90 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 91 | // default tuning for the given collective. 92 | // Also, the plugin is allowed to not set any output, or set only the 93 | // algorithm and protocol, but not only the algorithm or only the protocol. 94 | // Unset fields will be set automatically by NCCL. 95 | ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, 96 | int collNetSupport, int nvlsSupport, int numPipeOps, 97 | int* algorithm, int* protocol, int* nChannels); 98 | 99 | // Terminates the plugin and cleans up any resources that the plugin allocated. 100 | // context: tuner context object 101 | ncclResult_t (*destroy)(void* context); 102 | } ncclTuner_v2_t; 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /src/include/net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INT_NET_H_ 8 | #define NCCL_INT_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | #include "comm.h" 13 | #include "checks.h" 14 | 15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 16 | 17 | ncclResult_t ncclNetPluginLoad(struct ncclComm* comm); 18 | ncclResult_t ncclNetPluginUnload(struct ncclComm* comm); 19 | ncclResult_t ncclNetInit(struct ncclComm* comm); 20 | ncclResult_t ncclNetFinalize(struct ncclComm* comm); 21 | int ncclNetVersion(struct ncclComm* comm); 22 | 23 | // Test whether the current GPU support GPU Direct RDMA. 24 | ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); 25 | 26 | extern ncclNet_t ncclNetIb; 27 | extern ncclNet_t ncclNetSocket; 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /src/include/net_device.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_NET_DEVICE_H_ 8 | #define NCCL_NET_DEVICE_H_ 9 | 10 | #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 11 | #define NCCL_NET_MTU_SIZE 4096 12 | 13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin 14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. 15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 16 | 17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; 18 | 19 | typedef struct { 20 | ncclNetDeviceType netDeviceType; // Network offload type 21 | int netDeviceVersion; // Version number for network offload 22 | void* handle; 23 | size_t size; 24 | int needsProxyProgress; 25 | } ncclNetDeviceHandle_v7_t; 26 | 27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; 28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t; 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/include/npkit/npkit.h: -------------------------------------------------------------------------------- 1 | #ifndef NPKIT_H_ 2 | #define NPKIT_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "npkit/npkit_event.h" 10 | #include "npkit/npkit_struct.h" 11 | 12 | class NpKit { 13 | public: 14 | static const uint64_t kNumGpuEventBuffers = 512; 15 | 16 | static const uint64_t kNumCpuEventBuffers = 32; 17 | 18 | static ncclResult_t Init(int rank); 19 | 20 | static ncclResult_t Dump(const std::string& dump_dir); 21 | 22 | static ncclResult_t Shutdown(); 23 | 24 | static NpKitEventCollectContext* GetGpuEventCollectContexts(); 25 | 26 | static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, 27 | NpKitEventCollectContext* ctx) { 28 | uint64_t event_buffer_head = ctx->event_buffer_head; 29 | if (event_buffer_head < kMaxNumGpuEventsPerBuffer) { 30 | NpKitEvent& event = ctx->event_buffer[event_buffer_head]; 31 | event.fields.type = type; 32 | event.fields.size = size; 33 | event.fields.rsvd = rsvd; 34 | event.fields.timestamp = timestamp; 35 | ctx->event_buffer_head++; 36 | } 37 | } 38 | 39 | static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id); 40 | 41 | static uint64_t* GetCpuTimestamp(); 42 | 43 | private: 44 | static void CpuTimestampUpdateThread(); 45 | 46 | // 64K * 512 * 16B = 512MB per GPU 47 | static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 16; 48 | 49 | // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU 50 | static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21; 51 | 52 | static NpKitEvent** gpu_event_buffers_; 53 | static NpKitEvent** cpu_event_buffers_; 54 | 55 | static NpKitEventCollectContext* gpu_collect_contexts_; 56 | static NpKitEventCollectContext* cpu_collect_contexts_; 57 | static uint64_t* cpu_timestamp_; 58 | 59 | static uint64_t rank_; 60 | 61 | static std::thread* cpu_timestamp_update_thread_; 62 | static volatile bool cpu_timestamp_update_thread_should_stop_; 63 | }; 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /src/include/npkit/npkit_struct.h: -------------------------------------------------------------------------------- 1 | #ifndef NPKIT_STRUCT_H_ 2 | #define NPKIT_STRUCT_H_ 3 | 4 | #include 5 | 6 | #pragma pack(push, 1) 7 | 8 | union NpKitEvent { 9 | uint64_t bits[2]; 10 | struct { 11 | uint64_t type : 8; 12 | uint64_t size : 32; 13 | uint64_t rsvd : 24; 14 | uint64_t timestamp; 15 | } fields; 16 | }; 17 | 18 | struct NpKitEventCollectContext { 19 | NpKitEvent* event_buffer; 20 | uint64_t event_buffer_head; 21 | }; 22 | 23 | #pragma pack(pop) 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/include/nvtx.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_NVTX_H_ 8 | #define NCCL_NVTX_H_ 9 | 10 | #include "nvtx3/nvtx3.hpp" 11 | 12 | #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14) 13 | #define NVTX3_CONSTEXPR_IF_CPP14 constexpr 14 | #else 15 | #define NVTX3_CONSTEXPR_IF_CPP14 16 | #endif 17 | 18 | // Define all NCCL-provided static schema IDs here (avoid duplicates). 19 | #define NVTX_SID_CommInitRank 0 20 | #define NVTX_SID_CommInitAll 1 21 | #define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank 22 | #define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank 23 | #define NVTX_SID_AllGather 4 24 | #define NVTX_SID_AllReduce 5 25 | #define NVTX_SID_Broadcast 6 26 | #define NVTX_SID_ReduceScatter 7 27 | #define NVTX_SID_Reduce 8 28 | #define NVTX_SID_Send 9 29 | #define NVTX_SID_Recv 10 30 | #define NVTX_SID_CommInitRankConfig 11 // same schema as NVTX_SID_CommInitRank 31 | #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank 32 | #define NVTX_SID_CommSplit 13 33 | 34 | // Define static schema ID for the reduction operation. 35 | #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START 36 | 37 | extern const nvtxDomainHandle_t ncclNvtxDomainHandle; 38 | 39 | struct nccl_domain{static constexpr char const* name{"NCCL"};}; 40 | 41 | class payload_schema { 42 | public: 43 | explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept 44 | { 45 | schema_attr.name = schemaName; 46 | schema_attr.entries = entries; 47 | schema_attr.numEntries = numEntries; 48 | schema_attr.schemaId = schemaId; 49 | nvtxPayloadSchemaRegister(nvtx3::domain::get(), &schema_attr); 50 | } 51 | 52 | payload_schema() = delete; 53 | ~payload_schema() = default; 54 | payload_schema(payload_schema const&) = default; 55 | payload_schema& operator=(payload_schema const&) = default; 56 | payload_schema(payload_schema&&) = default; 57 | payload_schema& operator=(payload_schema&&) = default; 58 | 59 | private: 60 | nvtxPayloadSchemaAttr_t schema_attr{ 61 | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | 62 | NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | 63 | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | 64 | NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | 65 | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, 66 | nullptr, 67 | NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, 68 | NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 69 | nullptr, 0, 0, 0, 0, nullptr}; 70 | }; 71 | 72 | // Create NVTX push/pop range with parameters 73 | // @param name of the operation (see `NVTX_SID_*`) 74 | // @param N schema name 75 | // @param S schema (entries) 76 | // @param P payload (struct) 77 | #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ 78 | static const payload_schema schema{S, std::extent::value, \ 79 | NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ 80 | static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ 81 | nvtxPayloadData_t nvtx3_bpl__[] = { \ 82 | {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ 83 | ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ 84 | ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; 85 | 86 | extern void initNvtxRegisteredEnums(); 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtCuda.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #include "nvToolsExt.h" 10 | 11 | #include "cuda.h" 12 | 13 | #ifndef NVTOOLSEXT_CUDA_V3 14 | #define NVTOOLSEXT_CUDA_V3 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif /* __cplusplus */ 19 | 20 | /* ========================================================================= */ 21 | /** \name Functions for CUDA Resource Naming 22 | */ 23 | /** \addtogroup RESOURCE_NAMING 24 | * \section RESOURCE_NAMING_CUDA CUDA Resource Naming 25 | * 26 | * This section covers the API functions that allow to annotate CUDA resources 27 | * with user-provided names. 28 | * 29 | * @{ 30 | */ 31 | 32 | /* ------------------------------------------------------------------------- */ 33 | /* \cond SHOW_HIDDEN 34 | * \brief Used to build a non-colliding value for resource types separated class 35 | * \version \NVTX_VERSION_2 36 | */ 37 | #define NVTX_RESOURCE_CLASS_CUDA 4 38 | /** \endcond */ 39 | 40 | /* ------------------------------------------------------------------------- */ 41 | /** \brief Resource types for CUDA 42 | */ 43 | typedef enum nvtxResourceCUDAType_t 44 | { 45 | NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */ 46 | NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */ 47 | NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */ 48 | NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */ 49 | } nvtxResourceCUDAType_t; 50 | 51 | 52 | /* ------------------------------------------------------------------------- */ 53 | /** \brief Annotates a CUDA device. 54 | * 55 | * Allows the user to associate a CUDA device with a user-provided name. 56 | * 57 | * \param device - The handle of the CUDA device to name. 58 | * \param name - The name of the CUDA device. 59 | * 60 | * \version \NVTX_VERSION_1 61 | * @{ */ 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name); 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name); 64 | /** @} */ 65 | 66 | /* ------------------------------------------------------------------------- */ 67 | /** \brief Annotates a CUDA context. 68 | * 69 | * Allows the user to associate a CUDA context with a user-provided name. 70 | * 71 | * \param context - The handle of the CUDA context to name. 72 | * \param name - The name of the CUDA context. 73 | * 74 | * \par Example: 75 | * \code 76 | * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice ); 77 | * if ( CUDA_SUCCESS != status ) 78 | * goto Error; 79 | * nvtxNameCuContext(cuContext, "CTX_NAME"); 80 | * \endcode 81 | * 82 | * \version \NVTX_VERSION_1 83 | * @{ */ 84 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name); 85 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name); 86 | /** @} */ 87 | 88 | /* ------------------------------------------------------------------------- */ 89 | /** \brief Annotates a CUDA stream. 90 | * 91 | * Allows the user to associate a CUDA stream with a user-provided name. 92 | * 93 | * \param stream - The handle of the CUDA stream to name. 94 | * \param name - The name of the CUDA stream. 95 | * 96 | * \version \NVTX_VERSION_1 97 | * @{ */ 98 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name); 99 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name); 100 | /** @} */ 101 | 102 | /* ------------------------------------------------------------------------- */ 103 | /** \brief Annotates a CUDA event. 104 | * 105 | * Allows the user to associate a CUDA event with a user-provided name. 106 | * 107 | * \param event - The handle of the CUDA event to name. 108 | * \param name - The name of the CUDA event. 109 | * 110 | * \version \NVTX_VERSION_1 111 | * @{ */ 112 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name); 113 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name); 114 | /** @} */ 115 | 116 | /** @} */ /* END RESOURCE_NAMING */ 117 | 118 | /* ========================================================================= */ 119 | #ifdef UNICODE 120 | #define nvtxNameCuDevice nvtxNameCuDeviceW 121 | #define nvtxNameCuContext nvtxNameCuContextW 122 | #define nvtxNameCuStream nvtxNameCuStreamW 123 | #define nvtxNameCuEvent nvtxNameCuEventW 124 | #else 125 | #define nvtxNameCuDevice nvtxNameCuDeviceA 126 | #define nvtxNameCuContext nvtxNameCuContextA 127 | #define nvtxNameCuStream nvtxNameCuStreamA 128 | #define nvtxNameCuEvent nvtxNameCuEventA 129 | #endif 130 | 131 | #ifdef __cplusplus 132 | } 133 | #endif /* __cplusplus */ 134 | 135 | #ifndef NVTX_NO_IMPL 136 | #define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */ 137 | #include "nvtxDetail/nvtxImplCuda_v3.h" 138 | #undef NVTX_IMPL_GUARD_CUDA 139 | #endif /*NVTX_NO_IMPL*/ 140 | 141 | #endif /* NVTOOLSEXT_CUDA_V3 */ 142 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtCudaRt.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #include "nvToolsExt.h" 10 | 11 | #include "cuda.h" 12 | #include "driver_types.h" 13 | 14 | #ifndef NVTOOLSEXT_CUDART_V3 15 | #define NVTOOLSEXT_CUDART_V3 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif /* __cplusplus */ 20 | 21 | /* ========================================================================= */ 22 | /** \name Functions for CUDA Resource Naming 23 | */ 24 | /** \addtogroup RESOURCE_NAMING 25 | * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming 26 | * 27 | * This section covers the API functions that allow to annotate CUDA resources 28 | * with user-provided names. 29 | * 30 | * @{ 31 | */ 32 | 33 | /* ------------------------------------------------------------------------- */ 34 | /* \cond SHOW_HIDDEN 35 | * \brief Used to build a non-colliding value for resource types separated class 36 | * \version \NVTX_VERSION_2 37 | */ 38 | #define NVTX_RESOURCE_CLASS_CUDART 5 39 | /** \endcond */ 40 | 41 | /* ------------------------------------------------------------------------- */ 42 | /** \brief Resource types for CUDART 43 | */ 44 | typedef enum nvtxResourceCUDARTType_t 45 | { 46 | NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */ 47 | NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */ 48 | NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */ 49 | } nvtxResourceCUDARTType_t; 50 | 51 | 52 | /* ------------------------------------------------------------------------- */ 53 | /** \brief Annotates a CUDA device. 54 | * 55 | * Allows the user to associate a CUDA device with a user-provided name. 56 | * 57 | * \param device - The id of the CUDA device to name. 58 | * \param name - The name of the CUDA device. 59 | * 60 | * \version \NVTX_VERSION_1 61 | * @{ */ 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name); 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name); 64 | /** @} */ 65 | 66 | /* ------------------------------------------------------------------------- */ 67 | /** \brief Annotates a CUDA stream. 68 | * 69 | * Allows the user to associate a CUDA stream with a user-provided name. 70 | * 71 | * \param stream - The handle of the CUDA stream to name. 72 | * \param name - The name of the CUDA stream. 73 | * 74 | * \version \NVTX_VERSION_1 75 | * @{ */ 76 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name); 77 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name); 78 | /** @} */ 79 | 80 | /* ------------------------------------------------------------------------- */ 81 | /** \brief Annotates a CUDA event. 82 | * 83 | * Allows the user to associate a CUDA event with a user-provided name. 84 | * 85 | * \param event - The handle of the CUDA event to name. 86 | * \param name - The name of the CUDA event. 87 | * 88 | * \version \NVTX_VERSION_1 89 | * @{ */ 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name); 91 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name); 92 | /** @} */ 93 | 94 | /** @} */ /* END RESOURCE_NAMING */ 95 | 96 | /* ========================================================================= */ 97 | #ifdef UNICODE 98 | #define nvtxNameCudaDevice nvtxNameCudaDeviceW 99 | #define nvtxNameCudaStream nvtxNameCudaStreamW 100 | #define nvtxNameCudaEvent nvtxNameCudaEventW 101 | #else 102 | #define nvtxNameCudaDevice nvtxNameCudaDeviceA 103 | #define nvtxNameCudaStream nvtxNameCudaStreamA 104 | #define nvtxNameCudaEvent nvtxNameCudaEventA 105 | #endif 106 | 107 | #ifdef __cplusplus 108 | } 109 | #endif /* __cplusplus */ 110 | 111 | #ifndef NVTX_NO_IMPL 112 | #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot be included directly */ 113 | #include "nvtxDetail/nvtxImplCudaRt_v3.h" 114 | #undef NVTX_IMPL_GUARD_CUDART 115 | #endif /*NVTX_NO_IMPL*/ 116 | 117 | #endif /* NVTOOLSEXT_CUDART_V3 */ 118 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtSemanticsCounters.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /** 10 | * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. 11 | */ 12 | 13 | #ifndef NVTX_SEMANTIC_ID_COUNTERS_V1 14 | #define NVTX_SEMANTIC_ID_COUNTERS_V1 2 15 | 16 | /** 17 | * Flags to extend the semantics of counters. 18 | */ 19 | #define NVTX_COUNTERS_FLAGS_NONE 0 20 | 21 | /** 22 | * Convert the fixed point value to a normalized floating point value. 23 | * Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type 24 | * this flag is applied to. 25 | */ 26 | #define NVTX_COUNTERS_FLAG_NORMALIZE (1 << 1) 27 | 28 | /** 29 | * Visual tools should apply scale and limits when graphing. 30 | */ 31 | #define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2) 32 | #define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3) 33 | #define NVTX_COUNTERS_FLAG_LIMITS \ 34 | (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX) 35 | 36 | /** 37 | * Counter time scopes. 38 | */ 39 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT (1 << 5) 40 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST (2 << 5) 41 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT (3 << 5) 42 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START (4 << 5) 43 | 44 | /** 45 | * Counter value types. 46 | */ 47 | #define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10) 48 | /** Delta to previous value of same counter type. */ 49 | #define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA (2 << 10) 50 | 51 | /** 52 | * Datatypes for the `limits` union. 53 | */ 54 | #define NVTX_COUNTERS_LIMIT_I64 0 55 | #define NVTX_COUNTERS_LIMIT_U64 1 56 | #define NVTX_COUNTERS_LIMIT_F64 2 57 | 58 | /** 59 | *\brief Specify counter semantics. 60 | */ 61 | typedef struct nvtxSemanticsCounter_v1 { 62 | /** Header of the semantic extensions (with identifier, version, etc.). */ 63 | struct nvtxSemanticsHeader_v1 header; 64 | 65 | /** Flags to provide more context about the counter value. */ 66 | uint64_t flags; 67 | 68 | /** Unit of the counter value (case-insensitive). */ 69 | const char* unit; 70 | 71 | /** Should be 1 if not used. */ 72 | uint64_t unitScaleNumerator; 73 | 74 | /** Should be 1 if not used. */ 75 | uint64_t unitScaleDenominator; 76 | 77 | /** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */ 78 | int64_t limitType; 79 | 80 | /** Graph limits {minimum, maximum}. */ 81 | union limits_t { 82 | int64_t i64[2]; 83 | uint64_t u64[2]; 84 | double d[2]; 85 | } limits; 86 | } nvtxSemanticsCounter_t; 87 | 88 | #endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtSemanticsScope.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /** 10 | * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. 11 | */ 12 | 13 | #ifndef NVTX_SEMANTIC_ID_SCOPE_V1 14 | #define NVTX_SEMANTIC_ID_SCOPE_V1 1 15 | 16 | /** 17 | * \brief Specify the NVTX scope for a payload entry. 18 | * 19 | * This allows the scope to be set for a specific value or counter in a payload. 20 | * The scope must be known at schema registration time. 21 | */ 22 | typedef struct nvtxSemanticsScope_v1 23 | { 24 | struct nvtxSemanticsHeader_v1 header; 25 | 26 | /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */ 27 | uint64_t scopeId; 28 | } nvtxSemanticsScope_t; 29 | 30 | #endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_HELPER_MACROS_H 10 | #define NVTX_EXT_HELPER_MACROS_H 11 | 12 | /* Combine tokens */ 13 | #define _NVTX_EXT_CONCAT(a, b) a##b 14 | #define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b) 15 | 16 | /* Resolves to the number of arguments passed. */ 17 | #define NVTX_EXT_NUM_ARGS(...) \ 18 | NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway) 19 | #define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16 20 | 21 | /* Cast argument(s) to void to prevent unused variable warnings. */ 22 | #define _NVTX_EXT_VOIDIFY1(a1) (void)a1; 23 | #define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2; 24 | #define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3; 25 | #define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4; 26 | 27 | /* Mark function arguments as unused. */ 28 | #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \ 29 | NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) 30 | 31 | #endif /* NVTX_EXT_HELPER_MACROS_H */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtImpl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_IMPL_GUARD 10 | #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifndef NVTX_EXT_IMPL_H 14 | #define NVTX_EXT_IMPL_H 15 | /* ---- Include required platform headers ---- */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #if defined(_WIN32) 23 | 24 | #include 25 | 26 | #else 27 | #include 28 | 29 | #if defined(__ANDROID__) 30 | #include 31 | #endif 32 | 33 | #if defined(__linux__) || defined(__CYGWIN__) 34 | #include 35 | #endif 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #endif 45 | 46 | /* ---- Define macros used in this file ---- */ 47 | 48 | #ifdef NVTX_DEBUG_PRINT 49 | #ifdef __ANDROID__ 50 | #include 51 | #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); 52 | #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); 53 | #else 54 | #include 55 | #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) 56 | #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) 57 | #endif 58 | #else /* !defined(NVTX_DEBUG_PRINT) */ 59 | #define NVTX_ERR(...) 60 | #define NVTX_INFO(...) 61 | #endif 62 | 63 | #ifdef __cplusplus 64 | extern "C" { 65 | #endif /* __cplusplus */ 66 | /* 67 | #ifdef __GNUC__ 68 | #pragma GCC visibility push(hidden) 69 | #endif 70 | */ 71 | #define NVTX_EXTENSION_FRESH 0 72 | #define NVTX_EXTENSION_DISABLED 1 73 | #define NVTX_EXTENSION_STARTING 2 74 | #define NVTX_EXTENSION_LOADED 3 75 | 76 | /* Function slots are local to each extension */ 77 | typedef struct nvtxExtGlobals1_t 78 | { 79 | NvtxExtInitializeInjectionFunc_t injectionFnPtr; 80 | } nvtxExtGlobals1_t; 81 | 82 | NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) = 83 | { 84 | (NvtxExtInitializeInjectionFunc_t)0 85 | }; 86 | 87 | #define NVTX_EXT_INIT_GUARD 88 | #include "nvtxExtInit.h" 89 | #undef NVTX_EXT_INIT_GUARD 90 | /* 91 | #ifdef __GNUC__ 92 | #pragma GCC visibility pop 93 | #endif 94 | */ 95 | #ifdef __cplusplus 96 | } /* extern "C" */ 97 | #endif /* __cplusplus */ 98 | 99 | #endif /* NVTX_EXT_IMPL_H */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD 10 | #error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif /* __cplusplus */ 16 | 17 | #ifdef NVTX_DISABLE 18 | 19 | #include "nvtxExtHelperMacros.h" 20 | 21 | #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ 22 | ret_val fn_name signature { \ 23 | NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ 24 | return ((ret_val)(intptr_t)-1); \ 25 | } 26 | 27 | #else /* NVTX_DISABLE */ 28 | 29 | #define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \ 30 | typedef ret_type ( * fn_name##_impl_fntype )signature; \ 31 | NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ 32 | intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ 33 | if (slot != NVTX_EXTENSION_DISABLED) { \ 34 | if (slot != NVTX_EXTENSION_FRESH) { \ 35 | return (*(fn_name##_impl_fntype)slot) arg_names; \ 36 | } else { \ 37 | NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \ 38 | /* Re-read function slot after extension initialization. */ \ 39 | slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ 40 | if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ 41 | return (*(fn_name##_impl_fntype)slot) arg_names; \ 42 | } \ 43 | } \ 44 | } \ 45 | NVTX_EXT_FN_RETURN_INVALID(ret_type) \ 46 | } 47 | 48 | #endif /*NVTX_DISABLE*/ 49 | 50 | /* Non-void functions. */ 51 | #define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); 52 | 53 | NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain)) 54 | 55 | NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device)) 56 | 57 | #undef NVTX_EXT_FN_RETURN_INVALID 58 | /* END: Non-void functions. */ 59 | 60 | /* void functions. */ 61 | #define NVTX_EXT_FN_RETURN_INVALID(rtype) 62 | #define return 63 | 64 | NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags)) 65 | 66 | #undef return 67 | #undef NVTX_EXT_FN_RETURN_INVALID 68 | /* END: void functions. */ 69 | 70 | #undef NVTX_EXT_FN_IMPL 71 | 72 | #ifdef __cplusplus 73 | } /* extern "C" */ 74 | #endif /* __cplusplus */ 75 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtTypes.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /* This header defines types which are used by the internal implementation 10 | * of NVTX and callback subscribers. API clients do not use these types, 11 | * so they are defined here instead of in nvToolsExt.h to clarify they are 12 | * not part of the NVTX client API. */ 13 | 14 | #ifndef NVTXEXTTYPES_H 15 | #define NVTXEXTTYPES_H 16 | 17 | #ifndef NVTX_EXT_TYPES_GUARD 18 | #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h. 19 | #endif 20 | 21 | typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId); 22 | 23 | typedef struct nvtxExtModuleSegment_t 24 | { 25 | size_t segmentId; 26 | size_t slotCount; 27 | intptr_t* functionSlots; 28 | } nvtxExtModuleSegment_t; 29 | 30 | typedef struct nvtxExtModuleInfo_t 31 | { 32 | uint16_t nvtxVer; 33 | uint16_t structSize; 34 | uint16_t moduleId; 35 | uint16_t compatId; 36 | size_t segmentsCount; 37 | nvtxExtModuleSegment_t* segments; 38 | NvtxExtGetExportFunction_t getExportFunction; 39 | const void* extInfo; 40 | } nvtxExtModuleInfo_t; 41 | 42 | typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo); 43 | 44 | #endif /* NVTXEXTTYPES_H */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_CUDART 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif /* __cplusplus */ 16 | 17 | typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name); 18 | typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name); 19 | typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name); 20 | typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name); 21 | typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name); 22 | typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name); 23 | 24 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name) 25 | { 26 | #ifndef NVTX_DISABLE 27 | nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr; 28 | if(local!=0) 29 | (*local)(device, name); 30 | #endif /*NVTX_DISABLE*/ 31 | } 32 | 33 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name) 34 | { 35 | #ifndef NVTX_DISABLE 36 | nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr; 37 | if(local!=0) 38 | (*local)(device, name); 39 | #endif /*NVTX_DISABLE*/ 40 | } 41 | 42 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name) 43 | { 44 | #ifndef NVTX_DISABLE 45 | nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr; 46 | if(local!=0) 47 | (*local)(stream, name); 48 | #endif /*NVTX_DISABLE*/ 49 | } 50 | 51 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name) 52 | { 53 | #ifndef NVTX_DISABLE 54 | nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr; 55 | if(local!=0) 56 | (*local)(stream, name); 57 | #endif /*NVTX_DISABLE*/ 58 | } 59 | 60 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name) 61 | { 62 | #ifndef NVTX_DISABLE 63 | nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr; 64 | if(local!=0) 65 | (*local)(event, name); 66 | #endif /*NVTX_DISABLE*/ 67 | } 68 | 69 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name) 70 | { 71 | #ifndef NVTX_DISABLE 72 | nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr; 73 | if(local!=0) 74 | (*local)(event, name); 75 | #endif /*NVTX_DISABLE*/ 76 | } 77 | 78 | #ifdef __cplusplus 79 | } /* extern "C" */ 80 | #endif /* __cplusplus */ 81 | 82 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_CUDA 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif /* __cplusplus */ 17 | 18 | typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name); 19 | typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name); 20 | typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name); 21 | typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name); 22 | typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name); 23 | typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name); 24 | typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name); 25 | typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name); 26 | 27 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name) 28 | { 29 | #ifndef NVTX_DISABLE 30 | nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr; 31 | if(local!=0) 32 | (*local)(device, name); 33 | #endif /*NVTX_DISABLE*/ 34 | } 35 | 36 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name) 37 | { 38 | #ifndef NVTX_DISABLE 39 | nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr; 40 | if(local!=0) 41 | (*local)(device, name); 42 | #endif /*NVTX_DISABLE*/ 43 | } 44 | 45 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name) 46 | { 47 | #ifndef NVTX_DISABLE 48 | nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr; 49 | if(local!=0) 50 | (*local)(context, name); 51 | #endif /*NVTX_DISABLE*/ 52 | } 53 | 54 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name) 55 | { 56 | #ifndef NVTX_DISABLE 57 | nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr; 58 | if(local!=0) 59 | (*local)(context, name); 60 | #endif /*NVTX_DISABLE*/ 61 | } 62 | 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name) 64 | { 65 | #ifndef NVTX_DISABLE 66 | nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr; 67 | if(local!=0) 68 | (*local)(stream, name); 69 | #endif /*NVTX_DISABLE*/ 70 | } 71 | 72 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name) 73 | { 74 | #ifndef NVTX_DISABLE 75 | nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr; 76 | if(local!=0) 77 | (*local)(stream, name); 78 | #endif /*NVTX_DISABLE*/ 79 | } 80 | 81 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name) 82 | { 83 | #ifndef NVTX_DISABLE 84 | nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr; 85 | if(local!=0) 86 | (*local)(event, name); 87 | #endif /*NVTX_DISABLE*/ 88 | } 89 | 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name) 91 | { 92 | #ifndef NVTX_DISABLE 93 | nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr; 94 | if(local!=0) 95 | (*local)(event, name); 96 | #endif /*NVTX_DISABLE*/ 97 | } 98 | 99 | #ifdef __cplusplus 100 | } /* extern "C" */ 101 | #endif /* __cplusplus */ 102 | 103 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_SYNC 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif /* __cplusplus */ 17 | 18 | typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); 19 | typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle); 20 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle); 21 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle); 22 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle); 23 | typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle); 24 | 25 | NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs) 26 | { 27 | #ifndef NVTX_DISABLE 28 | nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr; 29 | if(local!=0) 30 | return (*local)(domain, attribs); 31 | else 32 | #endif /*NVTX_DISABLE*/ 33 | return (nvtxSyncUser_t)0; 34 | } 35 | 36 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle) 37 | { 38 | #ifndef NVTX_DISABLE 39 | nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr; 40 | if(local!=0) 41 | (*local)(handle); 42 | #endif /*NVTX_DISABLE*/ 43 | } 44 | 45 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle) 46 | { 47 | #ifndef NVTX_DISABLE 48 | nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr; 49 | if(local!=0) 50 | (*local)(handle); 51 | #endif /*NVTX_DISABLE*/ 52 | } 53 | 54 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle) 55 | { 56 | #ifndef NVTX_DISABLE 57 | nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr; 58 | if(local!=0) 59 | (*local)(handle); 60 | #endif /*NVTX_DISABLE*/ 61 | } 62 | 63 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle) 64 | { 65 | #ifndef NVTX_DISABLE 66 | nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr; 67 | if(local!=0) 68 | (*local)(handle); 69 | #endif /*NVTX_DISABLE*/ 70 | } 71 | 72 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle) 73 | { 74 | #ifndef NVTX_DISABLE 75 | nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr; 76 | if(local!=0) 77 | (*local)(handle); 78 | #endif /*NVTX_DISABLE*/ 79 | } 80 | 81 | #ifdef __cplusplus 82 | } /* extern "C" */ 83 | #endif /* __cplusplus */ 84 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef __NVTX_LINKONCE_H__ 10 | #define __NVTX_LINKONCE_H__ 11 | 12 | /* This header defines macros to permit making definitions of global variables 13 | * and functions in C/C++ header files which may be included multiple times in 14 | * a translation unit or linkage unit. It allows authoring header-only libraries 15 | * which can be used by multiple other header-only libraries (either as the same 16 | * copy or multiple copies), and does not require any build changes, such as 17 | * adding another .c file, linking a static library, or deploying a dynamic 18 | * library. Globals defined with these macros have the property that they have 19 | * the same address, pointing to a single instance, for the entire linkage unit. 20 | * It is expected but not guaranteed that each linkage unit will have a separate 21 | * instance. 22 | * 23 | * In some situations it is desirable to declare a variable without initializing 24 | * it, refer to it in code or other variables' initializers, and then initialize 25 | * it later. Similarly, functions can be prototyped, have their address taken, 26 | * and then have their body defined later. In such cases, use the FWDDECL macros 27 | * when forward-declaring LINKONCE global variables without initializers and 28 | * function prototypes, and then use the DEFINE macros when later defining them. 29 | * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro, 30 | * following this pattern makes code maximally portable. 31 | */ 32 | 33 | #if defined(__MINGW32__) /* MinGW */ 34 | #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) 35 | #if defined(__cplusplus) 36 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 37 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK 38 | #else 39 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 40 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 41 | #endif 42 | #elif defined(_MSC_VER) /* MSVC */ 43 | #if defined(__cplusplus) 44 | #define NVTX_LINKONCE_DEFINE_GLOBAL extern "C" __declspec(selectany) 45 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline 46 | #else 47 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 48 | #define NVTX_LINKONCE_DEFINE_FUNCTION __inline 49 | #endif 50 | #elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */ 51 | #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) 52 | #if defined(__cplusplus) 53 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 54 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK 55 | #else 56 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 57 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 58 | #endif 59 | #elif defined(__CYGWIN__) /* Assume GCC or compatible */ 60 | #define NVTX_LINKONCE_WEAK __attribute__((weak)) 61 | #if defined(__cplusplus) 62 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 63 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline 64 | #else 65 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 66 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 67 | #endif 68 | #else /* All others: Assume GCC, clang, or compatible */ 69 | #define NVTX_LINKONCE_WEAK __attribute__((weak)) 70 | #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden"))) 71 | #if defined(__cplusplus) 72 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 73 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline 74 | #else 75 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 76 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 77 | #endif 78 | #endif 79 | 80 | #define NVTX_LINKONCE_FWDDECL_GLOBAL NVTX_LINKONCE_DEFINE_GLOBAL extern 81 | #define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION 82 | 83 | #endif /* __NVTX_LINKONCE_H__ */ 84 | -------------------------------------------------------------------------------- /src/include/p2p.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | 9 | #ifndef NCCL_P2P_H_ 10 | #define NCCL_P2P_H_ 11 | 12 | #include 13 | #include 14 | 15 | #include "core.h" 16 | 17 | #if CUDART_VERSION < 12030 18 | // MNNVL: FABRIC handle support lifted from CUDA 12.3 19 | #define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128) 20 | #define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL) 21 | #define CU_IPC_HANDLE_SIZE 64 22 | typedef struct CUmemFabricHandle_st { 23 | unsigned char data[CU_IPC_HANDLE_SIZE]; 24 | } CUmemFabricHandle_v1; 25 | typedef CUmemFabricHandle_v1 CUmemFabricHandle; 26 | #endif 27 | 28 | typedef union { 29 | uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support 30 | CUmemFabricHandle handle; 31 | } ncclCuDesc; 32 | 33 | typedef union { 34 | // Legacy CUDA IPC 35 | cudaIpcMemHandle_t devIpc; 36 | // cuMem API support 37 | struct { 38 | ncclCuDesc cuDesc; 39 | CUmemGenericAllocationHandle memHandle; 40 | }; 41 | } ncclIpcDesc; 42 | 43 | enum ncclIpcRegType { 44 | NCCL_IPC_SENDRECV = 0, 45 | NCCL_IPC_COLLECTIVE = 1 46 | }; 47 | 48 | struct ncclIpcImpInfo { 49 | void* rmtRegAddr; 50 | bool legacyIpcCap; 51 | uintptr_t offset; 52 | }; 53 | 54 | struct ncclIpcRegInfo { 55 | int peerRank; 56 | void* baseAddr; 57 | struct ncclProxyConnector* ipcProxyconn; 58 | struct ncclIpcImpInfo impInfo; 59 | }; 60 | 61 | ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr); 62 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); 63 | ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); 64 | ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut); 65 | ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts); 66 | 67 | ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo); 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /src/include/param.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PARAM_H_ 8 | #define NCCL_PARAM_H_ 9 | 10 | #include 11 | 12 | const char* userHomeDir(); 13 | void setEnvFile(const char* fileName); 14 | void initEnv(); 15 | const char *ncclGetEnv(const char *name); 16 | 17 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); 18 | 19 | #define NCCL_PARAM(name, env, deftVal) \ 20 | int64_t ncclParam##name() { \ 21 | constexpr int64_t uninitialized = INT64_MIN; \ 22 | static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ 23 | static int64_t cache = uninitialized; \ 24 | if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ 25 | ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ 26 | } \ 27 | return cache; \ 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/include/profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_H_ 8 | #define PROFILER_H_ 9 | 10 | #include 11 | #include "nccl_profiler.h" 12 | 13 | struct ncclProxyArgs; 14 | struct ncclKernelPlan; 15 | struct ncclTaskColl; 16 | struct ncclTaskP2p; 17 | struct ncclInfo; 18 | struct ncclComm; 19 | struct ncclProxyOp; 20 | 21 | // Plugin Init/Finalize Wrappers 22 | ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm); 23 | ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm); 24 | 25 | // Profiler Start/Stop Group Wrappers 26 | ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan); 27 | ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan); 28 | 29 | // Profiler Start/Stop Task Events Wrappers 30 | ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan); 31 | ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan); 32 | 33 | // Proxy Op Start/Stop Event Wrappers 34 | ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args); 35 | ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args); 36 | ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args); 37 | 38 | // Proxy Step Start/Stop Event Wrappers 39 | ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); 40 | ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); 41 | ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); 42 | 43 | // Proxy Control Start/Stop Events Wrappers 44 | ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle); 45 | ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle); 46 | 47 | // Record Event Wrappers 48 | ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState); 49 | ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState); 50 | ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState); 51 | 52 | // Profiler utility functions 53 | ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op); 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /src/include/register.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_REGISTER_H_ 2 | #define NCCL_REGISTER_H_ 3 | 4 | #include "device.h" 5 | 6 | #include 7 | #include 8 | 9 | enum { 10 | NET_REG_COMPLETE = 0x01, 11 | NVLS_REG_COMPLETE = 0x02, 12 | NVLS_REG_POSSIBLE = 0x04, 13 | NVLS_REG_NO_SUPPORT = 0x08, 14 | COLLNET_REG_COMPLETE = 0x10, 15 | IPC_REG_COMPLETE = 0x20 16 | }; 17 | 18 | struct ncclPeerRegIpcAddr { 19 | uintptr_t* devPeerRmtAddrs; 20 | uintptr_t* hostPeerRmtAddrs; 21 | }; 22 | 23 | struct ncclReg { 24 | // common attributes 25 | size_t pages; 26 | int refs; 27 | uintptr_t addr; 28 | uint32_t state; 29 | // net reg 30 | int nDevs; 31 | int devs[MAXCHANNELS]; 32 | void** handles; 33 | // nvls reg 34 | uintptr_t baseAddr; 35 | size_t baseSize; 36 | CUdeviceptr regAddr; 37 | size_t regSize; 38 | int dev; 39 | CUmemGenericAllocationHandle mcHandle; 40 | uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */ 41 | // collnet reg 42 | void* collnetHandle; 43 | struct ncclProxyConnector* collnetProxyconn; 44 | // general ipc reg 45 | struct ncclPeerRegIpcAddr regIpcAddrs; 46 | struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS]; 47 | }; 48 | 49 | struct ncclRegCache { 50 | struct ncclReg **slots; 51 | int capacity, population; 52 | uintptr_t pageSize; 53 | void* sComms[MAXCHANNELS]; 54 | void* rComms[MAXCHANNELS]; 55 | }; 56 | 57 | ncclResult_t ncclRegCleanup(struct ncclComm* comm); 58 | ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg); 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/include/shm.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_SHM_H_ 2 | #define NCCL_SHM_H_ 3 | 4 | #include "comm.h" 5 | 6 | struct shmLegacyIpc { 7 | char shmSuffix[7]; 8 | ncclShmHandle_t handle; 9 | size_t shmSize; 10 | }; 11 | 12 | struct shmCuIpc { 13 | union { 14 | CUmemFabricHandle handle; 15 | CUmemGenericAllocationHandle data; 16 | }; 17 | int tpProxyRank; 18 | void *ptr; 19 | size_t size; 20 | }; 21 | 22 | struct shmIpcDesc { 23 | union 24 | { 25 | struct shmLegacyIpc shmli; 26 | struct shmCuIpc shmci; 27 | }; 28 | bool legacy; 29 | }; 30 | 31 | typedef struct shmIpcDesc ncclShmIpcDesc_t; 32 | 33 | ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr); 34 | ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut); 35 | ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/include/shmutils.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SHMUTILS_H_ 8 | #define NCCL_SHMUTILS_H_ 9 | 10 | #include "nccl.h" 11 | 12 | typedef void* ncclShmHandle_t; 13 | ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); 14 | ncclResult_t ncclShmClose(ncclShmHandle_t handle); 15 | ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); 16 | 17 | struct ncclShmemCollBuff { 18 | volatile size_t *cnt[2]; 19 | volatile void *ptr[2]; 20 | int round; 21 | size_t maxTypeSize; 22 | }; 23 | 24 | ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/include/socket.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SOCKET_H_ 8 | #define NCCL_SOCKET_H_ 9 | 10 | #include "nccl.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #define MAX_IFS 16 19 | #define MAX_IF_NAME_SIZE 16 20 | #define SLEEP_INT 1000 // connection retry sleep interval in usec 21 | #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) 22 | #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) 23 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) 24 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL 25 | 26 | /* Common socket address storage structure for IPv4/IPv6 */ 27 | union ncclSocketAddress { 28 | struct sockaddr sa; 29 | struct sockaddr_in sin; 30 | struct sockaddr_in6 sin6; 31 | }; 32 | 33 | enum ncclSocketState { 34 | ncclSocketStateNone = 0, 35 | ncclSocketStateInitialized = 1, 36 | ncclSocketStateAccepting = 2, 37 | ncclSocketStateAccepted = 3, 38 | ncclSocketStateConnecting = 4, 39 | ncclSocketStateConnectPolling = 5, 40 | ncclSocketStateConnected = 6, 41 | ncclSocketStateReady = 7, 42 | ncclSocketStateClosed = 8, 43 | ncclSocketStateError = 9, 44 | ncclSocketStateNum = 10 45 | }; 46 | 47 | enum ncclSocketType { 48 | ncclSocketTypeUnknown = 0, 49 | ncclSocketTypeBootstrap = 1, 50 | ncclSocketTypeProxy = 2, 51 | ncclSocketTypeNetSocket = 3, 52 | ncclSocketTypeNetIb = 4 53 | }; 54 | 55 | struct ncclSocket { 56 | int fd; 57 | int acceptFd; 58 | int timedOutRetries; 59 | int refusedRetries; 60 | union ncclSocketAddress addr; 61 | volatile uint32_t* abortFlag; 62 | int asyncFlag; 63 | enum ncclSocketState state; 64 | int salen; 65 | uint64_t magic; 66 | enum ncclSocketType type; 67 | }; 68 | 69 | const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); 70 | ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); 71 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); 72 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); 73 | 74 | // Initialize a socket 75 | ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); 76 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call 77 | ncclResult_t ncclSocketListen(struct ncclSocket* sock); 78 | ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr); 79 | // Connect to sock->addr. sock->fd is set after a successful call. 80 | ncclResult_t ncclSocketConnect(struct ncclSocket* sock); 81 | // Return socket connection state. 82 | ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running); 83 | // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. 84 | ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock); 85 | ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd); 86 | ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock); 87 | 88 | #define NCCL_SOCKET_SEND 0 89 | #define NCCL_SOCKET_RECV 1 90 | 91 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); 92 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); 93 | ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); 94 | ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); 95 | ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize); 96 | ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); 97 | ncclResult_t ncclSocketClose(struct ncclSocket* sock); 98 | #endif 99 | -------------------------------------------------------------------------------- /src/include/timer.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TIMER_H_ 8 | #define NCCL_TIMER_H_ 9 | #if ENABLE_TIMER 10 | #include 11 | #include 12 | #include 13 | static double freq = -1; 14 | static void calibrate() { 15 | struct timeval tv; 16 | gettimeofday(&tv, NULL); 17 | uint64_t timeCycles = __rdtsc(); 18 | double time = - tv.tv_sec*1E6 - tv.tv_usec; 19 | uint64_t total = 0ULL; 20 | for (int i=0; i<10000; i++) total += __rdtsc(); 21 | gettimeofday(&tv, NULL); 22 | timeCycles = __rdtsc() - timeCycles; 23 | time += tv.tv_sec*1E6 + tv.tv_usec; 24 | freq = timeCycles/time; 25 | } 26 | static inline double gettime() { 27 | if (freq == -1) calibrate(); 28 | return __rdtsc()/freq; 29 | } 30 | static uint64_t counts[8]; 31 | static double times[8]; 32 | static double startTimes[8]; 33 | #define TIME_START(index) do { \ 34 | counts[index]++; \ 35 | startTimes[index] = gettime(); \ 36 | } while (0) 37 | 38 | #define TIME_STOP(index) do { \ 39 | times[index] += gettime() - startTimes[index]; \ 40 | } while (0) 41 | 42 | #define TIME_CANCEL(index) do { \ 43 | counts[index]--; \ 44 | } while (0) 45 | 46 | #define TIME_PRINT(name) do { \ 47 | printf("%s stats", name); \ 48 | for (int i=0; i<8; i++) { \ 49 | if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ 50 | counts[i] = 0; \ 51 | } \ 52 | printf("\n"); \ 53 | } while (0) 54 | #else 55 | #define TIME_START(index) do {} while(0) 56 | #define TIME_STOP(index) do {} while(0) 57 | #define TIME_CANCEL(index) do {} while(0) 58 | #define TIME_PRINT(name) 59 | #endif 60 | #endif 61 | -------------------------------------------------------------------------------- /src/include/trees.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TREES_H_ 8 | #define NCCL_TREES_H_ 9 | 10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); 11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/include/tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_INT_TUNER_H_ 9 | #define NCCL_INT_TUNER_H_ 10 | 11 | #include "nccl_tuner.h" 12 | #include "comm.h" 13 | 14 | // Tuning plugin to override NCCL's default algorithm/protocol tuning. 15 | 16 | // Attempts to load NCCL tuner from environmental variable. 17 | // Returns ncclSuccess if the correct tuner symbol has been found and 18 | // successully loaded. Otherwise returns an error and also logs the error. 19 | ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm); 20 | 21 | // Cleans up NCCL tuner plugin. 22 | ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm); 23 | #endif 24 | -------------------------------------------------------------------------------- /src/init_nvtx.cc: -------------------------------------------------------------------------------- 1 | #include "nccl.h" 2 | #include "nvtx.h" 3 | 4 | static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { 5 | {"Sum", ncclSum, 0}, 6 | {"Product", ncclProd, 0}, 7 | {"Max", ncclMax, 0}, 8 | {"Min", ncclMin, 0}, 9 | {"Avg", ncclAvg, 0} 10 | }; 11 | 12 | // Must be called before the first call to any reduction operation. 13 | void initNvtxRegisteredEnums() { 14 | // Register schemas and strings 15 | constexpr const nvtxPayloadEnumAttr_t eAttr { 16 | .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES | 17 | NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID, 18 | .name = NULL, 19 | .entries = NvtxEnumRedSchema, 20 | .numEntries = std::extent::value, 21 | .sizeOfEnum = sizeof(ncclRedOp_t), 22 | .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP, 23 | .extension = nullptr 24 | }; 25 | 26 | nvtxPayloadEnumRegister(nvtx3::domain::get(), &eAttr); 27 | } 28 | -------------------------------------------------------------------------------- /src/misc/argcheck.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "argcheck.h" 8 | #include "comm.h" 9 | 10 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { 11 | cudaPointerAttributes attr; 12 | cudaError_t err = cudaPointerGetAttributes(&attr, pointer); 13 | if (err != cudaSuccess || attr.devicePointer == NULL) { 14 | WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer); 15 | return ncclInvalidArgument; 16 | } 17 | #if CUDART_VERSION >= 10000 18 | if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { 19 | #else 20 | if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { 21 | #endif 22 | WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); 23 | return ncclInvalidArgument; 24 | } 25 | return ncclSuccess; 26 | } 27 | 28 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { 29 | if (ptr == NULL) { 30 | WARN("%s : %s argument is NULL", opname, ptrname); 31 | return ncclInvalidArgument; 32 | } 33 | return ncclSuccess; 34 | } 35 | 36 | ncclResult_t CommCheck(struct ncclComm* comm, const char* opname, const char* ptrname) { 37 | NCCLCHECK(PtrCheck(comm, opname, ptrname)); 38 | if (comm->startMagic != NCCL_MAGIC || comm->endMagic != NCCL_MAGIC) { 39 | WARN("Error: corrupted comm object detected"); 40 | return ncclInvalidArgument; 41 | } 42 | return ncclSuccess; 43 | } 44 | 45 | ncclResult_t ArgsCheck(struct ncclInfo* info) { 46 | // First, the easy ones 47 | if (info->root < 0 || info->root >= info->comm->nRanks) { 48 | WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); 49 | return ncclInvalidArgument; 50 | } 51 | if (info->datatype < 0 || info->datatype >= ncclNumTypes) { 52 | WARN("%s : invalid type %d", info->opName, info->datatype); 53 | return ncclInvalidArgument; 54 | } 55 | 56 | // ncclMaxRedOp < info->op will always be false due to the sizes of 57 | // the datatypes involved, and that's by design. We keep the check though 58 | // just as a reminder. 59 | // coverity[result_independent_of_operands] 60 | if (info->op < 0 || ncclMaxRedOp < info->op) { 61 | WARN("%s : invalid reduction operation %d", info->opName, info->op); 62 | return ncclInvalidArgument; 63 | } 64 | int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps); 65 | if (ncclNumOps <= info->op && 66 | (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) { 67 | WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op); 68 | return ncclInvalidArgument; 69 | } 70 | 71 | if (info->comm->checkPointers) { 72 | if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) { 73 | if (info->count >0) 74 | NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName)); 75 | } else { 76 | // Check CUDA device pointers 77 | if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) { 78 | NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); 79 | } 80 | if (info->coll != ncclFuncReduce || info->comm->rank == info->root) { 81 | NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); 82 | } 83 | } 84 | } 85 | return ncclSuccess; 86 | } 87 | -------------------------------------------------------------------------------- /src/misc/msccl/msccl_status.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #include "msccl/msccl_status.h" 7 | #include "msccl/msccl_struct.h" 8 | 9 | mscclStatus& mscclGetStatus() { 10 | static mscclStatus status; 11 | return status; 12 | } 13 | 14 | mscclThreadLocalStatus& mscclGetThreadLocalStatus() { 15 | static thread_local mscclThreadLocalStatus threadLocalStatus; 16 | return threadLocalStatus; 17 | } 18 | 19 | mscclSavedProxyArgs& mscclGetSavedProxyArgs() { 20 | static mscclSavedProxyArgs savedProxyArgs; 21 | return savedProxyArgs; 22 | } -------------------------------------------------------------------------------- /src/misc/param.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "param.h" 8 | #include "debug.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const char* userHomeDir() { 21 | struct passwd *pwUser = getpwuid(getuid()); 22 | return pwUser == NULL ? NULL : pwUser->pw_dir; 23 | } 24 | 25 | void setEnvFile(const char* fileName) { 26 | FILE * file = fopen(fileName, "r"); 27 | if (file == NULL) return; 28 | 29 | char *line = NULL; 30 | char envVar[1024]; 31 | char envValue[1024]; 32 | size_t n = 0; 33 | ssize_t read; 34 | while ((read = getline(&line, &n, file)) != -1) { 35 | if (line[read-1] == '\n') line[read-1] = '\0'; 36 | int s=0; // Env Var Size 37 | while (line[s] != '\0' && line[s] != '=') s++; 38 | if (line[s] == '\0') continue; 39 | strncpy(envVar, line, std::min(1023,s)); 40 | envVar[std::min(1023,s)] = '\0'; 41 | s++; 42 | strncpy(envValue, line+s, 1023); 43 | envValue[1023]='\0'; 44 | setenv(envVar, envValue, 0); 45 | //printf("%s : %s->%s\n", fileName, envVar, envValue); 46 | } 47 | if (line) free(line); 48 | fclose(file); 49 | } 50 | 51 | static void initEnvFunc() { 52 | char confFilePath[1024]; 53 | const char* userFile = getenv("NCCL_CONF_FILE"); 54 | if (userFile && strlen(userFile) > 0) { 55 | snprintf(confFilePath, sizeof(confFilePath), "%s", userFile); 56 | setEnvFile(confFilePath); 57 | } else { 58 | const char* userDir = userHomeDir(); 59 | if (userDir) { 60 | snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir); 61 | setEnvFile(confFilePath); 62 | } 63 | } 64 | snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf"); 65 | setEnvFile(confFilePath); 66 | } 67 | 68 | void initEnv() { 69 | static pthread_once_t once = PTHREAD_ONCE_INIT; 70 | pthread_once(&once, initEnvFunc); 71 | } 72 | 73 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { 74 | static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 75 | pthread_mutex_lock(&mutex); 76 | if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { 77 | const char* str = ncclGetEnv(env); 78 | int64_t value = deftVal; 79 | if (str && strlen(str) > 0) { 80 | errno = 0; 81 | value = strtoll(str, nullptr, 0); 82 | if (errno) { 83 | value = deftVal; 84 | INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); 85 | } else { 86 | INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value); 87 | } 88 | } 89 | __atomic_store_n(cache, value, __ATOMIC_RELAXED); 90 | } 91 | pthread_mutex_unlock(&mutex); 92 | } 93 | 94 | const char* ncclGetEnv(const char* name) { 95 | initEnv(); 96 | return getenv(name); 97 | } 98 | -------------------------------------------------------------------------------- /src/nccl.pc.in: -------------------------------------------------------------------------------- 1 | prefix=${nccl:Prefix} 2 | exec_prefix=${prefix} 3 | libdir=${exec_prefix}/lib 4 | includedir=${prefix}/include 5 | 6 | Name: nccl 7 | Description: Optimized primitives for collective multi-GPU communication 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} 9 | Libs: -L${libdir} -lnccl 10 | Cflags: -I${includedir} 11 | -------------------------------------------------------------------------------- /src/transport/generic.cc: -------------------------------------------------------------------------------- 1 | #include "comm.h" 2 | #include "transport.h" 3 | 4 | ncclResult_t ncclTransportRingConnect(struct ncclComm* comm, int* highestTransportType/*=NULL*/, bool* needsProxy/*=NULL*/) { 5 | ncclResult_t ret = ncclSuccess; 6 | if (comm && comm->nRanks > 1) { 7 | for (int c = 0; c < comm->nChannels; c++) { 8 | struct ncclChannel* channel = comm->channels + c; 9 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail); 10 | } 11 | NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0, highestTransportType, needsProxy), ret, fail); 12 | INFO(NCCL_INIT, "Connected all rings"); 13 | } 14 | exit: 15 | return ret; 16 | fail: 17 | goto exit; 18 | } 19 | 20 | ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm, int* highestTransportType/*=NULL*/, bool* needsProxy/*=NULL*/) { 21 | ncclResult_t ret = ncclSuccess; 22 | if (comm && comm->nRanks > 1) { 23 | // Connect Trees 24 | for (int c = 0; c < comm->nChannels; c++) { 25 | struct ncclChannel* channel = comm->channels + c; 26 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail); 27 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail); 28 | } 29 | NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0, highestTransportType, needsProxy), ret, fail); 30 | INFO(NCCL_INIT, "Connected all trees"); 31 | } 32 | exit: 33 | return ret; 34 | fail: 35 | goto exit; 36 | } 37 | 38 | ncclResult_t ncclTransportPatConnect(struct ncclComm* comm, int* highestTransportType/*=NULL*/, bool* needsProxy/*=NULL*/) { 39 | ncclResult_t ret = ncclSuccess; 40 | if (comm && comm->nRanks > 1) { 41 | for (int mask=1; masknRanks; mask<<=1) { 42 | int prevPeer = (comm->rank + mask) % comm->nRanks; 43 | int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks; 44 | for (int c = 0; c < comm->nChannels; c++) { 45 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter 46 | } 47 | NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail); 48 | for (int c = 0; c < comm->nChannels; c++) { 49 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather 50 | } 51 | NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0, highestTransportType, needsProxy), ret, fail); 52 | } 53 | INFO(NCCL_INIT, "Connected binomial trees"); 54 | } 55 | exit: 56 | return ret; 57 | fail: 58 | goto exit; 59 | } 60 | --------------------------------------------------------------------------------