├── .github └── workflows │ └── codeql-analysis.yml ├── .gitignore ├── LICENSE.txt ├── Makefile ├── README.md ├── SECURITY.md ├── dockerfiles └── Dockerfile ├── ext-net ├── dummy │ ├── Makefile │ └── plugin.c └── google-fastsocket │ └── Makefile ├── makefiles ├── common.mk ├── formatting.mk └── version.mk ├── patches ├── nccl.cpp.patch ├── torch1.12.nccl.cpp.patch └── torch1.13.nccl.cpp.patch ├── pkg ├── Makefile ├── debian │ ├── .gitignore │ ├── Makefile │ ├── changelog.in │ ├── compat │ ├── control.in │ ├── copyright │ ├── gbp.conf │ ├── libnccl-dev.install.in │ ├── libnccl2.install.in │ ├── rules │ └── source │ │ └── format ├── redhat │ ├── Makefile │ └── nccl.spec.in ├── srctxz │ ├── Makefile │ └── create_srctxz.sh.in └── txz │ ├── Makefile │ └── create_txz.sh.in ├── src ├── Makefile ├── bootstrap.cc ├── channel.cc ├── collectives │ ├── all_gather.cc │ ├── all_reduce.cc │ ├── all_to_all.cc │ ├── broadcast.cc │ ├── custom_collective.cc │ ├── device │ │ ├── Makefile │ │ ├── all_gather.cu │ │ ├── all_gather.h │ │ ├── all_reduce.cu │ │ ├── all_reduce.h │ │ ├── all_to_all.cu │ │ ├── all_to_all.h │ │ ├── broadcast.cu │ │ ├── broadcast.h │ │ ├── common.h │ │ ├── common_kernel.h │ │ ├── custom_collective.cu │ │ ├── custom_collective.h │ │ ├── functions.cu │ │ ├── gen_rules.sh │ │ ├── msccl_interpreter.h │ │ ├── onerank_reduce.cu │ │ ├── op128.h │ │ ├── primitives.h │ │ ├── prims_ll.h │ │ ├── prims_ll128.h │ │ ├── prims_simple.h │ │ ├── reduce.cu │ │ ├── reduce.h │ │ ├── reduce_kernel.h │ │ ├── reduce_scatter.cu │ │ ├── reduce_scatter.h │ │ ├── sendrecv.cu │ │ ├── sendrecv.h │ │ └── stride_copy.cu │ ├── reduce.cc │ ├── reduce_scatter.cc │ └── sendrecv.cc ├── debug.cc ├── enhcompat.cc ├── enqueue.cc ├── graph │ ├── connect.cc │ ├── paths.cc │ ├── rings.cc │ ├── rings.h │ ├── search.cc │ ├── topo.cc │ ├── topo.h │ ├── trees.cc │ ├── tuning.cc │ ├── xml.cc │ └── xml.h ├── group.cc ├── include │ ├── align.h │ ├── alloc.h │ ├── argcheck.h │ ├── bootstrap.h │ ├── channel.h │ ├── checks.h │ ├── coll_net.h │ ├── collectives.h │ ├── comm.h │ ├── core.h │ ├── cpuset.h │ ├── debug.h │ ├── devcomm.h │ ├── enqueue.h │ ├── gdrwrap.h │ ├── graph.h │ ├── group.h │ ├── ibvwrap.h │ ├── info.h │ ├── msccl.h │ ├── nccl_net.h │ ├── net.h │ ├── npkit │ │ ├── npkit.h │ │ ├── npkit_event.h │ │ └── npkit_struct.h │ ├── nvmlwrap.h │ ├── nvtx.h │ ├── nvtx3.hpp │ ├── nvtx3 │ │ ├── nvToolsExt.h │ │ ├── nvToolsExtCuda.h │ │ ├── nvToolsExtCudaRt.h │ │ ├── nvToolsExtOpenCL.h │ │ ├── nvToolsExtSync.h │ │ └── nvtxDetail │ │ │ ├── nvtxImpl.h │ │ │ ├── nvtxImplCore.h │ │ │ ├── nvtxImplCudaRt_v3.h │ │ │ ├── nvtxImplCuda_v3.h │ │ │ ├── nvtxImplOpenCL_v3.h │ │ │ ├── nvtxImplSync_v3.h │ │ │ ├── nvtxInit.h │ │ │ ├── nvtxInitDecls.h │ │ │ ├── nvtxInitDefs.h │ │ │ ├── nvtxLinkOnce.h │ │ │ └── nvtxTypes.h │ ├── p2p.h │ ├── param.h │ ├── profiler.h │ ├── proxy.h │ ├── shm.h │ ├── socket.h │ ├── timer.h │ ├── transport.h │ ├── trees.h │ └── utils.h ├── init.cc ├── misc │ ├── argcheck.cc │ ├── gdrwrap.cc │ ├── ibvwrap.cc │ ├── npkit.cc │ ├── nvmlwrap.cc │ ├── param.cc │ ├── profiler.cc │ ├── shmutils.cc │ ├── socket.cc │ └── utils.cc ├── nccl.h.in ├── nccl.pc.in ├── net.cc ├── proxy.cc ├── transport.cc └── transport │ ├── coll_net.cc │ ├── net.cc │ ├── net_ib.cc │ ├── net_socket.cc │ ├── p2p.cc │ └── shm.cc └── tools └── npkit_trace_generator.py /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '30 12 * * 2' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-18.04 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'cpp' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 37 | # Learn more: 38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 39 | 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@v2 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v1 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | # - name: Autobuild 57 | # uses: github/codeql-action/autobuild@v1 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | - run: | 67 | sudo wget -O /etc/apt/preferences.d/cuda-repository-pin-600 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin 68 | sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub 69 | sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" 70 | sudo apt install cuda -y 71 | export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} 72 | make src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70" 73 | 74 | - name: Perform CodeQL Analysis 75 | uses: github/codeql-action/analyze@v1 76 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. 2 | /build 3 | *.gcov 4 | /coverage/ 5 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Contains contributions from NVIDIA. 3 | 4 | Copyright (c) 2020-2022, Microsoft Corporation. All rights reserved. 5 | 6 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions 10 | are met: 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 17 | Laboratory, the U.S. Department of Energy, nor the names of their 18 | contributors may be used to endorse or promote products derived 19 | from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 22 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 29 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | 33 | The U.S. Department of Energy funded the development of this software 34 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 35 | 36 | 37 | This code also includes files from the NVIDIA Tools Extension SDK project. 38 | 39 | See: 40 | 41 | https://github.com/NVIDIA/NVTX 42 | 43 | for more information and license details. 44 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : src.build 9 | install : src.install 10 | BUILDDIR ?= $(abspath ./build) 11 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 12 | TARGETS := src pkg 13 | clean: ${TARGETS:%=%.clean} 14 | test.build: src.build 15 | LICENSE_FILES := LICENSE.txt 16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) 17 | lic: $(LICENSE_TARGETS) 18 | 19 | ${BUILDDIR}/%.txt: %.txt 20 | @printf "Copying %-35s > %s\n" $< $@ 21 | mkdir -p ${BUILDDIR} 22 | cp $< $@ 23 | 24 | src.%: 25 | ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} 26 | 27 | pkg.%: 28 | ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} 29 | 30 | pkg.debian.prep: lic 31 | pkg.txz.prep: lic 32 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-devel 2 | 3 | ############################################################################## 4 | # Temporary Installation Directory 5 | ############################################################################## 6 | ENV STAGE_DIR=/tmp 7 | RUN mkdir -p ${STAGE_DIR} 8 | 9 | 10 | ############################################################################## 11 | # Installation/Basic Utilities 12 | ############################################################################## 13 | RUN apt-get update && \ 14 | apt-get install -y --allow-change-held-packages --no-install-recommends \ 15 | software-properties-common \ 16 | build-essential autotools-dev cmake g++ gcc \ 17 | openssh-client openssh-server \ 18 | nfs-common pdsh curl sudo net-tools \ 19 | vim iputils-ping wget perl unzip 20 | 21 | ############################################################################## 22 | # Installation Latest Git 23 | ############################################################################## 24 | RUN add-apt-repository ppa:git-core/ppa -y && \ 25 | apt-get update && \ 26 | apt-get install -y git && \ 27 | git --version 28 | 29 | ############################################################################## 30 | # Pip 31 | ############################################################################## 32 | # pip version <= 20.1.1 is needed for the ruamel.yaml installation conflict 33 | # between conda and pip. ruamel.yaml is needed by azureml. 34 | # https://github.com/Azure/MachineLearningNotebooks/issues/1110 for more info. 35 | ENV PIP_VERSION=20.1.1 36 | RUN conda install -y pip=${PIP_VERSION} && \ 37 | # Print python an pip version 38 | python -V && pip -V 39 | 40 | ############################################################################## 41 | # MPI 42 | ############################################################################## 43 | RUN cd ${STAGE_DIR} && mkdir openmpi/ && cd openmpi && wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.1.tar.gz && \ 44 | tar zxf openmpi-4.0.1.tar.gz && \ 45 | cd openmpi-4.0.1 && \ 46 | ./configure --enable-orterun-prefix-by-default && \ 47 | make -j $(nproc) all && \ 48 | make install && \ 49 | ldconfig && \ 50 | rm -rf ${STAGE_DIR}/openmpi/ 51 | 52 | ############################################################################## 53 | # SCCL 54 | ############################################################################## 55 | 56 | # update NCCL in pytorch, install SCCL interpreter 57 | RUN pip uninstall torch -y 58 | 59 | RUN pip install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses 60 | 61 | RUN conda install -c pytorch magma-cuda111 -y 62 | 63 | ENV CMAKE_PREFIX_PATH=/opt/conda 64 | 65 | # Change NCCL to SCCL Runtime 66 | RUN cd ${STAGE_DIR} && \ 67 | git clone https://github.com/pytorch/pytorch.git && \ 68 | cd pytorch && \ 69 | git checkout tags/v1.9.0 -b v1.9.0_sccl && \ 70 | perl -p -i -e 's/url = https:\/\/github\.com\/NVIDIA\/nccl/url = https:\/\/github\.com\/microsoft\/msccl/g' .gitmodules && \ 71 | git submodule sync third_party/nccl && \ 72 | git submodule update --init --recursive && \ 73 | git submodule update --init --recursive --remote third_party/nccl && \ 74 | cd third_party/nccl/nccl/ && \ 75 | git checkout master && \ 76 | cd ../../../ && \ 77 | git apply third_party/nccl/nccl/patches/nccl.cpp.patch && \ 78 | python setup.py install && \ 79 | cd ${STAGE_DIR} && \ 80 | rm -rf ${STAGE_DIR}/pytorch 81 | 82 | # Install SCCL 83 | RUN cd ${STAGE_DIR}/ && \ 84 | git clone https://github.com/microsoft/sccl.git && \ 85 | cd sccl/ && python setup.py install && \ 86 | cd ${STAGE_DIR} && \ 87 | rm -rf ${STAGE_DIR}/sccl/ 88 | 89 | ############################################################################## 90 | # inspector-topo 91 | ############################################################################## 92 | 93 | RUN apt-get install libibverbs-dev libnuma-dev -y 94 | RUN cd ${STAGE_DIR}/ && git clone https://github.com/microsoft/inspector-topo.git && \ 95 | cd inspector-topo/ && make && make install 96 | -------------------------------------------------------------------------------- /ext-net/dummy/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME:=../../build/ 7 | CUDA_HOME:=/usr/local/cuda 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include 9 | PLUGIN_SO:=libnccl-net.so 10 | 11 | default: $(PLUGIN_SO) 12 | 13 | $(PLUGIN_SO): plugin.c 14 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 15 | 16 | clean: 17 | rm -f $(PLUGIN_SO) 18 | -------------------------------------------------------------------------------- /ext-net/dummy/plugin.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | #include 9 | 10 | #define __hidden __attribute__ ((visibility("hidden"))) 11 | 12 | __hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } 13 | __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } 14 | __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } 15 | __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } 16 | __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } 17 | __hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; } 18 | __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; } 19 | __hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; } 20 | __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} 21 | __hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; } 22 | __hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; } 23 | __hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) { return ncclInternalError; } 24 | __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } 25 | __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } 26 | __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; } 27 | __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; } 28 | 29 | ncclNet_t NCCL_PLUGIN_SYMBOL = { 30 | "Dummy", 31 | pluginInit, 32 | pluginDevices, 33 | pluginPciPath, 34 | pluginPtrSupport, 35 | pluginListen, 36 | pluginConnect, 37 | pluginAccept, 38 | pluginRegMr, 39 | pluginDeregMr, 40 | pluginIsend, 41 | pluginIrecv, 42 | pluginFlush, 43 | pluginTest, 44 | pluginCloseSend, 45 | pluginCloseRecv, 46 | pluginCloseListen 47 | }; 48 | 49 | __hidden ncclResult_t pluginCollNetInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } 50 | __hidden ncclResult_t pluginCollNetDevices(int* ndev) { *ndev = 0; return ncclSuccess; } 51 | __hidden ncclResult_t pluginCollNetPciPath(int dev, char** path) { return ncclInternalError; } 52 | __hidden ncclResult_t pluginCollNetPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } 53 | __hidden ncclResult_t pluginCollNetListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } 54 | __hidden ncclResult_t pluginCollNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { return ncclInternalError; } 55 | __hidden ncclResult_t pluginCollNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { return ncclInternalError; } 56 | __hidden ncclResult_t pluginCollNetRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; } 57 | __hidden ncclResult_t pluginCollNetDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} 58 | __hidden ncclResult_t pluginCollNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { return ncclInternalError; } 59 | __hidden ncclResult_t pluginCollNetFlush(void* collComm, void* data, int size, void* mhandle) { return ncclInternalError; } 60 | __hidden ncclResult_t pluginCollNetTest(void* request, int* done, int* size) { return ncclInternalError; } 61 | __hidden ncclResult_t pluginCollNetCloseColl(void* collComm) { return ncclInternalError; } 62 | __hidden ncclResult_t pluginCollNetCloseListen(void* listenComm) { return ncclInternalError; } 63 | 64 | ncclCollNet_t NCCL_COLLNET_PLUGIN_SYMBOL = { 65 | "Dummy", 66 | pluginCollNetInit, 67 | pluginCollNetDevices, 68 | pluginCollNetPciPath, 69 | pluginCollNetPtrSupport, 70 | pluginCollNetListen, 71 | pluginCollNetConnect, 72 | pluginCollNetReduceSupport, 73 | pluginCollNetRegMr, 74 | pluginCollNetDeregMr, 75 | pluginCollNetIallreduce, 76 | pluginCollNetFlush, 77 | pluginCollNetTest, 78 | pluginCollNetCloseColl, 79 | pluginCollNetCloseListen 80 | }; 81 | -------------------------------------------------------------------------------- /ext-net/google-fastsocket/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME?=/usr/local/cuda 2 | INC:=-I$(CUDA_HOME)/include 3 | PLUGIN_SO:=libnccl-net.so 4 | 5 | default: $(PLUGIN_SO) 6 | 7 | $(PLUGIN_SO): nccl-fastsocket/net_fastsocket.cc nccl-fastsocket/compat.cc nccl-fastsocket/utilities.cc 8 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 9 | 10 | nccl-fastsocket/%.cc: 11 | git clone https://github.com/google/nccl-fastsocket.git 12 | 13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO) 14 | 15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO) 16 | @printf "Grabbing %-35s > %s\n" $< $@ 17 | mkdir -p $(BUILDDIR)/lib 18 | install -m 644 $< $@ 19 | 20 | clean: 21 | rm -f $(PLUGIN_SO) 22 | rm -Rf nccl-fastsocket 23 | -------------------------------------------------------------------------------- /makefiles/common.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | CUDA_HOME ?= /usr/local/cuda 8 | PREFIX ?= /usr/local 9 | VERBOSE ?= 0 10 | KEEP ?= 0 11 | DEBUG ?= 0 12 | TRACE ?= 0 13 | PROFAPI ?= 0 14 | NVTX ?= 1 15 | NPKIT ?= 0 16 | 17 | NVCC = $(CUDA_HOME)/bin/nvcc 18 | 19 | CUDA_LIB ?= $(CUDA_HOME)/lib64 20 | CUDA_INC ?= $(CUDA_HOME)/include 21 | CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) 22 | #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) 23 | CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) 24 | CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) 25 | #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}) 26 | 27 | # You should define NVCC_GENCODE in your environment to the minimal set 28 | # of archs to reduce compile time. 29 | CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \ 30 | -gencode=arch=compute_50,code=sm_50 \ 31 | -gencode=arch=compute_60,code=sm_60 \ 32 | -gencode=arch=compute_61,code=sm_61 33 | CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 34 | CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 35 | 36 | CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 37 | CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 38 | CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 39 | 40 | # Include Ampere support if we're using CUDA11 or above 41 | ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) 42 | NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX) 43 | # Include Volta support if we're using CUDA9 or above 44 | else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0) 45 | NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) 46 | else 47 | NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) 48 | endif 49 | #$(info NVCC_GENCODE is ${NVCC_GENCODE}) 50 | 51 | CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ 52 | -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \ 53 | -I $(CUDA_INC) \ 54 | $(CXXFLAGS) 55 | # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) 56 | # 512 : 120, 640 : 96, 768 : 80, 1024 : 60 57 | # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. 58 | NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all 59 | # Use addprefix so that we can specify more than one path 60 | NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt 61 | 62 | ########## GCOV ########## 63 | GCOV ?= 0 # disable by default. 64 | GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1 65 | CXXFLAGS += ${GCOV_FLAGS} 66 | NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %} 67 | LDFLAGS += ${GCOV_FLAGS} 68 | NVLDFLAGS += ${GCOV_FLAGS:%=-Xcompiler %} 69 | # $(warning GCOV_FLAGS=${GCOV_FLAGS}) 70 | ########## GCOV ########## 71 | 72 | ifeq ($(DEBUG), 0) 73 | NVCUFLAGS += -O3 74 | CXXFLAGS += -O3 -g 75 | else 76 | NVCUFLAGS += -O0 -G -g 77 | CXXFLAGS += -O0 -g -ggdb3 78 | endif 79 | 80 | ifneq ($(VERBOSE), 0) 81 | NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter 82 | CXXFLAGS += -Wall -Wextra 83 | else 84 | .SILENT: 85 | endif 86 | 87 | ifneq ($(TRACE), 0) 88 | CXXFLAGS += -DENABLE_TRACE 89 | endif 90 | 91 | ifeq ($(NVTX), 0) 92 | CXXFLAGS += -DNVTX_DISABLE 93 | endif 94 | 95 | ifeq ($(NPKIT), 1) 96 | CXXFLAGS += -DENABLE_NPKIT 97 | NVCUFLAGS += -DENABLE_NPKIT 98 | endif 99 | 100 | ifneq ($(KEEP), 0) 101 | NVCUFLAGS += -keep 102 | endif 103 | 104 | ifneq ($(PROFAPI), 0) 105 | CXXFLAGS += -DPROFAPI 106 | endif 107 | -------------------------------------------------------------------------------- /makefiles/formatting.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting 8 | # As this file defines a new target (format), it should be included at least after the definition of the 9 | # default target. 10 | 11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none 12 | ASTYLEDIR := $(BUILDDIR)/contrib 13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz 14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle 15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/ 16 | ASTYLEVER := 3.1 17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz" 18 | 19 | $(ASTYLEDIR) : 20 | @mkdir -p $(ASTYLEDIR) 21 | 22 | $(ASTYLETAR) : $(ASTYLEDIR) 23 | @wget -q -O $(ASTYLETAR) $(ASTYLEURL) 24 | 25 | $(ASTYLEBLD) : $(ASTYLETAR) 26 | @cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR) 27 | 28 | $(ASTYLEBIN) : $(ASTYLEBLD) 29 | ${MAKE} -C $(ASTYLEBLD) 30 | 31 | .PHONY : format 32 | format : $(ASTYLEBIN) 33 | @$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT) 34 | -------------------------------------------------------------------------------- /makefiles/version.mk: -------------------------------------------------------------------------------- 1 | ##### version 2 | NCCL_MAJOR := 2 3 | NCCL_MINOR := 12 4 | NCCL_PATCH := 12 5 | NCCL_SUFFIX := 6 | PKG_REVISION := 1 7 | MSCCL_VERSION := 0.1 8 | -------------------------------------------------------------------------------- /patches/nccl.cpp.patch: -------------------------------------------------------------------------------- 1 | diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp 2 | index 0248e81496..275154c5ce 100644 3 | --- a/torch/csrc/cuda/nccl.cpp 4 | +++ b/torch/csrc/cuda/nccl.cpp 5 | @@ -640,12 +640,22 @@ void all2all_single_equal_split(at::Tensor& input, 6 | #if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && (NCCL_MAJOR * 10 + NCCL_MINOR) >= 27 7 | using namespace torch::cuda::nccl::detail; 8 | 9 | - int numranks; 10 | + //int numranks; 11 | auto type = to_nccl_data_type(input); 12 | size_t count = input.numel() / size; 13 | - size_t rankdiff = input.nbytes() / size; 14 | const auto* sendbuff = reinterpret_cast(input.data_ptr()); 15 | auto* recvbuff = reinterpret_cast(output.data_ptr()); 16 | + NCCL_CHECK(ncclAllToAll( 17 | + sendbuff, 18 | + recvbuff, 19 | + count, 20 | + type, 21 | + to_nccl_comm(_comm), 22 | + stream)); 23 | + 24 | + /* 25 | + //size_t rankdiff = input.nbytes() / size; 26 | + 27 | auto comm = to_nccl_comm(_comm); 28 | NCCL_CHECK(ncclCommCount(comm, &numranks)); 29 | NCCL_CHECK(ncclGroupStart()); 30 | @@ -658,6 +668,7 @@ void all2all_single_equal_split(at::Tensor& input, 31 | } 32 | } 33 | NCCL_CHECK(ncclGroupEnd()); 34 | + */ 35 | #else 36 | AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0"); 37 | #endif 38 | -------------------------------------------------------------------------------- /patches/torch1.12.nccl.cpp.patch: -------------------------------------------------------------------------------- 1 | diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp 2 | index 5817449c1a..edc4f7781a 100644 3 | --- a/torch/csrc/cuda/nccl.cpp 4 | +++ b/torch/csrc/cuda/nccl.cpp 5 | @@ -650,7 +650,7 @@ void all2all_single_equal_split(at::Tensor& input, 6 | const auto* sendbuff = reinterpret_cast(input.data_ptr()); 7 | auto* recvbuff = reinterpret_cast(output.data_ptr()); 8 | auto comm = to_nccl_comm(_comm); 9 | -#if defined(USE_ROCM) && ROCM_VERSION >= 50000 10 | +#if 1 11 | NCCL_CHECK(ncclAllToAll(sendbuff , recvbuff , count, type, comm, stream)); 12 | #else 13 | NCCL_CHECK(ncclCommCount(comm, &numranks)); 14 | -------------------------------------------------------------------------------- /patches/torch1.13.nccl.cpp.patch: -------------------------------------------------------------------------------- 1 | diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp 2 | index 83729084ae..934bf24ea4 100644 3 | --- a/torch/csrc/cuda/nccl.cpp 4 | +++ b/torch/csrc/cuda/nccl.cpp 5 | @@ -655,7 +655,7 @@ void all2all_single_equal_split( 6 | const auto* sendbuff = reinterpret_cast(input.data_ptr()); 7 | auto* recvbuff = reinterpret_cast(output.data_ptr()); 8 | auto comm = to_nccl_comm(_comm); 9 | -#if defined(USE_ROCM) && ROCM_VERSION >= 50000 10 | +#if 1 11 | NCCL_CHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream)); 12 | #else 13 | NCCL_CHECK(ncclCommCount(comm, &numranks)); 14 | -------------------------------------------------------------------------------- /pkg/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : build 9 | build : debian.build txz.build 10 | 11 | BUILDDIR ?= $(abspath ../build) 12 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 13 | TARGETS := debian txz 14 | all: ${TARGETS:%=%.build} 15 | prep: ${TARGETS:%=%.prep} 16 | build: ${TARGETS:%=%.build} 17 | clean: ${TARGETS:%=%.clean} 18 | 19 | %.prep: 20 | ${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR} 21 | 22 | %.build: 23 | ${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR} 24 | 25 | %.clean: 26 | ${MAKE} -C $* clean 27 | -------------------------------------------------------------------------------- /pkg/debian/.gitignore: -------------------------------------------------------------------------------- 1 | /*.debhelper.log 2 | /*.debhelper 3 | /*.substvars 4 | /tmp/ 5 | /files 6 | /libnccl1/ 7 | /libnccl-dev/ 8 | -------------------------------------------------------------------------------- /pkg/debian/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | DEBPREPDIR := $(BUILDDIR)/debian 11 | PKGDIR := $(BUILDDIR)/pkg/deb/ 12 | 13 | DEBGEN_IN := $(wildcard *.in) 14 | DEBGEN := $(DEBGEN_IN:.in=) 15 | DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN) 16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) 20 | PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) 21 | 22 | prep : $(DEBTARGETS) 23 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 24 | 25 | build : prep 26 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 27 | @printf "Building Debian package\n" 28 | (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b) 29 | mkdir -p $(PKGDIR) 30 | mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ 31 | 32 | clean: 33 | rm -Rf $(DEBPREPDIR) $(PKGDIR) 34 | 35 | $(DEBPREPDIR)/% : %.in 36 | @printf "Generating %-35s > %s\n" $< $@ 37 | mkdir -p $(DEBPREPDIR) 38 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 39 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 40 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 41 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 42 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 43 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 44 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 45 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 46 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 47 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 48 | $< > $@ 49 | 50 | $(DEBPREPDIR)/% : % 51 | @printf "Grabbing %-35s > %s\n" $< $@ 52 | mkdir -p $(DEBPREPDIR) 53 | cp -f $< $@ 54 | -------------------------------------------------------------------------------- /pkg/debian/changelog.in: -------------------------------------------------------------------------------- 1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium 2 | 3 | * Automatic Debian package from build 4 | 5 | -- cudatools ${pkg:Timestamp} 6 | -------------------------------------------------------------------------------- /pkg/debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /pkg/debian/control.in: -------------------------------------------------------------------------------- 1 | Source: nccl 2 | Section: libs 3 | Maintainer: cudatools 4 | Priority: optional 5 | Build-depends: debhelper(>=9) 6 | Standards-Version: 3.9.5 7 | 8 | Package: libnccl${nccl:Major} 9 | Section: libs 10 | Architecture: ${pkg:Arch} 11 | Depends: ${misc:Depends}, ${shlibs:Depends} 12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | Package: libnccl-dev 21 | Section: libdevel 22 | Architecture: ${pkg:Arch} 23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version}) 24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files 25 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 26 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 27 | broadcast, and reduce-scatter. 28 | It has been optimized to achieve high bandwidth on any platform using PCIe, 29 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 30 | sockets. 31 | -------------------------------------------------------------------------------- /pkg/debian/copyright: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 13 | Laboratory, the U.S. Department of Energy, nor the names of their 14 | contributors may be used to endorse or promote products derived 15 | from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | The U.S. Department of Energy funded the development of this software 30 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 31 | 32 | 33 | This code also includes files from the NVIDIA Tools Extension SDK project. 34 | 35 | See: 36 | 37 | https://github.com/NVIDIA/NVTX 38 | 39 | for more information and license details. 40 | -------------------------------------------------------------------------------- /pkg/debian/gbp.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | debian-branch = master 3 | upstream-branch = master 4 | 5 | ignore-new = True 6 | 7 | [git-buildpackage] 8 | 9 | no-purge = True 10 | -------------------------------------------------------------------------------- /pkg/debian/libnccl-dev.install.in: -------------------------------------------------------------------------------- 1 | include/nccl.h /usr/include 2 | include/nccl_net.h /usr/include 3 | lib/libnccl.so /usr/lib/${pkg:MultiArch} 4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch} 5 | -------------------------------------------------------------------------------- /pkg/debian/libnccl2.install.in: -------------------------------------------------------------------------------- 1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch} 2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch} 3 | -------------------------------------------------------------------------------- /pkg/debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | %: 4 | dh $@ --parallel 5 | 6 | override_dh_auto_install: 7 | PREFIX=debian/tmp dh_auto_install 8 | 9 | override_dh_auto_test: 10 | # Do not make test 11 | 12 | override_dh_auto_clean: 13 | # Do not make clean 14 | -------------------------------------------------------------------------------- /pkg/debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /pkg/redhat/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | RPMPREPDIR := $(BUILDDIR)/redhat 11 | PKGDIR := $(BUILDDIR)/pkg/rpm/ 12 | 13 | RPMGEN_IN := $(wildcard *.in) 14 | RPMGEN := $(RPMGEN_IN:.in=) 15 | RPMFILES := $(RPMGEN) 16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | ARCH := $(shell uname -m) 20 | PKG_ARCH ?= $(shell uname -m) 21 | PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) 22 | ifeq ($(PKG_MULTIARCH),) 23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it 24 | PKG_MULTIARCH := $(ARCH)-linux-gnu 25 | endif 26 | 27 | prep : $(RPMTARGETS) 28 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 29 | 30 | build : prep 31 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 32 | $(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR) 33 | @printf "Building Redhat package\n" 34 | mkdir -p $(PKGDIR) 35 | rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \ 36 | --define "_rpmdir $(PKGDIR)" \ 37 | --define "_builddir $(PKGDIR)/build/" \ 38 | --define "_buildrootdir $(PKGDIR)/buildroot/" \ 39 | -bb $(BUILDDIR)/redhat/nccl.spec 40 | 41 | clean: 42 | rm -Rf $(RPMPREPDIR) $(PKGDIR) 43 | 44 | $(RPMPREPDIR)/% : %.in 45 | @printf "Generating %-35s > %s\n" $< $@ 46 | mkdir -p $(RPMPREPDIR) 47 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 48 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 49 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 50 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 51 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 52 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 53 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 54 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 55 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 56 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 57 | $< > $@ 58 | 59 | $(RPMPREPDIR)/% : % 60 | @printf "Grabbing %-35s > %s\n" $< $@ 61 | mkdir -p $(RPMPREPDIR) 62 | cp -f $< $@ 63 | -------------------------------------------------------------------------------- /pkg/redhat/nccl.spec.in: -------------------------------------------------------------------------------- 1 | Name: libnccl 2 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} 3 | Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} 4 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 5 | 6 | Group: Development/Libraries 7 | License: BSD 8 | URL: http://developer.nvidia.com/nccl 9 | Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz 10 | Requires(pre,preun): /sbin/ldconfig 11 | 12 | %description 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | %package devel 21 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 22 | Group: Development/Libraries 23 | %description devel 24 | NCCL development files 25 | 26 | %package static 27 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 28 | Group: Development/Libraries 29 | %description static 30 | NCCL static library 31 | 32 | %define debug_package %{nil} 33 | 34 | %prep 35 | %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q 36 | 37 | %build 38 | 39 | %install 40 | rm -rf $RPM_BUILD_ROOT 41 | install -m 755 -d $RPM_BUILD_ROOT 42 | install -m 755 -d $RPM_BUILD_ROOT/%{_libdir} 43 | install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir} 44 | ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major} 45 | 46 | # devel 47 | install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} 48 | install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} 49 | install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir} 50 | ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so 51 | 52 | # static 53 | install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir} 54 | 55 | %post -p /sbin/ldconfig 56 | %postun -p /sbin/ldconfig 57 | 58 | %post devel -p /sbin/ldconfig 59 | %postun devel -p /sbin/ldconfig 60 | 61 | %clean 62 | rm -rf $RPM_BUILD_ROOT 63 | 64 | %files devel 65 | %doc LICENSE.txt 66 | %defattr(-,root,root,-) 67 | %{_includedir}/nccl.h 68 | %{_includedir}/nccl_net.h 69 | %{_libdir}/libnccl.so 70 | 71 | %files static 72 | %doc LICENSE.txt 73 | %defattr(-,root,root,-) 74 | %{_libdir}/libnccl_static.a 75 | 76 | %files 77 | %doc LICENSE.txt 78 | %defattr(-,root,root,-) 79 | %{_libdir}/libnccl.so.${nccl:Major} 80 | %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} 81 | 82 | %changelog 83 | -------------------------------------------------------------------------------- /pkg/srctxz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/srctxz 11 | PKGDIR := $(BUILDDIR)/pkg/srctxz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_REVISION ?= 3 18 | PKG_ARCH := $(shell uname -m) 19 | 20 | prep: $(TXZTARGETS) 21 | 22 | build: prep 23 | $(MAKE) -C ../../src clean 24 | @printf "Building source tar.xz package\n" 25 | (cd $(BUILDDIR); bash srctxz/create_srctxz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 40 | $< > $@ 41 | -------------------------------------------------------------------------------- /pkg/srctxz/create_srctxz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | cd .. 11 | NCCLDIR=`basename $PWD` 12 | 13 | echo "Checking for unclean directory ..." 14 | git clean -x -i 15 | echo "Clean done." 16 | echo "Checking for uncommited files ..." 17 | if [ "`git status -s | wc -l`" != "0" ]; then 18 | git status -s 19 | echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)" 20 | read 21 | fi 22 | 23 | cd .. 24 | NCCL_MAJOR=${nccl:Major} 25 | NCCL_MINOR=${nccl:Minor} 26 | NCCL_PATCH=${nccl:Patch} 27 | NCCL_SUFFIX=${nccl:Suffix} 28 | NCCL_BUILD=${pkg:Revision} 29 | 30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" 31 | 32 | tar --exclude build \ 33 | --exclude ".git*" \ 34 | --exclude pkg/srctxz \ 35 | --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR 36 | -------------------------------------------------------------------------------- /pkg/txz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/txz 11 | PKGDIR := $(BUILDDIR)/pkg/txz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_ARCH := $(shell uname -m) 18 | 19 | prep: $(TXZTARGETS) 20 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 21 | 22 | build: prep 23 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 24 | @printf "Building tar.xz package\n" 25 | (cd $(BUILDDIR); bash txz/create_txz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../nccl*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 40 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 41 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 42 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 43 | $< > $@ 44 | -------------------------------------------------------------------------------- /pkg/txz/create_txz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | BUILDDIR=`basename $PWD` 11 | 12 | cd .. 13 | NCCL_MAJOR=${nccl:Major} 14 | NCCL_MINOR=${nccl:Minor} 15 | NCCL_PATCH=${nccl:Patch} 16 | NCCL_SUFFIX=${nccl:Suffix} 17 | CUDA_MAJOR=${cuda:Major} 18 | CUDA_MINOR=${cuda:Minor} 19 | PKG_REVISION=${pkg:Revision} 20 | PKG_ARCH=${pkg:Arch} 21 | 22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" 23 | 24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt 25 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../makefiles/common.mk 8 | include ../makefiles/version.mk 9 | 10 | ##### src files 11 | INCEXPORTS := nccl.h nccl_net.h 12 | LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \ 13 | misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/npkit.cc \ 14 | transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \ 15 | collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc collectives/all_to_all.cc collectives/custom_collective.cc \ 16 | graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc 17 | 18 | ##### lib files 19 | LIBNAME := libnccl.so 20 | STATICLIBNAME := libnccl_static.a 21 | ##### pkgconfig files 22 | PKGCONFIGFILE := nccl.pc 23 | ##### dirs 24 | BUILDDIR ?= $(abspath ../build) 25 | INCDIR := $(BUILDDIR)/include 26 | LIBDIR := $(BUILDDIR)/lib 27 | OBJDIR := $(BUILDDIR)/obj 28 | PKGDIR := $(BUILDDIR)/lib/pkgconfig 29 | ##### target files 30 | CUDARTLIB ?= cudart_static 31 | INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) 32 | LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) 33 | LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) 34 | STATICLIBTARGET := $(STATICLIBNAME) 35 | PKGTARGET := $(PKGCONFIGFILE) 36 | LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) 37 | DEPFILES := $(LIBOBJ:%.o=%.d) 38 | LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl 39 | 40 | DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a 41 | 42 | ##### rules 43 | build : lib staticlib 44 | 45 | lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET) 46 | 47 | staticlib : $(LIBDIR)/$(STATICLIBTARGET) 48 | 49 | $(DEVICELIB): ALWAYS_REBUILD $(INCTARGETS) 50 | $(MAKE) -C collectives/device 51 | 52 | # Empty target to force rebuild 53 | ALWAYS_REBUILD: 54 | 55 | -include $(DEPFILES) 56 | $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ) 57 | 58 | $(INCDIR)/nccl.h : nccl.h.in 59 | # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z)) 60 | @$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) 61 | mkdir -p $(INCDIR) 62 | @printf "Generating %-35s > %s\n" $< $@ 63 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 64 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 65 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 66 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 67 | -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \ 68 | $< > $@ 69 | 70 | $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB) 71 | @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ 72 | mkdir -p $(LIBDIR) 73 | $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS) 74 | ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) 75 | ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) 76 | 77 | null := 78 | space := $(null) # 79 | comma := , 80 | 81 | $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) 82 | @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ 83 | mkdir -p $(LIBDIR) 84 | printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M 85 | 86 | $(PKGDIR)/nccl.pc : nccl.pc.in 87 | mkdir -p $(PKGDIR) 88 | @printf "Generating %-35s > %s\n" $< $@ 89 | sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \ 90 | -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 91 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 92 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 93 | $< > $@ 94 | 95 | $(INCDIR)/%.h : %.h 96 | @printf "Grabbing %-35s > %s\n" $< $@ 97 | mkdir -p $(INCDIR) 98 | install -m 644 $< $@ 99 | 100 | $(INCDIR)/nccl_%.h : include/nccl_%.h 101 | @printf "Grabbing %-35s > %s\n" $< $@ 102 | mkdir -p $(INCDIR) 103 | install -m 644 $< $@ 104 | 105 | $(PKGDIR)/%.pc : %.pc 106 | @printf "Grabbing %-35s > %s\n" $< $@ 107 | mkdir -p $(PKGDIR) 108 | install -m 644 $< $@ 109 | 110 | $(OBJDIR)/%.o : %.cc $(INCTARGETS) 111 | @printf "Compiling %-35s > %s\n" $< $@ 112 | mkdir -p `dirname $@` 113 | $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ 114 | @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) 115 | @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) 116 | @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ 117 | sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) 118 | @rm -f $(@:%.o=%.d.tmp) 119 | 120 | clean : 121 | $(MAKE) -C collectives/device clean 122 | rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} 123 | 124 | install : build 125 | mkdir -p $(PREFIX)/lib 126 | mkdir -p $(PREFIX)/lib/pkgconfig 127 | mkdir -p $(PREFIX)/include 128 | cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/ 129 | cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ 130 | cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ 131 | 132 | FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h') 133 | # Note that formatting.mk defines a new target so in order to not overwrite the default target, 134 | # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well 135 | # as the BUILDDIR variable. 136 | include ../makefiles/formatting.mk 137 | -------------------------------------------------------------------------------- /src/channel.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "channel.h" 8 | #include "param.h" 9 | #include "gdrwrap.h" 10 | 11 | // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory 12 | NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); 13 | 14 | ncclResult_t initChannel(struct ncclComm* comm, int channelid) { 15 | struct ncclChannel* channel = comm->channels+channelid; 16 | if (channel->id != -1) return ncclSuccess; 17 | channel->id = channelid; 18 | 19 | // Ring index to user rank table. 20 | NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); 21 | NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); 22 | 23 | // Communication structures with peers. 24 | NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network) 25 | NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1)); 26 | for (size_t i=0; inRanks+1; ++i) { 27 | for (int b=0; bpeers[i].send[b].comm = comm; 29 | channel->peers[i].recv[b].comm = comm; 30 | } 31 | } 32 | 33 | // Per-channel operation list. 34 | NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS)); 35 | if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { 36 | // GDRCOPY support 37 | // We allocate a workFifo in GDR mapped CUDA memory 38 | // But we still allocate the Host workFifo so that we 39 | // can copy the work elements to CUDA memory on kernel launch 40 | NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc)); 41 | } else { 42 | // The device workFifo is the Host one 43 | channel->workFifoDev = channel->workFifo; 44 | } 45 | 46 | return ncclSuccess; 47 | } 48 | 49 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { 50 | if (channel->id == -1) return ncclSuccess; 51 | // Operation list 52 | NCCLCHECK(ncclCudaHostFree(channel->workFifo)); 53 | if (channel->gdrMemDesc) { 54 | // GDRCOPY support 55 | NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc)); 56 | } 57 | 58 | // Free Ring index to rank tables 59 | free(channel->ring.userRanks); 60 | CUDACHECK(cudaFree(channel->ring.devUserRanks)); 61 | 62 | // Free transport proxy resources 63 | // Note: free all send resources first due to CollNet arrangement 64 | for (int r=0; rpeers+r; 66 | for (int b=0; bsend[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b)); 68 | } 69 | } 70 | for (int r=0; rpeers+r; 72 | for (int b=0; brecv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b)); 74 | } 75 | } 76 | 77 | // Free the peer structures. 78 | CUDACHECK(cudaFree(channel->devPeers)); 79 | free(channel->peers); 80 | 81 | return ncclSuccess; 82 | } 83 | -------------------------------------------------------------------------------- /src/collectives/all_gather.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | 10 | NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, 11 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); 12 | ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, 13 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { 14 | NVTX3_FUNC_RANGE_IN(nccl_domain); 15 | struct ncclInfo info = { ncclFuncAllGather, "AllGather", 16 | sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ 17 | ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; 18 | info.algorithm = -1; 19 | return ncclEnqueueCheck(&info); 20 | } 21 | -------------------------------------------------------------------------------- /src/collectives/all_reduce.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | 9 | NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, 10 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); 11 | ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, 12 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { 13 | NVTX3_FUNC_RANGE_IN(nccl_domain); 14 | struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", 15 | sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ 16 | ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; 17 | info.algorithm = -1; 18 | return ncclEnqueueCheck(&info); 19 | } 20 | -------------------------------------------------------------------------------- /src/collectives/broadcast.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | 10 | NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, 11 | ncclComm_t comm, cudaStream_t stream); 12 | ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, 13 | ncclComm_t comm, cudaStream_t stream) { 14 | NVTX3_FUNC_RANGE_IN(nccl_domain); 15 | struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", 16 | sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ 17 | BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; 18 | info.algorithm = -1; 19 | return ncclEnqueueCheck(&info); 20 | } 21 | /* Deprecated original "in place" function, similar to MPI */ 22 | NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, 23 | ncclComm_t comm, cudaStream_t stream); 24 | ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, 25 | ncclComm_t comm, cudaStream_t stream) { 26 | return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); 27 | } 28 | 29 | -------------------------------------------------------------------------------- /src/collectives/custom_collective.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | 10 | NCCL_API(ncclResult_t, ncclCustomCollective, const void* sendbuff, void* recvbuff, size_t count, 11 | ncclDataType_t datatype, ncclRedOp_t op, int mscclAlgorithmIndex, ncclComm_t comm, cudaStream_t stream); 12 | ncclResult_t ncclCustomCollective(const void* sendbuff, void* recvbuff, size_t count, 13 | ncclDataType_t datatype, ncclRedOp_t op, int mscclAlgorithmIndex, ncclComm_t comm, cudaStream_t stream) { 14 | NVTX3_FUNC_RANGE_IN(nccl_domain); 15 | struct ncclInfo info = { ncclFuncCustomCollective, "CustomCollective", 16 | sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ 17 | MSCCL_CHUNKSTEPS, MSCCL_SLICESTEPS }; 18 | info.mscclInfo.mscclAlgoIndex = mscclAlgorithmIndex; 19 | return ncclEnqueueCheck(&info); 20 | } 21 | -------------------------------------------------------------------------------- /src/collectives/device/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../../makefiles/common.mk 8 | include ../../../makefiles/version.mk 9 | 10 | BUILDDIR ?= $(abspath ../../../build) 11 | OBJDIR := $(BUILDDIR)/obj/collectives/device 12 | 13 | LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu all_to_all.cu custom_collective.cu 14 | 15 | LIBSRCFILES += functions.cu 16 | 17 | DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES)) 18 | DEPENDFILES:= $(DEPFILES:%.d=%.dep) 19 | STATICLIB := $(OBJDIR)/colldevice.a 20 | DEVOBJ := $(OBJDIR)/devlink.o 21 | RULESFILE := $(OBJDIR)/Makefile.rules 22 | 23 | NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" 24 | 25 | 26 | all: $(STATICLIB) 27 | 28 | # Dummy rule so that the extra dependency (%.dep) files are preserved by make 29 | all_deps: $(DEPENDFILES) 30 | 31 | # Auto-generating the rules per op/reduction/datatype/algorithm 32 | $(RULESFILE) : 33 | @printf "Generating %-35s > %s\n" rules $@ 34 | @mkdir -p $(OBJDIR) 35 | @CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@ 36 | 37 | -include $(RULESFILE) 38 | 39 | LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o 40 | 41 | CUSTOMLIBOBJ := $(OBJDIR)/stride_copy_lib.o 42 | CUSTOMDEVOBJ := $(OBJDIR)/stride_copy_dev.o 43 | 44 | -include $(DEPFILES) 45 | 46 | $(STATICLIB): $(LIBOBJ) $(DEVOBJ) $(CUSTOMLIBOBJ) $(CUSTOMDEVOBJ) 47 | @printf "Archiving %-35s > %s\n" objects $@ 48 | ar cr $@ $^ 49 | 50 | # We do not want make to build *.d when running make clean. 51 | # So we only provide targets for .dep which will produce .dep and .d, 52 | # with only .d being included, and .dep keeping track of what needs to 53 | # be regenerated. 54 | $(OBJDIR)/%.dep : %.cu 55 | @mkdir -p $(OBJDIR) 56 | @$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp 57 | @sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@ 58 | @sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \ 59 | sed -e 's/^ *//' -e 's/$$/:/' >> $@ 60 | @rm -f $@.tmp 61 | @cp $@ $(@:.dep=.d) 62 | 63 | # Compiled kernels and collectives with relocatable device code ... 64 | $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep 65 | @printf "Compiling %-35s > %s\n" $< $@ 66 | mkdir -p `dirname $@` 67 | $(NVCC) $(NVCUFLAGS) -dc $< -o $@ 68 | 69 | $(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep 70 | @printf "Compiling %-35s > %s\n" $< $@ 71 | mkdir -p `dirname $@` 72 | $(NVCC) $(NVCUFLAGS) -dc $< -o $@ 73 | 74 | $(OBJDIR)/%_lib.o : %.cu 75 | @printf "Compiling %-35s > %s\n" $< $@ 76 | mkdir -p `dirname $@` 77 | $(NVCC) $(NVCUFLAGS) -dc $< -o $@ 78 | 79 | $(OBJDIR)/%_dev.o : $(OBJDIR)/%_lib.o 80 | @printf "Compiling %-35s > %s\n" $< $@ 81 | mkdir -p `dirname $@` 82 | $(NVCC) $(NVCUFLAGS) -dlink $< -o $@ 83 | 84 | # ... and create the device-side linked object with all those. 85 | $(DEVOBJ) : $(LIBOBJ) 86 | $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ 87 | 88 | clean: 89 | rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB) 90 | -------------------------------------------------------------------------------- /src/collectives/device/all_gather.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "all_gather.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_C(AllGather); 12 | -------------------------------------------------------------------------------- /src/collectives/device/all_gather.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | #include "msccl_interpreter.h" 11 | 12 | namespace { 13 | template 14 | __device__ __forceinline__ void runRing(ncclWorkElem *args) { 15 | const int tid = threadIdx.x; 16 | const int nthreads = args->header.nWarps*WARP_SIZE; 17 | const int bid = args->bid; 18 | const int nChannels = args->nChannels; 19 | ncclRing *ring = &ncclShmem.channel.ring; 20 | const int *ringRanks = ring->devUserRanks; 21 | const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1)); 22 | // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. 23 | const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); 24 | const int nranks = ncclShmem.comm.nRanks; 25 | const ssize_t loopSize = nChannels*int(chunkSize); 26 | const ssize_t size = args->count; 27 | 28 | T *inputBuf = (T*)args->sendbuff; 29 | T *outputBuf = (T*)args->recvbuff; 30 | Primitives, 1, Proto, 0> prims 31 | (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg); 32 | 33 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 34 | ssize_t realChunkSize; 35 | if (Proto::Id == NCCL_PROTO_SIMPLE) { 36 | realChunkSize = min(chunkSize, divUp(size-gridOffset,nChannels)); 37 | realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); 38 | } 39 | else if (Proto::Id == NCCL_PROTO_LL) 40 | realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; 41 | else if (Proto::Id == NCCL_PROTO_LL128) 42 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128); 43 | realChunkSize = int(realChunkSize); 44 | 45 | ssize_t chunkOffset = gridOffset + int(bid*realChunkSize); 46 | 47 | /////////////// begin AllGather steps /////////////// 48 | ssize_t offset; 49 | int nelem = min(realChunkSize, size-chunkOffset); 50 | int rankDest; 51 | 52 | // step 0: push data to next GPU 53 | rankDest = ringRanks[0]; 54 | offset = chunkOffset + rankDest * size; 55 | 56 | if (inputBuf + chunkOffset == outputBuf + offset) { // In place 57 | prims.directSend(chunkOffset, offset, nelem); 58 | } else { 59 | prims.directCopySend(chunkOffset, offset, offset, nelem); 60 | } 61 | 62 | // k-2 steps: copy to next GPU 63 | for (int j=1; j 81 | struct RunWorkElement { 82 | __device__ __forceinline__ void run(ncclWorkElem *args) { 83 | using Proto = ProtoSimple; 84 | runRing(args); 85 | } 86 | }; 87 | 88 | template 89 | struct RunWorkElement { 90 | __device__ __forceinline__ void run(ncclWorkElem *args) { 91 | runRing(args); 92 | } 93 | }; 94 | 95 | template 96 | struct RunWorkElement { 97 | __device__ __forceinline__ void run(ncclWorkElem *args) { 98 | runRing(args); 99 | } 100 | }; 101 | 102 | template 103 | struct RunWorkElement { 104 | __device__ __forceinline__ void run(ncclWorkElem *args) { 105 | using Proto = ProtoSimple; 106 | runInterpreter(args, ncclShmem.comm.nRanks); 107 | } 108 | }; 109 | 110 | template 111 | struct RunWorkElement { 112 | __device__ __forceinline__ void run(ncclWorkElem *args) { 113 | runInterpreter(args, ncclShmem.comm.nRanks); 114 | } 115 | }; 116 | 117 | template 118 | struct RunWorkElement { 119 | __device__ __forceinline__ void run(ncclWorkElem *args) { 120 | runInterpreter(args, ncclShmem.comm.nRanks); 121 | } 122 | }; -------------------------------------------------------------------------------- /src/collectives/device/all_reduce.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "all_reduce.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_R(AllReduce); 12 | -------------------------------------------------------------------------------- /src/collectives/device/all_to_all.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "all_to_all.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_C(AllToAll); 12 | -------------------------------------------------------------------------------- /src/collectives/device/all_to_all.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "primitives.h" 9 | #include "collectives.h" 10 | #include "msccl_interpreter.h" 11 | 12 | template 13 | struct RunWorkElement { 14 | __device__ __forceinline__ void run(ncclWorkElem *args) { 15 | using Proto = ProtoSimple; 16 | runInterpreter(args, ncclShmem.comm.nRanks); 17 | } 18 | }; 19 | 20 | template 21 | struct RunWorkElement { 22 | __device__ __forceinline__ void run(ncclWorkElem *args) { 23 | runInterpreter(args, ncclShmem.comm.nRanks); 24 | } 25 | }; 26 | 27 | template 28 | struct RunWorkElement { 29 | __device__ __forceinline__ void run(ncclWorkElem *args) { 30 | runInterpreter(args, ncclShmem.comm.nRanks); 31 | } 32 | }; 33 | -------------------------------------------------------------------------------- /src/collectives/device/broadcast.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "broadcast.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_C(Broadcast); 12 | -------------------------------------------------------------------------------- /src/collectives/device/broadcast.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | #include "msccl_interpreter.h" 11 | 12 | namespace { 13 | template 14 | __device__ __forceinline__ void runRing(ncclWorkElem *args) { 15 | const int tid = threadIdx.x; 16 | const int nthreads = args->header.nWarps*WARP_SIZE; 17 | const int bid = args->bid; 18 | const int nChannels = args->nChannels; 19 | ncclRing *ring = &ncclShmem.channel.ring; 20 | const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1)); 21 | const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); 22 | const ssize_t loopSize = nChannels*chunkSize; 23 | const ssize_t size = args->count; 24 | const int rank = ring->devUserRanks[0]; 25 | const int nextRank = ring->devUserRanks[1]; 26 | const int root = args->root; 27 | 28 | T *inputBuf = (T*)args->sendbuff; 29 | T *outputBuf = (T*)args->recvbuff; 30 | Primitives, 0, Proto, 0> 31 | prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg); 32 | 33 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 34 | ssize_t realChunkSize; 35 | if (Proto::Id == NCCL_PROTO_SIMPLE) { 36 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); 37 | realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); 38 | } 39 | else if (Proto::Id == NCCL_PROTO_LL) 40 | realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; 41 | else if (Proto::Id == NCCL_PROTO_LL128) 42 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128); 43 | realChunkSize = int(realChunkSize); 44 | 45 | ssize_t offset = gridOffset + int(bid*realChunkSize); 46 | int nelem = min(realChunkSize, size-offset); 47 | 48 | if (rank == root) { 49 | if (inputBuf == outputBuf) { 50 | prims.send(offset, nelem); 51 | } else { 52 | prims.copySend(offset, offset, nelem); 53 | } 54 | } else if (nextRank == root) { 55 | prims.recv(offset, nelem); 56 | } else { 57 | prims.recvCopySend(offset, nelem); 58 | } 59 | } 60 | } 61 | } 62 | 63 | template 64 | struct RunWorkElement { 65 | __device__ __forceinline__ void run(ncclWorkElem *args) { 66 | using Proto = ProtoSimple; 67 | runRing(args); 68 | } 69 | }; 70 | 71 | template 72 | struct RunWorkElement { 73 | __device__ __forceinline__ void run(ncclWorkElem *args) { 74 | runRing(args); 75 | } 76 | }; 77 | 78 | template 79 | struct RunWorkElement { 80 | __device__ __forceinline__ void run(ncclWorkElem *args) { 81 | runRing(args); 82 | } 83 | }; 84 | 85 | template 86 | struct RunWorkElement { 87 | __device__ __forceinline__ void run(ncclWorkElem *args) { 88 | using Proto = ProtoSimple; 89 | runInterpreter(args, 1); 90 | } 91 | }; 92 | 93 | template 94 | struct RunWorkElement { 95 | __device__ __forceinline__ void run(ncclWorkElem *args) { 96 | runInterpreter(args, 1); 97 | } 98 | }; 99 | 100 | template 101 | struct RunWorkElement { 102 | __device__ __forceinline__ void run(ncclWorkElem *args) { 103 | runInterpreter(args, 1); 104 | } 105 | }; -------------------------------------------------------------------------------- /src/collectives/device/custom_collective.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "custom_collective.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_R(CustomCollective); 12 | -------------------------------------------------------------------------------- /src/collectives/device/custom_collective.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "collectives.h" 8 | #include "primitives.h" 9 | #include "msccl_interpreter.h" 10 | 11 | template 12 | struct RunWorkElement { 13 | __device__ __forceinline__ void run(ncclWorkElem *args) { 14 | using Proto = ProtoSimple; 15 | runInterpreter(args, 1); 16 | } 17 | }; 18 | 19 | template 20 | struct RunWorkElement { 21 | __device__ __forceinline__ void run(ncclWorkElem *args) { 22 | runInterpreter(args, 1); 23 | } 24 | }; 25 | 26 | template 27 | struct RunWorkElement { 28 | __device__ __forceinline__ void run(ncclWorkElem *args) { 29 | runInterpreter(args, 1); 30 | } 31 | }; -------------------------------------------------------------------------------- /src/collectives/device/functions.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "common.h" 10 | 11 | __shared__ ncclShmemData ncclShmem; 12 | 13 | #define NCCL_FUNC5(func, algo, devredop, type, nullify) \ 14 | MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \ 15 | MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \ 16 | MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type)) 17 | 18 | #define NCCL_FUNC4(func, devredop, type, nullify) \ 19 | NCCL_FUNC5(func, TREE, devredop, type, nullify), \ 20 | NCCL_FUNC5(func, RING, devredop, type, nullify), \ 21 | NCCL_FUNC5(func, MSCCL, devredop, type, nullify), \ 22 | NCCL_FUNC5(func, COLLNET, devredop, type, nullify) 23 | 24 | #if defined(__CUDA_BF16_TYPES_EXIST__) 25 | // Must be consistent with ncclDataType_t 26 | #define NCCL_FUNCS3A(func, devredop, nullForFloat) \ 27 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 28 | NCCL_FUNC4(func, devredop, uint8_t, 0), \ 29 | NCCL_FUNC4(func, devredop, int32_t, 0), \ 30 | NCCL_FUNC4(func, devredop, uint32_t, 0), \ 31 | NCCL_FUNC4(func, devredop, int64_t, 0), \ 32 | NCCL_FUNC4(func, devredop, uint64_t, 0), \ 33 | NCCL_FUNC4(func, devredop, half, nullForFloat), \ 34 | NCCL_FUNC4(func, devredop, float, nullForFloat), \ 35 | NCCL_FUNC4(func, devredop, double, nullForFloat), \ 36 | NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat) 37 | #define NCCL_FUNCS3B(func, devredop) \ 38 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 39 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 40 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 41 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 42 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 43 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 44 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 45 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 46 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 47 | NCCL_FUNC4(func, devredop, int8_t, 0) 48 | #else 49 | // Must be consistent with ncclDataType_t 50 | #define NCCL_FUNCS3A(func, devredop, nullForFloat) \ 51 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 52 | NCCL_FUNC4(func, devredop, uint8_t, 0), \ 53 | NCCL_FUNC4(func, devredop, int32_t, 0), \ 54 | NCCL_FUNC4(func, devredop, uint32_t, 0), \ 55 | NCCL_FUNC4(func, devredop, int64_t, 0), \ 56 | NCCL_FUNC4(func, devredop, uint64_t, 0), \ 57 | NCCL_FUNC4(func, devredop, half, nullForFloat), \ 58 | NCCL_FUNC4(func, devredop, float, nullForFloat), \ 59 | NCCL_FUNC4(func, devredop, double, nullForFloat) 60 | #define NCCL_FUNCS3B(func, devredop) \ 61 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 62 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 63 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 64 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 65 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 66 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 67 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 68 | NCCL_FUNC4(func, devredop, int8_t, 0), \ 69 | NCCL_FUNC4(func, devredop, int8_t, 0) 70 | #endif 71 | 72 | // Must be consistent with ncclRedOp_t 73 | #define NCCL_FUNCS2A(func) \ 74 | NCCL_FUNCS3A(func, Sum, /*nullForFloat=*/0), \ 75 | NCCL_FUNCS3A(func, Prod, /*nullForFloat=*/0), \ 76 | NCCL_FUNCS3A(func, Max, /*nullForFloat=*/0), \ 77 | NCCL_FUNCS3A(func, Min, /*nullForFloat=*/0), \ 78 | NCCL_FUNCS3A(func, PreMulSum, /*nullForFloat=*/0), \ 79 | NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1) 80 | 81 | #define NCCL_FUNCS2B(func) \ 82 | NCCL_FUNCS3B(func, Sum), \ 83 | NCCL_FUNCS3B(func, Sum), \ 84 | NCCL_FUNCS3B(func, Sum), \ 85 | NCCL_FUNCS3B(func, Sum), \ 86 | NCCL_FUNCS3B(func, Sum), \ 87 | NCCL_FUNCS3B(func, Sum) 88 | 89 | // Must be consistent with the ncclFuncSet enum 90 | __device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { 91 | // Don't try to initialize the host shadow copy of this device-side global 92 | // variable. There is no host pointer to a device-side function, which 93 | // confuses clang. This will be fixed in the next clang release. 94 | #if __CUDA_ARCH__ 95 | NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), 96 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t), 97 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t), 98 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t), 99 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t), 100 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t), 101 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t), 102 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, half), 103 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, float), 104 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, double), 105 | #if defined(__CUDA_BF16_TYPES_EXIST__) 106 | NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16), 107 | #endif 108 | NCCL_FUNCS2B(Broadcast), 109 | NCCL_FUNCS2A(Reduce), 110 | NCCL_FUNCS2B(AllGather), 111 | NCCL_FUNCS2A(ReduceScatter), 112 | NCCL_FUNCS2A(AllReduce), 113 | NCCL_FUNCS2B(AllToAll), 114 | NCCL_FUNCS2A(CustomCollective) 115 | #endif 116 | }; 117 | 118 | // Workaround for https://reviews.llvm.org/D55580 119 | __device__ void ncclWorkaroundClangD55580() {} 120 | -------------------------------------------------------------------------------- /src/collectives/device/gen_rules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | dir=$1 9 | 10 | datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64" 11 | if [ "$CUDA_MAJOR" -ge 11 ] 12 | then 13 | datatypes+=" bf16" 14 | fi 15 | 16 | targets="GENOBJS := \\\\\n" 17 | 18 | for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter all_to_all custom_collective; do 19 | opn=0 20 | for op in sum prod min max premulsum sumpostdiv; do 21 | dtn=0 22 | # Order must match that of the ncclDataType_t enum 23 | for dt in ${datatypes}; do 24 | echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep" 25 | echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" 26 | echo " mkdir -p ${dir}" 27 | echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o" 28 | echo "" 29 | targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" 30 | dtn=$(($dtn + 1)) 31 | done 32 | opn=$(($opn + 1)) 33 | done 34 | done 35 | echo -e "$targets" 36 | -------------------------------------------------------------------------------- /src/collectives/device/onerank_reduce.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "reduce_kernel.h" 10 | #include "common.h" 11 | 12 | namespace { 13 | template 14 | __device__ __forceinline__ void oneRankReduce() { 15 | ncclWork *w = &ncclShmem.work; 16 | int tid = threadIdx.x; 17 | int tn = blockDim.x; 18 | #pragma unroll 1 19 | for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) { 20 | ncclWorkElem *we = &w->elems[e]; 21 | intptr_t eltN = we->count; 22 | int bid = we->bid; 23 | int bn = we->nChannels; 24 | T const *src = (T const*)we->sendbuff; 25 | T *dst = (T*)we->recvbuff; 26 | 27 | // each block/channel gets a roughly equal segment of 16 byte packs 28 | constexpr int EltPerPack = 16/sizeof(T); 29 | intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack; 30 | intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn); 31 | intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn); 32 | i0 *= EltPerPack; 33 | i0 = i0 < eltN ? i0 : eltN; 34 | i1 *= EltPerPack; 35 | i1 = i1 < eltN ? i1 : eltN; 36 | src += i0; 37 | dst += i0; 38 | ReduceOrCopyMulti 39 | (tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0); 40 | } 41 | } 42 | } 43 | 44 | #define INSTANTIATE(devredop, type) \ 45 | __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \ 46 | oneRankReduce>(); \ 47 | } 48 | 49 | INSTANTIATE(PreMulSum, int8_t) 50 | INSTANTIATE(PreMulSum, uint8_t) 51 | INSTANTIATE(PreMulSum, int32_t) 52 | INSTANTIATE(PreMulSum, uint32_t) 53 | INSTANTIATE(PreMulSum, int64_t) 54 | INSTANTIATE(PreMulSum, uint64_t) 55 | INSTANTIATE(PreMulSum, half) 56 | #if defined(__CUDA_BF16_TYPES_EXIST__) 57 | INSTANTIATE(PreMulSum, __nv_bfloat16) 58 | #endif 59 | INSTANTIATE(PreMulSum, float) 60 | INSTANTIATE(PreMulSum, double) 61 | -------------------------------------------------------------------------------- /src/collectives/device/op128.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef OP128_H_ 8 | #define OP128_H_ 9 | 10 | inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { 11 | asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" 12 | : "=l"(v0), "=l"(v1) : "l"(ptr)); 13 | } 14 | 15 | inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) { 16 | asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};" 17 | :: "l"(v0), "l"(v1), "l"(ptr)); 18 | } 19 | 20 | inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) { 21 | uint64_t* shmemAsmPtr; 22 | asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr)); 23 | return shmemAsmPtr; 24 | } 25 | 26 | inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) { 27 | asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];" 28 | : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr)); 29 | } 30 | 31 | inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) { 32 | asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};" 33 | :: "l"(v0), "l"(v1), "l"(shmemAsmPtr)); 34 | } 35 | 36 | template 37 | inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) { 38 | union { 39 | uint32_t tmp4[4]; 40 | uint64_t tmp8[2]; 41 | }; 42 | if(sizeof(T) < 4) { 43 | uint32_t *ptr4 = reinterpret_cast(reinterpret_cast(ptr) & -uintptr_t(4)); 44 | #pragma unroll 45 | for(int e=0; e < 4; e++) { 46 | // Produce 4 bytes of sub-register type by reading 2 4-byte 47 | // aligned values and shifting. 48 | uint32_t lo, hi; 49 | asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0)); 50 | asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1)); 51 | tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast(ptr))%4)); 52 | } 53 | } 54 | else if(sizeof(T) == 4) { 55 | #pragma unroll 56 | for(int e=0; e < 4; e++) 57 | asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e)); 58 | } 59 | else /*sizeof(T)==8*/ { 60 | #pragma unroll 61 | for(int e=0; e < 2; e++) 62 | asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e)); 63 | } 64 | v0 = tmp8[0]; 65 | v1 = tmp8[1]; 66 | } 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /src/collectives/device/reduce.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "reduce.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_R(Reduce); 12 | -------------------------------------------------------------------------------- /src/collectives/device/reduce.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | #include "msccl_interpreter.h" 11 | 12 | namespace { 13 | template 14 | __device__ __forceinline__ void runRing(ncclWorkElem *args) { 15 | const int tid = threadIdx.x; 16 | const int nthreads = args->header.nWarps*WARP_SIZE; 17 | const int bid = args->bid; 18 | const int nChannels = args->nChannels; 19 | ncclRing *ring = &ncclShmem.channel.ring; 20 | const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1)); 21 | const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); 22 | const int nranks = ncclShmem.comm.nRanks; 23 | const ssize_t loopSize = nChannels*chunkSize; 24 | const ssize_t size = args->count; 25 | const int rank = ncclShmem.comm.rank; 26 | const int prevRank = ring->devUserRanks[nranks-1]; 27 | const int root = args->root; 28 | 29 | Primitives, 0, Proto, 0> 30 | prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); 31 | 32 | auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int { 33 | int realChunkSize; 34 | if (Proto::Id == NCCL_PROTO_SIMPLE) { 35 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); 36 | realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); 37 | } 38 | else if (Proto::Id == NCCL_PROTO_LL) 39 | realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; 40 | else if (Proto::Id == NCCL_PROTO_LL128) 41 | realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); 42 | return realChunkSize; 43 | }; 44 | 45 | if (prevRank == root) { 46 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 47 | int realChunkSize = calcChunkSize(gridOffset); 48 | ssize_t offset = gridOffset + bid*realChunkSize; 49 | int nelem = min(realChunkSize, size-offset); 50 | prims.send(offset, nelem); 51 | } 52 | } 53 | else if (rank == root) { 54 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 55 | int realChunkSize = calcChunkSize(gridOffset); 56 | ssize_t offset = gridOffset + bid*realChunkSize; 57 | int nelem = min(realChunkSize, size-offset); 58 | prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); 59 | } 60 | } 61 | else { 62 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 63 | int realChunkSize = calcChunkSize(gridOffset); 64 | ssize_t offset = gridOffset + bid*realChunkSize; 65 | int nelem = min(realChunkSize, size-offset); 66 | prims.recvReduceSend(offset, nelem); 67 | } 68 | } 69 | } 70 | } 71 | 72 | template 73 | struct RunWorkElement { 74 | __device__ __forceinline__ void run(ncclWorkElem *args) { 75 | using Proto = ProtoSimple; 76 | runRing(args); 77 | } 78 | }; 79 | 80 | template 81 | struct RunWorkElement { 82 | __device__ __forceinline__ void run(ncclWorkElem *args) { 83 | runRing(args); 84 | } 85 | }; 86 | 87 | template 88 | struct RunWorkElement { 89 | __device__ __forceinline__ void run(ncclWorkElem *args) { 90 | runRing(args); 91 | } 92 | }; 93 | 94 | template 95 | struct RunWorkElement { 96 | __device__ __forceinline__ void run(ncclWorkElem *args) { 97 | using Proto = ProtoSimple; 98 | runInterpreter(args, 1); 99 | } 100 | }; 101 | 102 | template 103 | struct RunWorkElement { 104 | __device__ __forceinline__ void run(ncclWorkElem *args) { 105 | runInterpreter(args, 1); 106 | } 107 | }; 108 | 109 | template 110 | struct RunWorkElement { 111 | __device__ __forceinline__ void run(ncclWorkElem *args) { 112 | runInterpreter(args, 1); 113 | } 114 | }; -------------------------------------------------------------------------------- /src/collectives/device/reduce_scatter.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "reduce_scatter.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_R(ReduceScatter); 12 | -------------------------------------------------------------------------------- /src/collectives/device/reduce_scatter.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | #include "msccl_interpreter.h" 11 | 12 | namespace { 13 | template 14 | __device__ __forceinline__ void runRing(ncclWorkElem *args) { 15 | const int tid = threadIdx.x; 16 | const int nthreads = args->header.nWarps*WARP_SIZE; 17 | const int bid = args->bid; 18 | const int nChannels = args->nChannels; 19 | ncclRing *ring = &ncclShmem.channel.ring; 20 | int const *ringRanks = ring->devUserRanks; 21 | const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1)); 22 | // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. 23 | const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); 24 | const int nranks = ncclShmem.comm.nRanks; 25 | const ssize_t loopSize = nChannels*chunkSize; 26 | const ssize_t size = args->count; 27 | 28 | Primitives, 0, Proto, 0> 29 | prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); 30 | 31 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 32 | ssize_t realChunkSize; 33 | if (Proto::Id == NCCL_PROTO_SIMPLE) { 34 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); 35 | realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); 36 | } 37 | else if (Proto::Id == NCCL_PROTO_LL) 38 | realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; 39 | else if (Proto::Id == NCCL_PROTO_LL128) 40 | realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); 41 | realChunkSize = int(realChunkSize); 42 | 43 | ssize_t chunkOffset = gridOffset + bid*int(realChunkSize); 44 | 45 | /////////////// begin ReduceScatter steps /////////////// 46 | ssize_t offset; 47 | int nelem = min(realChunkSize, size-chunkOffset); 48 | int rankDest; 49 | 50 | // step 0: push data to next GPU 51 | rankDest = ringRanks[nranks-1]; 52 | offset = chunkOffset + rankDest * size; 53 | prims.send(offset, nelem); 54 | 55 | // k-2 steps: reduce and copy to next GPU 56 | for (int j=2; j 71 | struct RunWorkElement { 72 | __device__ __forceinline__ void run(ncclWorkElem *args) { 73 | using Proto = ProtoSimple; 74 | runRing(args); 75 | } 76 | }; 77 | 78 | template 79 | struct RunWorkElement { 80 | __device__ __forceinline__ void run(ncclWorkElem *args) { 81 | runRing(args); 82 | } 83 | }; 84 | 85 | template 86 | struct RunWorkElement { 87 | __device__ __forceinline__ void run(ncclWorkElem *args) { 88 | runRing(args); 89 | } 90 | }; 91 | 92 | template 93 | struct RunWorkElement { 94 | __device__ __forceinline__ void run(ncclWorkElem *args) { 95 | using Proto = ProtoSimple; 96 | runInterpreter(args, ncclShmem.comm.nRanks); 97 | } 98 | }; 99 | 100 | template 101 | struct RunWorkElement { 102 | __device__ __forceinline__ void run(ncclWorkElem *args) { 103 | runInterpreter(args, ncclShmem.comm.nRanks); 104 | } 105 | }; 106 | 107 | template 108 | struct RunWorkElement { 109 | __device__ __forceinline__ void run(ncclWorkElem *args) { 110 | runInterpreter(args, ncclShmem.comm.nRanks); 111 | } 112 | }; -------------------------------------------------------------------------------- /src/collectives/device/sendrecv.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "sendrecv.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_P(SendRecv); 12 | -------------------------------------------------------------------------------- /src/collectives/device/sendrecv.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | #include "npkit/npkit.h" 11 | 12 | template 13 | struct RunWork { 14 | __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { 15 | NPKIT_GPU_SYNC_TIME_SEND(blockIdx.x, tid); 16 | 17 | if (args->peer == ncclShmem.comm.rank) { 18 | struct ncclWorkElemP2p* recvArgs = args-1; 19 | if (args->buff != recvArgs->buff) { 20 | NPKIT_GPU_ENTER_EVENT(NPKIT_EVENT_SEND_ENTRY, args->count*sizeof(T)); 21 | 22 | ReduceOrCopyMulti(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count); 23 | 24 | NPKIT_GPU_ENTER_EVENT(NPKIT_EVENT_SEND_EXIT, args->count*sizeof(T)); 25 | } 26 | } else { 27 | using Proto = ProtoSimple<1, 1>; 28 | ssize_t const count = args->count; 29 | int const chunkSize = args->chunkSize/sizeof(T); 30 | int const peer = args->peer; 31 | Primitives, 1, Proto, 1> prims 32 | (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group); 33 | 34 | NPKIT_GPU_SET_CTX_ID(prims); 35 | 36 | ssize_t offset = 0; 37 | do { 38 | int nelem = min(chunkSize, count-offset); 39 | prims.directSend(offset, offset, nelem); 40 | offset += nelem; 41 | } while(offset < count); 42 | } 43 | } 44 | 45 | __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { 46 | NPKIT_GPU_SYNC_TIME_RECV(blockIdx.x, tid); 47 | 48 | if (args->peer != ncclShmem.comm.rank) { 49 | using Proto = ProtoSimple<1, 1>; 50 | ssize_t const count = args->count; 51 | int const chunkSize = args->chunkSize/sizeof(T); 52 | int const peer = args->peer; 53 | Primitives, 1, Proto, 1> prims 54 | (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group); 55 | 56 | NPKIT_GPU_SET_CTX_ID(prims); 57 | 58 | ssize_t offset = 0; 59 | do { 60 | int nelem = min(chunkSize, count-offset); 61 | 62 | NPKIT_GPU_ENTER_EVENT(NPKIT_EVENT_RECV_ENTRY, nelem*sizeof(T)); 63 | 64 | prims.directRecv(offset, nelem); 65 | 66 | NPKIT_GPU_ENTER_EVENT(NPKIT_EVENT_RECV_EXIT, nelem*sizeof(T)); 67 | 68 | offset += nelem; 69 | } while(offset < count); 70 | } 71 | } 72 | 73 | __device__ __forceinline__ void run(ncclWork *work) { 74 | struct ncclWorkElemP2p* args = work->p2pElems; 75 | int ngroups = args->ngroups; 76 | int tid = threadIdx.x; 77 | int wid = tid / WARP_SIZE; 78 | // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3 79 | // warps for send, 2 warps for recv). 80 | // warpStarts were rounded thanks to int division, but for group number we need to round the other way around 81 | // So we mirror wid then mirror again the group. 82 | #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) 83 | int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS; 84 | args += group; 85 | if (args->header.type == ncclWorkTypeUnused) return; 86 | 87 | tid -= args->warpStart * WARP_SIZE; 88 | int nthreads = args->nWarps * WARP_SIZE; 89 | group |= 1<<16; // Used to select connIndex 1 90 | if (tid >= nthreads || args->peer == -1) return; 91 | if ((group%2) == 0) { 92 | runRecv(tid, nthreads, group, args); 93 | } else { 94 | runSend(tid, nthreads, group, args); 95 | } 96 | } 97 | }; 98 | -------------------------------------------------------------------------------- /src/collectives/device/stride_copy.cu: -------------------------------------------------------------------------------- 1 | #include "devcomm.h" 2 | 3 | static int strideMemcpyGridsize = 0, strideMemcpyBlocksize = 0; 4 | 5 | // memory stride copy kernel 6 | template 7 | __global__ void strideMemcpyKernel(T *__restrict__ out, const T *__restrict__ in, const size_t size, const int height, const int width) { 8 | const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; 9 | for (size_t i = tid; i < size * height * width; i += gridDim.x * blockDim.x) { 10 | const size_t index = i / size, offset = i % size; 11 | const size_t j = (width * (index % height) + (index / height)) * size + offset; 12 | out[j] = in[i]; 13 | } 14 | } 15 | 16 | cudaError_t strideMemcpyAsync(void *dst, const void *src, const size_t size, const int height, const int width, cudaStream_t stream) { 17 | if (strideMemcpyGridsize == 0 || strideMemcpyBlocksize == 0) 18 | cudaOccupancyMaxPotentialBlockSize(&strideMemcpyGridsize, &strideMemcpyBlocksize, strideMemcpyKernel); 19 | 20 | if (size < sizeof(uint4)) 21 | strideMemcpyKernel<<>>((char*)dst, (char*)src, size, height, width); 22 | else 23 | strideMemcpyKernel<<>>((uint4*)dst, (uint4*)src, size/sizeof(uint4), height, width); 24 | return cudaSuccess; 25 | } 26 | -------------------------------------------------------------------------------- /src/collectives/reduce.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | 10 | NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, 11 | ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); 12 | ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, 13 | ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { 14 | NVTX3_FUNC_RANGE_IN(nccl_domain); 15 | struct ncclInfo info = { ncclFuncReduce, "Reduce", 16 | sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ 17 | REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; 18 | info.algorithm = -1; 19 | return ncclEnqueueCheck(&info); 20 | } 21 | -------------------------------------------------------------------------------- /src/collectives/reduce_scatter.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | 10 | NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, 11 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); 12 | ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, 13 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { 14 | NVTX3_FUNC_RANGE_IN(nccl_domain); 15 | struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", 16 | sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ 17 | REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; 18 | info.algorithm = -1; 19 | return ncclEnqueueCheck(&info); 20 | } 21 | -------------------------------------------------------------------------------- /src/collectives/sendrecv.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | #include "argcheck.h" // Need some checks here since we access comm 10 | 11 | NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, 12 | ncclComm_t comm, cudaStream_t stream); 13 | ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, 14 | ncclComm_t comm, cudaStream_t stream) { 15 | NVTX3_FUNC_RANGE_IN(nccl_domain); 16 | struct ncclInfo info = { ncclFuncSend, "Send", 17 | NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 18 | 1, 1 }; 19 | info.algorithm = -1; 20 | ncclResult_t ret; 21 | NCCLCHECK(ncclGroupStart()); 22 | ret = ncclEnqueueCheck(&info); 23 | NCCLCHECK(ncclGroupEnd()); 24 | return ret; 25 | } 26 | 27 | NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, 28 | ncclComm_t comm, cudaStream_t stream); 29 | ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, 30 | ncclComm_t comm, cudaStream_t stream) { 31 | NVTX3_FUNC_RANGE_IN(nccl_domain); 32 | struct ncclInfo info = { ncclFuncRecv, "Recv", 33 | NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 34 | 1, 1 }; 35 | info.algorithm = -1; 36 | ncclResult_t ret; 37 | NCCLCHECK(ncclGroupStart()); 38 | ret = ncclEnqueueCheck(&info); 39 | NCCLCHECK(ncclGroupEnd()); 40 | return ret; 41 | } 42 | -------------------------------------------------------------------------------- /src/enhcompat.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ 8 | 9 | enum cudaError_t { cudaErrorStubLibrary = 34 }; 10 | 11 | extern "C" { 12 | 13 | cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); 14 | cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } 15 | 16 | cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); 17 | cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } 18 | 19 | cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); 20 | cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } 21 | 22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); 23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } 24 | 25 | cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); 26 | cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/graph/rings.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "core.h" 8 | 9 | #define MAXWIDTH 20 10 | #define PREFIXLEN 15 11 | #define STRLENGTH (PREFIXLEN+5*MAXWIDTH) 12 | void dumpLine(int* values, int nranks, const char* prefix) { 13 | int prefixlen = strlen(prefix); 14 | char line[STRLENGTH+1]; 15 | line[STRLENGTH] = '\0'; 16 | memset(line, ' ', STRLENGTH); 17 | strncpy(line, prefix, PREFIXLEN); 18 | for (int i=0; i root ? rank-1 : rank) 10 | 11 | /* Btree which alternates leaves and nodes. 12 | * Assumes root is 0, which conveniently builds a tree on powers of two, 13 | * (because we have pow2-1 ranks) which lets us manipulate bits. 14 | * Find first non-zero bit, then : 15 | * Find the parent : 16 | * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below) 17 | * xx11[0] -> xx10[0] (3,7,11 below) 18 | * Find the children : 19 | * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13) 20 | * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13) 21 | * 22 | * Illustration : 23 | * 0---------------8 24 | * ______/ \______ 25 | * 4 12 26 | * / \ / \ 27 | * 2 6 10 \ 28 | * / \ / \ / \ \ 29 | * 1 3 5 7 9 11 13 30 | */ 31 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) { 32 | int up, down0, down1; 33 | int bit; 34 | for (bit=1; bit 0 so it has to be our child 1, not 0. 42 | *d1 = nranks > 1 ? bit >> 1 : -1; 43 | return ncclSuccess; 44 | } 45 | 46 | up = (rank ^ bit) | (bit << 1); 47 | // if smaller than the parent, we are his first child, otherwise we're his second 48 | if (up >= nranks) up = (rank ^ bit); 49 | *parentChildType = (rank < up) ? 0 : 1; 50 | *u = up; 51 | 52 | int lowbit = bit >> 1; 53 | // down0 is always within bounds 54 | down0 = lowbit == 0 ? -1 : rank-lowbit; 55 | 56 | down1 = lowbit == 0 ? -1 : rank+lowbit; 57 | // Make sure down1 is within bounds 58 | while (down1 >= nranks) { 59 | down1 = lowbit == 0 ? -1 : rank+lowbit; 60 | lowbit >>= 1; 61 | } 62 | *d0 = down0; *d1 = down1; 63 | 64 | return ncclSuccess; 65 | } 66 | 67 | /* Build a double binary tree. Take the previous tree for the first tree. 68 | * For the second tree, we use a mirror tree (if nranks is even) 69 | * 70 | * 0---------------8 3----------------11 71 | * ______/ \ / \______ 72 | * 4 \ / 7 73 | * / \ \ / / \ 74 | * 2 6 10 1 5 9 75 | * / \ / \ / \ / \ / \ / \ 76 | * 1 3 5 7 9 11 0 2 4 6 8 10 77 | * 78 | * or shift it by one rank (if nranks is odd). 79 | * 80 | * 0---------------8 1---------------9 81 | * ______/ \______ ______/ \______ 82 | * 4 12 5 0 83 | * / \ / / \ / 84 | * 2 6 10 3 7 11 85 | * / \ / \ / \ / \ / \ / \ 86 | * 1 3 5 7 9 11 2 4 6 8 10 12 87 | */ 88 | ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) { 89 | // First tree ... use a btree 90 | ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0); 91 | // Second tree ... mirror or shift 92 | if (nranks % 2 == 1) { 93 | // shift 94 | int shiftrank = (rank-1+nranks) % nranks; 95 | int u, d0, d1; 96 | ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1); 97 | *s1 = u == -1 ? -1 : (u+1) % nranks; 98 | *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks; 99 | *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks; 100 | } else { 101 | // mirror 102 | int u, d0, d1; 103 | ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1); 104 | *s1 = u == -1 ? -1 : nranks-1-u; 105 | *d1_0 = d0 == -1 ? -1 : nranks-1-d0; 106 | *d1_1 = d1 == -1 ? -1 : nranks-1-d1; 107 | } 108 | return ncclSuccess; 109 | } 110 | -------------------------------------------------------------------------------- /src/include/align.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ALIGN_H_ 8 | #define NCCL_ALIGN_H_ 9 | 10 | #define DIVUP(x, y) \ 11 | (((x)+(y)-1)/(y)) 12 | 13 | #define ROUNDUP(x, y) \ 14 | (DIVUP((x), (y))*(y)) 15 | 16 | #define ALIGN_SIZE(size, align) \ 17 | size = ((size + (align) - 1) / (align)) * (align); 18 | 19 | #if !__CUDA_ARCH__ 20 | #ifndef __host__ 21 | #define __host__ 22 | #endif 23 | #ifndef __device__ 24 | #define __device__ 25 | #endif 26 | #endif 27 | 28 | template 29 | __host__ __device__ constexpr Z divUp(X x, Y y) { 30 | return (x+y-1)/y; 31 | } 32 | 33 | template 34 | __host__ __device__ constexpr Z roundUp(X x, Y y) { 35 | return (x+y-1) - (x+y-1)%y; 36 | } 37 | 38 | // assumes second argument is a power of 2 39 | template 40 | __host__ __device__ constexpr Z alignUp(X x, int a) { 41 | return (x+a-1) & Z(-a); 42 | } 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/include/alloc.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ALLOC_H_ 8 | #define NCCL_ALLOC_H_ 9 | 10 | #include "nccl.h" 11 | #include "checks.h" 12 | #include "align.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | template 19 | static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { 20 | CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped)); 21 | memset(*ptr, 0, nelem*sizeof(T)); 22 | INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); 23 | return ncclSuccess; 24 | } 25 | #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) 26 | 27 | static inline ncclResult_t ncclCudaHostFree(void* ptr) { 28 | CUDACHECK(cudaFreeHost(ptr)); 29 | return ncclSuccess; 30 | } 31 | 32 | template 33 | static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { 34 | void* p = malloc(nelem*sizeof(T)); 35 | if (p == NULL) { 36 | WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); 37 | return ncclSystemError; 38 | } 39 | //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p); 40 | memset(p, 0, nelem*sizeof(T)); 41 | *ptr = (T*)p; 42 | return ncclSuccess; 43 | } 44 | #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) 45 | 46 | template 47 | static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { 48 | if (nelem < oldNelem) return ncclInternalError; 49 | if (nelem == oldNelem) return ncclSuccess; 50 | 51 | T* oldp = *ptr; 52 | T* p = (T*)malloc(nelem*sizeof(T)); 53 | if (p == NULL) { 54 | WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); 55 | return ncclSystemError; 56 | } 57 | memcpy(p, oldp, oldNelem*sizeof(T)); 58 | free(oldp); 59 | memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T)); 60 | *ptr = (T*)p; 61 | INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr); 62 | return ncclSuccess; 63 | } 64 | 65 | template 66 | static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { 67 | // Need async stream for P2P pre-connect + CUDA Graph 68 | cudaStream_t stream; 69 | CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 70 | CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); 71 | CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream)); 72 | CUDACHECK(cudaStreamSynchronize(stream)); 73 | CUDACHECK(cudaStreamDestroy(stream)); 74 | INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); 75 | return ncclSuccess; 76 | } 77 | #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) 78 | 79 | template 80 | static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { 81 | CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); 82 | return ncclSuccess; 83 | } 84 | 85 | // Allocate memory to be potentially ibv_reg_mr'd. This needs to be 86 | // allocated on separate pages as those pages will be marked DONTFORK 87 | // and if they are shared, that could cause a crash in a child process 88 | static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { 89 | size_t page_size = sysconf(_SC_PAGESIZE); 90 | void* p; 91 | int size_aligned = ROUNDUP(size, page_size); 92 | int ret = posix_memalign(&p, page_size, size_aligned); 93 | if (ret != 0) return ncclSystemError; 94 | memset(p, 0, size); 95 | *ptr = p; 96 | INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr); 97 | return ncclSuccess; 98 | } 99 | #define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__) 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /src/include/argcheck.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ARGCHECK_H_ 8 | #define NCCL_ARGCHECK_H_ 9 | 10 | #include "core.h" 11 | #include "info.h" 12 | 13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); 14 | ncclResult_t ArgsCheck(struct ncclInfo* info); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/include/bootstrap.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_BOOTSTRAP_H_ 8 | #define NCCL_BOOTSTRAP_H_ 9 | 10 | #include "nccl.h" 11 | #include "comm.h" 12 | 13 | ncclResult_t bootstrapNetInit(); 14 | ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); 15 | ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); 16 | ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm); 17 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); 18 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); 19 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); 20 | ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag); 21 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); 22 | ncclResult_t bootstrapClose(void* commState); 23 | ncclResult_t bootstrapAbort(void* commState); 24 | #endif 25 | -------------------------------------------------------------------------------- /src/include/channel.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CHANNEL_H_ 8 | #define NCCL_CHANNEL_H_ 9 | #include "comm.h" 10 | 11 | ncclResult_t initChannel(struct ncclComm* comm, int channelid); 12 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks); 13 | static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) { 14 | int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; 15 | int peerNode = comm->rankToNode[peer]; 16 | int peerIndex = comm->rankToLocalRank[peer]; 17 | int nsteps = comm->maxLocalRanks; 18 | int rankIndex = comm->rankToLocalRank[comm->rank]; 19 | int step, delta; 20 | if (coll == ncclFuncSend) { 21 | step = (nsteps + peerIndex - rankIndex)%nsteps; 22 | delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes; 23 | } else if (coll == ncclFuncRecv) { 24 | step = (nsteps + rankIndex - peerIndex)%nsteps; 25 | delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes; 26 | } else { 27 | return ncclInternalError; 28 | } 29 | *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step; 30 | return ncclSuccess; 31 | } 32 | 33 | static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) { 34 | *channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels; 35 | return ncclSuccess; 36 | } 37 | 38 | static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) { 39 | int base; 40 | NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base)); 41 | NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId)); 42 | return ncclSuccess; 43 | } 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/include/checks.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CHECKS_H_ 8 | #define NCCL_CHECKS_H_ 9 | 10 | #include "debug.h" 11 | 12 | // Check CUDA calls 13 | #define CUDACHECK(cmd) do { \ 14 | cudaError_t err = cmd; \ 15 | if( err != cudaSuccess ) { \ 16 | WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ 17 | return ncclUnhandledCudaError; \ 18 | } \ 19 | } while(false) 20 | 21 | #define CUDACHECKGOTO(cmd, res, label) do { \ 22 | cudaError_t err = cmd; \ 23 | if( err != cudaSuccess ) { \ 24 | WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ 25 | res = ncclUnhandledCudaError; \ 26 | goto label; \ 27 | } \ 28 | } while(false) 29 | 30 | // Report failure but clear error and continue 31 | #define CUDACHECKIGNORE(cmd) do { \ 32 | cudaError_t err = cmd; \ 33 | if( err != cudaSuccess ) { \ 34 | INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \ 35 | (void) cudaGetLastError(); \ 36 | } \ 37 | } while(false) 38 | 39 | #include 40 | // Check system calls 41 | #define SYSCHECK(call, name) do { \ 42 | int retval; \ 43 | SYSCHECKVAL(call, name, retval); \ 44 | } while (false) 45 | 46 | #define SYSCHECKVAL(call, name, retval) do { \ 47 | SYSCHECKSYNC(call, name, retval); \ 48 | if (retval == -1) { \ 49 | WARN("Call to " name " failed : %s", strerror(errno)); \ 50 | return ncclSystemError; \ 51 | } \ 52 | } while (false) 53 | 54 | #define SYSCHECKSYNC(call, name, retval) do { \ 55 | retval = call; \ 56 | if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ 57 | INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ 58 | } else { \ 59 | break; \ 60 | } \ 61 | } while(true) 62 | 63 | #define SYSCHECKGOTO(statement, res, label) do { \ 64 | if ((statement) == -1) { \ 65 | /* Print the back trace*/ \ 66 | res = ncclSystemError; \ 67 | INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ 68 | goto label; \ 69 | } \ 70 | } while (0); 71 | 72 | #define NEQCHECK(statement, value) do { \ 73 | if ((statement) != value) { \ 74 | /* Print the back trace*/ \ 75 | INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \ 76 | return ncclSystemError; \ 77 | } \ 78 | } while (0); 79 | 80 | #define NEQCHECKGOTO(statement, value, res, label) do { \ 81 | if ((statement) != value) { \ 82 | /* Print the back trace*/ \ 83 | res = ncclSystemError; \ 84 | INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ 85 | goto label; \ 86 | } \ 87 | } while (0); 88 | 89 | #define EQCHECK(statement, value) do { \ 90 | if ((statement) == value) { \ 91 | /* Print the back trace*/ \ 92 | INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \ 93 | return ncclSystemError; \ 94 | } \ 95 | } while (0); 96 | 97 | #define EQCHECKGOTO(statement, value, res, label) do { \ 98 | if ((statement) == value) { \ 99 | /* Print the back trace*/ \ 100 | res = ncclSystemError; \ 101 | INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ 102 | goto label; \ 103 | } \ 104 | } while (0); 105 | 106 | // Propagate errors up 107 | #define NCCLCHECK(call) do { \ 108 | ncclResult_t res = call; \ 109 | if (res != ncclSuccess) { \ 110 | /* Print the back trace*/ \ 111 | if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ 112 | return res; \ 113 | } \ 114 | } while (0); 115 | 116 | #define NCCLCHECKGOTO(call, res, label) do { \ 117 | res = call; \ 118 | if (res != ncclSuccess) { \ 119 | /* Print the back trace*/ \ 120 | if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ 121 | goto label; \ 122 | } \ 123 | } while (0); 124 | 125 | #define NCCLWAIT(call, cond, abortFlagPtr) do { \ 126 | volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ 127 | ncclResult_t res = call; \ 128 | if (res != ncclSuccess) { \ 129 | if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ 130 | return ncclInternalError; \ 131 | } \ 132 | if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ 133 | } while (!(cond)); 134 | 135 | #define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \ 136 | volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ 137 | res = call; \ 138 | if (res != ncclSuccess) { \ 139 | if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ 140 | goto label; \ 141 | } \ 142 | if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \ 143 | } while (!(cond)); 144 | 145 | #define NCCLCHECKTHREAD(a) do { \ 146 | if ((args->ret = (a)) != ncclSuccess) { \ 147 | INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ 148 | return args; \ 149 | } \ 150 | } while(0) 151 | 152 | #define CUDACHECKTHREAD(a) do { \ 153 | if ((a) != cudaSuccess) { \ 154 | INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ 155 | args->ret = ncclUnhandledCudaError; \ 156 | return args; \ 157 | } \ 158 | } while(0) 159 | 160 | #endif 161 | -------------------------------------------------------------------------------- /src/include/coll_net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COLL_NET_H_ 8 | #define COLL_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | 13 | extern ncclCollNet_t* ncclCollNet; 14 | typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 15 | 16 | // Translation to external API 17 | static const char* collNetName() { return ncclCollNet->name; } 18 | static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; } 19 | static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; } 20 | static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } 21 | static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } 22 | static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } 23 | static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } 24 | static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; } 25 | static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { 26 | NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } 27 | static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } 28 | static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; } 29 | static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; } 30 | static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; } 31 | 32 | static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; } 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/include/collectives.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_COLLECTIVES_H_ 8 | #define NCCL_COLLECTIVES_H_ 9 | #include 10 | 11 | enum ncclDevRedOp_t { 12 | ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin, 13 | ncclDevPreMulSum, ncclDevSumPostDiv, 14 | ncclNumDevRedOps 15 | }; 16 | struct ncclDevRedOpFull { 17 | ncclDevRedOp_t op; 18 | bool scalarArgIsPtr; 19 | uint64_t scalarArg; 20 | }; 21 | 22 | #define FUNC_INDEX_P2P 0 23 | #define FUNC_INDEX(func, devredop, ncclType, al, pr) (1+ncclNumTypes+(((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)) 24 | 25 | #define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \ 26 | ncclFunction_##func##_##algo##_##proto##_##devredop##_##type 27 | 28 | #define NCCL_ONERANK_REDUCE_NAME(devredop, type) \ 29 | ncclFunction_OneRankReduce_##devredop##_##type 30 | 31 | #define NCCL_KERN_NAME(func, algo, proto, devredop, type) \ 32 | ncclKernel_##func##_##algo##_##proto##_##devredop##_##type 33 | 34 | #define NCCL_IMPL_NAME(func, algo, proto) \ 35 | nccl##func##algo##proto 36 | 37 | /* Declare all collective operations */ 38 | #define DECL5(func, algo, proto, devredop, type) \ 39 | extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ 40 | extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \ 41 | 42 | #define CONCAT(a,b) a##b 43 | #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f) 44 | #define MACRO_IF_0(t, f) f 45 | #define MACRO_IF_1(t, f) t 46 | 47 | #define DECL4(func, algo, devredop, type, undef) \ 48 | MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \ 49 | MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL, devredop, type)) \ 50 | MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type)) 51 | 52 | #define DECL3(func, devredop, type, undef) \ 53 | DECL4(func, RING, devredop, type, undef) \ 54 | DECL4(func, TREE, devredop, type, undef) \ 55 | DECL4(func, MSCCL, devredop, type, undef) \ 56 | DECL4(func, COLLNET, devredop, type, undef) 57 | 58 | #if defined(__CUDA_BF16_TYPES_EXIST__) 59 | #define DECL2(func, devredop, undefForFloat) \ 60 | DECL3(func, devredop, int8_t, /*undef=*/0) \ 61 | DECL3(func, devredop, uint8_t, /*undef=*/0) \ 62 | DECL3(func, devredop, int32_t, /*undef=*/0) \ 63 | DECL3(func, devredop, uint32_t, /*undef=*/0) \ 64 | DECL3(func, devredop, int64_t, /*undef=*/0) \ 65 | DECL3(func, devredop, uint64_t, /*undef=*/0) \ 66 | DECL3(func, devredop, half, /*undef=*/undefForFloat) \ 67 | DECL3(func, devredop, float, /*undef=*/undefForFloat) \ 68 | DECL3(func, devredop, double, /*undef=*/undefForFloat) \ 69 | DECL3(func, devredop, __nv_bfloat16, /*undef=*/undefForFloat) 70 | #else 71 | #define DECL2(func, devredop, undefForFloat) \ 72 | DECL3(func, devredop, int8_t, /*undef=*/0) \ 73 | DECL3(func, devredop, uint8_t, /*undef=*/0) \ 74 | DECL3(func, devredop, int32_t, /*undef=*/0) \ 75 | DECL3(func, devredop, uint32_t, /*undef=*/0) \ 76 | DECL3(func, devredop, int64_t, /*undef=*/0) \ 77 | DECL3(func, devredop, uint64_t, /*undef=*/0) \ 78 | DECL3(func, devredop, half, /*undef=*/undefForFloat) \ 79 | DECL3(func, devredop, float, /*undef=*/undefForFloat) \ 80 | DECL3(func, devredop, double, /*undef=*/undefForFloat) 81 | #endif 82 | 83 | #define DECL(func) \ 84 | DECL2(func, Sum, /*undefForFloat=*/0) \ 85 | DECL2(func, Prod, /*undefForFloat=*/0) \ 86 | DECL2(func, Min, /*undefForFloat=*/0) \ 87 | DECL2(func, Max, /*undefForFloat=*/0) \ 88 | DECL2(func, PreMulSum, /*undefForFloat=*/0) \ 89 | DECL2(func, SumPostDiv, /*undefForFloat=*/1) 90 | 91 | DECL2(Broadcast, Sum, /*undefForFloat=*/0) 92 | DECL(Reduce) 93 | DECL2(AllGather, Sum, /*undefForFloat=*/0) 94 | DECL(ReduceScatter) 95 | DECL(AllReduce) 96 | DECL2(AllToAll, Sum, /*undefForFloat=*/0) 97 | DECL(CustomCollective) 98 | DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) 99 | 100 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)(); 101 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)(); 102 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)(); 103 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)(); 104 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)(); 105 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)(); 106 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)(); 107 | #if defined(__CUDA_BF16_TYPES_EXIST__) 108 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16)(); 109 | #endif 110 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)(); 111 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(); 112 | 113 | // CHUNKSIZE must be a multiple of SLICESIZE 114 | #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) 115 | #define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) 116 | #define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) 117 | #define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) 118 | #define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) 119 | #define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) 120 | #define BROADCAST_SLICESTEPS 1 121 | #define BROADCAST_CHUNKSTEPS 1 122 | #define REDUCE_SLICESTEPS 1 123 | #define REDUCE_CHUNKSTEPS 1 124 | #define SENDRECV_SLICEFACTOR 4 125 | #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /src/include/core.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CORE_H_ 8 | #define NCCL_CORE_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include // For std::min/std::max 15 | #include "nccl.h" 16 | 17 | #ifdef PROFAPI 18 | #define NCCL_API(ret, func, args...) \ 19 | __attribute__ ((visibility("default"))) \ 20 | __attribute__ ((alias(#func))) \ 21 | ret p##func (args); \ 22 | extern "C" \ 23 | __attribute__ ((visibility("default"))) \ 24 | __attribute__ ((weak)) \ 25 | ret func(args) 26 | #else 27 | #define NCCL_API(ret, func, args...) \ 28 | extern "C" \ 29 | __attribute__ ((visibility("default"))) \ 30 | ret func(args) 31 | #endif // end PROFAPI 32 | 33 | static __inline__ int ncclTypeSize(ncclDataType_t type) { 34 | switch (type) { 35 | case ncclInt8: 36 | case ncclUint8: 37 | return 1; 38 | case ncclFloat16: 39 | #if defined(__CUDA_BF16_TYPES_EXIST__) 40 | case ncclBfloat16: 41 | #endif 42 | return 2; 43 | case ncclInt32: 44 | case ncclUint32: 45 | case ncclFloat32: 46 | return 4; 47 | case ncclInt64: 48 | case ncclUint64: 49 | case ncclFloat64: 50 | return 8; 51 | default: 52 | return -1; 53 | } 54 | } 55 | 56 | #include "debug.h" 57 | #include "checks.h" 58 | #include "alloc.h" 59 | #include "utils.h" 60 | #include "param.h" 61 | #include "nvtx.h" 62 | 63 | #endif // end include guard 64 | -------------------------------------------------------------------------------- /src/include/cpuset.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CPUSET_H_ 8 | #define NCCL_CPUSET_H_ 9 | 10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t 11 | 12 | static int hexToInt(char c) { 13 | int v = c - '0'; 14 | if (v < 0) return -1; 15 | if (v > 9) v = 10 + c - 'a'; 16 | if ((v < 0) || (v > 15)) return -1; 17 | return v; 18 | } 19 | 20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) 21 | 22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { 23 | uint32_t cpumasks[CPU_SET_N_U32]; 24 | int m = CPU_SET_N_U32-1; 25 | cpumasks[m] = 0; 26 | for (int o=0; o=0; o--) { 49 | if (c == 0 && m8[o] == 0) continue; 50 | sprintf(str+c, "%02x", m8[o]); 51 | c+=2; 52 | if (o && o%4 == 0) { 53 | sprintf(str+c, ","); 54 | c++; 55 | } 56 | } 57 | str[c] = '\0'; 58 | return ncclSuccess; 59 | } 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /src/include/debug.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_DEBUG_H_ 8 | #define NCCL_DEBUG_H_ 9 | 10 | #include "nccl_net.h" 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | // Conform to pthread and NVTX standard 20 | #define NCCL_THREAD_NAMELEN 16 21 | 22 | extern int ncclDebugLevel; 23 | extern uint64_t ncclDebugMask; 24 | extern pthread_mutex_t ncclDebugOutputLock; 25 | extern FILE *ncclDebugFile; 26 | extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); 27 | 28 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); 29 | 30 | // Let code temporarily downgrade WARN into INFO 31 | extern thread_local int ncclDebugNoWarn; 32 | 33 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 34 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) 35 | 36 | #ifdef ENABLE_TRACE 37 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) 38 | extern std::chrono::high_resolution_clock::time_point ncclEpoch; 39 | #else 40 | #define TRACE(...) 41 | #endif 42 | 43 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...); 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/include/enqueue.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ENQUEUE_H_ 8 | #define NCCL_ENQUEUE_H_ 9 | 10 | #include "comm.h" 11 | #include "group.h" 12 | #include "collectives.h" 13 | 14 | #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64) 15 | #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ 16 | 17 | size_t ncclKernMaxLocalSize(); 18 | ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut); 19 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); 20 | ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast); 21 | ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm); 22 | ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm); 23 | ncclResult_t ncclLaunchBarrier(struct ncclComm* comm); 24 | ncclResult_t ncclLaunchKernel(ncclComm_t comm); 25 | ncclResult_t ncclRecordEvents(struct ncclComm* comm); 26 | ncclResult_t ncclLaunchReset(ncclComm_t comm); 27 | ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info); 28 | ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm); 29 | template 30 | void CUDART_CB ncclEnqueueHostSetup(void* arg); 31 | ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph); 32 | ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph); 33 | 34 | struct ncclBuffRegInfo { 35 | void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS]; 36 | void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS]; 37 | void* sendbuffs[NCCL_MAX_LOCAL_RANKS]; 38 | void* recvbuffs[NCCL_MAX_LOCAL_RANKS]; 39 | int nBuffs; 40 | }; 41 | 42 | // Enqueue information (for kernel and proxy) for each operation 43 | struct ncclQueueElem { 44 | struct ncclWork work; 45 | struct ncclProxyOp proxyOp; 46 | struct ncclBuffRegInfo buffRegInfo; 47 | }; 48 | 49 | typedef ncclRecyclableList ncclQueueElemList; 50 | 51 | // Structure passed to CUDA graph 52 | struct ncclQueueInfo { 53 | ncclComm_t comm; 54 | int maxChannels; // Dynamic version of gridDim 55 | ncclResult_t ret; // Return value of host setup call 56 | int nRegBuffs; 57 | ncclQueueElemList* elemList; 58 | }; 59 | 60 | static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_t comm) { 61 | NCCLCHECK(ncclCalloc(eqInfo, 1)); 62 | (*eqInfo)->comm = comm; 63 | (*eqInfo)->elemList = new ncclQueueElemList(); 64 | (*eqInfo)->comm->nQueueInfoCreated++; 65 | return ncclSuccess; 66 | } 67 | 68 | // Reset element queue 69 | static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) { 70 | if (eqInfo == NULL) return ncclInternalError; 71 | eqInfo->maxChannels = 0; 72 | eqInfo->ret = ncclSuccess; 73 | eqInfo->nRegBuffs = 0; 74 | eqInfo->elemList->recycle(); 75 | return ncclSuccess; 76 | } 77 | 78 | // Destroy enqueue info space 79 | // used by both CUDA graph and non CUDA graph 80 | static void ncclDestroyQueueInfo(void* ptr) { 81 | if (ptr == NULL) return; 82 | struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr; 83 | struct ncclComm* comm = eqInfo->comm; 84 | // Close IPC mem handles for registered buffers 85 | struct ncclQueueElem* eqElem = eqInfo->elemList->begin(); 86 | #if 0 87 | // Ideally, the deregistration should happen here 88 | // but currently the destroy function of CUDA objects does not allow CUDA API calls 89 | while (eqElem != NULL) { 90 | for (int i=0; ibuffRegInfo.nBuffs; i++) { 91 | if (i == eqInfo->comm->localRank) continue; 92 | CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i])); 93 | CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i])); 94 | } 95 | eqElem = eqInfo->elemList->getNext(); 96 | } 97 | #else 98 | // Instead, we push these pointers to a pool owned by ncclComm 99 | // and asks a helper thread to close mem handles 100 | struct ncclGraphHelperResources* res = comm->graphHelperResources; 101 | int ipcTailOld = 0; 102 | if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip; 103 | 104 | pthread_mutex_lock(&res->threadLock); 105 | ipcTailOld = res->ipcTail; 106 | while (eqElem != NULL) { 107 | for (int i=0; ibuffRegInfo.nBuffs; i++) { 108 | if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) { 109 | res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i]; 110 | res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE; 111 | } 112 | if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) { 113 | res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i]; 114 | res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE; 115 | } 116 | } 117 | eqElem = eqInfo->elemList->getNext(); 118 | } 119 | if (res->ipcTail != ipcTailOld) { 120 | res->threadState = ThreadStart; 121 | TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld); 122 | pthread_cond_signal(&res->threadCond); 123 | } 124 | pthread_mutex_unlock(&res->threadLock); 125 | #endif 126 | 127 | skip: 128 | delete eqInfo->elemList; 129 | free(eqInfo); 130 | comm->nQueueInfoDestroyed++; 131 | return; 132 | } 133 | #endif // End include guard 134 | -------------------------------------------------------------------------------- /src/include/graph.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_GRAPH_H_ 8 | #define NCCL_GRAPH_H_ 9 | 10 | #include "nccl.h" 11 | #include "devcomm.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); 19 | 20 | struct ncclTopoSystem; 21 | // Build the topology 22 | ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system); 23 | ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); 24 | ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); 25 | 26 | ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info); 27 | void ncclTopoFree(struct ncclTopoSystem* system); 28 | ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); 29 | ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); 30 | ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); 31 | 32 | // Query topology 33 | ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); 34 | ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); 35 | ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); 36 | int ncclPxnDisable(); 37 | ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); 38 | ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank); 39 | 40 | // Find CPU affinity 41 | ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); 42 | 43 | #define NCCL_TOPO_CPU_ARCH_X86 1 44 | #define NCCL_TOPO_CPU_ARCH_POWER 2 45 | #define NCCL_TOPO_CPU_ARCH_ARM 3 46 | #define NCCL_TOPO_CPU_VENDOR_INTEL 1 47 | #define NCCL_TOPO_CPU_VENDOR_AMD 2 48 | #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 49 | #define NCCL_TOPO_CPU_TYPE_BDW 1 50 | #define NCCL_TOPO_CPU_TYPE_SKL 2 51 | #define NCCL_TOPO_CPU_TYPE_YONGFENG 1 52 | ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); 53 | ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); 54 | ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id); 55 | 56 | #define NCCL_TOPO_MAX_NODES 256 57 | 58 | // Init search. Needs to be done before calling ncclTopoCompute 59 | ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); 60 | 61 | #define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU) 62 | #define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU) 63 | #define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU 64 | #define NCCL_TOPO_PATTERN_RING 4 // Ring 65 | struct ncclTopoGraph { 66 | // Input / output 67 | int id; // ring : 0, tree : 1, collnet : 2 68 | int pattern; 69 | int crossNic; 70 | int collNet; 71 | int minChannels; 72 | int maxChannels; 73 | // Output 74 | int nChannels; 75 | float speedIntra; 76 | float speedInter; 77 | float latencyInter; 78 | int typeIntra; 79 | int typeInter; 80 | int sameChannels; 81 | int nHops; 82 | int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; 83 | int inter[MAXCHANNELS*2]; 84 | }; 85 | ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); 86 | 87 | ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); 88 | ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs); 89 | 90 | struct ncclTopoRanks { 91 | int ringRecv[MAXCHANNELS]; 92 | int ringSend[MAXCHANNELS]; 93 | int ringPrev[MAXCHANNELS]; 94 | int ringNext[MAXCHANNELS]; 95 | int treeToParent[MAXCHANNELS]; 96 | int treeToChild0[MAXCHANNELS]; 97 | int treeToChild1[MAXCHANNELS]; 98 | }; 99 | 100 | ncclResult_t ncclTopoPreset(struct ncclComm* comm, 101 | struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, 102 | struct ncclTopoRanks* topoRanks); 103 | 104 | ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, 105 | struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph); 106 | 107 | ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph); 108 | #include "info.h" 109 | ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time); 110 | ncclResult_t ncclTopoGetMSCCLAlgo(struct ncclInfo* info); 111 | 112 | int ncclMaxNchannels(); 113 | 114 | #endif 115 | -------------------------------------------------------------------------------- /src/include/group.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_GROUP_H_ 8 | #define NCCL_GROUP_H_ 9 | 10 | #include "nccl.h" 11 | #include "comm.h" 12 | 13 | bool ncclAsyncMode(); 14 | ncclResult_t ncclAsyncErrCheck(ncclResult_t ret); 15 | 16 | typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); 17 | 18 | ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); 19 | 20 | typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count, 21 | ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); 22 | 23 | ncclResult_t ncclAsyncColl(ncclComm_t comm); 24 | #endif 25 | -------------------------------------------------------------------------------- /src/include/info.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INFO_H_ 8 | #define NCCL_INFO_H_ 9 | 10 | #include "nccl.h" 11 | #include "devcomm.h" 12 | #include "collectives.h" 13 | 14 | typedef enum : uint8_t { 15 | ncclPatternRing, 16 | ncclPatternRingTwice, 17 | ncclPatternPipelineFrom, 18 | ncclPatternPipelineTo, 19 | ncclPatternTreeUp, 20 | ncclPatternTreeDown, 21 | ncclPatternTreeUpDown, 22 | ncclPatternMSCCL, 23 | ncclPatternCollTreeUpDown, 24 | ncclPatternSend, 25 | ncclPatternRecv 26 | } ncclPattern_t; 27 | 28 | // Used to pass NCCL call information between functions 29 | struct ncclInfo { 30 | ncclFunc_t coll; 31 | const char* opName; 32 | // NCCL Coll Args 33 | const void* sendbuff; 34 | void* recvbuff; 35 | size_t count; 36 | ncclDataType_t datatype; 37 | ncclRedOp_t op; 38 | int root; // peer for p2p operations 39 | ncclComm_t comm; 40 | cudaStream_t stream; 41 | // Algorithm details 42 | int chunkSteps; 43 | int sliceSteps; 44 | // Computed later 45 | ncclDevRedOpFull opFull; 46 | int algorithm; 47 | int protocol; 48 | ncclPattern_t pattern; 49 | int nChannels; 50 | int nThreads; 51 | size_t nBytes; 52 | int nstepsPerLoop; 53 | int nchunksPerLoop; 54 | int chunkSize; 55 | int channelId; 56 | 57 | struct mscclWorkInfo mscclInfo; 58 | }; 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/include/net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INT_NET_H_ 8 | #define NCCL_INT_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | #include "checks.h" 13 | 14 | extern ncclNet_t* ncclNet; 15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 16 | 17 | ncclResult_t ncclNetInit(); 18 | int ncclNetVersion(); 19 | 20 | // Translation to external API 21 | static const char* ncclNetName() { return ncclNet->name; } 22 | static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } 23 | static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; } 24 | static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } 25 | static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } 26 | static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } 27 | static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } 28 | static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; } 29 | static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; } 30 | static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; } 31 | static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; } 32 | static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; } 33 | static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; } 34 | static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } 35 | static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } 36 | 37 | // Test whether the current GPU support GPU Direct RDMA. 38 | ncclResult_t ncclGpuGdrSupport(int* gdrSupport); 39 | 40 | extern ncclNet_t ncclNetIb; 41 | extern ncclNet_t ncclNetSocket; 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/include/npkit/npkit_event.h: -------------------------------------------------------------------------------- 1 | #ifndef NPKIT_EVENT_H_ 2 | #define NPKIT_EVENT_H_ 3 | 4 | #define NPKIT_EVENT_INVALID 0x0 5 | 6 | #define NPKIT_EVENT_SEND_ENTRY 0x1 7 | #define NPKIT_EVENT_SEND_EXIT 0x2 8 | #define NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY 0x3 9 | #define NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT 0x4 10 | #define NPKIT_EVENT_DIRECT_SEND_ENTRY 0x5 11 | #define NPKIT_EVENT_DIRECT_SEND_EXIT 0x6 12 | #define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY 0x7 13 | #define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT 0x8 14 | 15 | #define NPKIT_EVENT_RECV_ENTRY 0x9 16 | #define NPKIT_EVENT_RECV_EXIT 0xA 17 | #define NPKIT_EVENT_DIRECT_RECV_ENTRY 0xB 18 | #define NPKIT_EVENT_DIRECT_RECV_EXIT 0xC 19 | 20 | #define NPKIT_EVENT_REDUCE_ENTRY 0xD 21 | #define NPKIT_EVENT_REDUCE_EXIT 0xE 22 | 23 | #define NPKIT_EVENT_LOCAL_COPY_ENTRY 0xF 24 | #define NPKIT_EVENT_LOCAL_COPY_EXIT 0x10 25 | 26 | #define NPKIT_EVENT_COPY_SEND_ENTRY 0x11 27 | #define NPKIT_EVENT_COPY_SEND_EXIT 0x12 28 | #define NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY 0x13 29 | #define NPKIT_EVENT_DIRECT_COPY_SEND_EXIT 0x14 30 | 31 | #define NPKIT_EVENT_RECV_COPY_SEND_ENTRY 0x15 32 | #define NPKIT_EVENT_RECV_COPY_SEND_EXIT 0x16 33 | #define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY 0x17 34 | #define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT 0x18 35 | #define NPKIT_EVENT_RECV_COPY_DIRECT_SEND_ENTRY 0x19 36 | #define NPKIT_EVENT_RECV_COPY_DIRECT_SEND_EXIT 0x1A 37 | 38 | #define NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY 0x1B 39 | #define NPKIT_EVENT_RECV_REDUCE_COPY_EXIT 0x1C 40 | 41 | #define NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY 0x1D 42 | #define NPKIT_EVENT_RECV_REDUCE_SEND_EXIT 0x1E 43 | #define NPKIT_EVENT_DIRECT_RECV_REDUCE_SEND_ENTRY 0x1F 44 | #define NPKIT_EVENT_DIRECT_RECV_REDUCE_SEND_EXIT 0x20 45 | 46 | #define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY 0x21 47 | #define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT 0x22 48 | #define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x23 49 | #define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x24 50 | 51 | #define NPKIT_EVENT_NET_SEND_ENTRY 0x25 52 | #define NPKIT_EVENT_NET_SEND_EXIT 0x26 53 | #define NPKIT_EVENT_NET_RECV_ENTRY 0x27 54 | #define NPKIT_EVENT_NET_RECV_EXIT 0x28 55 | 56 | #define NPKIT_EVENT_DEP_CHECK_ENTRY 0x29 57 | #define NPKIT_EVENT_DEP_CHECK_EXIT 0x2A 58 | 59 | #define NPKIT_EVENT_TIME_SYNC_GPU 0x2B 60 | #define NPKIT_EVENT_TIME_SYNC_CPU 0x2C 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /src/include/npkit/npkit_struct.h: -------------------------------------------------------------------------------- 1 | #ifndef NPKIT_STRUCT_H_ 2 | #define NPKIT_STRUCT_H_ 3 | 4 | #include 5 | 6 | #pragma pack(push, 1) 7 | 8 | union NpKitEvent { 9 | uint64_t bits[2]; 10 | struct { 11 | uint64_t type : 8; 12 | uint64_t size : 32; 13 | uint64_t rsvd : 24; 14 | uint64_t timestamp; 15 | } fields; 16 | }; 17 | 18 | struct NpKitEventCollectContext { 19 | NpKitEvent* event_buffer; 20 | uint64_t event_buffer_head; 21 | }; 22 | 23 | #pragma pack(pop) 24 | 25 | #if defined(ENABLE_NPKIT) 26 | 27 | #define NPKIT_GPU_COMM_DECL_FIELDS \ 28 | NpKitEventCollectContext* npKitEventCollectContexts; \ 29 | uint64_t* npKitCpuTimestamp; 30 | 31 | #else 32 | 33 | #define NPKIT_GPU_COMM_DECL_FIELDS 34 | 35 | #endif 36 | 37 | #if defined(ENABLE_NPKIT) 38 | 39 | #define NPKIT_CPU_PROXY_DECL_FIELDS \ 40 | int npKitSizesFifo[NCCL_STEPS]; 41 | 42 | #else 43 | 44 | #define NPKIT_CPU_PROXY_DECL_FIELDS 45 | 46 | #endif 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/include/nvtx.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_NVTX_H_ 8 | #define NCCL_NVTX_H_ 9 | 10 | #include "nvtx3.hpp" 11 | 12 | struct nccl_domain{static constexpr char const* name{"NCCL"};}; 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtCuda.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #include "nvToolsExt.h" 10 | 11 | #include "cuda.h" 12 | 13 | #ifndef NVTOOLSEXT_CUDA_V3 14 | #define NVTOOLSEXT_CUDA_V3 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif /* __cplusplus */ 19 | 20 | /* ========================================================================= */ 21 | /** \name Functions for CUDA Resource Naming 22 | */ 23 | /** \addtogroup RESOURCE_NAMING 24 | * \section RESOURCE_NAMING_CUDA CUDA Resource Naming 25 | * 26 | * This section covers the API functions that allow to annotate CUDA resources 27 | * with user-provided names. 28 | * 29 | * @{ 30 | */ 31 | 32 | /* ------------------------------------------------------------------------- */ 33 | /* \cond SHOW_HIDDEN 34 | * \brief Used to build a non-colliding value for resource types separated class 35 | * \version \NVTX_VERSION_2 36 | */ 37 | #define NVTX_RESOURCE_CLASS_CUDA 4 38 | /** \endcond */ 39 | 40 | /* ------------------------------------------------------------------------- */ 41 | /** \brief Resource types for CUDA 42 | */ 43 | typedef enum nvtxResourceCUDAType_t 44 | { 45 | NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */ 46 | NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */ 47 | NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */ 48 | NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */ 49 | } nvtxResourceCUDAType_t; 50 | 51 | 52 | /* ------------------------------------------------------------------------- */ 53 | /** \brief Annotates a CUDA device. 54 | * 55 | * Allows the user to associate a CUDA device with a user-provided name. 56 | * 57 | * \param device - The handle of the CUDA device to name. 58 | * \param name - The name of the CUDA device. 59 | * 60 | * \version \NVTX_VERSION_1 61 | * @{ */ 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name); 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name); 64 | /** @} */ 65 | 66 | /* ------------------------------------------------------------------------- */ 67 | /** \brief Annotates a CUDA context. 68 | * 69 | * Allows the user to associate a CUDA context with a user-provided name. 70 | * 71 | * \param context - The handle of the CUDA context to name. 72 | * \param name - The name of the CUDA context. 73 | * 74 | * \par Example: 75 | * \code 76 | * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice ); 77 | * if ( CUDA_SUCCESS != status ) 78 | * goto Error; 79 | * nvtxNameCuContext(cuContext, "CTX_NAME"); 80 | * \endcode 81 | * 82 | * \version \NVTX_VERSION_1 83 | * @{ */ 84 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name); 85 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name); 86 | /** @} */ 87 | 88 | /* ------------------------------------------------------------------------- */ 89 | /** \brief Annotates a CUDA stream. 90 | * 91 | * Allows the user to associate a CUDA stream with a user-provided name. 92 | * 93 | * \param stream - The handle of the CUDA stream to name. 94 | * \param name - The name of the CUDA stream. 95 | * 96 | * \version \NVTX_VERSION_1 97 | * @{ */ 98 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name); 99 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name); 100 | /** @} */ 101 | 102 | /* ------------------------------------------------------------------------- */ 103 | /** \brief Annotates a CUDA event. 104 | * 105 | * Allows the user to associate a CUDA event with a user-provided name. 106 | * 107 | * \param event - The handle of the CUDA event to name. 108 | * \param name - The name of the CUDA event. 109 | * 110 | * \version \NVTX_VERSION_1 111 | * @{ */ 112 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name); 113 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name); 114 | /** @} */ 115 | 116 | /** @} */ /* END RESOURCE_NAMING */ 117 | 118 | /* ========================================================================= */ 119 | #ifdef UNICODE 120 | #define nvtxNameCuDevice nvtxNameCuDeviceW 121 | #define nvtxNameCuContext nvtxNameCuContextW 122 | #define nvtxNameCuStream nvtxNameCuStreamW 123 | #define nvtxNameCuEvent nvtxNameCuEventW 124 | #else 125 | #define nvtxNameCuDevice nvtxNameCuDeviceA 126 | #define nvtxNameCuContext nvtxNameCuContextA 127 | #define nvtxNameCuStream nvtxNameCuStreamA 128 | #define nvtxNameCuEvent nvtxNameCuEventA 129 | #endif 130 | 131 | #ifdef __cplusplus 132 | } 133 | #endif /* __cplusplus */ 134 | 135 | #ifndef NVTX_NO_IMPL 136 | #define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */ 137 | #include "nvtxDetail/nvtxImplCuda_v3.h" 138 | #undef NVTX_IMPL_GUARD_CUDA 139 | #endif /*NVTX_NO_IMPL*/ 140 | 141 | #endif /* NVTOOLSEXT_CUDA_V3 */ 142 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtCudaRt.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #include "nvToolsExt.h" 10 | 11 | #include "cuda.h" 12 | #include "driver_types.h" 13 | 14 | #ifndef NVTOOLSEXT_CUDART_V3 15 | #define NVTOOLSEXT_CUDART_V3 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif /* __cplusplus */ 20 | 21 | /* ========================================================================= */ 22 | /** \name Functions for CUDA Resource Naming 23 | */ 24 | /** \addtogroup RESOURCE_NAMING 25 | * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming 26 | * 27 | * This section covers the API functions that allow to annotate CUDA resources 28 | * with user-provided names. 29 | * 30 | * @{ 31 | */ 32 | 33 | /* ------------------------------------------------------------------------- */ 34 | /* \cond SHOW_HIDDEN 35 | * \brief Used to build a non-colliding value for resource types separated class 36 | * \version \NVTX_VERSION_2 37 | */ 38 | #define NVTX_RESOURCE_CLASS_CUDART 5 39 | /** \endcond */ 40 | 41 | /* ------------------------------------------------------------------------- */ 42 | /** \brief Resource types for CUDART 43 | */ 44 | typedef enum nvtxResourceCUDARTType_t 45 | { 46 | NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */ 47 | NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */ 48 | NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */ 49 | } nvtxResourceCUDARTType_t; 50 | 51 | 52 | /* ------------------------------------------------------------------------- */ 53 | /** \brief Annotates a CUDA device. 54 | * 55 | * Allows the user to associate a CUDA device with a user-provided name. 56 | * 57 | * \param device - The id of the CUDA device to name. 58 | * \param name - The name of the CUDA device. 59 | * 60 | * \version \NVTX_VERSION_1 61 | * @{ */ 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name); 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name); 64 | /** @} */ 65 | 66 | /* ------------------------------------------------------------------------- */ 67 | /** \brief Annotates a CUDA stream. 68 | * 69 | * Allows the user to associate a CUDA stream with a user-provided name. 70 | * 71 | * \param stream - The handle of the CUDA stream to name. 72 | * \param name - The name of the CUDA stream. 73 | * 74 | * \version \NVTX_VERSION_1 75 | * @{ */ 76 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name); 77 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name); 78 | /** @} */ 79 | 80 | /* ------------------------------------------------------------------------- */ 81 | /** \brief Annotates a CUDA event. 82 | * 83 | * Allows the user to associate a CUDA event with a user-provided name. 84 | * 85 | * \param event - The handle of the CUDA event to name. 86 | * \param name - The name of the CUDA event. 87 | * 88 | * \version \NVTX_VERSION_1 89 | * @{ */ 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name); 91 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name); 92 | /** @} */ 93 | 94 | /** @} */ /* END RESOURCE_NAMING */ 95 | 96 | /* ========================================================================= */ 97 | #ifdef UNICODE 98 | #define nvtxNameCudaDevice nvtxNameCudaDeviceW 99 | #define nvtxNameCudaStream nvtxNameCudaStreamW 100 | #define nvtxNameCudaEvent nvtxNameCudaEventW 101 | #else 102 | #define nvtxNameCudaDevice nvtxNameCudaDeviceA 103 | #define nvtxNameCudaStream nvtxNameCudaStreamA 104 | #define nvtxNameCudaEvent nvtxNameCudaEventA 105 | #endif 106 | 107 | #ifdef __cplusplus 108 | } 109 | #endif /* __cplusplus */ 110 | 111 | #ifndef NVTX_NO_IMPL 112 | #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */ 113 | #include "nvtxDetail/nvtxImplCudaRt_v3.h" 114 | #undef NVTX_IMPL_GUARD_CUDART 115 | #endif /*NVTX_NO_IMPL*/ 116 | 117 | #endif /* NVTOOLSEXT_CUDART_V3 */ 118 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_CUDART 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif /* __cplusplus */ 16 | 17 | typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name); 18 | typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name); 19 | typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name); 20 | typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name); 21 | typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name); 22 | typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name); 23 | 24 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name) 25 | { 26 | #ifndef NVTX_DISABLE 27 | nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr; 28 | if(local!=0) 29 | (*local)(device, name); 30 | #endif /*NVTX_DISABLE*/ 31 | } 32 | 33 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name) 34 | { 35 | #ifndef NVTX_DISABLE 36 | nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr; 37 | if(local!=0) 38 | (*local)(device, name); 39 | #endif /*NVTX_DISABLE*/ 40 | } 41 | 42 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name) 43 | { 44 | #ifndef NVTX_DISABLE 45 | nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr; 46 | if(local!=0) 47 | (*local)(stream, name); 48 | #endif /*NVTX_DISABLE*/ 49 | } 50 | 51 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name) 52 | { 53 | #ifndef NVTX_DISABLE 54 | nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr; 55 | if(local!=0) 56 | (*local)(stream, name); 57 | #endif /*NVTX_DISABLE*/ 58 | } 59 | 60 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name) 61 | { 62 | #ifndef NVTX_DISABLE 63 | nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr; 64 | if(local!=0) 65 | (*local)(event, name); 66 | #endif /*NVTX_DISABLE*/ 67 | } 68 | 69 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name) 70 | { 71 | #ifndef NVTX_DISABLE 72 | nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr; 73 | if(local!=0) 74 | (*local)(event, name); 75 | #endif /*NVTX_DISABLE*/ 76 | } 77 | 78 | #ifdef __cplusplus 79 | } /* extern "C" */ 80 | #endif /* __cplusplus */ 81 | 82 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_CUDA 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif /* __cplusplus */ 17 | 18 | typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name); 19 | typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name); 20 | typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name); 21 | typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name); 22 | typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name); 23 | typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name); 24 | typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name); 25 | typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name); 26 | 27 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name) 28 | { 29 | #ifndef NVTX_DISABLE 30 | nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr; 31 | if(local!=0) 32 | (*local)(device, name); 33 | #endif /*NVTX_DISABLE*/ 34 | } 35 | 36 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name) 37 | { 38 | #ifndef NVTX_DISABLE 39 | nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr; 40 | if(local!=0) 41 | (*local)(device, name); 42 | #endif /*NVTX_DISABLE*/ 43 | } 44 | 45 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name) 46 | { 47 | #ifndef NVTX_DISABLE 48 | nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr; 49 | if(local!=0) 50 | (*local)(context, name); 51 | #endif /*NVTX_DISABLE*/ 52 | } 53 | 54 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name) 55 | { 56 | #ifndef NVTX_DISABLE 57 | nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr; 58 | if(local!=0) 59 | (*local)(context, name); 60 | #endif /*NVTX_DISABLE*/ 61 | } 62 | 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name) 64 | { 65 | #ifndef NVTX_DISABLE 66 | nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr; 67 | if(local!=0) 68 | (*local)(stream, name); 69 | #endif /*NVTX_DISABLE*/ 70 | } 71 | 72 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name) 73 | { 74 | #ifndef NVTX_DISABLE 75 | nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr; 76 | if(local!=0) 77 | (*local)(stream, name); 78 | #endif /*NVTX_DISABLE*/ 79 | } 80 | 81 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name) 82 | { 83 | #ifndef NVTX_DISABLE 84 | nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr; 85 | if(local!=0) 86 | (*local)(event, name); 87 | #endif /*NVTX_DISABLE*/ 88 | } 89 | 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name) 91 | { 92 | #ifndef NVTX_DISABLE 93 | nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr; 94 | if(local!=0) 95 | (*local)(event, name); 96 | #endif /*NVTX_DISABLE*/ 97 | } 98 | 99 | #ifdef __cplusplus 100 | } /* extern "C" */ 101 | #endif /* __cplusplus */ 102 | 103 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_SYNC 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif /* __cplusplus */ 17 | 18 | typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); 19 | typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle); 20 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle); 21 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle); 22 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle); 23 | typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle); 24 | 25 | NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs) 26 | { 27 | #ifndef NVTX_DISABLE 28 | nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr; 29 | if(local!=0) 30 | return (*local)(domain, attribs); 31 | else 32 | #endif /*NVTX_DISABLE*/ 33 | return (nvtxSyncUser_t)0; 34 | } 35 | 36 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle) 37 | { 38 | #ifndef NVTX_DISABLE 39 | nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr; 40 | if(local!=0) 41 | (*local)(handle); 42 | #endif /*NVTX_DISABLE*/ 43 | } 44 | 45 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle) 46 | { 47 | #ifndef NVTX_DISABLE 48 | nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr; 49 | if(local!=0) 50 | (*local)(handle); 51 | #endif /*NVTX_DISABLE*/ 52 | } 53 | 54 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle) 55 | { 56 | #ifndef NVTX_DISABLE 57 | nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr; 58 | if(local!=0) 59 | (*local)(handle); 60 | #endif /*NVTX_DISABLE*/ 61 | } 62 | 63 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle) 64 | { 65 | #ifndef NVTX_DISABLE 66 | nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr; 67 | if(local!=0) 68 | (*local)(handle); 69 | #endif /*NVTX_DISABLE*/ 70 | } 71 | 72 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle) 73 | { 74 | #ifndef NVTX_DISABLE 75 | nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr; 76 | if(local!=0) 77 | (*local)(handle); 78 | #endif /*NVTX_DISABLE*/ 79 | } 80 | 81 | #ifdef __cplusplus 82 | } /* extern "C" */ 83 | #endif /* __cplusplus */ 84 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef __NVTX_LINKONCE_H__ 10 | #define __NVTX_LINKONCE_H__ 11 | 12 | /* This header defines macros to permit making definitions of global variables 13 | * and functions in C/C++ header files which may be included multiple times in 14 | * a translation unit or linkage unit. It allows authoring header-only libraries 15 | * which can be used by multiple other header-only libraries (either as the same 16 | * copy or multiple copies), and does not require any build changes, such as 17 | * adding another .c file, linking a static library, or deploying a dynamic 18 | * library. Globals defined with these macros have the property that they have 19 | * the same address, pointing to a single instance, for the entire linkage unit. 20 | * It is expected but not guaranteed that each linkage unit will have a separate 21 | * instance. 22 | * 23 | * In some situations it is desirable to declare a variable without initializing 24 | * it, refer to it in code or other variables' initializers, and then initialize 25 | * it later. Similarly, functions can be prototyped, have their address taken, 26 | * and then have their body defined later. In such cases, use the FWDDECL macros 27 | * when forward-declaring LINKONCE global variables without initializers and 28 | * function prototypes, and then use the DEFINE macros when later defining them. 29 | * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro, 30 | * following this pattern makes code maximally portable. 31 | */ 32 | 33 | #if defined(__MINGW32__) /* MinGW */ 34 | #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) 35 | #if defined(__cplusplus) 36 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 37 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK 38 | #else 39 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 40 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 41 | #endif 42 | #elif defined(_MSC_VER) /* MSVC */ 43 | #if defined(__cplusplus) 44 | #define NVTX_LINKONCE_DEFINE_GLOBAL extern "C" __declspec(selectany) 45 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline 46 | #else 47 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 48 | #define NVTX_LINKONCE_DEFINE_FUNCTION __inline 49 | #endif 50 | #elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */ 51 | #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) 52 | #if defined(__cplusplus) 53 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 54 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK 55 | #else 56 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 57 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 58 | #endif 59 | #elif defined(__CYGWIN__) /* Assume GCC or compatible */ 60 | #define NVTX_LINKONCE_WEAK __attribute__((weak)) 61 | #if defined(__cplusplus) 62 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 63 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline 64 | #else 65 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 66 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 67 | #endif 68 | #else /* All others: Assume GCC, clang, or compatible */ 69 | #define NVTX_LINKONCE_WEAK __attribute__((weak)) 70 | #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden"))) 71 | #if defined(__cplusplus) 72 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 73 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline 74 | #else 75 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 76 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 77 | #endif 78 | #endif 79 | 80 | #define NVTX_LINKONCE_FWDDECL_GLOBAL NVTX_LINKONCE_DEFINE_GLOBAL extern 81 | #define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION 82 | 83 | #endif /* __NVTX_LINKONCE_H__ */ 84 | -------------------------------------------------------------------------------- /src/include/p2p.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | 9 | #ifndef NCCL_P2P_H_ 10 | #define NCCL_P2P_H_ 11 | 12 | struct ncclP2Pinfo { 13 | void* buff; 14 | ssize_t nbytes; 15 | }; 16 | 17 | typedef ncclRecyclableList ncclP2Plist; 18 | 19 | static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes) { 20 | if (p2p == NULL) p2p = new ncclP2Plist(); 21 | struct ncclP2Pinfo* next; 22 | NCCLCHECK(p2p->getNewElem(&next)); 23 | next->buff = buff; 24 | next->nbytes = nBytes; 25 | return ncclSuccess; 26 | } 27 | #endif 28 | -------------------------------------------------------------------------------- /src/include/param.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PARAM_H_ 8 | #define NCCL_PARAM_H_ 9 | 10 | #include 11 | 12 | const char* userHomeDir(); 13 | void setEnvFile(const char* fileName); 14 | void initEnv(); 15 | 16 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); 17 | 18 | #define NCCL_PARAM(name, env, deftVal) \ 19 | int64_t ncclParam##name() { \ 20 | constexpr int64_t uninitialized = INT64_MIN; \ 21 | static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ 22 | static int64_t cache = uninitialized; \ 23 | if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ 24 | ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ 25 | } \ 26 | return cache; \ 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /src/include/profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PROFILER_H_ 8 | #define NCCL_PROFILER_H_ 9 | 10 | #include "proxy.h" 11 | 12 | enum ncclProxyProfileState { 13 | ncclProxyProfileBegin = 0, 14 | 15 | ncclProxyProfileSendGPUWait = 1, 16 | ncclProxyProfileSendWait = 2, 17 | 18 | ncclProxyProfileRecvWait = 1, 19 | ncclProxyProfileRecvFlushWait = 2, 20 | ncclProxyProfileRecvGPUWait = 3, 21 | 22 | ncclProxyProfileEnd = 4, 23 | 24 | ncclProxyProfileSleep = 8, 25 | ncclProxyProfileWakeup = 9, 26 | 27 | ncclProxyProfileIdle = 16, 28 | ncclProxyProfileActive = 17, 29 | 30 | ncclProxyProfileAppend = 24, 31 | ncclProxyProfileAppendEnd = 25 32 | }; 33 | 34 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); 35 | void ncclProfilingDump(); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/include/shm.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SHM_H_ 8 | #define NCCL_SHM_H_ 9 | 10 | #include "nccl.h" 11 | 12 | ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create); 13 | ncclResult_t ncclShmUnlink(const char* shmname); 14 | ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize); 15 | #endif 16 | -------------------------------------------------------------------------------- /src/include/socket.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SOCKET_H_ 8 | #define NCCL_SOCKET_H_ 9 | 10 | #include "nccl.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #define MAX_IFS 16 19 | #define MAX_IF_NAME_SIZE 16 20 | #define SLEEP_INT 1000 // connection retry sleep interval in usec 21 | #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) 22 | #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) 23 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) 24 | 25 | /* Common socket address storage structure for IPv4/IPv6 */ 26 | union ncclSocketAddress { 27 | struct sockaddr sa; 28 | struct sockaddr_in sin; 29 | struct sockaddr_in6 sin6; 30 | }; 31 | 32 | enum ncclSocketState { 33 | ncclSocketConnecting = 0, 34 | ncclSocketConnected = 1, 35 | ncclSocketError = 2, 36 | ncclSocketStateNum = 3 37 | } ; 38 | 39 | struct ncclSocket { 40 | int fd; 41 | union ncclSocketAddress addr; 42 | volatile uint32_t* abortFlag; 43 | int asyncFlag; 44 | enum ncclSocketState state; 45 | }; 46 | 47 | const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); 48 | ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); 49 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); 50 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); 51 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call 52 | ncclResult_t ncclSocketListen(struct ncclSocket* sock); 53 | // Connect to sock->addr. sock->fd is set after a successful call. 54 | ncclResult_t ncclSocketConnect(struct ncclSocket* sock); 55 | // Return socket connection state. 56 | ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state); 57 | // Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. 58 | ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket); 59 | 60 | #define NCCL_SOCKET_SEND 0 61 | #define NCCL_SOCKET_RECV 1 62 | 63 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); 64 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); 65 | ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); 66 | ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); 67 | ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed); 68 | /* initialize a socket. */ 69 | ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); 70 | #endif 71 | -------------------------------------------------------------------------------- /src/include/timer.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TIMER_H_ 8 | #define NCCL_TIMER_H_ 9 | #if ENABLE_TIMER 10 | #include 11 | #include 12 | #include 13 | static double freq = -1; 14 | static void calibrate() { 15 | struct timeval tv; 16 | gettimeofday(&tv, NULL); 17 | uint64_t timeCycles = __rdtsc(); 18 | double time = - tv.tv_sec*1E6 - tv.tv_usec; 19 | uint64_t total = 0ULL; 20 | for (int i=0; i<10000; i++) total += __rdtsc(); 21 | gettimeofday(&tv, NULL); 22 | timeCycles = __rdtsc() - timeCycles; 23 | time += tv.tv_sec*1E6 + tv.tv_usec; 24 | freq = timeCycles/time; 25 | } 26 | static inline double gettime() { 27 | if (freq == -1) calibrate(); 28 | return __rdtsc()/freq; 29 | } 30 | static uint64_t counts[8]; 31 | static double times[8]; 32 | static double startTimes[8]; 33 | #define TIME_START(index) do { \ 34 | counts[index]++; \ 35 | startTimes[index] = gettime(); \ 36 | } while (0); 37 | 38 | #define TIME_STOP(index) do { \ 39 | times[index] += gettime() - startTimes[index]; \ 40 | } while (0); 41 | 42 | #define TIME_CANCEL(index) do { \ 43 | counts[index]--; \ 44 | } while (0); 45 | 46 | #define TIME_PRINT(name) do { \ 47 | printf("%s stats", name); \ 48 | for (int i=0; i<8; i++) { \ 49 | if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ 50 | counts[i] = 0; \ 51 | } \ 52 | printf("\n"); \ 53 | } while (0); 54 | #else 55 | #define TIME_START(index) while(0); 56 | #define TIME_STOP(index) while(0); 57 | #define TIME_CANCEL(index) while(0); 58 | #define TIME_PRINT(name) 59 | #endif 60 | #endif 61 | -------------------------------------------------------------------------------- /src/include/transport.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TRANSPORT_H_ 8 | #define NCCL_TRANSPORT_H_ 9 | 10 | #include "devcomm.h" 11 | #include "graph.h" 12 | #include "nvmlwrap.h" 13 | #include "core.h" 14 | 15 | #define NTRANSPORTS 4 16 | #define TRANSPORT_P2P 0 17 | #define TRANSPORT_SHM 1 18 | #define TRANSPORT_NET 2 19 | #define TRANSPORT_COLLNET 3 20 | 21 | #include "proxy.h" 22 | 23 | extern struct ncclTransport ncclTransports[]; 24 | 25 | // Forward declarations 26 | struct ncclRing; 27 | struct ncclConnector; 28 | struct ncclComm; 29 | 30 | struct ncclPeerInfo { 31 | int rank; 32 | int cudaDev; 33 | int netDev; 34 | int gdrSupport; 35 | uint64_t hostHash; 36 | uint64_t pidHash; 37 | dev_t shmDev; 38 | int64_t busId; 39 | struct ncclComm* comm; 40 | int cudaCompCap; 41 | }; 42 | 43 | #define CONNECT_SIZE 128 44 | struct ncclConnect { 45 | char data[CONNECT_SIZE]; 46 | }; 47 | 48 | struct ncclTransportComm { 49 | ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); 50 | ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); 51 | ncclResult_t (*free)(struct ncclConnector*); 52 | ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels); 53 | ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); 54 | ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); 55 | ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm); 56 | ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*); 57 | }; 58 | 59 | struct ncclTransport { 60 | const char name[4]; 61 | ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); 62 | struct ncclTransportComm send; 63 | struct ncclTransportComm recv; 64 | }; 65 | 66 | ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); 67 | ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); 68 | 69 | enum { collNetRecv=0, collNetSend=1 }; 70 | int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type); 71 | ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); 72 | ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm); 73 | #endif 74 | -------------------------------------------------------------------------------- /src/include/trees.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TREES_H_ 8 | #define NCCL_TREES_H_ 9 | 10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); 11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/include/utils.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_UTILS_H_ 8 | #define NCCL_UTILS_H_ 9 | 10 | #include "nccl.h" 11 | #include "checks.h" 12 | #include 13 | 14 | int ncclCudaCompCap(); 15 | 16 | // PCI Bus ID <-> int64 conversion functions 17 | ncclResult_t int64ToBusId(int64_t id, char* busId); 18 | ncclResult_t busIdToInt64(const char* busId, int64_t* id); 19 | 20 | ncclResult_t getBusId(int cudaDev, int64_t *busId); 21 | 22 | ncclResult_t getHostName(char* hostname, int maxlen, const char delim); 23 | uint64_t getHash(const char* string, int n); 24 | uint64_t getHostHash(); 25 | uint64_t getPidHash(); 26 | 27 | struct netIf { 28 | char prefix[64]; 29 | int port; 30 | }; 31 | 32 | int parseStringList(const char* string, struct netIf* ifList, int maxList); 33 | bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); 34 | 35 | static long log2i(long n) { 36 | long l = 0; 37 | while (n>>=1) l++; 38 | return l; 39 | } 40 | 41 | // Recyclable list that avoids frequent malloc/free 42 | template 43 | struct ncclListElem { 44 | T data; 45 | struct ncclListElem* next; 46 | }; 47 | 48 | template 49 | class ncclRecyclableList { 50 | private: 51 | struct ncclListElem* head; 52 | struct ncclListElem* tail; 53 | struct ncclListElem* cursor; 54 | int n; 55 | 56 | public: 57 | ncclRecyclableList() { 58 | tail = cursor = head = NULL; 59 | n = 0; 60 | } 61 | 62 | int count() const { return n; } 63 | 64 | // Get a new element from the list and return pointer 65 | ncclResult_t getNewElem(T** dataOut) { 66 | if (tail != NULL) { 67 | *dataOut = &tail->data; 68 | memset(*dataOut, 0, sizeof(T)); 69 | } else { 70 | NCCLCHECK(ncclCalloc(&tail, 1)); 71 | *dataOut = &tail->data; 72 | cursor = head = tail; 73 | } 74 | if (tail->next == NULL) { 75 | NCCLCHECK(ncclCalloc(&tail->next, 1)); 76 | } 77 | tail = tail->next; 78 | n += 1; 79 | return ncclSuccess; 80 | } 81 | 82 | T* begin() { 83 | if (head == NULL || head == tail) return NULL; 84 | cursor = head->next; 85 | return &head->data; 86 | } 87 | 88 | // Get next element from the list during an iteration 89 | T* getNext() { 90 | // tail always points to the next element to be enqueued 91 | // hence does not contain valid data 92 | if (cursor == NULL || cursor == tail) return NULL; 93 | T* rv = &cursor->data; 94 | cursor = cursor->next; 95 | return rv; 96 | } 97 | 98 | T* peakNext() { 99 | if (cursor == NULL || cursor == tail) return NULL; 100 | return &cursor->data; 101 | } 102 | 103 | // Recycle the list without freeing the space 104 | void recycle() { 105 | tail = cursor = head; 106 | n = 0; 107 | } 108 | 109 | ~ncclRecyclableList() { 110 | while (head != NULL) { 111 | struct ncclListElem* temp = head; 112 | head = head->next; 113 | free(temp); 114 | } 115 | } 116 | }; 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /src/misc/argcheck.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "argcheck.h" 8 | #include "comm.h" 9 | 10 | static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { 11 | cudaPointerAttributes attr; 12 | cudaError_t err = cudaPointerGetAttributes(&attr, pointer); 13 | if (err != cudaSuccess || attr.devicePointer == NULL) { 14 | WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer); 15 | return ncclInvalidArgument; 16 | } 17 | #if CUDART_VERSION >= 10000 18 | if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { 19 | #else 20 | if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { 21 | #endif 22 | WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); 23 | return ncclInvalidArgument; 24 | } 25 | return ncclSuccess; 26 | } 27 | 28 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { 29 | if (ptr == NULL) { 30 | WARN("%s : %s argument is NULL", opname, ptrname); 31 | return ncclInvalidArgument; 32 | } 33 | return ncclSuccess; 34 | } 35 | 36 | ncclResult_t ArgsCheck(struct ncclInfo* info) { 37 | // First, the easy ones 38 | if (info->root < 0 || info->root >= info->comm->nRanks) { 39 | WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); 40 | return ncclInvalidArgument; 41 | } 42 | if (info->datatype < 0 || info->datatype >= ncclNumTypes) { 43 | WARN("%s : invalid type %d", info->opName, info->datatype); 44 | return ncclInvalidArgument; 45 | } 46 | // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars. 47 | info->nBytes = info->count * ncclTypeSize(info->datatype); 48 | if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAll) { 49 | info->count = info->nBytes; 50 | info->datatype = ncclInt8; 51 | } 52 | if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter || info->coll == ncclFuncAllToAll) info->nBytes *= info->comm->nRanks; // count is per rank 53 | 54 | if (info->op < 0 || ncclMaxRedOp < info->op) { 55 | WARN("%s : invalid reduction operation %d", info->opName, info->op); 56 | return ncclInvalidArgument; 57 | } 58 | int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps); 59 | if (ncclNumOps <= info->op && 60 | (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) { 61 | WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op); 62 | return ncclInvalidArgument; 63 | } 64 | 65 | if (info->comm->checkPointers) { 66 | if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) { 67 | if (info->count >0) 68 | NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName)); 69 | } else { 70 | // Check CUDA device pointers 71 | if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) { 72 | NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); 73 | } 74 | if (info->coll != ncclFuncReduce || info->comm->rank == info->root) { 75 | NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); 76 | } 77 | } 78 | } 79 | return ncclSuccess; 80 | } 81 | -------------------------------------------------------------------------------- /src/misc/param.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "param.h" 8 | #include "debug.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const char* userHomeDir() { 21 | struct passwd *pwUser = getpwuid(getuid()); 22 | return pwUser == NULL ? NULL : pwUser->pw_dir; 23 | } 24 | 25 | void setEnvFile(const char* fileName) { 26 | FILE * file = fopen(fileName, "r"); 27 | if (file == NULL) return; 28 | 29 | char *line = NULL; 30 | char envVar[1024]; 31 | char envValue[1024]; 32 | size_t n = 0; 33 | ssize_t read; 34 | while ((read = getline(&line, &n, file)) != -1) { 35 | if (line[read-1] == '\n') line[read-1] = '\0'; 36 | int s=0; // Env Var Size 37 | while (line[s] != '\0' && line[s] != '=') s++; 38 | if (line[s] == '\0') continue; 39 | strncpy(envVar, line, std::min(1023,s)); 40 | envVar[s] = '\0'; 41 | s++; 42 | strncpy(envValue, line+s, 1023); 43 | envValue[1023]='\0'; 44 | setenv(envVar, envValue, 0); 45 | //printf("%s : %s->%s\n", fileName, envVar, envValue); 46 | } 47 | if (line) free(line); 48 | fclose(file); 49 | } 50 | 51 | void initEnv() { 52 | char confFilePath[1024]; 53 | const char * userDir = userHomeDir(); 54 | if (userDir) { 55 | sprintf(confFilePath, "%s/.nccl.conf", userDir); 56 | setEnvFile(confFilePath); 57 | } 58 | sprintf(confFilePath, "/etc/nccl.conf"); 59 | setEnvFile(confFilePath); 60 | } 61 | 62 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { 63 | static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 64 | pthread_mutex_lock(&mutex); 65 | if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { 66 | char* str = getenv(env); 67 | int64_t value = deftVal; 68 | if (str && strlen(str) > 0) { 69 | errno = 0; 70 | value = strtoll(str, nullptr, 0); 71 | if (errno) { 72 | value = deftVal; 73 | INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); 74 | } else { 75 | INFO(NCCL_ALL,"%s set by environment to %lld.", env, (long long)value); 76 | } 77 | } 78 | __atomic_store_n(cache, value, __ATOMIC_RELAXED); 79 | } 80 | pthread_mutex_unlock(&mutex); 81 | } 82 | -------------------------------------------------------------------------------- /src/misc/profiler.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "profiler.h" 8 | 9 | //#define PROFILE_PROXY 1 10 | #ifdef PROFILE_PROXY 11 | #include "timer.h" 12 | #include "alloc.h" 13 | 14 | static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" }; 15 | static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" }; 16 | static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" }; 17 | struct ncclProxyProfileEvent { 18 | double timestamp[6]; 19 | uint64_t opCount; 20 | int peer; 21 | int step; 22 | uint16_t channel; 23 | uint8_t type; // send / recv 24 | uint8_t opIndex; 25 | }; 26 | 27 | struct ncclProxyProfileEvent* profilingEvents = NULL; 28 | int profilingIndex = 0; 29 | double profilingStart = 0; 30 | #define MAX_EVENTS 200000 31 | 32 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { 33 | if (profilingEvents == NULL) { 34 | NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS)); 35 | profilingStart = gettime(); 36 | } 37 | struct ncclProxyProfileEvent* event = NULL; 38 | if (state%8 == 0) { 39 | if (profilingIndex == MAX_EVENTS) return ncclSuccess; 40 | args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++; 41 | if (state == ncclProxyProfileBegin) { 42 | // Proxy operation information 43 | event->opCount = args->opCount; 44 | event->channel = args->subs[sub].channelId; 45 | event->peer = args->subs[sub].peer; 46 | event->type = args->pattern; 47 | event->step = step; 48 | event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256; 49 | } else event->peer = -state; 50 | } else { 51 | event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS]; 52 | if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL; 53 | if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount; 54 | } 55 | // Timestamp 56 | event->timestamp[state%8] = gettime()-profilingStart; 57 | return ncclSuccess; 58 | } 59 | 60 | void ncclProfilingDump() { 61 | static int dumpDone = 0; 62 | if (dumpDone) return; 63 | dumpDone = 1; 64 | const char* str = getenv("NCCL_PROXY_PROFILE"); 65 | if (!str) { free(profilingEvents); return; } 66 | FILE* f = fopen(str, "w"); 67 | fprintf(f, "[\n"); 68 | 69 | for (int i=0; ipeer >= 0; 72 | const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") : 73 | profilingEventStr[-(e->peer/8)]; 74 | 75 | 76 | if (sendrecv) { 77 | int state = ncclProxyProfileBegin; 78 | const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr; 79 | fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n", 80 | typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex); 81 | 82 | while (statetimestamp[state]) { 84 | const char* name = stateStr[state]; 85 | fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", 86 | name, i, e->channel, e->timestamp[state]); 87 | state++; 88 | while (e->timestamp[state] == 0) state++; 89 | fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", 90 | name, i, e->channel, e->timestamp[state]); 91 | } 92 | } 93 | 94 | fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", 95 | typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]); 96 | } else { 97 | if (e->peer == -ncclProxyProfileAppend) { 98 | fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n", 99 | typeStr, i, e->timestamp[0], e->opCount); 100 | } else { 101 | fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", 102 | typeStr, i, e->timestamp[0]); 103 | } 104 | fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", 105 | typeStr, i, e->timestamp[1]); 106 | } 107 | } 108 | fprintf(f, "{} ]\n"); 109 | fclose(f); 110 | free(profilingEvents); 111 | } 112 | #else 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } 114 | void ncclProfilingDump() {} 115 | #endif 116 | -------------------------------------------------------------------------------- /src/misc/shmutils.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "shm.h" 8 | #include "checks.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | // Change functions behavior to match other SYS functions 19 | static int shm_allocate(int fd, const int shmSize) { 20 | int err = posix_fallocate(fd, 0, shmSize); 21 | if (err) { errno = err; return -1; } 22 | return 0; 23 | } 24 | static int shm_map(int fd, const int shmSize, void** ptr) { 25 | *ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 26 | return (*ptr == MAP_FAILED) ? -1 : 0; 27 | } 28 | 29 | static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) { 30 | if (create) { 31 | if (shmPath[0] == '\0') { 32 | sprintf(shmPath, "/dev/shm/nccl-XXXXXX"); 33 | *fd = mkstemp(shmPath); 34 | } else { 35 | SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd); 36 | } 37 | if (ftruncate(*fd, shmSize) != 0) { 38 | WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize); 39 | return ncclSystemError; 40 | } 41 | } else { 42 | SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd); 43 | } 44 | *ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0); 45 | if (*ptr == NULL) { 46 | WARN("Could not map %s\n", shmPath); 47 | return ncclSystemError; 48 | } 49 | close(*fd); 50 | *fd = -1; 51 | if (create) memset(*ptr, 0, shmSize); 52 | return ncclSuccess; 53 | } 54 | 55 | ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) { 56 | int fd = -1; 57 | void* ptr = MAP_FAILED; 58 | ncclResult_t res = ncclSuccess; 59 | 60 | NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError); 61 | if (devShmPtr) { 62 | CUDACHECKGOTO(cudaHostRegister(ptr, shmSize, cudaHostRegisterMapped), res, cudaError); 63 | CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError); 64 | } 65 | 66 | *shmPtr = ptr; 67 | return ncclSuccess; 68 | sysError: 69 | WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize); 70 | cudaError: 71 | if (fd != -1) close(fd); 72 | if (create) shm_unlink(shmPath); 73 | if (ptr != MAP_FAILED) munmap(ptr, shmSize); 74 | *shmPtr = NULL; 75 | return res; 76 | } 77 | 78 | ncclResult_t ncclShmUnlink(const char* shmPath) { 79 | if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink"); 80 | return ncclSuccess; 81 | } 82 | 83 | ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) { 84 | if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr)); 85 | if (munmap(shmPtr, shmSize) != 0) { 86 | WARN("munmap of shared memory failed"); 87 | return ncclSystemError; 88 | } 89 | return ncclSuccess; 90 | } 91 | -------------------------------------------------------------------------------- /src/nccl.pc.in: -------------------------------------------------------------------------------- 1 | prefix=${nccl:Prefix} 2 | exec_prefix=${prefix} 3 | libdir=${exec_prefix}/lib 4 | includedir=${prefix}/include 5 | 6 | Name: nccl 7 | Description: Optimized primitives for collective multi-GPU communication 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} 9 | Libs: -L${libdir} -lnccl 10 | Cflags: -I${includedir} 11 | --------------------------------------------------------------------------------