├── .github
    └── workflows
    │   └── codeql-analysis.yml
├── .gitignore
├── LICENSE.txt
├── Makefile
├── README.md
├── SECURITY.md
├── dockerfiles
    └── Dockerfile
├── ext-net
    ├── dummy
    │   ├── Makefile
    │   └── plugin.c
    └── google-fastsocket
    │   └── Makefile
├── makefiles
    ├── common.mk
    ├── formatting.mk
    └── version.mk
├── patches
    ├── nccl.cpp.patch
    ├── torch1.12.nccl.cpp.patch
    └── torch1.13.nccl.cpp.patch
├── pkg
    ├── Makefile
    ├── debian
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── changelog.in
    │   ├── compat
    │   ├── control.in
    │   ├── copyright
    │   ├── gbp.conf
    │   ├── libnccl-dev.install.in
    │   ├── libnccl2.install.in
    │   ├── rules
    │   └── source
    │   │   └── format
    ├── redhat
    │   ├── Makefile
    │   └── nccl.spec.in
    ├── srctxz
    │   ├── Makefile
    │   └── create_srctxz.sh.in
    └── txz
    │   ├── Makefile
    │   └── create_txz.sh.in
├── src
    ├── Makefile
    ├── bootstrap.cc
    ├── channel.cc
    ├── collectives
    │   ├── all_gather.cc
    │   ├── all_reduce.cc
    │   ├── all_to_all.cc
    │   ├── broadcast.cc
    │   ├── custom_collective.cc
    │   ├── device
    │   │   ├── Makefile
    │   │   ├── all_gather.cu
    │   │   ├── all_gather.h
    │   │   ├── all_reduce.cu
    │   │   ├── all_reduce.h
    │   │   ├── all_to_all.cu
    │   │   ├── all_to_all.h
    │   │   ├── broadcast.cu
    │   │   ├── broadcast.h
    │   │   ├── common.h
    │   │   ├── common_kernel.h
    │   │   ├── custom_collective.cu
    │   │   ├── custom_collective.h
    │   │   ├── functions.cu
    │   │   ├── gen_rules.sh
    │   │   ├── msccl_interpreter.h
    │   │   ├── onerank_reduce.cu
    │   │   ├── op128.h
    │   │   ├── primitives.h
    │   │   ├── prims_ll.h
    │   │   ├── prims_ll128.h
    │   │   ├── prims_simple.h
    │   │   ├── reduce.cu
    │   │   ├── reduce.h
    │   │   ├── reduce_kernel.h
    │   │   ├── reduce_scatter.cu
    │   │   ├── reduce_scatter.h
    │   │   ├── sendrecv.cu
    │   │   ├── sendrecv.h
    │   │   └── stride_copy.cu
    │   ├── reduce.cc
    │   ├── reduce_scatter.cc
    │   └── sendrecv.cc
    ├── debug.cc
    ├── enhcompat.cc
    ├── enqueue.cc
    ├── graph
    │   ├── connect.cc
    │   ├── paths.cc
    │   ├── rings.cc
    │   ├── rings.h
    │   ├── search.cc
    │   ├── topo.cc
    │   ├── topo.h
    │   ├── trees.cc
    │   ├── tuning.cc
    │   ├── xml.cc
    │   └── xml.h
    ├── group.cc
    ├── include
    │   ├── align.h
    │   ├── alloc.h
    │   ├── argcheck.h
    │   ├── bootstrap.h
    │   ├── channel.h
    │   ├── checks.h
    │   ├── coll_net.h
    │   ├── collectives.h
    │   ├── comm.h
    │   ├── core.h
    │   ├── cpuset.h
    │   ├── debug.h
    │   ├── devcomm.h
    │   ├── enqueue.h
    │   ├── gdrwrap.h
    │   ├── graph.h
    │   ├── group.h
    │   ├── ibvwrap.h
    │   ├── info.h
    │   ├── msccl.h
    │   ├── nccl_net.h
    │   ├── net.h
    │   ├── npkit
    │   │   ├── npkit.h
    │   │   ├── npkit_event.h
    │   │   └── npkit_struct.h
    │   ├── nvmlwrap.h
    │   ├── nvtx.h
    │   ├── nvtx3.hpp
    │   ├── nvtx3
    │   │   ├── nvToolsExt.h
    │   │   ├── nvToolsExtCuda.h
    │   │   ├── nvToolsExtCudaRt.h
    │   │   ├── nvToolsExtOpenCL.h
    │   │   ├── nvToolsExtSync.h
    │   │   └── nvtxDetail
    │   │   │   ├── nvtxImpl.h
    │   │   │   ├── nvtxImplCore.h
    │   │   │   ├── nvtxImplCudaRt_v3.h
    │   │   │   ├── nvtxImplCuda_v3.h
    │   │   │   ├── nvtxImplOpenCL_v3.h
    │   │   │   ├── nvtxImplSync_v3.h
    │   │   │   ├── nvtxInit.h
    │   │   │   ├── nvtxInitDecls.h
    │   │   │   ├── nvtxInitDefs.h
    │   │   │   ├── nvtxLinkOnce.h
    │   │   │   └── nvtxTypes.h
    │   ├── p2p.h
    │   ├── param.h
    │   ├── profiler.h
    │   ├── proxy.h
    │   ├── shm.h
    │   ├── socket.h
    │   ├── timer.h
    │   ├── transport.h
    │   ├── trees.h
    │   └── utils.h
    ├── init.cc
    ├── misc
    │   ├── argcheck.cc
    │   ├── gdrwrap.cc
    │   ├── ibvwrap.cc
    │   ├── npkit.cc
    │   ├── nvmlwrap.cc
    │   ├── param.cc
    │   ├── profiler.cc
    │   ├── shmutils.cc
    │   ├── socket.cc
    │   └── utils.cc
    ├── nccl.h.in
    ├── nccl.pc.in
    ├── net.cc
    ├── proxy.cc
    ├── transport.cc
    └── transport
    │   ├── coll_net.cc
    │   ├── net.cc
    │   ├── net_ib.cc
    │   ├── net_socket.cc
    │   ├── p2p.cc
    │   └── shm.cc
└── tools
    └── npkit_trace_generator.py


/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '30 12 * * 2'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-18.04
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'cpp' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
37 |         # Learn more:
38 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
39 | 
40 |     steps:
41 |     - name: Checkout repository
42 |       uses: actions/checkout@v2
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v1
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file.
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     # - name: Autobuild
57 |     #  uses: github/codeql-action/autobuild@v1
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     - run: |
67 |        sudo wget -O /etc/apt/preferences.d/cuda-repository-pin-600 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
68 |        sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
69 |        sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
70 |        sudo apt install cuda -y
71 |        export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
72 |        make src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
73 | 
74 |     - name: Perform CodeQL Analysis
75 |       uses: github/codeql-action/analyze@v1
76 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
2 | /build
3 | *.gcov
4 | /coverage/
5 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  Contains contributions from NVIDIA. 
 3 | 
 4 |  Copyright (c) 2020-2022, Microsoft Corporation. All rights reserved. 
 5 | 
 6 |  Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 7 | 
 8 |  Redistribution and use in source and binary forms, with or without
 9 |  modification, are permitted provided that the following conditions
10 |  are met:
11 |   * Redistributions of source code must retain the above copyright
12 |     notice, this list of conditions and the following disclaimer.
13 |   * Redistributions in binary form must reproduce the above copyright
14 |     notice, this list of conditions and the following disclaimer in the
15 |     documentation and/or other materials provided with the distribution.
16 |   * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
17 |     Laboratory, the U.S. Department of Energy, nor the names of their
18 |     contributors may be used to endorse or promote products derived
19 |     from this software without specific prior written permission.
20 | 
21 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
22 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
25 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 
33 |  The U.S. Department of Energy funded the development of this software
34 |  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
35 | 
36 | 
37 | This code also includes files from the NVIDIA Tools Extension SDK project.
38 | 
39 | See:
40 | 
41 |    https://github.com/NVIDIA/NVTX
42 | 
43 | for more information and license details.
44 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : src.build
 9 | install : src.install
10 | BUILDDIR ?= $(abspath ./build)
11 | ABSBUILDDIR := $(abspath $(BUILDDIR))
12 | TARGETS := src pkg
13 | clean: ${TARGETS:%=%.clean}
14 | test.build: src.build
15 | LICENSE_FILES := LICENSE.txt
16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
17 | lic: $(LICENSE_TARGETS)
18 | 
19 | ${BUILDDIR}/%.txt: %.txt
20 | 	@printf "Copying    %-35s > %s\n" $< $@
21 | 	mkdir -p ${BUILDDIR}
22 | 	cp $< $@
23 | 
24 | src.%:
25 | 	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
26 | 
27 | pkg.%:
28 | 	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
29 | 
30 | pkg.debian.prep: lic
31 | pkg.txz.prep: lic
32 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-devel
 2 | 
 3 | ##############################################################################
 4 | # Temporary Installation Directory
 5 | ##############################################################################
 6 | ENV STAGE_DIR=/tmp
 7 | RUN mkdir -p ${STAGE_DIR}
 8 | 
 9 | 
10 | ##############################################################################
11 | # Installation/Basic Utilities
12 | ##############################################################################
13 | RUN apt-get update && \
14 |     apt-get install -y --allow-change-held-packages --no-install-recommends \
15 |     software-properties-common \
16 |     build-essential autotools-dev cmake g++ gcc \
17 |     openssh-client openssh-server \
18 |     nfs-common pdsh curl sudo net-tools \
19 |     vim iputils-ping wget perl unzip
20 | 
21 | ##############################################################################
22 | # Installation Latest Git
23 | ##############################################################################
24 | RUN add-apt-repository ppa:git-core/ppa -y && \
25 |     apt-get update && \
26 |     apt-get install -y git && \
27 |     git --version
28 | 
29 | ##############################################################################
30 | # Pip
31 | ##############################################################################
32 | # pip version <= 20.1.1 is needed for the ruamel.yaml installation conflict
33 | # between conda and pip. ruamel.yaml is needed by azureml.
34 | # https://github.com/Azure/MachineLearningNotebooks/issues/1110 for more info.
35 | ENV PIP_VERSION=20.1.1
36 | RUN conda install -y pip=${PIP_VERSION} && \
37 |     # Print python an pip version
38 |     python -V && pip -V
39 | 
40 | ##############################################################################
41 | # MPI
42 | ##############################################################################
43 | RUN cd ${STAGE_DIR} && mkdir openmpi/ && cd openmpi && wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.1.tar.gz && \
44 |     tar zxf openmpi-4.0.1.tar.gz && \
45 |     cd openmpi-4.0.1 && \
46 |     ./configure --enable-orterun-prefix-by-default && \
47 |     make -j $(nproc) all && \
48 |     make install && \
49 |     ldconfig && \
50 |     rm -rf ${STAGE_DIR}/openmpi/
51 | 
52 | ##############################################################################
53 | # SCCL
54 | ##############################################################################
55 | 
56 | # update NCCL in pytorch, install SCCL interpreter
57 | RUN pip uninstall torch -y
58 | 
59 | RUN pip install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
60 | 
61 | RUN conda install -c pytorch magma-cuda111 -y
62 | 
63 | ENV CMAKE_PREFIX_PATH=/opt/conda
64 | 
65 | # Change NCCL to SCCL Runtime
66 | RUN cd ${STAGE_DIR} && \
67 |     git clone https://github.com/pytorch/pytorch.git && \
68 |     cd pytorch && \
69 |     git checkout tags/v1.9.0 -b v1.9.0_sccl && \
70 |     perl -p -i -e  's/url = https:\/\/github\.com\/NVIDIA\/nccl/url = https:\/\/github\.com\/microsoft\/msccl/g' .gitmodules && \
71 |     git submodule sync third_party/nccl  && \
72 |     git submodule update --init --recursive  && \
73 |     git submodule update --init --recursive --remote third_party/nccl && \
74 |     cd third_party/nccl/nccl/ && \
75 |     git checkout master && \
76 |     cd ../../../ && \
77 |     git apply third_party/nccl/nccl/patches/nccl.cpp.patch && \
78 |     python setup.py install && \
79 |     cd ${STAGE_DIR} && \
80 |     rm -rf ${STAGE_DIR}/pytorch
81 | 
82 | # Install SCCL
83 | RUN cd ${STAGE_DIR}/ && \
84 |     git clone https://github.com/microsoft/sccl.git && \
85 |     cd sccl/ && python setup.py install && \
86 |     cd ${STAGE_DIR} && \
87 |     rm -rf ${STAGE_DIR}/sccl/
88 | 
89 | ##############################################################################
90 | # inspector-topo
91 | ##############################################################################
92 | 
93 | RUN apt-get install libibverbs-dev libnuma-dev -y
94 | RUN cd ${STAGE_DIR}/ && git clone https://github.com/microsoft/inspector-topo.git && \
95 |     cd inspector-topo/ && make && make install
96 | 


--------------------------------------------------------------------------------
/ext-net/dummy/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME:=../../build/
 7 | CUDA_HOME:=/usr/local/cuda
 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
 9 | PLUGIN_SO:=libnccl-net.so
10 | 
11 | default: $(PLUGIN_SO)
12 | 
13 | $(PLUGIN_SO): plugin.c
14 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
15 | 
16 | clean:
17 | 	rm -f $(PLUGIN_SO)
18 | 


--------------------------------------------------------------------------------
/ext-net/dummy/plugin.c:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <nccl.h>
 8 | #include <nccl_net.h>
 9 | 
10 | #define __hidden __attribute__ ((visibility("hidden")))
11 | 
12 | __hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
13 | __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
14 | __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
15 | __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
16 | __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
17 | __hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
18 | __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
19 | __hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
20 | __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
21 | __hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; }
22 | __hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; }
23 | __hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) { return ncclInternalError; }
24 | __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
25 | __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
26 | __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
27 | __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
28 | 
29 | ncclNet_t NCCL_PLUGIN_SYMBOL = {
30 |   "Dummy",
31 |   pluginInit,
32 |   pluginDevices,
33 |   pluginPciPath,
34 |   pluginPtrSupport,
35 |   pluginListen,
36 |   pluginConnect,
37 |   pluginAccept,
38 |   pluginRegMr,
39 |   pluginDeregMr,
40 |   pluginIsend,
41 |   pluginIrecv,
42 |   pluginFlush,
43 |   pluginTest,
44 |   pluginCloseSend,
45 |   pluginCloseRecv,
46 |   pluginCloseListen
47 | };
48 | 
49 | __hidden ncclResult_t pluginCollNetInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
50 | __hidden ncclResult_t pluginCollNetDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
51 | __hidden ncclResult_t pluginCollNetPciPath(int dev, char** path) { return ncclInternalError; }
52 | __hidden ncclResult_t pluginCollNetPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
53 | __hidden ncclResult_t pluginCollNetListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
54 | __hidden ncclResult_t pluginCollNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { return ncclInternalError; }
55 | __hidden ncclResult_t pluginCollNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { return ncclInternalError; }
56 | __hidden ncclResult_t pluginCollNetRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
57 | __hidden ncclResult_t pluginCollNetDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
58 | __hidden ncclResult_t pluginCollNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { return ncclInternalError; }
59 | __hidden ncclResult_t pluginCollNetFlush(void* collComm, void* data, int size, void* mhandle) { return ncclInternalError; }
60 | __hidden ncclResult_t pluginCollNetTest(void* request, int* done, int* size) { return ncclInternalError; }
61 | __hidden ncclResult_t pluginCollNetCloseColl(void* collComm) { return ncclInternalError; }
62 | __hidden ncclResult_t pluginCollNetCloseListen(void* listenComm) { return ncclInternalError; }
63 | 
64 | ncclCollNet_t NCCL_COLLNET_PLUGIN_SYMBOL = {
65 |   "Dummy",
66 |   pluginCollNetInit,
67 |   pluginCollNetDevices,
68 |   pluginCollNetPciPath,
69 |   pluginCollNetPtrSupport,
70 |   pluginCollNetListen,
71 |   pluginCollNetConnect,
72 |   pluginCollNetReduceSupport,
73 |   pluginCollNetRegMr,
74 |   pluginCollNetDeregMr,
75 |   pluginCollNetIallreduce,
76 |   pluginCollNetFlush,
77 |   pluginCollNetTest,
78 |   pluginCollNetCloseColl,
79 |   pluginCollNetCloseListen
80 | };
81 | 


--------------------------------------------------------------------------------
/ext-net/google-fastsocket/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME?=/usr/local/cuda
 2 | INC:=-I$(CUDA_HOME)/include
 3 | PLUGIN_SO:=libnccl-net.so
 4 | 
 5 | default: $(PLUGIN_SO)
 6 | 
 7 | $(PLUGIN_SO): nccl-fastsocket/net_fastsocket.cc nccl-fastsocket/compat.cc nccl-fastsocket/utilities.cc
 8 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
 9 | 
10 | nccl-fastsocket/%.cc:
11 | 	git clone https://github.com/google/nccl-fastsocket.git
12 | 
13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO)
14 | 
15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO)
16 | 	@printf "Grabbing %-35s > %s\n" $< $@
17 | 	mkdir -p $(BUILDDIR)/lib
18 | 	install -m 644 $< $@
19 | 
20 | clean:
21 | 	rm -f $(PLUGIN_SO)
22 | 	rm -Rf nccl-fastsocket
23 | 


--------------------------------------------------------------------------------
/makefiles/common.mk:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # See LICENSE.txt for license information
  5 | #
  6 | 
  7 | CUDA_HOME ?= /usr/local/cuda
  8 | PREFIX ?= /usr/local
  9 | VERBOSE ?= 0
 10 | KEEP ?= 0
 11 | DEBUG ?= 0
 12 | TRACE ?= 0
 13 | PROFAPI ?= 0
 14 | NVTX ?= 1
 15 | NPKIT ?= 0
 16 | 
 17 | NVCC = $(CUDA_HOME)/bin/nvcc
 18 | 
 19 | CUDA_LIB ?= $(CUDA_HOME)/lib64
 20 | CUDA_INC ?= $(CUDA_HOME)/include
 21 | CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 22 | #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
 23 | CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 24 | CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 25 | #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
 26 | 
 27 | # You should define NVCC_GENCODE in your environment to the minimal set
 28 | # of archs to reduce compile time.
 29 | CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
 30 |                 -gencode=arch=compute_50,code=sm_50 \
 31 |                 -gencode=arch=compute_60,code=sm_60 \
 32 |                 -gencode=arch=compute_61,code=sm_61
 33 | CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 34 | CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
 35 | 
 36 | CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 37 | CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
 38 | CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
 39 | 
 40 | # Include Ampere support if we're using CUDA11 or above
 41 | ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
 42 |   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
 43 | # Include Volta support if we're using CUDA9 or above
 44 | else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
 45 |   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
 46 | else
 47 |   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
 48 | endif
 49 | #$(info NVCC_GENCODE is ${NVCC_GENCODE})
 50 | 
 51 | CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
 52 |               -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
 53 |               -I $(CUDA_INC) \
 54 |               $(CXXFLAGS)
 55 | # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
 56 | # 512 : 120, 640 : 96, 768 : 80, 1024 : 60
 57 | # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
 58 | NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 59 | # Use addprefix so that we can specify more than one path
 60 | NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
 61 | 
 62 | ########## GCOV ##########
 63 | GCOV ?= 0 # disable by default.
 64 | GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
 65 | CXXFLAGS  += ${GCOV_FLAGS}
 66 | NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
 67 | LDFLAGS   += ${GCOV_FLAGS}
 68 | NVLDFLAGS   += ${GCOV_FLAGS:%=-Xcompiler %}
 69 | # $(warning GCOV_FLAGS=${GCOV_FLAGS})
 70 | ########## GCOV ##########
 71 | 
 72 | ifeq ($(DEBUG), 0)
 73 | NVCUFLAGS += -O3
 74 | CXXFLAGS  += -O3 -g
 75 | else
 76 | NVCUFLAGS += -O0 -G -g
 77 | CXXFLAGS  += -O0 -g -ggdb3
 78 | endif
 79 | 
 80 | ifneq ($(VERBOSE), 0)
 81 | NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 82 | CXXFLAGS  += -Wall -Wextra
 83 | else
 84 | .SILENT:
 85 | endif
 86 | 
 87 | ifneq ($(TRACE), 0)
 88 | CXXFLAGS  += -DENABLE_TRACE
 89 | endif
 90 | 
 91 | ifeq ($(NVTX), 0)
 92 | CXXFLAGS  += -DNVTX_DISABLE
 93 | endif
 94 | 
 95 | ifeq ($(NPKIT), 1)
 96 | CXXFLAGS  += -DENABLE_NPKIT
 97 | NVCUFLAGS += -DENABLE_NPKIT
 98 | endif
 99 | 
100 | ifneq ($(KEEP), 0)
101 | NVCUFLAGS += -keep
102 | endif
103 | 
104 | ifneq ($(PROFAPI), 0)
105 | CXXFLAGS += -DPROFAPI
106 | endif
107 | 


--------------------------------------------------------------------------------
/makefiles/formatting.mk:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
 8 | # As this file defines a new target (format), it should be included at least after the definition of the
 9 | # default target.
10 | 
11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
12 | ASTYLEDIR := $(BUILDDIR)/contrib
13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
16 | ASTYLEVER := 3.1
17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
18 | 
19 | $(ASTYLEDIR) :
20 | 	@mkdir -p $(ASTYLEDIR)
21 | 
22 | $(ASTYLETAR) : $(ASTYLEDIR)
23 | 	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
24 | 
25 | $(ASTYLEBLD) : $(ASTYLETAR)
26 | 	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
27 | 
28 | $(ASTYLEBIN) : $(ASTYLEBLD)
29 | 	${MAKE} -C $(ASTYLEBLD)
30 | 
31 | .PHONY : format
32 | format : $(ASTYLEBIN)
33 | 	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
34 | 


--------------------------------------------------------------------------------
/makefiles/version.mk:
--------------------------------------------------------------------------------
1 | ##### version
2 | NCCL_MAJOR   := 2
3 | NCCL_MINOR   := 12
4 | NCCL_PATCH   := 12
5 | NCCL_SUFFIX  :=
6 | PKG_REVISION := 1
7 | MSCCL_VERSION := 0.1
8 | 


--------------------------------------------------------------------------------
/patches/nccl.cpp.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
 2 | index 0248e81496..275154c5ce 100644
 3 | --- a/torch/csrc/cuda/nccl.cpp
 4 | +++ b/torch/csrc/cuda/nccl.cpp
 5 | @@ -640,12 +640,22 @@ void all2all_single_equal_split(at::Tensor& input,
 6 |  #if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && (NCCL_MAJOR * 10 + NCCL_MINOR) >= 27
 7 |    using namespace torch::cuda::nccl::detail;
 8 |  
 9 | -  int numranks;
10 | +  //int numranks;
11 |    auto type = to_nccl_data_type(input);
12 |    size_t count = input.numel() / size;
13 | -  size_t rankdiff = input.nbytes() / size;
14 |    const auto* sendbuff = reinterpret_cast<char*>(input.data_ptr());
15 |    auto* recvbuff = reinterpret_cast<char *>(output.data_ptr());
16 | +  NCCL_CHECK(ncclAllToAll(
17 | +			  sendbuff, 
18 | +			  recvbuff, 
19 | +			  count,
20 | +			  type,
21 | +			  to_nccl_comm(_comm),
22 | +			  stream));
23 | +  
24 | +  /*
25 | +  //size_t rankdiff = input.nbytes() / size;
26 | +
27 |    auto comm = to_nccl_comm(_comm);
28 |    NCCL_CHECK(ncclCommCount(comm, &numranks));
29 |    NCCL_CHECK(ncclGroupStart());
30 | @@ -658,6 +668,7 @@ void all2all_single_equal_split(at::Tensor& input,
31 |      }
32 |    }
33 |    NCCL_CHECK(ncclGroupEnd());
34 | +  */
35 |  #else
36 |    AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
37 |  #endif
38 | 


--------------------------------------------------------------------------------
/patches/torch1.12.nccl.cpp.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
 2 | index 5817449c1a..edc4f7781a 100644
 3 | --- a/torch/csrc/cuda/nccl.cpp
 4 | +++ b/torch/csrc/cuda/nccl.cpp
 5 | @@ -650,7 +650,7 @@ void all2all_single_equal_split(at::Tensor& input,
 6 |    const auto* sendbuff = reinterpret_cast<char*>(input.data_ptr());
 7 |    auto* recvbuff = reinterpret_cast<char *>(output.data_ptr());
 8 |    auto comm = to_nccl_comm(_comm);
 9 | -#if defined(USE_ROCM) && ROCM_VERSION >= 50000
10 | +#if 1
11 |    NCCL_CHECK(ncclAllToAll(sendbuff , recvbuff , count,  type, comm, stream));
12 |  #else
13 |    NCCL_CHECK(ncclCommCount(comm, &numranks));
14 | 


--------------------------------------------------------------------------------
/patches/torch1.13.nccl.cpp.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
 2 | index 83729084ae..934bf24ea4 100644
 3 | --- a/torch/csrc/cuda/nccl.cpp
 4 | +++ b/torch/csrc/cuda/nccl.cpp
 5 | @@ -655,7 +655,7 @@ void all2all_single_equal_split(
 6 |    const auto* sendbuff = reinterpret_cast<char*>(input.data_ptr());
 7 |    auto* recvbuff = reinterpret_cast<char*>(output.data_ptr());
 8 |    auto comm = to_nccl_comm(_comm);
 9 | -#if defined(USE_ROCM) && ROCM_VERSION >= 50000
10 | +#if 1
11 |    NCCL_CHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
12 |  #else
13 |    NCCL_CHECK(ncclCommCount(comm, &numranks));
14 | 


--------------------------------------------------------------------------------
/pkg/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : build
 9 | build : debian.build txz.build
10 | 
11 | BUILDDIR ?= $(abspath ../build)
12 | ABSBUILDDIR := $(abspath $(BUILDDIR))
13 | TARGETS := debian txz
14 | all:   ${TARGETS:%=%.build}
15 | prep:  ${TARGETS:%=%.prep}
16 | build: ${TARGETS:%=%.build}
17 | clean: ${TARGETS:%=%.clean}
18 | 
19 | %.prep:
20 | 	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
21 | 
22 | %.build:
23 | 	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
24 | 
25 | %.clean:
26 | 	${MAKE} -C $* clean
27 | 


--------------------------------------------------------------------------------
/pkg/debian/.gitignore:
--------------------------------------------------------------------------------
1 | /*.debhelper.log
2 | /*.debhelper
3 | /*.substvars
4 | /tmp/
5 | /files
6 | /libnccl1/
7 | /libnccl-dev/
8 | 


--------------------------------------------------------------------------------
/pkg/debian/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | DEBPREPDIR := $(BUILDDIR)/debian
11 | PKGDIR  := $(BUILDDIR)/pkg/deb/
12 | 
13 | DEBGEN_IN  := $(wildcard *.in)
14 | DEBGEN     := $(DEBGEN_IN:.in=)
15 | DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | PKG_ARCH       ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
20 | PKG_MULTIARCH  ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
21 | 
22 | prep : $(DEBTARGETS)
23 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
24 | 
25 | build : prep
26 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
27 | 	@printf "Building Debian package\n"
28 | 	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
29 | 	mkdir -p $(PKGDIR)
30 | 	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
31 | 
32 | clean:
33 | 	rm -Rf $(DEBPREPDIR) $(PKGDIR)
34 | 
35 | $(DEBPREPDIR)/% : %.in
36 | 	@printf "Generating %-35s > %s\n" $< $@
37 | 	mkdir -p $(DEBPREPDIR)
38 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
39 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
40 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
41 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
42 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
43 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
44 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
45 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
46 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
47 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
48 | 	    $< > $@
49 | 
50 | $(DEBPREPDIR)/% : %
51 | 	@printf "Grabbing   %-35s > %s\n" $< $@
52 | 	mkdir -p $(DEBPREPDIR)
53 | 	cp -f $< $@
54 | 


--------------------------------------------------------------------------------
/pkg/debian/changelog.in:
--------------------------------------------------------------------------------
1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
2 | 
3 |   * Automatic Debian package from build
4 | 
5 |  -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
6 | 


--------------------------------------------------------------------------------
/pkg/debian/compat:
--------------------------------------------------------------------------------
1 | 9
2 | 


--------------------------------------------------------------------------------
/pkg/debian/control.in:
--------------------------------------------------------------------------------
 1 | Source: nccl
 2 | Section: libs
 3 | Maintainer: cudatools <cudatools@nvidia.com>
 4 | Priority: optional
 5 | Build-depends: debhelper(>=9)
 6 | Standards-Version: 3.9.5
 7 | 
 8 | Package: libnccl${nccl:Major}
 9 | Section: libs
10 | Architecture: ${pkg:Arch}
11 | Depends: ${misc:Depends}, ${shlibs:Depends}
12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime
13 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 |  broadcast, and reduce-scatter.
16 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
17 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 |  sockets.
19 | 
20 | Package: libnccl-dev
21 | Section: libdevel
22 | Architecture: ${pkg:Arch}
23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files
25 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
26 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
27 |  broadcast, and reduce-scatter.
28 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
29 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
30 |  sockets.
31 | 


--------------------------------------------------------------------------------
/pkg/debian/copyright:
--------------------------------------------------------------------------------
 1 | 
 2 |  Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 | 
 4 |  Redistribution and use in source and binary forms, with or without
 5 |  modification, are permitted provided that the following conditions
 6 |  are met:
 7 |   * Redistributions of source code must retain the above copyright
 8 |     notice, this list of conditions and the following disclaimer.
 9 |   * Redistributions in binary form must reproduce the above copyright
10 |     notice, this list of conditions and the following disclaimer in the
11 |     documentation and/or other materials provided with the distribution.
12 |   * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
13 |     Laboratory, the U.S. Department of Energy, nor the names of their
14 |     contributors may be used to endorse or promote products derived
15 |     from this software without specific prior written permission.
16 | 
17 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 |  The U.S. Department of Energy funded the development of this software
30 |  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
31 | 
32 | 
33 | This code also includes files from the NVIDIA Tools Extension SDK project.
34 | 
35 | See:
36 | 
37 |    https://github.com/NVIDIA/NVTX
38 | 
39 | for more information and license details.
40 | 


--------------------------------------------------------------------------------
/pkg/debian/gbp.conf:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | debian-branch   = master
 3 | upstream-branch = master
 4 | 
 5 | ignore-new = True
 6 | 
 7 | [git-buildpackage]
 8 | 
 9 | no-purge = True
10 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl-dev.install.in:
--------------------------------------------------------------------------------
1 | include/nccl.h /usr/include
2 | include/nccl_net.h /usr/include
3 | lib/libnccl.so /usr/lib/${pkg:MultiArch}
4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
5 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl2.install.in:
--------------------------------------------------------------------------------
1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
3 | 


--------------------------------------------------------------------------------
/pkg/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | %:
 4 | 	dh $@ --parallel
 5 | 
 6 | override_dh_auto_install:
 7 | 	PREFIX=debian/tmp dh_auto_install
 8 | 
 9 | override_dh_auto_test:
10 | 	# Do not make test
11 | 
12 | override_dh_auto_clean:
13 | 	# Do not make clean
14 | 


--------------------------------------------------------------------------------
/pkg/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (native)
2 | 


--------------------------------------------------------------------------------
/pkg/redhat/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | RPMPREPDIR := $(BUILDDIR)/redhat
11 | PKGDIR  := $(BUILDDIR)/pkg/rpm/
12 | 
13 | RPMGEN_IN  := $(wildcard *.in)
14 | RPMGEN     := $(RPMGEN_IN:.in=)
15 | RPMFILES   := $(RPMGEN)
16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | ARCH           := $(shell uname -m)
20 | PKG_ARCH       ?= $(shell uname -m)
21 | PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
22 | ifeq ($(PKG_MULTIARCH),)
23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
24 | PKG_MULTIARCH  := $(ARCH)-linux-gnu
25 | endif
26 | 
27 | prep : $(RPMTARGETS)
28 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
29 | 
30 | build : prep
31 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
32 | 	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
33 | 	@printf "Building Redhat package\n"
34 | 	mkdir -p $(PKGDIR)
35 | 	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
36 |                  --define "_rpmdir $(PKGDIR)" \
37 |                  --define "_builddir $(PKGDIR)/build/" \
38 |                  --define "_buildrootdir $(PKGDIR)/buildroot/" \
39 |                  -bb $(BUILDDIR)/redhat/nccl.spec
40 | 
41 | clean:
42 | 	rm -Rf $(RPMPREPDIR) $(PKGDIR)
43 | 
44 | $(RPMPREPDIR)/% : %.in
45 | 	@printf "Generating %-35s > %s\n" $< $@
46 | 	mkdir -p $(RPMPREPDIR)
47 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
48 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
49 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
50 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
51 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
52 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
53 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
54 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
55 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
56 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
57 | 	    $< > $@
58 | 
59 | $(RPMPREPDIR)/% : %
60 | 	@printf "Grabbing   %-35s > %s\n" $< $@
61 | 	mkdir -p $(RPMPREPDIR)
62 | 	cp -f $< $@
63 | 


--------------------------------------------------------------------------------
/pkg/redhat/nccl.spec.in:
--------------------------------------------------------------------------------
 1 | Name:           libnccl
 2 | Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
 3 | Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
 4 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 5 | 
 6 | Group:          Development/Libraries
 7 | License:        BSD
 8 | URL:            http://developer.nvidia.com/nccl
 9 | Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
10 | Requires(pre,preun): /sbin/ldconfig
11 | 
12 | %description
13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 | broadcast, and reduce-scatter.
16 | It has been optimized to achieve high bandwidth on any platform using PCIe,
17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 | sockets.
19 | 
20 | %package devel
21 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
22 | Group:          Development/Libraries
23 | %description devel
24 | NCCL development files
25 | 
26 | %package static
27 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
28 | Group:          Development/Libraries
29 | %description static
30 | NCCL static library
31 | 
32 | %define debug_package %{nil}
33 | 
34 | %prep
35 | %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
36 | 
37 | %build
38 | 
39 | %install
40 | rm -rf $RPM_BUILD_ROOT
41 | install -m 755 -d $RPM_BUILD_ROOT
42 | install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
43 | install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
44 | ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
45 | 
46 | # devel
47 | install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
48 | install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
49 | install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir}
50 | ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
51 | 
52 | # static
53 | install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
54 | 
55 | %post -p /sbin/ldconfig
56 | %postun -p /sbin/ldconfig
57 | 
58 | %post devel -p /sbin/ldconfig
59 | %postun devel -p /sbin/ldconfig
60 | 
61 | %clean
62 | rm -rf $RPM_BUILD_ROOT
63 | 
64 | %files devel
65 | %doc LICENSE.txt
66 | %defattr(-,root,root,-)
67 | %{_includedir}/nccl.h
68 | %{_includedir}/nccl_net.h
69 | %{_libdir}/libnccl.so
70 | 
71 | %files static
72 | %doc LICENSE.txt
73 | %defattr(-,root,root,-)
74 | %{_libdir}/libnccl_static.a
75 | 
76 | %files
77 | %doc LICENSE.txt
78 | %defattr(-,root,root,-)
79 | %{_libdir}/libnccl.so.${nccl:Major}
80 | %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
81 | 
82 | %changelog
83 | 


--------------------------------------------------------------------------------
/pkg/srctxz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/srctxz
11 | PKGDIR  := $(BUILDDIR)/pkg/srctxz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_REVISION   ?= 3
18 | PKG_ARCH       := $(shell uname -m)
19 | 
20 | prep: $(TXZTARGETS)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../../src clean
24 | 	@printf "Building source tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
40 | 	    $< > $@
41 | 


--------------------------------------------------------------------------------
/pkg/srctxz/create_srctxz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | cd ..
11 | NCCLDIR=`basename $PWD`
12 | 
13 | echo "Checking for unclean directory ..."
14 | git clean -x -i
15 | echo "Clean done."
16 | echo "Checking for uncommited files ..."
17 | if [ "`git status -s | wc -l`" != "0" ]; then
18 |   git status -s
19 |   echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
20 |   read
21 | fi
22 | 
23 | cd ..
24 | NCCL_MAJOR=${nccl:Major}
25 | NCCL_MINOR=${nccl:Minor}
26 | NCCL_PATCH=${nccl:Patch}
27 | NCCL_SUFFIX=${nccl:Suffix}
28 | NCCL_BUILD=${pkg:Revision}
29 | 
30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
31 | 
32 | tar --exclude build \
33 |     --exclude ".git*" \
34 |     --exclude pkg/srctxz \
35 |     --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
36 | 


--------------------------------------------------------------------------------
/pkg/txz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/txz
11 | PKGDIR  := $(BUILDDIR)/pkg/txz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_ARCH   := $(shell uname -m)
18 | 
19 | prep: $(TXZTARGETS)
20 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
24 | 	@printf "Building tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash txz/create_txz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
40 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
41 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
42 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
43 | 	    $< > $@
44 | 


--------------------------------------------------------------------------------
/pkg/txz/create_txz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | BUILDDIR=`basename $PWD`
11 | 
12 | cd ..
13 | NCCL_MAJOR=${nccl:Major}
14 | NCCL_MINOR=${nccl:Minor}
15 | NCCL_PATCH=${nccl:Patch}
16 | NCCL_SUFFIX=${nccl:Suffix}
17 | CUDA_MAJOR=${cuda:Major}
18 | CUDA_MINOR=${cuda:Minor}
19 | PKG_REVISION=${pkg:Revision}
20 | PKG_ARCH=${pkg:Arch}
21 | 
22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
23 | 
24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
25 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # See LICENSE.txt for license information
  5 | #
  6 | 
  7 | include ../makefiles/common.mk
  8 | include ../makefiles/version.mk
  9 | 
 10 | ##### src files
 11 | INCEXPORTS  := nccl.h nccl_net.h
 12 | LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
 13 | 		misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/npkit.cc \
 14 | 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
 15 |                 collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc collectives/all_to_all.cc collectives/custom_collective.cc \
 16 |                 graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
 17 | 
 18 | ##### lib files
 19 | LIBNAME     := libnccl.so
 20 | STATICLIBNAME := libnccl_static.a
 21 | ##### pkgconfig files
 22 | PKGCONFIGFILE := nccl.pc
 23 | ##### dirs
 24 | BUILDDIR ?= $(abspath ../build)
 25 | INCDIR := $(BUILDDIR)/include
 26 | LIBDIR := $(BUILDDIR)/lib
 27 | OBJDIR := $(BUILDDIR)/obj
 28 | PKGDIR := $(BUILDDIR)/lib/pkgconfig
 29 | ##### target files
 30 | CUDARTLIB  ?= cudart_static
 31 | INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
 32 | LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
 33 | LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 34 | STATICLIBTARGET := $(STATICLIBNAME)
 35 | PKGTARGET  := $(PKGCONFIGFILE)
 36 | LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 37 | DEPFILES   := $(LIBOBJ:%.o=%.d)
 38 | LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
 39 | 
 40 | DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
 41 | 
 42 | ##### rules
 43 | build : lib staticlib
 44 | 
 45 | lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
 46 | 
 47 | staticlib : $(LIBDIR)/$(STATICLIBTARGET)
 48 | 
 49 | $(DEVICELIB): ALWAYS_REBUILD $(INCTARGETS)
 50 | 	$(MAKE) -C collectives/device
 51 | 
 52 | # Empty target to force rebuild
 53 | ALWAYS_REBUILD:
 54 | 
 55 | -include $(DEPFILES)
 56 | $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
 57 | 
 58 | $(INCDIR)/nccl.h : nccl.h.in
 59 | # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
 60 | 	@$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
 61 | 	mkdir -p $(INCDIR)
 62 | 	@printf "Generating %-35s > %s\n" $< $@
 63 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
 64 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 65 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
 66 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
 67 | 	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
 68 | 	    $< > $@
 69 | 
 70 | $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
 71 | 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 72 | 	mkdir -p $(LIBDIR)
 73 | 	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
 74 | 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 75 | 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 76 | 
 77 | null :=
 78 | space := $(null) #
 79 | comma := ,
 80 | 
 81 | $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
 82 | 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 83 | 	mkdir -p $(LIBDIR)
 84 | 	printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M
 85 | 
 86 | $(PKGDIR)/nccl.pc : nccl.pc.in
 87 | 	mkdir -p $(PKGDIR)
 88 | 	@printf "Generating %-35s > %s\n" $< $@
 89 | 	sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \
 90 | 	    -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
 91 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 92 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
 93 | 	    $< > $@
 94 | 
 95 | $(INCDIR)/%.h : %.h
 96 | 	@printf "Grabbing   %-35s > %s\n" $< $@
 97 | 	mkdir -p $(INCDIR)
 98 | 	install -m 644 $< $@
 99 | 
100 | $(INCDIR)/nccl_%.h : include/nccl_%.h
101 | 	@printf "Grabbing   %-35s > %s\n" $< $@
102 | 	mkdir -p $(INCDIR)
103 | 	install -m 644 $< $@
104 | 
105 | $(PKGDIR)/%.pc : %.pc
106 | 	@printf "Grabbing   %-35s > %s\n" $< $@
107 | 	mkdir -p $(PKGDIR)
108 | 	install -m 644 $< $@
109 | 
110 | $(OBJDIR)/%.o : %.cc $(INCTARGETS)
111 | 	@printf "Compiling  %-35s > %s\n" $< $@
112 | 	mkdir -p `dirname $@`
113 | 	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
114 | 	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
115 | 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
116 | 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
117 |                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
118 | 	@rm -f $(@:%.o=%.d.tmp)
119 | 
120 | clean :
121 | 	$(MAKE) -C collectives/device clean
122 | 	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
123 | 
124 | install : build
125 | 	mkdir -p $(PREFIX)/lib
126 | 	mkdir -p $(PREFIX)/lib/pkgconfig
127 | 	mkdir -p $(PREFIX)/include
128 | 	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
129 | 	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
130 | 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
131 | 
132 | FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
133 | # Note that formatting.mk defines a new target so in order to not overwrite the default target,
134 | # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
135 | # as the BUILDDIR variable.
136 | include ../makefiles/formatting.mk
137 | 


--------------------------------------------------------------------------------
/src/channel.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "channel.h"
 8 | #include "param.h"
 9 | #include "gdrwrap.h"
10 | 
11 | // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
12 | NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
13 | 
14 | ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
15 |   struct ncclChannel* channel = comm->channels+channelid;
16 |   if (channel->id != -1) return ncclSuccess;
17 |   channel->id = channelid;
18 | 
19 |   // Ring index to user rank table.
20 |   NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
21 |   NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
22 | 
23 |   // Communication structures with peers.
24 |   NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
25 |   NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
26 |   for (size_t i=0; i<comm->nRanks+1; ++i) {
27 |     for (int b=0; b<NCCL_MAX_CONNS; b++) {
28 |       channel->peers[i].send[b].comm = comm;
29 |       channel->peers[i].recv[b].comm = comm;
30 |     }
31 |   }
32 | 
33 |   // Per-channel operation list.
34 |   NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
35 |   if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
36 |     // GDRCOPY support
37 |     // We allocate a workFifo in GDR mapped CUDA memory
38 |     // But we still allocate the Host workFifo so that we
39 |     // can copy the work elements to CUDA memory on kernel launch
40 |     NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
41 |   } else {
42 |     // The device workFifo is the Host one
43 |     channel->workFifoDev = channel->workFifo;
44 |   }
45 | 
46 |   return ncclSuccess;
47 | }
48 | 
49 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
50 |   if (channel->id == -1) return ncclSuccess;
51 |   // Operation list
52 |   NCCLCHECK(ncclCudaHostFree(channel->workFifo));
53 |   if (channel->gdrMemDesc) {
54 |     // GDRCOPY support
55 |     NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc));
56 |   }
57 | 
58 |   // Free Ring index to rank tables
59 |   free(channel->ring.userRanks);
60 |   CUDACHECK(cudaFree(channel->ring.devUserRanks));
61 | 
62 |   // Free transport proxy resources
63 |   // Note: free all send resources first due to CollNet arrangement
64 |   for (int r=0; r<nRanks+1; r++) {
65 |     struct ncclPeer* peer = channel->peers+r;
66 |     for (int b=0; b<NCCL_MAX_CONNS; b++) {
67 |       if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
68 |     }
69 |   }
70 |   for (int r=0; r<nRanks+1; r++) {
71 |     struct ncclPeer* peer = channel->peers+r;
72 |     for (int b=0; b<NCCL_MAX_CONNS; b++) {
73 |       if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
74 |     }
75 |   }
76 | 
77 |   // Free the peer structures.
78 |   CUDACHECK(cudaFree(channel->devPeers));
79 |   free(channel->peers);
80 | 
81 |   return ncclSuccess;
82 | }
83 | 


--------------------------------------------------------------------------------
/src/collectives/all_gather.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | 
10 | NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
11 |     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
12 | ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
13 |     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
14 |   NVTX3_FUNC_RANGE_IN(nccl_domain);
15 |   struct ncclInfo info = { ncclFuncAllGather, "AllGather",
16 |     sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
17 |     ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
18 |   info.algorithm = -1;
19 |   return ncclEnqueueCheck(&info);
20 | }
21 | 


--------------------------------------------------------------------------------
/src/collectives/all_reduce.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | 
 9 | NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
10 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
11 | ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
12 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
13 |   NVTX3_FUNC_RANGE_IN(nccl_domain);
14 |   struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
15 |     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
16 |     ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
17 |   info.algorithm = -1;    
18 |   return ncclEnqueueCheck(&info);
19 | }
20 | 


--------------------------------------------------------------------------------
/src/collectives/broadcast.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | 
10 | NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
11 |     ncclComm_t comm, cudaStream_t stream);
12 | ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
13 |     ncclComm_t comm, cudaStream_t stream) {
14 |   NVTX3_FUNC_RANGE_IN(nccl_domain);
15 |   struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
16 |     sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
17 |     BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
18 |   info.algorithm = -1;    
19 |   return ncclEnqueueCheck(&info);
20 | }
21 | /* Deprecated original "in place" function, similar to MPI */
22 | NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
23 |     ncclComm_t comm, cudaStream_t stream);
24 | ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
25 |     ncclComm_t comm, cudaStream_t stream) {
26 |   return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/src/collectives/custom_collective.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | 
10 | NCCL_API(ncclResult_t, ncclCustomCollective, const void* sendbuff, void* recvbuff, size_t count,
11 |     ncclDataType_t datatype, ncclRedOp_t op, int mscclAlgorithmIndex, ncclComm_t comm, cudaStream_t stream);
12 | ncclResult_t ncclCustomCollective(const void* sendbuff, void* recvbuff, size_t count,
13 |     ncclDataType_t datatype, ncclRedOp_t op, int mscclAlgorithmIndex, ncclComm_t comm, cudaStream_t stream) {
14 |   NVTX3_FUNC_RANGE_IN(nccl_domain);
15 |   struct ncclInfo info = { ncclFuncCustomCollective, "CustomCollective",
16 |     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
17 |     MSCCL_CHUNKSTEPS, MSCCL_SLICESTEPS };
18 |   info.mscclInfo.mscclAlgoIndex = mscclAlgorithmIndex;
19 |   return ncclEnqueueCheck(&info);
20 | }
21 | 


--------------------------------------------------------------------------------
/src/collectives/device/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../../makefiles/common.mk
 8 | include ../../../makefiles/version.mk
 9 | 
10 | BUILDDIR ?= $(abspath ../../../build)
11 | OBJDIR := $(BUILDDIR)/obj/collectives/device
12 | 
13 | LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu all_to_all.cu custom_collective.cu
14 | 
15 | LIBSRCFILES += functions.cu
16 | 
17 | DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
18 | DEPENDFILES:= $(DEPFILES:%.d=%.dep)
19 | STATICLIB  := $(OBJDIR)/colldevice.a
20 | DEVOBJ     := $(OBJDIR)/devlink.o
21 | RULESFILE  := $(OBJDIR)/Makefile.rules
22 | 
23 | NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
24 | 
25 | 
26 | all: $(STATICLIB)
27 | 
28 | # Dummy rule so that the extra dependency (%.dep) files are preserved by make
29 | all_deps: $(DEPENDFILES)
30 | 
31 | # Auto-generating the rules per op/reduction/datatype/algorithm
32 | $(RULESFILE) :
33 | 	@printf "Generating %-35s > %s\n" rules $@
34 | 	@mkdir -p $(OBJDIR)
35 | 	@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
36 | 
37 | -include $(RULESFILE)
38 | 
39 | LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o
40 | 
41 | CUSTOMLIBOBJ := $(OBJDIR)/stride_copy_lib.o
42 | CUSTOMDEVOBJ := $(OBJDIR)/stride_copy_dev.o
43 | 
44 | -include $(DEPFILES)
45 | 
46 | $(STATICLIB): $(LIBOBJ) $(DEVOBJ) $(CUSTOMLIBOBJ) $(CUSTOMDEVOBJ)
47 | 	@printf "Archiving  %-35s > %s\n" objects $@
48 | 	ar cr $@ $^
49 | 
50 | # We do not want make to build *.d when running make clean.
51 | # So we only provide targets for .dep which will produce .dep and .d,
52 | # with only .d being included, and .dep keeping track of what needs to
53 | # be regenerated.
54 | $(OBJDIR)/%.dep : %.cu
55 | 	@mkdir -p $(OBJDIR)
56 | 	@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
57 | 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
58 | 	@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
59 |                 sed -e 's/^ *//' -e 's/$$/:/' >> $@
60 | 	@rm -f $@.tmp
61 | 	@cp $@ $(@:.dep=.d)
62 | 
63 | # Compiled kernels and collectives with relocatable device code ...
64 | $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
65 | 	@printf "Compiling  %-35s > %s\n" $< $@
66 | 	mkdir -p `dirname $@`
67 | 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
68 | 
69 | $(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep
70 | 	@printf "Compiling  %-35s > %s\n" $< $@
71 | 	mkdir -p `dirname $@`
72 | 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
73 | 
74 | $(OBJDIR)/%_lib.o : %.cu
75 | 	@printf "Compiling  %-35s > %s\n" $< $@
76 | 	mkdir -p `dirname $@`
77 | 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
78 | 
79 | $(OBJDIR)/%_dev.o : $(OBJDIR)/%_lib.o
80 | 	@printf "Compiling  %-35s > %s\n" $< $@
81 | 	mkdir -p `dirname $@`
82 | 	$(NVCC) $(NVCUFLAGS) -dlink $< -o $@
83 | 
84 | # ... and create the device-side linked object with all those.
85 | $(DEVOBJ) : $(LIBOBJ)
86 | 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
87 | 
88 | clean:
89 | 	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
90 | 


--------------------------------------------------------------------------------
/src/collectives/device/all_gather.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "all_gather.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_C(AllGather);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/all_gather.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "devcomm.h"
  8 | #include "collectives.h"
  9 | #include "primitives.h"
 10 | #include "msccl_interpreter.h"
 11 | 
 12 | namespace {
 13 |   template<typename T, typename RedOp, typename Proto>
 14 |   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
 15 |     const int tid = threadIdx.x;
 16 |     const int nthreads = args->header.nWarps*WARP_SIZE;
 17 |     const int bid = args->bid;
 18 |     const int nChannels = args->nChannels;
 19 |     ncclRing *ring = &ncclShmem.channel.ring;
 20 |     const int *ringRanks = ring->devUserRanks;
 21 |     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
 22 |     // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
 23 |     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
 24 |     const int nranks = ncclShmem.comm.nRanks;
 25 |     const ssize_t loopSize = nChannels*int(chunkSize);
 26 |     const ssize_t size = args->count;
 27 | 
 28 |     T *inputBuf = (T*)args->sendbuff;
 29 |     T *outputBuf = (T*)args->recvbuff;
 30 |     Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
 31 |       (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
 32 | 
 33 |     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 34 |       ssize_t realChunkSize;
 35 |       if (Proto::Id == NCCL_PROTO_SIMPLE) {
 36 |         realChunkSize = min(chunkSize, divUp(size-gridOffset,nChannels));
 37 |         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
 38 |       }
 39 |       else if (Proto::Id == NCCL_PROTO_LL)
 40 |         realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
 41 |       else if (Proto::Id == NCCL_PROTO_LL128)
 42 |         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
 43 |       realChunkSize = int(realChunkSize);
 44 | 
 45 |       ssize_t chunkOffset = gridOffset + int(bid*realChunkSize);
 46 | 
 47 |       /////////////// begin AllGather steps ///////////////
 48 |       ssize_t offset;
 49 |       int nelem = min(realChunkSize, size-chunkOffset);
 50 |       int rankDest;
 51 | 
 52 |       // step 0: push data to next GPU
 53 |       rankDest = ringRanks[0];
 54 |       offset = chunkOffset + rankDest * size;
 55 | 
 56 |       if (inputBuf + chunkOffset == outputBuf + offset) { // In place
 57 |         prims.directSend(chunkOffset, offset, nelem);
 58 |       } else {
 59 |         prims.directCopySend(chunkOffset, offset, offset, nelem);
 60 |       }
 61 | 
 62 |       // k-2 steps: copy to next GPU
 63 |       for (int j=1; j<nranks-1; ++j) {
 64 |         rankDest = ringRanks[nranks-j];
 65 |         offset = chunkOffset + rankDest * size;
 66 | 
 67 |         prims.directRecvCopySend(offset, offset, nelem);
 68 |       }
 69 | 
 70 |       // Make final copy from buffer to dest.
 71 |       rankDest = ringRanks[1];
 72 |       offset = chunkOffset + rankDest * size;
 73 | 
 74 |       // Final wait/copy.
 75 |       prims.directRecv(offset, nelem);
 76 |     }
 77 |   }
 78 | }
 79 | 
 80 | template<typename T, typename RedOp>
 81 | struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 82 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 83 |     using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
 84 |     runRing<T, RedOp, Proto>(args);
 85 |   }
 86 | };
 87 | 
 88 | template<typename T, typename RedOp>
 89 | struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
 90 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 91 |     runRing<T, RedOp, ProtoLL>(args);
 92 |   }
 93 | };
 94 | 
 95 | template<typename T, typename RedOp>
 96 | struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
 97 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 98 |     runRing<T, RedOp, ProtoLL128>(args);
 99 |   }
100 | };
101 | 
102 | template<typename T, typename RedOp>
103 | struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_SIMPLE> {
104 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
105 |     using Proto = ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS>;
106 |     runInterpreter<T, RedOp, Proto>(args, ncclShmem.comm.nRanks);
107 |   }
108 | };
109 | 
110 | template<typename T, typename RedOp>
111 | struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL128> {
112 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
113 |     runInterpreter<T, RedOp, ProtoLL128>(args, ncclShmem.comm.nRanks);
114 |   }
115 | };
116 | 
117 | template<typename T, typename RedOp>
118 | struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL> {
119 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
120 |     runInterpreter<T, RedOp, ProtoLL>(args, ncclShmem.comm.nRanks);
121 |   }
122 | };


--------------------------------------------------------------------------------
/src/collectives/device/all_reduce.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "all_reduce.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_R(AllReduce);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/all_to_all.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "all_to_all.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_C(AllToAll);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/all_to_all.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "devcomm.h"
 8 | #include "primitives.h"
 9 | #include "collectives.h"
10 | #include "msccl_interpreter.h"
11 | 
12 | template<typename T, typename RedOp>
13 | struct RunWorkElement<ncclFuncAllToAll, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_SIMPLE> {
14 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
15 |     using Proto = ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS>;
16 |     runInterpreter<T, RedOp, Proto>(args, ncclShmem.comm.nRanks);
17 |   }
18 | };
19 | 
20 | template<typename T, typename RedOp>
21 | struct RunWorkElement<ncclFuncAllToAll, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL128> {
22 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
23 |     runInterpreter<T, RedOp, ProtoLL128>(args, ncclShmem.comm.nRanks);
24 |   }
25 | };
26 | 
27 | template<typename T, typename RedOp>
28 | struct RunWorkElement<ncclFuncAllToAll, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL> {
29 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
30 |     runInterpreter<T, RedOp, ProtoLL>(args, ncclShmem.comm.nRanks);
31 |   }
32 | };
33 | 


--------------------------------------------------------------------------------
/src/collectives/device/broadcast.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "broadcast.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_C(Broadcast);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/broadcast.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "devcomm.h"
  8 | #include "collectives.h"
  9 | #include "primitives.h"
 10 | #include "msccl_interpreter.h"
 11 | 
 12 | namespace {
 13 |   template<typename T, typename RedOp, typename Proto>
 14 |   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
 15 |     const int tid = threadIdx.x;
 16 |     const int nthreads = args->header.nWarps*WARP_SIZE;
 17 |     const int bid = args->bid;
 18 |     const int nChannels = args->nChannels;
 19 |     ncclRing *ring = &ncclShmem.channel.ring;
 20 |     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
 21 |     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
 22 |     const ssize_t loopSize = nChannels*chunkSize;
 23 |     const ssize_t size = args->count;
 24 |     const int rank = ring->devUserRanks[0];
 25 |     const int nextRank = ring->devUserRanks[1];
 26 |     const int root = args->root;
 27 | 
 28 |     T *inputBuf = (T*)args->sendbuff;
 29 |     T *outputBuf = (T*)args->recvbuff;
 30 |     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
 31 |       prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
 32 | 
 33 |     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 34 |       ssize_t realChunkSize;
 35 |       if (Proto::Id == NCCL_PROTO_SIMPLE) {
 36 |         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
 37 |         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
 38 |       }
 39 |       else if (Proto::Id == NCCL_PROTO_LL)
 40 |         realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
 41 |       else if (Proto::Id == NCCL_PROTO_LL128)
 42 |         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
 43 |       realChunkSize = int(realChunkSize);
 44 | 
 45 |       ssize_t offset = gridOffset + int(bid*realChunkSize);
 46 |       int nelem = min(realChunkSize, size-offset);
 47 | 
 48 |       if (rank == root) {
 49 |         if (inputBuf == outputBuf) {
 50 |           prims.send(offset, nelem);
 51 |         } else {
 52 |           prims.copySend(offset, offset, nelem);
 53 |         }
 54 |       } else if (nextRank == root) {
 55 |         prims.recv(offset, nelem);
 56 |       } else {
 57 |         prims.recvCopySend(offset, nelem);
 58 |       }
 59 |     }
 60 |   }
 61 | }
 62 | 
 63 | template<typename T, typename RedOp>
 64 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 65 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 66 |     using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
 67 |     runRing<T, RedOp, Proto>(args);
 68 |   }
 69 | };
 70 | 
 71 | template<typename T, typename RedOp>
 72 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
 73 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 74 |     runRing<T, RedOp, ProtoLL>(args);
 75 |   }
 76 | };
 77 | 
 78 | template<typename T, typename RedOp>
 79 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
 80 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 81 |     runRing<T, RedOp, ProtoLL128>(args);
 82 |   }
 83 | };
 84 | 
 85 | template<typename T, typename RedOp>
 86 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_SIMPLE> {
 87 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 88 |     using Proto = ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS>;
 89 |     runInterpreter<T, RedOp, Proto>(args, 1);
 90 |   }
 91 | };
 92 | 
 93 | template<typename T, typename RedOp>
 94 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL128> {
 95 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 96 |     runInterpreter<T, RedOp, ProtoLL128>(args, 1);
 97 |   }
 98 | };
 99 | 
100 | template<typename T, typename RedOp>
101 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL> {
102 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
103 |     runInterpreter<T, RedOp, ProtoLL>(args, 1);
104 |   }
105 | };


--------------------------------------------------------------------------------
/src/collectives/device/custom_collective.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "custom_collective.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_R(CustomCollective);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/custom_collective.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "collectives.h"
 8 | #include "primitives.h"
 9 | #include "msccl_interpreter.h"
10 | 
11 | template<typename T, typename RedOp>
12 | struct RunWorkElement<ncclFuncCustomCollective, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_SIMPLE> {
13 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
14 |     using Proto = ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS>;
15 |     runInterpreter<T, RedOp, Proto>(args, 1);
16 |   }
17 | };
18 | 
19 | template<typename T, typename RedOp>
20 | struct RunWorkElement<ncclFuncCustomCollective, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL128> {
21 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
22 |     runInterpreter<T, RedOp, ProtoLL128>(args, 1);
23 |   }
24 | };
25 | 
26 | template<typename T, typename RedOp>
27 | struct RunWorkElement<ncclFuncCustomCollective, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL> {
28 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
29 |     runInterpreter<T, RedOp, ProtoLL>(args, 1);
30 |   }
31 | };


--------------------------------------------------------------------------------
/src/collectives/device/functions.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "devcomm.h"
  8 | #include "collectives.h"
  9 | #include "common.h"
 10 | 
 11 | __shared__ ncclShmemData ncclShmem;
 12 | 
 13 | #define NCCL_FUNC5(func, algo, devredop, type, nullify) \
 14 |   MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
 15 |   MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128,  devredop, type)), \
 16 |   MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
 17 | 
 18 | #define NCCL_FUNC4(func, devredop, type, nullify) \
 19 |   NCCL_FUNC5(func, TREE,    devredop, type, nullify), \
 20 |   NCCL_FUNC5(func, RING,    devredop, type, nullify), \
 21 |   NCCL_FUNC5(func, MSCCL,   devredop, type, nullify), \
 22 |   NCCL_FUNC5(func, COLLNET, devredop, type, nullify)
 23 | 
 24 | #if defined(__CUDA_BF16_TYPES_EXIST__)
 25 | // Must be consistent with ncclDataType_t
 26 | #define NCCL_FUNCS3A(func, devredop, nullForFloat) \
 27 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 28 |   NCCL_FUNC4(func, devredop, uint8_t, 0), \
 29 |   NCCL_FUNC4(func, devredop, int32_t, 0), \
 30 |   NCCL_FUNC4(func, devredop, uint32_t, 0), \
 31 |   NCCL_FUNC4(func, devredop, int64_t, 0), \
 32 |   NCCL_FUNC4(func, devredop, uint64_t, 0), \
 33 |   NCCL_FUNC4(func, devredop, half, nullForFloat), \
 34 |   NCCL_FUNC4(func, devredop, float, nullForFloat), \
 35 |   NCCL_FUNC4(func, devredop, double, nullForFloat), \
 36 |   NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
 37 | #define NCCL_FUNCS3B(func, devredop) \
 38 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 39 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 40 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 41 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 42 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 43 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 44 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 45 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 46 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 47 |   NCCL_FUNC4(func, devredop, int8_t, 0)
 48 | #else
 49 | // Must be consistent with ncclDataType_t
 50 | #define NCCL_FUNCS3A(func, devredop, nullForFloat) \
 51 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 52 |   NCCL_FUNC4(func, devredop, uint8_t, 0), \
 53 |   NCCL_FUNC4(func, devredop, int32_t, 0), \
 54 |   NCCL_FUNC4(func, devredop, uint32_t, 0), \
 55 |   NCCL_FUNC4(func, devredop, int64_t, 0), \
 56 |   NCCL_FUNC4(func, devredop, uint64_t, 0), \
 57 |   NCCL_FUNC4(func, devredop, half, nullForFloat), \
 58 |   NCCL_FUNC4(func, devredop, float, nullForFloat), \
 59 |   NCCL_FUNC4(func, devredop, double, nullForFloat)
 60 | #define NCCL_FUNCS3B(func, devredop) \
 61 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 62 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 63 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 64 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 65 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 66 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 67 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 68 |   NCCL_FUNC4(func, devredop, int8_t, 0), \
 69 |   NCCL_FUNC4(func, devredop, int8_t, 0)
 70 | #endif
 71 | 
 72 | // Must be consistent with ncclRedOp_t
 73 | #define NCCL_FUNCS2A(func) \
 74 |   NCCL_FUNCS3A(func, Sum,        /*nullForFloat=*/0), \
 75 |   NCCL_FUNCS3A(func, Prod,       /*nullForFloat=*/0), \
 76 |   NCCL_FUNCS3A(func, Max,        /*nullForFloat=*/0), \
 77 |   NCCL_FUNCS3A(func, Min,        /*nullForFloat=*/0), \
 78 |   NCCL_FUNCS3A(func, PreMulSum,  /*nullForFloat=*/0), \
 79 |   NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1)
 80 | 
 81 | #define NCCL_FUNCS2B(func) \
 82 |   NCCL_FUNCS3B(func, Sum), \
 83 |   NCCL_FUNCS3B(func, Sum), \
 84 |   NCCL_FUNCS3B(func, Sum), \
 85 |   NCCL_FUNCS3B(func, Sum), \
 86 |   NCCL_FUNCS3B(func, Sum), \
 87 |   NCCL_FUNCS3B(func, Sum)
 88 | 
 89 | // Must be consistent with the ncclFuncSet enum
 90 | __device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
 91 | // Don't try to initialize the host shadow copy of this device-side global
 92 | // variable. There is no host pointer to a device-side function, which
 93 | // confuses clang. This will be fixed in the next clang release.
 94 | #if __CUDA_ARCH__
 95 |   NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
 96 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
 97 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
 98 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
 99 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t),
100 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t),
101 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t),
102 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
103 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
104 |   NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
105 |   #if defined(__CUDA_BF16_TYPES_EXIST__)
106 |     NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
107 |   #endif
108 |   NCCL_FUNCS2B(Broadcast),
109 |   NCCL_FUNCS2A(Reduce),
110 |   NCCL_FUNCS2B(AllGather),
111 |   NCCL_FUNCS2A(ReduceScatter),
112 |   NCCL_FUNCS2A(AllReduce),
113 |   NCCL_FUNCS2B(AllToAll),
114 |   NCCL_FUNCS2A(CustomCollective)
115 | #endif
116 | };
117 | 
118 | // Workaround for https://reviews.llvm.org/D55580
119 | __device__ void ncclWorkaroundClangD55580() {}
120 | 


--------------------------------------------------------------------------------
/src/collectives/device/gen_rules.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | dir=$1
 9 | 
10 | datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64"
11 | if [ "$CUDA_MAJOR" -ge 11 ]
12 | then
13 |     datatypes+=" bf16"
14 | fi
15 | 
16 | targets="GENOBJS := \\\\\n"
17 | 
18 | for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter all_to_all custom_collective; do
19 |   opn=0
20 |   for op in sum prod min max premulsum sumpostdiv; do
21 |     dtn=0
22 |     # Order must match that of the ncclDataType_t enum
23 |     for dt in ${datatypes}; do
24 |       echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
25 |       echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
26 |       echo "	mkdir -p ${dir}"
27 |       echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
28 |       echo ""
29 |       targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
30 |       dtn=$(($dtn + 1))
31 |     done
32 |     opn=$(($opn + 1))
33 |   done
34 | done
35 | echo -e "$targets"
36 | 


--------------------------------------------------------------------------------
/src/collectives/device/onerank_reduce.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "devcomm.h"
 8 | #include "collectives.h"
 9 | #include "reduce_kernel.h"
10 | #include "common.h"
11 | 
12 | namespace {
13 |   template<typename T, typename RedOp>
14 |   __device__ __forceinline__ void oneRankReduce() {
15 |     ncclWork *w = &ncclShmem.work;
16 |     int tid = threadIdx.x;
17 |     int tn = blockDim.x;
18 |     #pragma unroll 1
19 |     for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
20 |       ncclWorkElem *we = &w->elems[e];
21 |       intptr_t eltN = we->count;
22 |       int bid = we->bid;
23 |       int bn = we->nChannels;
24 |       T const *src = (T const*)we->sendbuff;
25 |       T *dst = (T*)we->recvbuff;
26 | 
27 |       // each block/channel gets a roughly equal segment of 16 byte packs
28 |       constexpr int EltPerPack = 16/sizeof(T);
29 |       intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack;
30 |       intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn);
31 |       intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn);
32 |       i0 *= EltPerPack;
33 |       i0 = i0 < eltN ? i0 : eltN;
34 |       i1 *= EltPerPack;
35 |       i1 = i1 < eltN ? i1 : eltN;
36 |       src += i0;
37 |       dst += i0;
38 |       ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1>
39 |         (tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0);
40 |     }
41 |   }
42 | }
43 | 
44 | #define INSTANTIATE(devredop, type) \
45 |   __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
46 |     oneRankReduce<type, Func##devredop<type>>(); \
47 |   }
48 | 
49 | INSTANTIATE(PreMulSum, int8_t)
50 | INSTANTIATE(PreMulSum, uint8_t)
51 | INSTANTIATE(PreMulSum, int32_t)
52 | INSTANTIATE(PreMulSum, uint32_t)
53 | INSTANTIATE(PreMulSum, int64_t)
54 | INSTANTIATE(PreMulSum, uint64_t)
55 | INSTANTIATE(PreMulSum, half)
56 | #if defined(__CUDA_BF16_TYPES_EXIST__)
57 | INSTANTIATE(PreMulSum, __nv_bfloat16)
58 | #endif
59 | INSTANTIATE(PreMulSum, float)
60 | INSTANTIATE(PreMulSum, double)
61 | 


--------------------------------------------------------------------------------
/src/collectives/device/op128.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef OP128_H_
 8 | #define OP128_H_
 9 | 
10 | inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
11 |   asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
12 |       : "=l"(v0), "=l"(v1) : "l"(ptr));
13 | }
14 | 
15 | inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
16 |   asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
17 |       :: "l"(v0), "l"(v1), "l"(ptr));
18 | }
19 | 
20 | inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
21 |   uint64_t* shmemAsmPtr;
22 |   asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
23 |   return shmemAsmPtr;
24 | }
25 | 
26 | inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
27 |   asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
28 |       : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
29 | }
30 | 
31 | inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
32 |   asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
33 |       :: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
34 | }
35 | 
36 | template<typename T>
37 | inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) {
38 |   union {
39 |     uint32_t tmp4[4];
40 |     uint64_t tmp8[2];
41 |   };
42 |   if(sizeof(T) < 4) {
43 |     uint32_t *ptr4 = reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(ptr) & -uintptr_t(4));
44 |     #pragma unroll
45 |     for(int e=0; e < 4; e++) {
46 |       // Produce 4 bytes of sub-register type by reading 2 4-byte
47 |       // aligned values and shifting.
48 |       uint32_t lo, hi;
49 |       asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0));
50 |       asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1));
51 |       tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast<uintptr_t>(ptr))%4));
52 |     }
53 |   }
54 |   else if(sizeof(T) == 4) {
55 |     #pragma unroll
56 |     for(int e=0; e < 4; e++)
57 |       asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e));
58 |   }
59 |   else /*sizeof(T)==8*/ {
60 |     #pragma unroll
61 |     for(int e=0; e < 2; e++)
62 |       asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e));
63 |   }
64 |   v0 = tmp8[0];
65 |   v1 = tmp8[1];
66 | }
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/src/collectives/device/reduce.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "reduce.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_R(Reduce);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/reduce.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "devcomm.h"
  8 | #include "collectives.h"
  9 | #include "primitives.h"
 10 | #include "msccl_interpreter.h"
 11 | 
 12 | namespace {
 13 |   template<typename T, typename RedOp, typename Proto>
 14 |   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
 15 |     const int tid = threadIdx.x;
 16 |     const int nthreads = args->header.nWarps*WARP_SIZE;
 17 |     const int bid = args->bid;
 18 |     const int nChannels = args->nChannels;
 19 |     ncclRing *ring = &ncclShmem.channel.ring;
 20 |     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
 21 |     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
 22 |     const int nranks = ncclShmem.comm.nRanks;
 23 |     const ssize_t loopSize = nChannels*chunkSize;
 24 |     const ssize_t size = args->count;
 25 |     const int rank = ncclShmem.comm.rank;
 26 |     const int prevRank = ring->devUserRanks[nranks-1];
 27 |     const int root = args->root;
 28 | 
 29 |     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
 30 |       prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
 31 | 
 32 |     auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
 33 |       int realChunkSize;
 34 |       if (Proto::Id == NCCL_PROTO_SIMPLE) {
 35 |         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
 36 |         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
 37 |       }
 38 |       else if (Proto::Id == NCCL_PROTO_LL)
 39 |         realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
 40 |       else if (Proto::Id == NCCL_PROTO_LL128)
 41 |         realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
 42 |       return realChunkSize;
 43 |     };
 44 | 
 45 |     if (prevRank == root) {
 46 |       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 47 |         int realChunkSize = calcChunkSize(gridOffset);
 48 |         ssize_t offset = gridOffset + bid*realChunkSize;
 49 |         int nelem = min(realChunkSize, size-offset);
 50 |         prims.send(offset, nelem);
 51 |       }
 52 |     }
 53 |     else if (rank == root) {
 54 |       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 55 |         int realChunkSize = calcChunkSize(gridOffset);
 56 |         ssize_t offset = gridOffset + bid*realChunkSize;
 57 |         int nelem = min(realChunkSize, size-offset);
 58 |         prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
 59 |       }
 60 |     }
 61 |     else {
 62 |       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 63 |         int realChunkSize = calcChunkSize(gridOffset);
 64 |         ssize_t offset = gridOffset + bid*realChunkSize;
 65 |         int nelem = min(realChunkSize, size-offset);
 66 |         prims.recvReduceSend(offset, nelem);
 67 |       }
 68 |     }
 69 |   }
 70 | }
 71 | 
 72 | template<typename T, typename RedOp>
 73 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 74 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 75 |     using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
 76 |     runRing<T, RedOp, Proto>(args);
 77 |   }
 78 | };
 79 | 
 80 | template<typename T, typename RedOp>
 81 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
 82 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 83 |     runRing<T, RedOp, ProtoLL>(args);
 84 |   }
 85 | };
 86 | 
 87 | template<typename T, typename RedOp>
 88 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
 89 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 90 |     runRing<T, RedOp, ProtoLL128>(args);
 91 |   }
 92 | };
 93 | 
 94 | template<typename T, typename RedOp>
 95 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_SIMPLE> {
 96 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 97 |     using Proto = ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS>;
 98 |     runInterpreter<T, RedOp, Proto>(args, 1);
 99 |   }
100 | };
101 | 
102 | template<typename T, typename RedOp>
103 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL128> {
104 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
105 |     runInterpreter<T, RedOp, ProtoLL128>(args, 1);
106 |   }
107 | };
108 | 
109 | template<typename T, typename RedOp>
110 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL> {
111 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
112 |     runInterpreter<T, RedOp, ProtoLL>(args, 1);
113 |   }
114 | };


--------------------------------------------------------------------------------
/src/collectives/device/reduce_scatter.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "reduce_scatter.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_R(ReduceScatter);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/reduce_scatter.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "devcomm.h"
  8 | #include "collectives.h"
  9 | #include "primitives.h"
 10 | #include "msccl_interpreter.h"
 11 | 
 12 | namespace {
 13 |   template<typename T, typename RedOp, typename Proto>
 14 |   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
 15 |     const int tid = threadIdx.x;
 16 |     const int nthreads = args->header.nWarps*WARP_SIZE;
 17 |     const int bid = args->bid;
 18 |     const int nChannels = args->nChannels;
 19 |     ncclRing *ring = &ncclShmem.channel.ring;
 20 |     int const *ringRanks = ring->devUserRanks;
 21 |     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
 22 |     // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
 23 |     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
 24 |     const int nranks = ncclShmem.comm.nRanks;
 25 |     const ssize_t loopSize = nChannels*chunkSize;
 26 |     const ssize_t size = args->count;
 27 | 
 28 |     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
 29 |       prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
 30 | 
 31 |     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 32 |       ssize_t realChunkSize;
 33 |       if (Proto::Id == NCCL_PROTO_SIMPLE) {
 34 |         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
 35 |         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
 36 |       }
 37 |       else if (Proto::Id == NCCL_PROTO_LL)
 38 |         realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
 39 |       else if (Proto::Id == NCCL_PROTO_LL128)
 40 |         realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
 41 |       realChunkSize = int(realChunkSize);
 42 | 
 43 |       ssize_t chunkOffset = gridOffset + bid*int(realChunkSize);
 44 | 
 45 |       /////////////// begin ReduceScatter steps ///////////////
 46 |       ssize_t offset;
 47 |       int nelem = min(realChunkSize, size-chunkOffset);
 48 |       int rankDest;
 49 | 
 50 |       // step 0: push data to next GPU
 51 |       rankDest = ringRanks[nranks-1];
 52 |       offset = chunkOffset + rankDest * size;
 53 |       prims.send(offset, nelem);
 54 | 
 55 |       // k-2 steps: reduce and copy to next GPU
 56 |       for (int j=2; j<nranks; ++j) {
 57 |         rankDest = ringRanks[nranks-j];
 58 |         offset = chunkOffset + rankDest * size;
 59 |         prims.recvReduceSend(offset, nelem);
 60 |       }
 61 | 
 62 |       // step k-1: reduce this buffer and data, which will produce the final result
 63 |       rankDest = ringRanks[0];
 64 |       offset = chunkOffset + rankDest * size;
 65 |       prims.recvReduceCopy(offset, chunkOffset, nelem, /*postOp=*/true);
 66 |     }
 67 |   }
 68 | }
 69 | 
 70 | template<typename T, typename RedOp>
 71 | struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 72 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 73 |     using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
 74 |     runRing<T, RedOp, Proto>(args);
 75 |   }
 76 | };
 77 | 
 78 | template<typename T, typename RedOp>
 79 | struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
 80 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 81 |     runRing<T, RedOp, ProtoLL>(args);
 82 |   }
 83 | };
 84 | 
 85 | template<typename T, typename RedOp>
 86 | struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
 87 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 88 |     runRing<T, RedOp, ProtoLL128>(args);
 89 |   }
 90 | };
 91 | 
 92 | template<typename T, typename RedOp>
 93 | struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_SIMPLE> {
 94 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 95 |     using Proto = ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS>;
 96 |     runInterpreter<T, RedOp, Proto>(args, ncclShmem.comm.nRanks);
 97 |   }
 98 | };
 99 | 
100 | template<typename T, typename RedOp>
101 | struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL128> {
102 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
103 |     runInterpreter<T, RedOp, ProtoLL128>(args, ncclShmem.comm.nRanks);
104 |   }
105 | };
106 | 
107 | template<typename T, typename RedOp>
108 | struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_MSCCL, NCCL_PROTO_LL> {
109 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
110 |     runInterpreter<T, RedOp, ProtoLL>(args, ncclShmem.comm.nRanks);
111 |   }
112 | };


--------------------------------------------------------------------------------
/src/collectives/device/sendrecv.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "sendrecv.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_P(SendRecv);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/sendrecv.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "devcomm.h"
 8 | #include "collectives.h"
 9 | #include "primitives.h"
10 | #include "npkit/npkit.h"
11 | 
12 | template<typename T, typename RedOp>
13 | struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
14 |   __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
15 |     NPKIT_GPU_SYNC_TIME_SEND(blockIdx.x, tid);
16 | 
17 |     if (args->peer == ncclShmem.comm.rank) {
18 |       struct ncclWorkElemP2p* recvArgs = args-1;
19 |       if (args->buff != recvArgs->buff) {
20 |         NPKIT_GPU_ENTER_EVENT(NPKIT_EVENT_SEND_ENTRY, args->count*sizeof(T));
21 | 
22 |         ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
23 | 
24 |         NPKIT_GPU_ENTER_EVENT(NPKIT_EVENT_SEND_EXIT, args->count*sizeof(T));
25 |       }
26 |     } else {
27 |       using Proto = ProtoSimple<1, 1>;
28 |       ssize_t const count = args->count;
29 |       int const chunkSize = args->chunkSize/sizeof(T);
30 |       int const peer = args->peer;
31 |       Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
32 |         (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
33 | 
34 |       NPKIT_GPU_SET_CTX_ID(prims);
35 | 
36 |       ssize_t offset = 0;
37 |       do {
38 |         int nelem = min(chunkSize, count-offset);
39 |         prims.directSend(offset, offset, nelem);
40 |         offset += nelem;
41 |       } while(offset < count);
42 |     }
43 |   }
44 | 
45 |   __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
46 |     NPKIT_GPU_SYNC_TIME_RECV(blockIdx.x, tid);
47 | 
48 |     if (args->peer != ncclShmem.comm.rank) {
49 |       using Proto = ProtoSimple<1, 1>;
50 |       ssize_t const count = args->count;
51 |       int const chunkSize = args->chunkSize/sizeof(T);
52 |       int const peer = args->peer;
53 |       Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
54 |         (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
55 | 
56 |       NPKIT_GPU_SET_CTX_ID(prims);
57 | 
58 |       ssize_t offset = 0;
59 |       do {
60 |         int nelem = min(chunkSize, count-offset);
61 | 
62 |         NPKIT_GPU_ENTER_EVENT(NPKIT_EVENT_RECV_ENTRY, nelem*sizeof(T));
63 | 
64 |         prims.directRecv(offset, nelem);
65 | 
66 |         NPKIT_GPU_ENTER_EVENT(NPKIT_EVENT_RECV_EXIT, nelem*sizeof(T));
67 | 
68 |         offset += nelem;
69 |       } while(offset < count);
70 |     }
71 |   }
72 | 
73 |   __device__ __forceinline__ void run(ncclWork *work) {
74 |     struct ncclWorkElemP2p* args = work->p2pElems;
75 |     int ngroups = args->ngroups;
76 |     int tid = threadIdx.x;
77 |     int wid = tid / WARP_SIZE;
78 |     // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
79 |     // warps for send, 2 warps for recv).
80 |     // warpStarts were rounded thanks to int division, but for group number we need to round the other way around
81 |     // So we mirror wid then mirror again the group.
82 |     #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
83 |     int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
84 |     args += group;
85 |     if (args->header.type == ncclWorkTypeUnused) return;
86 | 
87 |     tid -= args->warpStart * WARP_SIZE;
88 |     int nthreads = args->nWarps * WARP_SIZE;
89 |     group |= 1<<16; // Used to select connIndex 1
90 |     if (tid >= nthreads || args->peer == -1) return;
91 |     if ((group%2) == 0) {
92 |       runRecv(tid, nthreads, group, args);
93 |     } else {
94 |       runSend(tid, nthreads, group, args);
95 |     }
96 |   }
97 | };
98 | 


--------------------------------------------------------------------------------
/src/collectives/device/stride_copy.cu:
--------------------------------------------------------------------------------
 1 | #include "devcomm.h"
 2 | 
 3 | static int strideMemcpyGridsize = 0, strideMemcpyBlocksize = 0;
 4 | 
 5 | // memory stride copy kernel
 6 | template <typename T>
 7 | __global__ void strideMemcpyKernel(T *__restrict__ out, const T *__restrict__ in, const size_t size, const int height, const int width) {
 8 |   const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
 9 |   for (size_t i = tid; i < size * height * width; i += gridDim.x * blockDim.x) {
10 |     const size_t index = i / size, offset = i % size;
11 |     const size_t j = (width * (index % height) + (index / height)) * size + offset;
12 |     out[j] = in[i];
13 |   }
14 | }
15 | 
16 | cudaError_t strideMemcpyAsync(void *dst, const void *src, const size_t size, const int height, const int width, cudaStream_t stream) {
17 |   if (strideMemcpyGridsize == 0 || strideMemcpyBlocksize == 0)
18 |     cudaOccupancyMaxPotentialBlockSize(&strideMemcpyGridsize, &strideMemcpyBlocksize, strideMemcpyKernel<uint4>);
19 | 
20 |   if (size < sizeof(uint4))
21 |     strideMemcpyKernel<char><<<strideMemcpyGridsize, strideMemcpyBlocksize, 0, stream>>>((char*)dst, (char*)src, size, height, width);
22 |   else
23 |     strideMemcpyKernel<uint4><<<strideMemcpyGridsize, strideMemcpyBlocksize, 0, stream>>>((uint4*)dst, (uint4*)src, size/sizeof(uint4), height, width);
24 |   return cudaSuccess;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/collectives/reduce.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | 
10 | NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
11 |     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
12 | ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
13 |     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
14 |   NVTX3_FUNC_RANGE_IN(nccl_domain);
15 |   struct ncclInfo info = { ncclFuncReduce, "Reduce",
16 |     sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
17 |     REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
18 |   info.algorithm = -1;    
19 |   return ncclEnqueueCheck(&info);
20 | }
21 | 


--------------------------------------------------------------------------------
/src/collectives/reduce_scatter.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | 
10 | NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
11 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
12 | ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
13 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
14 |   NVTX3_FUNC_RANGE_IN(nccl_domain);
15 |   struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
16 |     sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
17 |     REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
18 |   info.algorithm = -1;    
19 |   return ncclEnqueueCheck(&info);
20 | }
21 | 


--------------------------------------------------------------------------------
/src/collectives/sendrecv.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | #include "argcheck.h" // Need some checks here since we access comm
10 | 
11 | NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
12 |     ncclComm_t comm, cudaStream_t stream);
13 | ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
14 |     ncclComm_t comm, cudaStream_t stream) {
15 |   NVTX3_FUNC_RANGE_IN(nccl_domain);
16 |   struct ncclInfo info = { ncclFuncSend, "Send",
17 |     NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
18 |     1, 1 };
19 |   info.algorithm = -1;    
20 |   ncclResult_t ret;
21 |   NCCLCHECK(ncclGroupStart());
22 |   ret = ncclEnqueueCheck(&info);
23 |   NCCLCHECK(ncclGroupEnd());
24 |   return ret;
25 | }
26 | 
27 | NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
28 |     ncclComm_t comm, cudaStream_t stream);
29 | ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
30 |     ncclComm_t comm, cudaStream_t stream) {
31 |   NVTX3_FUNC_RANGE_IN(nccl_domain);
32 |   struct ncclInfo info = { ncclFuncRecv, "Recv",
33 |     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
34 |     1, 1 };
35 |   info.algorithm = -1;    
36 |   ncclResult_t ret;
37 |   NCCLCHECK(ncclGroupStart());
38 |   ret = ncclEnqueueCheck(&info);
39 |   NCCLCHECK(ncclGroupEnd());
40 |   return ret;
41 | }
42 | 


--------------------------------------------------------------------------------
/src/enhcompat.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
 8 | 
 9 | enum cudaError_t { cudaErrorStubLibrary = 34 };
10 | 
11 | extern "C" {
12 | 
13 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         __attribute__((visibility("hidden"))) __attribute((weak));
14 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         { return cudaErrorStubLibrary; }
15 | 
16 | cudaError_t cudaUserObjectCreate(...)                __attribute__((visibility("hidden"))) __attribute((weak));
17 | cudaError_t cudaUserObjectCreate(...)                { return cudaErrorStubLibrary; }
18 | 
19 | cudaError_t cudaGraphRetainUserObject(...)           __attribute__((visibility("hidden"))) __attribute((weak));
20 | cudaError_t cudaGraphRetainUserObject(...)           { return cudaErrorStubLibrary; }
21 | 
22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
24 | 
25 | cudaError_t cudaGetDriverEntryPoint(...)             __attribute__((visibility("hidden"))) __attribute((weak));
26 | cudaError_t cudaGetDriverEntryPoint(...)             { return cudaErrorStubLibrary; }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/graph/rings.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "core.h"
 8 | 
 9 | #define MAXWIDTH 20
10 | #define PREFIXLEN 15
11 | #define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
12 | void dumpLine(int* values, int nranks, const char* prefix) {
13 |   int prefixlen = strlen(prefix);
14 |   char line[STRLENGTH+1];
15 |   line[STRLENGTH] = '\0';
16 |   memset(line, ' ', STRLENGTH);
17 |   strncpy(line, prefix, PREFIXLEN);
18 |   for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
19 |   INFO(NCCL_INIT,"%s", line);
20 | }
21 | 
22 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
23 |   for (int r=0; r<nrings; r++) {
24 |     char prefix[40];
25 |     /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
26 |     dumpLine(prev+r*nranks, nranks, prefix);
27 |     sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
28 |     dumpLine(next+r*nranks, nranks, prefix);*/
29 | 
30 |     int current = rank;
31 |     for (int i=0; i<nranks; i++) {
32 |       rings[r*nranks+i] = current;
33 |       current = next[r*nranks+current];
34 |     }
35 |     sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
36 |     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
37 |     if (current != rank) {
38 |       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
39 |       return ncclInternalError;
40 |     }
41 |     // Check that all ranks are there
42 |     for (int i=0; i<nranks; i++) {
43 |       int found = 0;
44 |       for (int j=0; j<nranks; j++) {
45 |         if (rings[r*nranks+j] == i) {
46 |           found = 1;
47 |           break;
48 |         }
49 |       }
50 |       if (found == 0) {
51 |         WARN("Error : ring %d does not contain rank %d", r, i);
52 |         return ncclInternalError;
53 |       }
54 |     }
55 |   }
56 |   return ncclSuccess;
57 | }
58 | 


--------------------------------------------------------------------------------
/src/graph/rings.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 |  *
4 |  * See LICENSE.txt for license information
5 |  ************************************************************************/
6 | 
7 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
8 | 


--------------------------------------------------------------------------------
/src/graph/trees.cc:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "nccl.h"
  8 | 
  9 | #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
 10 | 
 11 | /* Btree which alternates leaves and nodes.
 12 |  * Assumes root is 0, which conveniently builds a tree on powers of two,
 13 |  * (because we have pow2-1 ranks) which lets us manipulate bits.
 14 |  * Find first non-zero bit, then :
 15 |  * Find the parent :
 16 |  *   xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
 17 |  *   xx11[0] -> xx10[0] (3,7,11 below)
 18 |  * Find the children :
 19 |  *   xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
 20 |  *   xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
 21 |  *
 22 |  * Illustration :
 23 |  * 0---------------8
 24 |  *          ______/ \______
 25 |  *         4               12
 26 |  *       /   \            /  \
 27 |  *     2       6       10     \
 28 |  *    / \     / \     /  \     \
 29 |  *   1   3   5   7   9   11    13
 30 |  */
 31 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
 32 |   int up, down0, down1;
 33 |   int bit;
 34 |   for (bit=1; bit<nranks; bit<<=1) {
 35 |     if (bit & rank) break;
 36 |   }
 37 | 
 38 |   if (rank == 0) {
 39 |     *u = -1;
 40 |     *d0 = -1;
 41 |     // Child rank is > 0 so it has to be our child 1, not 0.
 42 |     *d1 = nranks > 1 ? bit >> 1 : -1;
 43 |     return ncclSuccess;
 44 |   }
 45 | 
 46 |   up = (rank ^ bit) | (bit << 1);
 47 |   // if smaller than the parent, we are his first child, otherwise we're his second
 48 |   if (up >= nranks) up = (rank ^ bit);
 49 |   *parentChildType = (rank < up) ? 0 : 1;
 50 |   *u = up;
 51 | 
 52 |   int lowbit = bit >> 1;
 53 |   // down0 is always within bounds
 54 |   down0 = lowbit == 0 ? -1 : rank-lowbit;
 55 | 
 56 |   down1 = lowbit == 0 ? -1 : rank+lowbit;
 57 |   // Make sure down1 is within bounds
 58 |   while (down1 >= nranks) {
 59 |     down1 = lowbit == 0 ? -1 : rank+lowbit;
 60 |     lowbit >>= 1;
 61 |   }
 62 |   *d0 = down0; *d1 = down1;
 63 | 
 64 |   return ncclSuccess;
 65 | }
 66 | 
 67 | /* Build a double binary tree. Take the previous tree for the first tree.
 68 |  * For the second tree, we use a mirror tree (if nranks is even)
 69 |  *
 70 |  * 0---------------8                   3----------------11
 71 |  *          ______/ \                 / \______
 72 |  *         4         \               /         7
 73 |  *       /   \        \             /        /   \
 74 |  *     2       6       10         1        5      9
 75 |  *    / \     / \     /  \       / \      / \    / \
 76 |  *   1   3   5   7   9   11     0   2    4   6  8   10
 77 |  *
 78 |  * or shift it by one rank (if nranks is odd).
 79 |  *
 80 |  * 0---------------8            1---------------9
 81 |  *          ______/ \______              ______/ \______
 82 |  *         4               12           5                0
 83 |  *       /   \            /           /   \            /
 84 |  *     2       6       10           3       7       11
 85 |  *    / \     / \     /  \         / \     / \     /  \
 86 |  *   1   3   5   7   9   11       2   4   6   8  10   12
 87 |  */
 88 | ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
 89 |   // First tree ... use a btree
 90 |   ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
 91 |   // Second tree ... mirror or shift
 92 |   if (nranks % 2 == 1) {
 93 |     // shift
 94 |     int shiftrank = (rank-1+nranks) % nranks;
 95 |     int u, d0, d1;
 96 |     ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
 97 |     *s1 = u == -1 ? -1 : (u+1) % nranks;
 98 |     *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
 99 |     *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
100 |   } else {
101 |     // mirror
102 |     int u, d0, d1;
103 |     ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
104 |     *s1 = u == -1 ? -1 : nranks-1-u;
105 |     *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
106 |     *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
107 |   }
108 |   return ncclSuccess;
109 | }
110 | 


--------------------------------------------------------------------------------
/src/include/align.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ALIGN_H_
 8 | #define NCCL_ALIGN_H_
 9 | 
10 | #define DIVUP(x, y) \
11 |     (((x)+(y)-1)/(y))
12 | 
13 | #define ROUNDUP(x, y) \
14 |     (DIVUP((x), (y))*(y))
15 | 
16 | #define ALIGN_SIZE(size, align) \
17 |   size = ((size + (align) - 1) / (align)) * (align);
18 | 
19 | #if !__CUDA_ARCH__
20 |   #ifndef __host__
21 |     #define __host__
22 |   #endif
23 |   #ifndef __device__
24 |     #define __device__
25 |   #endif
26 | #endif
27 | 
28 | template<typename X, typename Y, typename Z = decltype(X()+Y())>
29 | __host__ __device__ constexpr Z divUp(X x, Y y) {
30 |   return (x+y-1)/y;
31 | }
32 | 
33 | template<typename X, typename Y, typename Z = decltype(X()+Y())>
34 | __host__ __device__ constexpr Z roundUp(X x, Y y) {
35 |   return (x+y-1) - (x+y-1)%y;
36 | }
37 | 
38 | // assumes second argument is a power of 2
39 | template<typename X, typename Z = decltype(X()+int())>
40 | __host__ __device__ constexpr Z alignUp(X x, int a) {
41 |   return (x+a-1) & Z(-a);
42 | }
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/include/alloc.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_ALLOC_H_
  8 | #define NCCL_ALLOC_H_
  9 | 
 10 | #include "nccl.h"
 11 | #include "checks.h"
 12 | #include "align.h"
 13 | #include <sys/mman.h>
 14 | #include <unistd.h>
 15 | #include <stdlib.h>
 16 | #include <string.h>
 17 | 
 18 | template <typename T>
 19 | static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
 20 |   CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
 21 |   memset(*ptr, 0, nelem*sizeof(T));
 22 |   INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
 23 |   return ncclSuccess;
 24 | }
 25 | #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
 26 | 
 27 | static inline ncclResult_t ncclCudaHostFree(void* ptr) {
 28 |   CUDACHECK(cudaFreeHost(ptr));
 29 |   return ncclSuccess;
 30 | }
 31 | 
 32 | template <typename T>
 33 | static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
 34 |   void* p = malloc(nelem*sizeof(T));
 35 |   if (p == NULL) {
 36 |     WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
 37 |     return ncclSystemError;
 38 |   }
 39 |   //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
 40 |   memset(p, 0, nelem*sizeof(T));
 41 |   *ptr = (T*)p;
 42 |   return ncclSuccess;
 43 | }
 44 | #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
 45 | 
 46 | template <typename T>
 47 | static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
 48 |   if (nelem < oldNelem) return ncclInternalError;
 49 |   if (nelem == oldNelem) return ncclSuccess;
 50 | 
 51 |   T* oldp = *ptr;
 52 |   T* p = (T*)malloc(nelem*sizeof(T));
 53 |   if (p == NULL) {
 54 |     WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
 55 |     return ncclSystemError;
 56 |   }
 57 |   memcpy(p, oldp, oldNelem*sizeof(T));
 58 |   free(oldp);
 59 |   memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
 60 |   *ptr = (T*)p;
 61 |   INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
 62 |   return ncclSuccess;
 63 | }
 64 | 
 65 | template <typename T>
 66 | static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
 67 |   // Need async stream for P2P pre-connect + CUDA Graph
 68 |   cudaStream_t stream;
 69 |   CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 70 |   CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
 71 |   CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
 72 |   CUDACHECK(cudaStreamSynchronize(stream));
 73 |   CUDACHECK(cudaStreamDestroy(stream));
 74 |   INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
 75 |   return ncclSuccess;
 76 | }
 77 | #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
 78 | 
 79 | template <typename T>
 80 | static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
 81 |   CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
 82 |   return ncclSuccess;
 83 | }
 84 | 
 85 | // Allocate memory to be potentially ibv_reg_mr'd. This needs to be
 86 | // allocated on separate pages as those pages will be marked DONTFORK
 87 | // and if they are shared, that could cause a crash in a child process
 88 | static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
 89 |   size_t page_size = sysconf(_SC_PAGESIZE);
 90 |   void* p;
 91 |   int size_aligned = ROUNDUP(size, page_size);
 92 |   int ret = posix_memalign(&p, page_size, size_aligned);
 93 |   if (ret != 0) return ncclSystemError;
 94 |   memset(p, 0, size);
 95 |   *ptr = p;
 96 |   INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
 97 |   return ncclSuccess;
 98 | }
 99 | #define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------
/src/include/argcheck.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ARGCHECK_H_
 8 | #define NCCL_ARGCHECK_H_
 9 | 
10 | #include "core.h"
11 | #include "info.h"
12 | 
13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
14 | ncclResult_t ArgsCheck(struct ncclInfo* info);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/include/bootstrap.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_BOOTSTRAP_H_
 8 | #define NCCL_BOOTSTRAP_H_
 9 | 
10 | #include "nccl.h"
11 | #include "comm.h"
12 | 
13 | ncclResult_t bootstrapNetInit();
14 | ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
15 | ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
16 | ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm);
17 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
18 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
19 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
20 | ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
21 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
22 | ncclResult_t bootstrapClose(void* commState);
23 | ncclResult_t bootstrapAbort(void* commState);
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/include/channel.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CHANNEL_H_
 8 | #define NCCL_CHANNEL_H_
 9 | #include "comm.h"
10 | 
11 | ncclResult_t initChannel(struct ncclComm* comm, int channelid);
12 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
13 | static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
14 |   int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
15 |   int peerNode = comm->rankToNode[peer];
16 |   int peerIndex = comm->rankToLocalRank[peer];
17 |   int nsteps = comm->maxLocalRanks;
18 |   int rankIndex = comm->rankToLocalRank[comm->rank];
19 |   int step, delta;
20 |   if (coll == ncclFuncSend) {
21 |     step = (nsteps + peerIndex - rankIndex)%nsteps;
22 |     delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
23 |   } else if (coll == ncclFuncRecv) {
24 |     step = (nsteps + rankIndex - peerIndex)%nsteps;
25 |     delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
26 |   } else {
27 |     return ncclInternalError;
28 |   }
29 |   *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
30 |   return ncclSuccess;
31 | }
32 | 
33 | static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
34 |   *channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
35 |   return ncclSuccess;
36 | }
37 | 
38 | static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
39 |   int base;
40 |   NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
41 |   NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
42 |   return ncclSuccess;
43 | }
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/include/checks.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_CHECKS_H_
  8 | #define NCCL_CHECKS_H_
  9 | 
 10 | #include "debug.h"
 11 | 
 12 | // Check CUDA calls
 13 | #define CUDACHECK(cmd) do {                                 \
 14 |     cudaError_t err = cmd;                                  \
 15 |     if( err != cudaSuccess ) {                              \
 16 |         WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
 17 |         return ncclUnhandledCudaError;                      \
 18 |     }                                                       \
 19 | } while(false)
 20 | 
 21 | #define CUDACHECKGOTO(cmd, res, label) do {                 \
 22 |     cudaError_t err = cmd;                                  \
 23 |     if( err != cudaSuccess ) {                              \
 24 |         WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
 25 |         res = ncclUnhandledCudaError;                       \
 26 |         goto label;                                         \
 27 |     }                                                       \
 28 | } while(false)
 29 | 
 30 | // Report failure but clear error and continue
 31 | #define CUDACHECKIGNORE(cmd) do {  \
 32 |     cudaError_t err = cmd;         \
 33 |     if( err != cudaSuccess ) {     \
 34 |         INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
 35 |         (void) cudaGetLastError(); \
 36 |     }                              \
 37 | } while(false)
 38 | 
 39 | #include <errno.h>
 40 | // Check system calls
 41 | #define SYSCHECK(call, name) do { \
 42 |   int retval; \
 43 |   SYSCHECKVAL(call, name, retval); \
 44 | } while (false)
 45 | 
 46 | #define SYSCHECKVAL(call, name, retval) do { \
 47 |   SYSCHECKSYNC(call, name, retval); \
 48 |   if (retval == -1) { \
 49 |     WARN("Call to " name " failed : %s", strerror(errno)); \
 50 |     return ncclSystemError; \
 51 |   } \
 52 | } while (false)
 53 | 
 54 | #define SYSCHECKSYNC(call, name, retval) do { \
 55 |   retval = call; \
 56 |   if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
 57 |     INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
 58 |   } else { \
 59 |     break; \
 60 |   } \
 61 | } while(true)
 62 | 
 63 | #define SYSCHECKGOTO(statement, res, label) do { \
 64 |   if ((statement) == -1) {    \
 65 |     /* Print the back trace*/ \
 66 |     res = ncclSystemError;    \
 67 |     INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
 68 |     goto label; \
 69 |   } \
 70 | } while (0);
 71 | 
 72 | #define NEQCHECK(statement, value) do {   \
 73 |   if ((statement) != value) {             \
 74 |     /* Print the back trace*/             \
 75 |     INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
 76 |     return ncclSystemError;     \
 77 |   }                             \
 78 | } while (0);
 79 | 
 80 | #define NEQCHECKGOTO(statement, value, res, label) do { \
 81 |   if ((statement) != value) { \
 82 |     /* Print the back trace*/ \
 83 |     res = ncclSystemError;    \
 84 |     INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
 85 |     goto label; \
 86 |   } \
 87 | } while (0);
 88 | 
 89 | #define EQCHECK(statement, value) do {    \
 90 |   if ((statement) == value) {             \
 91 |     /* Print the back trace*/             \
 92 |     INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
 93 |     return ncclSystemError;     \
 94 |   }                             \
 95 | } while (0);
 96 | 
 97 | #define EQCHECKGOTO(statement, value, res, label) do { \
 98 |   if ((statement) == value) { \
 99 |     /* Print the back trace*/ \
100 |     res = ncclSystemError;    \
101 |     INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
102 |     goto label; \
103 |   } \
104 | } while (0);
105 | 
106 | // Propagate errors up
107 | #define NCCLCHECK(call) do { \
108 |   ncclResult_t res = call; \
109 |   if (res != ncclSuccess) { \
110 |     /* Print the back trace*/ \
111 |     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
112 |     return res; \
113 |   } \
114 | } while (0);
115 | 
116 | #define NCCLCHECKGOTO(call, res, label) do { \
117 |   res = call; \
118 |   if (res != ncclSuccess) { \
119 |     /* Print the back trace*/ \
120 |     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
121 |     goto label; \
122 |   } \
123 | } while (0);
124 | 
125 | #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
126 |   volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
127 |   ncclResult_t res = call;                \
128 |   if (res != ncclSuccess) {               \
129 |     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
130 |     return ncclInternalError;             \
131 |   }                                       \
132 |   if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
133 | } while (!(cond));
134 | 
135 | #define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
136 |   volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
137 |   res = call;                             \
138 |   if (res != ncclSuccess) {               \
139 |     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
140 |     goto label;                           \
141 |   }                                       \
142 |   if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
143 | } while (!(cond));
144 | 
145 | #define NCCLCHECKTHREAD(a) do { \
146 |   if ((args->ret = (a)) != ncclSuccess) { \
147 |     INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
148 |     return args; \
149 |   } \
150 | } while(0)
151 | 
152 | #define CUDACHECKTHREAD(a) do { \
153 |   if ((a) != cudaSuccess) { \
154 |     INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
155 |     args->ret = ncclUnhandledCudaError; \
156 |     return args; \
157 |   } \
158 | } while(0)
159 | 
160 | #endif
161 | 


--------------------------------------------------------------------------------
/src/include/coll_net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COLL_NET_H_
 8 | #define COLL_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | 
13 | extern ncclCollNet_t* ncclCollNet;
14 | typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
15 | 
16 | // Translation to external API
17 | static const char* collNetName() { return ncclCollNet->name; }
18 | static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
19 | static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
20 | static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
21 | static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
22 | static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
23 | static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
24 | static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
25 | static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
26 |   NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
27 | static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
28 | static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
29 | static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
30 | static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
31 | 
32 | static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/include/collectives.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_COLLECTIVES_H_
  8 | #define NCCL_COLLECTIVES_H_
  9 | #include <cstdint>
 10 | 
 11 | enum ncclDevRedOp_t {
 12 |   ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin,
 13 |   ncclDevPreMulSum, ncclDevSumPostDiv,
 14 |   ncclNumDevRedOps
 15 | };
 16 | struct ncclDevRedOpFull {
 17 |   ncclDevRedOp_t op;
 18 |   bool scalarArgIsPtr;
 19 |   uint64_t scalarArg;
 20 | };
 21 | 
 22 | #define FUNC_INDEX_P2P 0
 23 | #define FUNC_INDEX(func, devredop, ncclType, al, pr) (1+ncclNumTypes+(((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
 24 | 
 25 | #define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \
 26 |   ncclFunction_##func##_##algo##_##proto##_##devredop##_##type
 27 | 
 28 | #define NCCL_ONERANK_REDUCE_NAME(devredop, type) \
 29 |   ncclFunction_OneRankReduce_##devredop##_##type
 30 | 
 31 | #define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
 32 |   ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
 33 | 
 34 | #define NCCL_IMPL_NAME(func, algo, proto) \
 35 |   nccl##func##algo##proto
 36 | 
 37 | /* Declare all collective operations */
 38 | #define DECL5(func, algo, proto, devredop, type) \
 39 |   extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
 40 |   extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
 41 | 
 42 | #define CONCAT(a,b) a##b
 43 | #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
 44 | #define MACRO_IF_0(t, f) f
 45 | #define MACRO_IF_1(t, f) t
 46 | 
 47 | #define DECL4(func, algo, devredop, type, undef) \
 48 |   MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \
 49 |   MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL,     devredop, type)) \
 50 |   MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128,  devredop, type))
 51 | 
 52 | #define DECL3(func, devredop, type, undef) \
 53 |   DECL4(func, RING,    devredop, type, undef) \
 54 |   DECL4(func, TREE,    devredop, type, undef) \
 55 |   DECL4(func, MSCCL,    devredop, type, undef) \
 56 |   DECL4(func, COLLNET, devredop, type, undef)
 57 | 
 58 | #if defined(__CUDA_BF16_TYPES_EXIST__)
 59 | #define DECL2(func, devredop, undefForFloat) \
 60 |   DECL3(func, devredop, int8_t, /*undef=*/0) \
 61 |   DECL3(func, devredop, uint8_t, /*undef=*/0) \
 62 |   DECL3(func, devredop, int32_t, /*undef=*/0) \
 63 |   DECL3(func, devredop, uint32_t, /*undef=*/0) \
 64 |   DECL3(func, devredop, int64_t, /*undef=*/0) \
 65 |   DECL3(func, devredop, uint64_t, /*undef=*/0) \
 66 |   DECL3(func, devredop, half, /*undef=*/undefForFloat) \
 67 |   DECL3(func, devredop, float, /*undef=*/undefForFloat) \
 68 |   DECL3(func, devredop, double, /*undef=*/undefForFloat) \
 69 |   DECL3(func, devredop, __nv_bfloat16, /*undef=*/undefForFloat)
 70 | #else
 71 | #define DECL2(func, devredop, undefForFloat) \
 72 |   DECL3(func, devredop, int8_t, /*undef=*/0) \
 73 |   DECL3(func, devredop, uint8_t, /*undef=*/0) \
 74 |   DECL3(func, devredop, int32_t, /*undef=*/0) \
 75 |   DECL3(func, devredop, uint32_t, /*undef=*/0) \
 76 |   DECL3(func, devredop, int64_t, /*undef=*/0) \
 77 |   DECL3(func, devredop, uint64_t, /*undef=*/0) \
 78 |   DECL3(func, devredop, half, /*undef=*/undefForFloat) \
 79 |   DECL3(func, devredop, float, /*undef=*/undefForFloat) \
 80 |   DECL3(func, devredop, double, /*undef=*/undefForFloat)
 81 | #endif
 82 | 
 83 | #define DECL(func) \
 84 |   DECL2(func, Sum, /*undefForFloat=*/0) \
 85 |   DECL2(func, Prod, /*undefForFloat=*/0) \
 86 |   DECL2(func, Min, /*undefForFloat=*/0) \
 87 |   DECL2(func, Max, /*undefForFloat=*/0) \
 88 |   DECL2(func, PreMulSum, /*undefForFloat=*/0) \
 89 |   DECL2(func, SumPostDiv, /*undefForFloat=*/1)
 90 | 
 91 | DECL2(Broadcast, Sum, /*undefForFloat=*/0)
 92 | DECL(Reduce)
 93 | DECL2(AllGather, Sum, /*undefForFloat=*/0)
 94 | DECL(ReduceScatter)
 95 | DECL(AllReduce)
 96 | DECL2(AllToAll, Sum, /*undefForFloat=*/0)
 97 | DECL(CustomCollective)
 98 | DECL5(SendRecv, RING, SIMPLE, Sum, int8_t)
 99 | 
100 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)();
101 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)();
102 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)();
103 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)();
104 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)();
105 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)();
106 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)();
107 | #if defined(__CUDA_BF16_TYPES_EXIST__)
108 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16)();
109 | #endif
110 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)();
111 | extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
112 | 
113 | // CHUNKSIZE must be a multiple of SLICESIZE
114 | #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
115 | #define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
116 | #define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
117 | #define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
118 | #define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
119 | #define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
120 | #define BROADCAST_SLICESTEPS 1
121 | #define BROADCAST_CHUNKSTEPS 1
122 | #define REDUCE_SLICESTEPS 1
123 | #define REDUCE_CHUNKSTEPS 1
124 | #define SENDRECV_SLICEFACTOR 4
125 | #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
126 | 
127 | #endif
128 | 


--------------------------------------------------------------------------------
/src/include/core.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CORE_H_
 8 | #define NCCL_CORE_H_
 9 | 
10 | #include <pthread.h>
11 | #include <unistd.h>
12 | #include <stdlib.h>
13 | #include <stdint.h>
14 | #include <algorithm> // For std::min/std::max
15 | #include "nccl.h"
16 | 
17 | #ifdef PROFAPI
18 | #define NCCL_API(ret, func, args...)        \
19 |     __attribute__ ((visibility("default"))) \
20 |     __attribute__ ((alias(#func)))          \
21 |     ret p##func (args);                     \
22 |     extern "C"                              \
23 |     __attribute__ ((visibility("default"))) \
24 |     __attribute__ ((weak))                  \
25 |     ret func(args)
26 | #else
27 | #define NCCL_API(ret, func, args...)        \
28 |     extern "C"                              \
29 |     __attribute__ ((visibility("default"))) \
30 |     ret func(args)
31 | #endif // end PROFAPI
32 | 
33 | static __inline__ int ncclTypeSize(ncclDataType_t type) {
34 |   switch (type) {
35 |     case ncclInt8:
36 |     case ncclUint8:
37 |       return 1;
38 |     case ncclFloat16:
39 | #if defined(__CUDA_BF16_TYPES_EXIST__)
40 |     case ncclBfloat16:
41 | #endif
42 |       return 2;
43 |     case ncclInt32:
44 |     case ncclUint32:
45 |     case ncclFloat32:
46 |       return 4;
47 |     case ncclInt64:
48 |     case ncclUint64:
49 |     case ncclFloat64:
50 |       return 8;
51 |     default:
52 |       return -1;
53 |   }
54 | }
55 | 
56 | #include "debug.h"
57 | #include "checks.h"
58 | #include "alloc.h"
59 | #include "utils.h"
60 | #include "param.h"
61 | #include "nvtx.h"
62 | 
63 | #endif // end include guard
64 | 


--------------------------------------------------------------------------------
/src/include/cpuset.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CPUSET_H_
 8 | #define NCCL_CPUSET_H_
 9 | 
10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
11 | 
12 | static int hexToInt(char c) {
13 |   int v = c - '0';
14 |   if (v < 0) return -1;
15 |   if (v > 9) v = 10 + c - 'a';
16 |   if ((v < 0) || (v > 15)) return -1;
17 |   return v;
18 | }
19 | 
20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
21 | 
22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
23 |   uint32_t cpumasks[CPU_SET_N_U32];
24 |   int m = CPU_SET_N_U32-1;
25 |   cpumasks[m] = 0;
26 |   for (int o=0; o<strlen(str); o++) {
27 |     char c = str[o];
28 |     if (c == ',') {
29 |       m--;
30 |       cpumasks[m] = 0;
31 |     } else {
32 |       int v = hexToInt(c);
33 |       if (v == -1) break;
34 |       cpumasks[m] <<= 4;
35 |       cpumasks[m] += v;
36 |     }
37 |   }
38 |   // Copy cpumasks to mask
39 |   for (int a=0; m<CPU_SET_N_U32; a++,m++) {
40 |     memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
41 |   }
42 |   return ncclSuccess;
43 | }
44 | 
45 | static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
46 |   int c = 0;
47 |   uint8_t* m8 = (uint8_t*)mask;
48 |   for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
49 |     if (c == 0 && m8[o] == 0) continue;
50 |     sprintf(str+c, "%02x", m8[o]);
51 |     c+=2;
52 |     if (o && o%4 == 0) {
53 |       sprintf(str+c, ",");
54 |       c++;
55 |     }
56 |   }
57 |   str[c] = '\0';
58 |   return ncclSuccess;
59 | }
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/include/debug.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_DEBUG_H_
 8 | #define NCCL_DEBUG_H_
 9 | 
10 | #include "nccl_net.h"
11 | #include <stdio.h>
12 | #include <chrono>
13 | 
14 | #include <sys/syscall.h>
15 | #include <limits.h>
16 | #include <string.h>
17 | #include <pthread.h>
18 | 
19 | // Conform to pthread and NVTX standard
20 | #define NCCL_THREAD_NAMELEN 16
21 | 
22 | extern int ncclDebugLevel;
23 | extern uint64_t ncclDebugMask;
24 | extern pthread_mutex_t ncclDebugOutputLock;
25 | extern FILE *ncclDebugFile;
26 | extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
27 | 
28 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
29 | 
30 | // Let code temporarily downgrade WARN into INFO
31 | extern thread_local int ncclDebugNoWarn;
32 | 
33 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
34 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
35 | 
36 | #ifdef ENABLE_TRACE
37 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
38 | extern std::chrono::high_resolution_clock::time_point ncclEpoch;
39 | #else
40 | #define TRACE(...)
41 | #endif
42 | 
43 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/include/enqueue.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_ENQUEUE_H_
  8 | #define NCCL_ENQUEUE_H_
  9 | 
 10 | #include "comm.h"
 11 | #include "group.h"
 12 | #include "collectives.h"
 13 | 
 14 | #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
 15 | #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
 16 | 
 17 | size_t ncclKernMaxLocalSize();
 18 | ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
 19 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 20 | ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
 21 | ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
 22 | ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
 23 | ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
 24 | ncclResult_t ncclLaunchKernel(ncclComm_t comm);
 25 | ncclResult_t ncclRecordEvents(struct ncclComm* comm);
 26 | ncclResult_t ncclLaunchReset(ncclComm_t comm);
 27 | ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
 28 | ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
 29 | template<int USING_CUDA_GRAPH>
 30 | void CUDART_CB ncclEnqueueHostSetup(void* arg);
 31 | ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
 32 | ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
 33 | 
 34 | struct ncclBuffRegInfo {
 35 |   void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
 36 |   void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
 37 |   void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
 38 |   void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
 39 |   int nBuffs;
 40 | };
 41 | 
 42 | // Enqueue information (for kernel and proxy) for each operation
 43 | struct ncclQueueElem {
 44 |   struct ncclWork work;
 45 |   struct ncclProxyOp proxyOp;
 46 |   struct ncclBuffRegInfo buffRegInfo;
 47 | };
 48 | 
 49 | typedef ncclRecyclableList<struct ncclQueueElem> ncclQueueElemList;
 50 | 
 51 | // Structure passed to CUDA graph
 52 | struct ncclQueueInfo {
 53 |   ncclComm_t comm;
 54 |   int maxChannels;    // Dynamic version of gridDim
 55 |   ncclResult_t ret;   // Return value of host setup call
 56 |   int nRegBuffs;
 57 |   ncclQueueElemList* elemList;
 58 | };
 59 | 
 60 | static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_t comm) {
 61 |   NCCLCHECK(ncclCalloc(eqInfo, 1));
 62 |   (*eqInfo)->comm = comm;
 63 |   (*eqInfo)->elemList = new ncclQueueElemList();
 64 |   (*eqInfo)->comm->nQueueInfoCreated++;
 65 |   return ncclSuccess;
 66 | }
 67 | 
 68 | // Reset element queue
 69 | static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
 70 |   if (eqInfo == NULL) return ncclInternalError;
 71 |   eqInfo->maxChannels = 0;
 72 |   eqInfo->ret = ncclSuccess;
 73 |   eqInfo->nRegBuffs = 0;
 74 |   eqInfo->elemList->recycle();
 75 |   return ncclSuccess;
 76 | }
 77 | 
 78 | // Destroy enqueue info space
 79 | // used by both CUDA graph and non CUDA graph
 80 | static void ncclDestroyQueueInfo(void* ptr) {
 81 |   if (ptr == NULL) return;
 82 |   struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
 83 |   struct ncclComm* comm = eqInfo->comm;
 84 |   // Close IPC mem handles for registered buffers
 85 |   struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
 86 | #if 0
 87 |   // Ideally, the deregistration should happen here
 88 |   // but currently the destroy function of CUDA objects does not allow CUDA API calls
 89 |   while (eqElem != NULL) {
 90 |     for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
 91 |       if (i == eqInfo->comm->localRank) continue;
 92 |       CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
 93 |       CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
 94 |     }
 95 |     eqElem = eqInfo->elemList->getNext();
 96 |   }
 97 | #else
 98 |   // Instead, we push these pointers to a pool owned by ncclComm
 99 |   // and asks a helper thread to close mem handles
100 |   struct ncclGraphHelperResources* res = comm->graphHelperResources;
101 |   int ipcTailOld = 0;
102 |   if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip;
103 | 
104 |   pthread_mutex_lock(&res->threadLock);
105 |   ipcTailOld = res->ipcTail;
106 |   while (eqElem != NULL) {
107 |     for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
108 |       if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) {
109 |         res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i];
110 |         res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
111 |       }
112 |       if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) {
113 |         res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i];
114 |         res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
115 |       }
116 |     }
117 |     eqElem = eqInfo->elemList->getNext();
118 |   }
119 |   if (res->ipcTail != ipcTailOld) {
120 |     res->threadState = ThreadStart;
121 |     TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld);
122 |     pthread_cond_signal(&res->threadCond);
123 |   }
124 |   pthread_mutex_unlock(&res->threadLock);
125 | #endif
126 | 
127 | skip:
128 |   delete eqInfo->elemList;
129 |   free(eqInfo);
130 |   comm->nQueueInfoDestroyed++;
131 |   return;
132 | }
133 | #endif // End include guard
134 | 


--------------------------------------------------------------------------------
/src/include/graph.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_GRAPH_H_
  8 | #define NCCL_GRAPH_H_
  9 | 
 10 | #include "nccl.h"
 11 | #include "devcomm.h"
 12 | #include <limits.h>
 13 | #include <stdlib.h>
 14 | #include <ctype.h>
 15 | #include <stdio.h>
 16 | #include <sched.h>
 17 | 
 18 | ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
 19 | 
 20 | struct ncclTopoSystem;
 21 | // Build the topology
 22 | ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
 23 | ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
 24 | ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
 25 | 
 26 | ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
 27 | void ncclTopoFree(struct ncclTopoSystem* system);
 28 | ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
 29 | ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 30 | ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
 31 | 
 32 | // Query topology
 33 | ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
 34 | ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 35 | ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
 36 | int ncclPxnDisable();
 37 | ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
 38 | ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
 39 | 
 40 | // Find CPU affinity
 41 | ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
 42 | 
 43 | #define NCCL_TOPO_CPU_ARCH_X86 1
 44 | #define NCCL_TOPO_CPU_ARCH_POWER 2
 45 | #define NCCL_TOPO_CPU_ARCH_ARM 3
 46 | #define NCCL_TOPO_CPU_VENDOR_INTEL 1
 47 | #define NCCL_TOPO_CPU_VENDOR_AMD 2
 48 | #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
 49 | #define NCCL_TOPO_CPU_TYPE_BDW 1
 50 | #define NCCL_TOPO_CPU_TYPE_SKL 2
 51 | #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 52 | ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 53 | ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
 54 | ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
 55 | 
 56 | #define NCCL_TOPO_MAX_NODES 256
 57 | 
 58 | // Init search. Needs to be done before calling ncclTopoCompute
 59 | ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
 60 | 
 61 | #define NCCL_TOPO_PATTERN_BALANCED_TREE 1   // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
 62 | #define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
 63 | #define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
 64 | #define NCCL_TOPO_PATTERN_RING 4            // Ring
 65 | struct ncclTopoGraph {
 66 |   // Input / output
 67 |   int id; // ring : 0, tree : 1, collnet : 2
 68 |   int pattern;
 69 |   int crossNic;
 70 |   int collNet;
 71 |   int minChannels;
 72 |   int maxChannels;
 73 |   // Output
 74 |   int nChannels;
 75 |   float speedIntra;
 76 |   float speedInter;
 77 |   float latencyInter;
 78 |   int typeIntra;
 79 |   int typeInter;
 80 |   int sameChannels;
 81 |   int nHops;
 82 |   int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
 83 |   int inter[MAXCHANNELS*2];
 84 | };
 85 | ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
 86 | 
 87 | ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
 88 | ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
 89 | 
 90 | struct ncclTopoRanks {
 91 |   int ringRecv[MAXCHANNELS];
 92 |   int ringSend[MAXCHANNELS];
 93 |   int ringPrev[MAXCHANNELS];
 94 |   int ringNext[MAXCHANNELS];
 95 |   int treeToParent[MAXCHANNELS];
 96 |   int treeToChild0[MAXCHANNELS];
 97 |   int treeToChild1[MAXCHANNELS];
 98 | };
 99 | 
100 | ncclResult_t ncclTopoPreset(struct ncclComm* comm,
101 |     struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
102 |     struct ncclTopoRanks* topoRanks);
103 | 
104 | ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
105 |     struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph);
106 | 
107 | ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
108 | #include "info.h"
109 | ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
110 | ncclResult_t ncclTopoGetMSCCLAlgo(struct ncclInfo* info);
111 | 
112 | int ncclMaxNchannels();
113 | 
114 | #endif
115 | 


--------------------------------------------------------------------------------
/src/include/group.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_GROUP_H_
 8 | #define NCCL_GROUP_H_
 9 | 
10 | #include "nccl.h"
11 | #include "comm.h"
12 | 
13 | bool ncclAsyncMode();
14 | ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
15 | 
16 | typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
17 | 
18 | ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
19 | 
20 | typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
21 |     ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
22 | 
23 | ncclResult_t ncclAsyncColl(ncclComm_t comm);
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/include/info.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INFO_H_
 8 | #define NCCL_INFO_H_
 9 | 
10 | #include "nccl.h"
11 | #include "devcomm.h"
12 | #include "collectives.h"
13 | 
14 | typedef enum : uint8_t {
15 |   ncclPatternRing,
16 |   ncclPatternRingTwice,
17 |   ncclPatternPipelineFrom,
18 |   ncclPatternPipelineTo,
19 |   ncclPatternTreeUp,
20 |   ncclPatternTreeDown,
21 |   ncclPatternTreeUpDown,
22 |   ncclPatternMSCCL,
23 |   ncclPatternCollTreeUpDown,
24 |   ncclPatternSend,
25 |   ncclPatternRecv
26 | } ncclPattern_t;
27 | 
28 | // Used to pass NCCL call information between functions
29 | struct ncclInfo {
30 |   ncclFunc_t coll;
31 |   const char* opName;
32 |   // NCCL Coll Args
33 |   const void* sendbuff;
34 |   void* recvbuff;
35 |   size_t count;
36 |   ncclDataType_t datatype;
37 |   ncclRedOp_t op;
38 |   int root; // peer for p2p operations
39 |   ncclComm_t comm;
40 |   cudaStream_t stream;
41 |   // Algorithm details
42 |   int chunkSteps;
43 |   int sliceSteps;
44 |   // Computed later
45 |   ncclDevRedOpFull opFull;
46 |   int algorithm;
47 |   int protocol;
48 |   ncclPattern_t pattern;
49 |   int nChannels;
50 |   int nThreads;
51 |   size_t nBytes;
52 |   int nstepsPerLoop;
53 |   int nchunksPerLoop;
54 |   int chunkSize;
55 |   int channelId;
56 | 
57 |   struct mscclWorkInfo mscclInfo;
58 | };
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/include/net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INT_NET_H_
 8 | #define NCCL_INT_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | #include "checks.h"
13 | 
14 | extern ncclNet_t* ncclNet;
15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
16 | 
17 | ncclResult_t ncclNetInit();
18 | int ncclNetVersion();
19 | 
20 | // Translation to external API
21 | static const char* ncclNetName() { return ncclNet->name; }
22 | static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
23 | static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
24 | static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
25 | static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
26 | static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
27 | static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
28 | static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
29 | static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
30 | static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
31 | static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
32 | static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
33 | static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
34 | static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
35 | static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
36 | 
37 | // Test whether the current GPU support GPU Direct RDMA.
38 | ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
39 | 
40 | extern ncclNet_t ncclNetIb;
41 | extern ncclNet_t ncclNetSocket;
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/include/npkit/npkit_event.h:
--------------------------------------------------------------------------------
 1 | #ifndef NPKIT_EVENT_H_
 2 | #define NPKIT_EVENT_H_
 3 | 
 4 | #define NPKIT_EVENT_INVALID                                     0x0
 5 | 
 6 | #define NPKIT_EVENT_SEND_ENTRY                                  0x1
 7 | #define NPKIT_EVENT_SEND_EXIT                                   0x2
 8 | #define NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY                      0x3
 9 | #define NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT                       0x4
10 | #define NPKIT_EVENT_DIRECT_SEND_ENTRY                           0x5
11 | #define NPKIT_EVENT_DIRECT_SEND_EXIT                            0x6
12 | #define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY               0x7
13 | #define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT                0x8
14 | 
15 | #define NPKIT_EVENT_RECV_ENTRY                                  0x9
16 | #define NPKIT_EVENT_RECV_EXIT                                   0xA
17 | #define NPKIT_EVENT_DIRECT_RECV_ENTRY                           0xB
18 | #define NPKIT_EVENT_DIRECT_RECV_EXIT                            0xC
19 | 
20 | #define NPKIT_EVENT_REDUCE_ENTRY                                0xD
21 | #define NPKIT_EVENT_REDUCE_EXIT                                 0xE
22 | 
23 | #define NPKIT_EVENT_LOCAL_COPY_ENTRY                            0xF
24 | #define NPKIT_EVENT_LOCAL_COPY_EXIT                             0x10
25 | 
26 | #define NPKIT_EVENT_COPY_SEND_ENTRY                             0x11
27 | #define NPKIT_EVENT_COPY_SEND_EXIT                              0x12
28 | #define NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY                      0x13
29 | #define NPKIT_EVENT_DIRECT_COPY_SEND_EXIT                       0x14
30 | 
31 | #define NPKIT_EVENT_RECV_COPY_SEND_ENTRY                        0x15
32 | #define NPKIT_EVENT_RECV_COPY_SEND_EXIT                         0x16
33 | #define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY                 0x17
34 | #define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT                  0x18
35 | #define NPKIT_EVENT_RECV_COPY_DIRECT_SEND_ENTRY                 0x19
36 | #define NPKIT_EVENT_RECV_COPY_DIRECT_SEND_EXIT                  0x1A
37 | 
38 | #define NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY                      0x1B
39 | #define NPKIT_EVENT_RECV_REDUCE_COPY_EXIT                       0x1C
40 | 
41 | #define NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY                      0x1D
42 | #define NPKIT_EVENT_RECV_REDUCE_SEND_EXIT                       0x1E
43 | #define NPKIT_EVENT_DIRECT_RECV_REDUCE_SEND_ENTRY               0x1F
44 | #define NPKIT_EVENT_DIRECT_RECV_REDUCE_SEND_EXIT                0x20
45 | 
46 | #define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY                 0x21
47 | #define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT                  0x22
48 | #define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY          0x23
49 | #define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT           0x24
50 | 
51 | #define NPKIT_EVENT_NET_SEND_ENTRY                              0x25
52 | #define NPKIT_EVENT_NET_SEND_EXIT                               0x26
53 | #define NPKIT_EVENT_NET_RECV_ENTRY                              0x27
54 | #define NPKIT_EVENT_NET_RECV_EXIT                               0x28
55 | 
56 | #define NPKIT_EVENT_DEP_CHECK_ENTRY                             0x29
57 | #define NPKIT_EVENT_DEP_CHECK_EXIT                              0x2A
58 | 
59 | #define NPKIT_EVENT_TIME_SYNC_GPU                               0x2B
60 | #define NPKIT_EVENT_TIME_SYNC_CPU                               0x2C
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/src/include/npkit/npkit_struct.h:
--------------------------------------------------------------------------------
 1 | #ifndef NPKIT_STRUCT_H_
 2 | #define NPKIT_STRUCT_H_
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | #pragma pack(push, 1)
 7 | 
 8 | union NpKitEvent {
 9 |   uint64_t bits[2];
10 |   struct {
11 |     uint64_t type : 8;
12 |     uint64_t size : 32;
13 |     uint64_t rsvd : 24;
14 |     uint64_t timestamp;
15 |   } fields;
16 | };
17 | 
18 | struct NpKitEventCollectContext {
19 |   NpKitEvent* event_buffer;
20 |   uint64_t event_buffer_head;
21 | };
22 | 
23 | #pragma pack(pop)
24 | 
25 | #if defined(ENABLE_NPKIT)
26 | 
27 | #define NPKIT_GPU_COMM_DECL_FIELDS \
28 |   NpKitEventCollectContext* npKitEventCollectContexts; \
29 |   uint64_t* npKitCpuTimestamp;
30 | 
31 | #else
32 | 
33 | #define NPKIT_GPU_COMM_DECL_FIELDS
34 | 
35 | #endif
36 | 
37 | #if defined(ENABLE_NPKIT)
38 | 
39 | #define NPKIT_CPU_PROXY_DECL_FIELDS \
40 |   int npKitSizesFifo[NCCL_STEPS];
41 | 
42 | #else
43 | 
44 | #define NPKIT_CPU_PROXY_DECL_FIELDS
45 | 
46 | #endif
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/include/nvtx.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_NVTX_H_
 8 | #define NCCL_NVTX_H_
 9 | 
10 | #include "nvtx3.hpp"
11 | 
12 | struct nccl_domain{static constexpr char const* name{"NCCL"};};
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtCuda.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
  5 | * See https://llvm.org/LICENSE.txt for license information.
  6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7 | */
  8 | 
  9 | #include "nvToolsExt.h"
 10 | 
 11 | #include "cuda.h"
 12 | 
 13 | #ifndef NVTOOLSEXT_CUDA_V3
 14 | #define NVTOOLSEXT_CUDA_V3
 15 | 
 16 | #ifdef __cplusplus
 17 | extern "C" {
 18 | #endif /* __cplusplus */
 19 | 
 20 | /* ========================================================================= */
 21 | /** \name Functions for CUDA Resource Naming
 22 | */
 23 | /** \addtogroup RESOURCE_NAMING
 24 |  * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
 25 |  *
 26 |  * This section covers the API functions that allow to annotate CUDA resources
 27 |  * with user-provided names.
 28 |  *
 29 |  * @{
 30 |  */
 31 | 
 32 | /*  ------------------------------------------------------------------------- */
 33 | /* \cond SHOW_HIDDEN 
 34 | * \brief Used to build a non-colliding value for resource types separated class
 35 | * \version \NVTX_VERSION_2
 36 | */
 37 | #define NVTX_RESOURCE_CLASS_CUDA  4
 38 | /** \endcond */
 39 | 
 40 | /*  ------------------------------------------------------------------------- */
 41 | /** \brief Resource types for CUDA
 42 | */
 43 | typedef enum nvtxResourceCUDAType_t
 44 | {
 45 |     NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
 46 |     NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
 47 |     NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
 48 |     NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
 49 | } nvtxResourceCUDAType_t;
 50 | 
 51 | 
 52 | /* ------------------------------------------------------------------------- */
 53 | /** \brief Annotates a CUDA device.
 54 |  *
 55 |  * Allows the user to associate a CUDA device with a user-provided name.
 56 |  *
 57 |  * \param device - The handle of the CUDA device to name.
 58 |  * \param name   - The name of the CUDA device.
 59 |  *
 60 |  * \version \NVTX_VERSION_1
 61 |  * @{ */
 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
 64 | /** @} */
 65 | 
 66 | /* ------------------------------------------------------------------------- */
 67 | /** \brief Annotates a CUDA context.
 68 |  *
 69 |  * Allows the user to associate a CUDA context with a user-provided name.
 70 |  *
 71 |  * \param context - The handle of the CUDA context to name.
 72 |  * \param name    - The name of the CUDA context.
 73 |  *
 74 |  * \par Example:
 75 |  * \code
 76 |  * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
 77 |  * if ( CUDA_SUCCESS != status )
 78 |  *     goto Error;
 79 |  * nvtxNameCuContext(cuContext, "CTX_NAME");
 80 |  * \endcode
 81 |  *
 82 |  * \version \NVTX_VERSION_1
 83 |  * @{ */
 84 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
 85 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
 86 | /** @} */
 87 | 
 88 | /* ------------------------------------------------------------------------- */
 89 | /** \brief Annotates a CUDA stream.
 90 |  *
 91 |  * Allows the user to associate a CUDA stream with a user-provided name.
 92 |  *
 93 |  * \param stream - The handle of the CUDA stream to name.
 94 |  * \param name   - The name of the CUDA stream.
 95 |  *
 96 |  * \version \NVTX_VERSION_1
 97 |  * @{ */
 98 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
 99 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
100 | /** @} */
101 | 
102 | /* ------------------------------------------------------------------------- */
103 | /** \brief Annotates a CUDA event.
104 |  *
105 |  * Allows the user to associate a CUDA event with a user-provided name.
106 |  *
107 |  * \param event - The handle of the CUDA event to name.
108 |  * \param name  - The name of the CUDA event.
109 |  *
110 |  * \version \NVTX_VERSION_1
111 |  * @{ */
112 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
113 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
114 | /** @} */
115 | 
116 | /** @} */ /* END RESOURCE_NAMING */
117 | 
118 | /* ========================================================================= */
119 | #ifdef UNICODE
120 |   #define nvtxNameCuDevice   nvtxNameCuDeviceW
121 |   #define nvtxNameCuContext  nvtxNameCuContextW
122 |   #define nvtxNameCuStream   nvtxNameCuStreamW
123 |   #define nvtxNameCuEvent    nvtxNameCuEventW
124 | #else
125 |   #define nvtxNameCuDevice   nvtxNameCuDeviceA
126 |   #define nvtxNameCuContext  nvtxNameCuContextA
127 |   #define nvtxNameCuStream   nvtxNameCuStreamA
128 |   #define nvtxNameCuEvent    nvtxNameCuEventA
129 | #endif
130 | 
131 | #ifdef __cplusplus
132 | }
133 | #endif /* __cplusplus */
134 | 
135 | #ifndef NVTX_NO_IMPL
136 | #define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
137 | #include "nvtxDetail/nvtxImplCuda_v3.h"
138 | #undef NVTX_IMPL_GUARD_CUDA
139 | #endif /*NVTX_NO_IMPL*/
140 | 
141 | #endif /* NVTOOLSEXT_CUDA_V3 */
142 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtCudaRt.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
  5 | * See https://llvm.org/LICENSE.txt for license information.
  6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7 | */
  8 | 
  9 | #include "nvToolsExt.h"
 10 | 
 11 | #include "cuda.h"
 12 | #include "driver_types.h"
 13 | 
 14 | #ifndef NVTOOLSEXT_CUDART_V3
 15 | #define NVTOOLSEXT_CUDART_V3
 16 | 
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif /* __cplusplus */
 20 | 
 21 | /* ========================================================================= */
 22 | /** \name Functions for CUDA Resource Naming
 23 | */
 24 | /** \addtogroup RESOURCE_NAMING
 25 |  * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
 26 |  *
 27 |  * This section covers the API functions that allow to annotate CUDA resources
 28 |  * with user-provided names.
 29 |  *
 30 |  * @{
 31 |  */
 32 | 
 33 | /*  ------------------------------------------------------------------------- */
 34 | /* \cond SHOW_HIDDEN 
 35 | * \brief Used to build a non-colliding value for resource types separated class
 36 | * \version \NVTX_VERSION_2
 37 | */
 38 | #define NVTX_RESOURCE_CLASS_CUDART 5
 39 | /** \endcond */
 40 | 
 41 | /*  ------------------------------------------------------------------------- */
 42 | /** \brief Resource types for CUDART
 43 | */
 44 | typedef enum nvtxResourceCUDARTType_t
 45 | {
 46 |     NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
 47 |     NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
 48 |     NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
 49 | } nvtxResourceCUDARTType_t;
 50 | 
 51 | 
 52 | /* ------------------------------------------------------------------------- */
 53 | /** \brief Annotates a CUDA device.
 54 |  *
 55 |  * Allows the user to associate a CUDA device with a user-provided name.
 56 |  *
 57 |  * \param device - The id of the CUDA device to name.
 58 |  * \param name   - The name of the CUDA device.
 59 |  *
 60 |  * \version \NVTX_VERSION_1
 61 |  * @{ */
 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
 64 | /** @} */
 65 | 
 66 | /* ------------------------------------------------------------------------- */
 67 | /** \brief Annotates a CUDA stream.
 68 |  *
 69 |  * Allows the user to associate a CUDA stream with a user-provided name.
 70 |  *
 71 |  * \param stream - The handle of the CUDA stream to name.
 72 |  * \param name   - The name of the CUDA stream.
 73 |  *
 74 |  * \version \NVTX_VERSION_1
 75 |  * @{ */
 76 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
 77 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
 78 | /** @} */
 79 | 
 80 | /* ------------------------------------------------------------------------- */
 81 | /** \brief Annotates a CUDA event.
 82 |  *
 83 |  * Allows the user to associate a CUDA event with a user-provided name.
 84 |  *
 85 |  * \param event - The handle of the CUDA event to name.
 86 |  * \param name  - The name of the CUDA event.
 87 |  *
 88 |  * \version \NVTX_VERSION_1
 89 |  * @{ */
 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
 91 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
 92 | /** @} */
 93 | 
 94 | /** @} */ /* END RESOURCE_NAMING */
 95 | 
 96 | /* ========================================================================= */
 97 | #ifdef UNICODE
 98 |   #define nvtxNameCudaDevice nvtxNameCudaDeviceW
 99 |   #define nvtxNameCudaStream nvtxNameCudaStreamW
100 |   #define nvtxNameCudaEvent  nvtxNameCudaEventW
101 | #else
102 |   #define nvtxNameCudaDevice nvtxNameCudaDeviceA
103 |   #define nvtxNameCudaStream nvtxNameCudaStreamA
104 |   #define nvtxNameCudaEvent  nvtxNameCudaEventA
105 | #endif
106 | 
107 | #ifdef __cplusplus
108 | }
109 | #endif /* __cplusplus */
110 | 
111 | #ifndef NVTX_NO_IMPL
112 | #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
113 | #include "nvtxDetail/nvtxImplCudaRt_v3.h"
114 | #undef NVTX_IMPL_GUARD_CUDART
115 | #endif /*NVTX_NO_IMPL*/
116 | 
117 | #endif /* NVTOOLSEXT_CUDART_V3 */
118 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_IMPL_GUARD_CUDART
10 | #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif /* __cplusplus */
16 | 
17 | typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
18 | typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
19 | typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
20 | typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
21 | typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
22 | typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
23 | 
24 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
25 | {
26 | #ifndef NVTX_DISABLE
27 |     nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
28 |     if(local!=0)
29 |         (*local)(device, name);
30 | #endif /*NVTX_DISABLE*/
31 | }
32 | 
33 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
34 | {
35 | #ifndef NVTX_DISABLE
36 |     nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
37 |     if(local!=0)
38 |         (*local)(device, name);
39 | #endif /*NVTX_DISABLE*/
40 | }
41 | 
42 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
43 | {
44 | #ifndef NVTX_DISABLE
45 |     nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
46 |     if(local!=0)
47 |         (*local)(stream, name);
48 | #endif /*NVTX_DISABLE*/
49 | }
50 | 
51 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
52 | {
53 | #ifndef NVTX_DISABLE
54 |     nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
55 |     if(local!=0)
56 |         (*local)(stream, name);
57 | #endif /*NVTX_DISABLE*/
58 | }
59 | 
60 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
61 | {
62 | #ifndef NVTX_DISABLE
63 |     nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
64 |     if(local!=0)
65 |         (*local)(event, name);
66 | #endif /*NVTX_DISABLE*/
67 | }
68 | 
69 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
70 | {
71 | #ifndef NVTX_DISABLE
72 |     nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
73 |     if(local!=0)
74 |         (*local)(event, name);
75 | #endif /*NVTX_DISABLE*/
76 | }
77 | 
78 | #ifdef __cplusplus
79 | } /* extern "C" */
80 | #endif /* __cplusplus */
81 | 
82 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
  5 | * See https://llvm.org/LICENSE.txt for license information.
  6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7 | */
  8 | 
  9 | #ifndef NVTX_IMPL_GUARD_CUDA
 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
 11 | #endif
 12 | 
 13 | 
 14 | #ifdef __cplusplus
 15 | extern "C" {
 16 | #endif /* __cplusplus */
 17 | 
 18 | typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
 19 | typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
 20 | typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
 21 | typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
 22 | typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
 23 | typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
 24 | typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
 25 | typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
 26 | 
 27 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
 28 | {
 29 | #ifndef NVTX_DISABLE
 30 |     nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
 31 |     if(local!=0)
 32 |         (*local)(device, name);
 33 | #endif /*NVTX_DISABLE*/
 34 | }
 35 | 
 36 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
 37 | {
 38 | #ifndef NVTX_DISABLE
 39 |     nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
 40 |     if(local!=0)
 41 |         (*local)(device, name);
 42 | #endif /*NVTX_DISABLE*/
 43 | }
 44 | 
 45 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
 46 | {
 47 | #ifndef NVTX_DISABLE
 48 |     nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
 49 |     if(local!=0)
 50 |         (*local)(context, name);
 51 | #endif /*NVTX_DISABLE*/
 52 | }
 53 | 
 54 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
 55 | {
 56 | #ifndef NVTX_DISABLE
 57 |     nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
 58 |     if(local!=0)
 59 |         (*local)(context, name);
 60 | #endif /*NVTX_DISABLE*/
 61 | }
 62 | 
 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
 64 | {
 65 | #ifndef NVTX_DISABLE
 66 |     nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
 67 |     if(local!=0)
 68 |         (*local)(stream, name);
 69 | #endif /*NVTX_DISABLE*/
 70 | }
 71 | 
 72 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
 73 | {
 74 | #ifndef NVTX_DISABLE
 75 |     nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
 76 |     if(local!=0)
 77 |         (*local)(stream, name);
 78 | #endif /*NVTX_DISABLE*/
 79 | }
 80 | 
 81 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
 82 | {
 83 | #ifndef NVTX_DISABLE
 84 |     nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
 85 |     if(local!=0)
 86 |         (*local)(event, name);
 87 | #endif /*NVTX_DISABLE*/
 88 | }
 89 | 
 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
 91 | {
 92 | #ifndef NVTX_DISABLE
 93 |     nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
 94 |     if(local!=0)
 95 |         (*local)(event, name);
 96 | #endif /*NVTX_DISABLE*/
 97 | }
 98 | 
 99 | #ifdef __cplusplus
100 | } /* extern "C" */
101 | #endif /* __cplusplus */
102 | 
103 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_IMPL_GUARD_SYNC
10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif /* __cplusplus */
17 | 
18 | typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
19 | typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
20 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
21 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
22 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
23 | typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
24 | 
25 | NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
26 | {
27 | #ifndef NVTX_DISABLE
28 |     nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
29 |     if(local!=0)
30 |         return (*local)(domain, attribs);
31 |     else
32 | #endif  /*NVTX_DISABLE*/
33 |         return (nvtxSyncUser_t)0;
34 | }
35 | 
36 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
37 | {
38 | #ifndef NVTX_DISABLE
39 |     nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
40 |     if(local!=0)
41 |         (*local)(handle);
42 | #endif /*NVTX_DISABLE*/
43 | }
44 | 
45 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
46 | {
47 | #ifndef NVTX_DISABLE
48 |     nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
49 |     if(local!=0)
50 |         (*local)(handle);
51 | #endif /*NVTX_DISABLE*/
52 | }
53 | 
54 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
55 | {
56 | #ifndef NVTX_DISABLE
57 |     nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
58 |     if(local!=0)
59 |         (*local)(handle);
60 | #endif /*NVTX_DISABLE*/
61 | }
62 | 
63 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
64 | {
65 | #ifndef NVTX_DISABLE
66 |     nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
67 |     if(local!=0)
68 |         (*local)(handle);
69 | #endif /*NVTX_DISABLE*/
70 | }
71 | 
72 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
73 | {
74 | #ifndef NVTX_DISABLE
75 |     nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
76 |     if(local!=0)
77 |         (*local)(handle);
78 | #endif /*NVTX_DISABLE*/
79 | }
80 | 
81 | #ifdef __cplusplus
82 | } /* extern "C" */
83 | #endif /* __cplusplus */
84 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef __NVTX_LINKONCE_H__
10 | #define __NVTX_LINKONCE_H__
11 | 
12 | /* This header defines macros to permit making definitions of global variables
13 |  * and functions in C/C++ header files which may be included multiple times in
14 |  * a translation unit or linkage unit.  It allows authoring header-only libraries
15 |  * which can be used by multiple other header-only libraries (either as the same
16 |  * copy or multiple copies), and does not require any build changes, such as
17 |  * adding another .c file, linking a static library, or deploying a dynamic
18 |  * library.  Globals defined with these macros have the property that they have
19 |  * the same address, pointing to a single instance, for the entire linkage unit.
20 |  * It is expected but not guaranteed that each linkage unit will have a separate
21 |  * instance.
22 |  *
23 |  * In some situations it is desirable to declare a variable without initializing
24 |  * it, refer to it in code or other variables' initializers, and then initialize
25 |  * it later.  Similarly, functions can be prototyped, have their address taken,
26 |  * and then have their body defined later.  In such cases, use the FWDDECL macros 
27 |  * when forward-declaring LINKONCE global variables without initializers and
28 |  * function prototypes, and then use the DEFINE macros when later defining them.
29 |  * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
30 |  * following this pattern makes code maximally portable.
31 |  */
32 | 
33 | #if defined(__MINGW32__) /* MinGW */
34 |     #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
35 |     #if defined(__cplusplus)
36 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
37 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK
38 |     #else
39 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
40 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
41 |     #endif
42 | #elif defined(_MSC_VER) /* MSVC */
43 |     #if defined(__cplusplus)
44 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   extern "C" __declspec(selectany)
45 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
46 |     #else
47 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
48 |         #define NVTX_LINKONCE_DEFINE_FUNCTION __inline
49 |     #endif
50 | #elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */
51 |     #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
52 |     #if defined(__cplusplus)
53 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
54 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK
55 |     #else
56 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
57 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
58 |     #endif
59 | #elif defined(__CYGWIN__) /* Assume GCC or compatible */
60 |     #define NVTX_LINKONCE_WEAK __attribute__((weak))
61 |     #if defined(__cplusplus)
62 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
63 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
64 |     #else
65 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
66 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
67 |     #endif
68 | #else /* All others: Assume GCC, clang, or compatible */
69 |     #define NVTX_LINKONCE_WEAK   __attribute__((weak))
70 |     #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden")))
71 |     #if defined(__cplusplus)
72 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
73 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline
74 |     #else
75 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
76 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
77 |     #endif
78 | #endif
79 | 
80 | #define NVTX_LINKONCE_FWDDECL_GLOBAL   NVTX_LINKONCE_DEFINE_GLOBAL   extern
81 | #define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION
82 | 
83 | #endif /* __NVTX_LINKONCE_H__ */
84 | 


--------------------------------------------------------------------------------
/src/include/p2p.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <stdlib.h>
 8 | 
 9 | #ifndef NCCL_P2P_H_
10 | #define NCCL_P2P_H_
11 | 
12 | struct ncclP2Pinfo {
13 |   void* buff;
14 |   ssize_t nbytes;
15 | };
16 | 
17 | typedef ncclRecyclableList<struct ncclP2Pinfo> ncclP2Plist;
18 | 
19 | static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes) {
20 |   if (p2p == NULL) p2p = new ncclP2Plist();
21 |   struct ncclP2Pinfo* next;
22 |   NCCLCHECK(p2p->getNewElem(&next));
23 |   next->buff = buff;
24 |   next->nbytes = nBytes;
25 |   return ncclSuccess;
26 | }
27 | #endif
28 | 


--------------------------------------------------------------------------------
/src/include/param.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PARAM_H_
 8 | #define NCCL_PARAM_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | const char* userHomeDir();
13 | void setEnvFile(const char* fileName);
14 | void initEnv();
15 | 
16 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
17 | 
18 | #define NCCL_PARAM(name, env, deftVal) \
19 |   int64_t ncclParam##name() { \
20 |     constexpr int64_t uninitialized = INT64_MIN; \
21 |     static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
22 |     static int64_t cache = uninitialized; \
23 |     if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
24 |       ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \
25 |     } \
26 |     return cache; \
27 |   }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/include/profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PROFILER_H_
 8 | #define NCCL_PROFILER_H_
 9 | 
10 | #include "proxy.h"
11 | 
12 | enum ncclProxyProfileState {
13 |   ncclProxyProfileBegin = 0,
14 | 
15 |   ncclProxyProfileSendGPUWait = 1,
16 |   ncclProxyProfileSendWait = 2,
17 | 
18 |   ncclProxyProfileRecvWait = 1,
19 |   ncclProxyProfileRecvFlushWait = 2,
20 |   ncclProxyProfileRecvGPUWait = 3,
21 | 
22 |   ncclProxyProfileEnd = 4,
23 | 
24 |   ncclProxyProfileSleep = 8,
25 |   ncclProxyProfileWakeup = 9,
26 | 
27 |   ncclProxyProfileIdle = 16,
28 |   ncclProxyProfileActive = 17,
29 | 
30 |   ncclProxyProfileAppend = 24,
31 |   ncclProxyProfileAppendEnd = 25
32 | };
33 | 
34 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
35 | void ncclProfilingDump();
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/include/shm.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SHM_H_
 8 | #define NCCL_SHM_H_
 9 | 
10 | #include "nccl.h"
11 | 
12 | ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create);
13 | ncclResult_t ncclShmUnlink(const char* shmname);
14 | ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize);
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/include/socket.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SOCKET_H_
 8 | #define NCCL_SOCKET_H_
 9 | 
10 | #include "nccl.h"
11 | #include <sys/socket.h>
12 | #include <arpa/inet.h>
13 | #include <netinet/tcp.h>
14 | #include <netdb.h>
15 | #include <fcntl.h>
16 | #include <poll.h>
17 | 
18 | #define MAX_IFS 16
19 | #define MAX_IF_NAME_SIZE 16
20 | #define SLEEP_INT            1000 // connection retry sleep interval in usec
21 | #define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
22 | #define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
23 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
24 | 
25 | /* Common socket address storage structure for IPv4/IPv6 */
26 | union ncclSocketAddress {
27 |   struct sockaddr sa;
28 |   struct sockaddr_in sin;
29 |   struct sockaddr_in6 sin6;
30 | };
31 | 
32 | enum ncclSocketState {
33 |   ncclSocketConnecting = 0,
34 |   ncclSocketConnected = 1,
35 |   ncclSocketError = 2,
36 |   ncclSocketStateNum = 3
37 | } ;
38 | 
39 | struct ncclSocket {
40 |   int fd;
41 |   union ncclSocketAddress addr;
42 |   volatile uint32_t* abortFlag;
43 |   int asyncFlag;
44 |   enum ncclSocketState state;
45 | };
46 | 
47 | const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
48 | ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
49 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
50 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
51 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
52 | ncclResult_t ncclSocketListen(struct ncclSocket* sock);
53 | // Connect to sock->addr. sock->fd is set after a successful call.
54 | ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
55 | // Return socket connection state.
56 | ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state);
57 | // Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
58 | ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket);
59 | 
60 | #define NCCL_SOCKET_SEND 0
61 | #define NCCL_SOCKET_RECV 1
62 | 
63 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
64 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
65 | ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
66 | ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
67 | ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
68 | /* initialize a socket. */
69 | ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
70 | #endif
71 | 


--------------------------------------------------------------------------------
/src/include/timer.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TIMER_H_
 8 | #define NCCL_TIMER_H_
 9 | #if ENABLE_TIMER
10 | #include <unistd.h>
11 | #include <sys/time.h>
12 | #include <x86intrin.h>
13 | static double freq = -1;
14 | static void calibrate() {
15 |   struct timeval tv;
16 |   gettimeofday(&tv, NULL);
17 |   uint64_t timeCycles = __rdtsc();
18 |   double time = - tv.tv_sec*1E6 - tv.tv_usec;
19 |   uint64_t total = 0ULL;
20 |   for (int i=0; i<10000; i++) total += __rdtsc();
21 |   gettimeofday(&tv, NULL);
22 |   timeCycles = __rdtsc() - timeCycles;
23 |   time += tv.tv_sec*1E6 + tv.tv_usec;
24 |   freq = timeCycles/time;
25 | }
26 | static inline double gettime() {
27 |   if (freq == -1) calibrate();
28 |   return __rdtsc()/freq;
29 | }
30 | static uint64_t counts[8];
31 | static double times[8];
32 | static double startTimes[8];
33 | #define TIME_START(index) do { \
34 |   counts[index]++; \
35 |   startTimes[index] = gettime(); \
36 | } while (0);
37 | 
38 | #define TIME_STOP(index) do { \
39 |   times[index] += gettime() - startTimes[index]; \
40 | } while (0);
41 | 
42 | #define TIME_CANCEL(index) do { \
43 |   counts[index]--; \
44 | } while (0);
45 | 
46 | #define TIME_PRINT(name) do { \
47 |   printf("%s stats", name); \
48 |   for (int i=0; i<8; i++) { \
49 |     if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
50 |     counts[i] = 0; \
51 |   } \
52 |   printf("\n"); \
53 | } while (0);
54 | #else
55 | #define TIME_START(index) while(0);
56 | #define TIME_STOP(index) while(0);
57 | #define TIME_CANCEL(index) while(0);
58 | #define TIME_PRINT(name)
59 | #endif
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/include/transport.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TRANSPORT_H_
 8 | #define NCCL_TRANSPORT_H_
 9 | 
10 | #include "devcomm.h"
11 | #include "graph.h"
12 | #include "nvmlwrap.h"
13 | #include "core.h"
14 | 
15 | #define NTRANSPORTS 4
16 | #define TRANSPORT_P2P 0
17 | #define TRANSPORT_SHM 1
18 | #define TRANSPORT_NET 2
19 | #define TRANSPORT_COLLNET 3
20 | 
21 | #include "proxy.h"
22 | 
23 | extern struct ncclTransport ncclTransports[];
24 | 
25 | // Forward declarations
26 | struct ncclRing;
27 | struct ncclConnector;
28 | struct ncclComm;
29 | 
30 | struct ncclPeerInfo {
31 |   int rank;
32 |   int cudaDev;
33 |   int netDev;
34 |   int gdrSupport;
35 |   uint64_t hostHash;
36 |   uint64_t pidHash;
37 |   dev_t shmDev;
38 |   int64_t busId;
39 |   struct ncclComm* comm;
40 |   int cudaCompCap;
41 | };
42 | 
43 | #define CONNECT_SIZE 128
44 | struct ncclConnect {
45 |   char data[CONNECT_SIZE];
46 | };
47 | 
48 | struct ncclTransportComm {
49 |   ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
50 |   ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
51 |   ncclResult_t (*free)(struct ncclConnector*);
52 |   ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
53 |   ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
54 |   ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
55 |   ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
56 |   ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
57 | };
58 | 
59 | struct ncclTransport {
60 |   const char name[4];
61 |   ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
62 |   struct ncclTransportComm send;
63 |   struct ncclTransportComm recv;
64 | };
65 | 
66 | ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
67 | ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
68 | 
69 | enum { collNetRecv=0, collNetSend=1 };
70 | int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
71 | ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
72 | ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/include/trees.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TREES_H_
 8 | #define NCCL_TREES_H_
 9 | 
10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/include/utils.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_UTILS_H_
  8 | #define NCCL_UTILS_H_
  9 | 
 10 | #include "nccl.h"
 11 | #include "checks.h"
 12 | #include <stdint.h>
 13 | 
 14 | int ncclCudaCompCap();
 15 | 
 16 | // PCI Bus ID <-> int64 conversion functions
 17 | ncclResult_t int64ToBusId(int64_t id, char* busId);
 18 | ncclResult_t busIdToInt64(const char* busId, int64_t* id);
 19 | 
 20 | ncclResult_t getBusId(int cudaDev, int64_t *busId);
 21 | 
 22 | ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 23 | uint64_t getHash(const char* string, int n);
 24 | uint64_t getHostHash();
 25 | uint64_t getPidHash();
 26 | 
 27 | struct netIf {
 28 |   char prefix[64];
 29 |   int port;
 30 | };
 31 | 
 32 | int parseStringList(const char* string, struct netIf* ifList, int maxList);
 33 | bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
 34 | 
 35 | static long log2i(long n) {
 36 |  long l = 0;
 37 |  while (n>>=1) l++;
 38 |  return l;
 39 | }
 40 | 
 41 | // Recyclable list that avoids frequent malloc/free
 42 | template<typename T>
 43 | struct ncclListElem {
 44 |   T data;
 45 |   struct ncclListElem* next;
 46 | };
 47 | 
 48 | template<typename T>
 49 | class ncclRecyclableList {
 50 |  private:
 51 |   struct ncclListElem<T>* head;
 52 |   struct ncclListElem<T>* tail;
 53 |   struct ncclListElem<T>* cursor;
 54 |   int n;
 55 | 
 56 |  public:
 57 |   ncclRecyclableList() {
 58 |     tail = cursor = head = NULL;
 59 |     n = 0;
 60 |   }
 61 | 
 62 |   int count() const { return n; }
 63 | 
 64 |   // Get a new element from the list and return pointer
 65 |   ncclResult_t getNewElem(T** dataOut) {
 66 |     if (tail != NULL) {
 67 |       *dataOut = &tail->data;
 68 |       memset(*dataOut, 0, sizeof(T));
 69 |     } else {
 70 |       NCCLCHECK(ncclCalloc(&tail, 1));
 71 |       *dataOut = &tail->data;
 72 |       cursor = head = tail;
 73 |     }
 74 |     if (tail->next == NULL) {
 75 |       NCCLCHECK(ncclCalloc(&tail->next, 1));
 76 |     }
 77 |     tail = tail->next;
 78 |     n += 1;
 79 |     return ncclSuccess;
 80 |   }
 81 | 
 82 |   T* begin() {
 83 |     if (head == NULL || head == tail) return NULL;
 84 |     cursor = head->next;
 85 |     return &head->data;
 86 |   }
 87 | 
 88 |   // Get next element from the list during an iteration
 89 |   T* getNext() {
 90 |     // tail always points to the next element to be enqueued
 91 |     // hence does not contain valid data
 92 |     if (cursor == NULL || cursor == tail) return NULL;
 93 |     T* rv = &cursor->data;
 94 |     cursor = cursor->next;
 95 |     return rv;
 96 |   }
 97 | 
 98 |   T* peakNext() {
 99 |     if (cursor == NULL || cursor == tail) return NULL;
100 |     return &cursor->data;
101 |   }
102 | 
103 |   // Recycle the list without freeing the space
104 |   void recycle() {
105 |     tail = cursor = head;
106 |     n = 0;
107 |   }
108 | 
109 |   ~ncclRecyclableList() {
110 |     while (head != NULL) {
111 |       struct ncclListElem<T>* temp = head;
112 |       head = head->next;
113 |       free(temp);
114 |     }
115 |   }
116 | };
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/src/misc/argcheck.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "argcheck.h"
 8 | #include "comm.h"
 9 | 
10 | static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
11 |   cudaPointerAttributes attr;
12 |   cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
13 |   if (err != cudaSuccess || attr.devicePointer == NULL) {
14 |     WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer);
15 |     return ncclInvalidArgument;
16 |   }
17 | #if CUDART_VERSION >= 10000
18 |   if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
19 | #else
20 |   if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
21 | #endif
22 |     WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
23 |     return ncclInvalidArgument;
24 |   }
25 |   return ncclSuccess;
26 | }
27 | 
28 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
29 |   if (ptr == NULL) {
30 |     WARN("%s : %s argument is NULL", opname, ptrname);
31 |     return ncclInvalidArgument;
32 |   }
33 |   return ncclSuccess;
34 | }
35 | 
36 | ncclResult_t ArgsCheck(struct ncclInfo* info) {
37 |   // First, the easy ones
38 |   if (info->root < 0 || info->root >= info->comm->nRanks) {
39 |     WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
40 |     return ncclInvalidArgument;
41 |   }
42 |   if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
43 |     WARN("%s : invalid type %d", info->opName, info->datatype);
44 |     return ncclInvalidArgument;
45 |   }
46 |   // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
47 |   info->nBytes = info->count * ncclTypeSize(info->datatype);
48 |   if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAll) {
49 |     info->count = info->nBytes;
50 |     info->datatype = ncclInt8;
51 |   }
52 |   if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter || info->coll == ncclFuncAllToAll) info->nBytes *= info->comm->nRanks; // count is per rank
53 | 
54 |   if (info->op < 0 || ncclMaxRedOp < info->op) {
55 |     WARN("%s : invalid reduction operation %d", info->opName, info->op);
56 |     return ncclInvalidArgument;
57 |   }
58 |   int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps);
59 |   if (ncclNumOps <= info->op &&
60 |       (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) {
61 |     WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op);
62 |     return ncclInvalidArgument;
63 |   }
64 | 
65 |   if (info->comm->checkPointers) {
66 |     if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) {
67 |       if (info->count >0)
68 |         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName));
69 |     } else {
70 |       // Check CUDA device pointers
71 |       if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
72 |         NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
73 |       }
74 |       if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
75 |         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
76 |       }
77 |     }
78 |   }
79 |   return ncclSuccess;
80 | }
81 | 


--------------------------------------------------------------------------------
/src/misc/param.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "param.h"
 8 | #include "debug.h"
 9 | 
10 | #include <algorithm>
11 | #include <errno.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <string.h>
15 | #include <sys/types.h>
16 | #include <unistd.h>
17 | #include <pthread.h>
18 | #include <pwd.h>
19 | 
20 | const char* userHomeDir() {
21 |   struct passwd *pwUser = getpwuid(getuid());
22 |   return pwUser == NULL ? NULL : pwUser->pw_dir;
23 | }
24 | 
25 | void setEnvFile(const char* fileName) {
26 |   FILE * file = fopen(fileName, "r");
27 |   if (file == NULL) return;
28 | 
29 |   char *line = NULL;
30 |   char envVar[1024];
31 |   char envValue[1024];
32 |   size_t n = 0;
33 |   ssize_t read;
34 |   while ((read = getline(&line, &n, file)) != -1) {
35 |     if (line[read-1] == '\n') line[read-1] = '\0';
36 |     int s=0; // Env Var Size
37 |     while (line[s] != '\0' && line[s] != '=') s++;
38 |     if (line[s] == '\0') continue;
39 |     strncpy(envVar, line, std::min(1023,s));
40 |     envVar[s] = '\0';
41 |     s++;
42 |     strncpy(envValue, line+s, 1023);
43 |     envValue[1023]='\0';
44 |     setenv(envVar, envValue, 0);
45 |     //printf("%s : %s->%s\n", fileName, envVar, envValue);
46 |   }
47 |   if (line) free(line);
48 |   fclose(file);
49 | }
50 | 
51 | void initEnv() {
52 |   char confFilePath[1024];
53 |   const char * userDir = userHomeDir();
54 |   if (userDir) {
55 |     sprintf(confFilePath, "%s/.nccl.conf", userDir);
56 |     setEnvFile(confFilePath);
57 |   }
58 |   sprintf(confFilePath, "/etc/nccl.conf");
59 |   setEnvFile(confFilePath);
60 | }
61 | 
62 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
63 |   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
64 |   pthread_mutex_lock(&mutex);
65 |   if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
66 |     char* str = getenv(env);
67 |     int64_t value = deftVal;
68 |     if (str && strlen(str) > 0) {
69 |       errno = 0;
70 |       value = strtoll(str, nullptr, 0);
71 |       if (errno) {
72 |         value = deftVal;
73 |         INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
74 |       } else {
75 |         INFO(NCCL_ALL,"%s set by environment to %lld.", env, (long long)value);
76 |       }
77 |     }
78 |     __atomic_store_n(cache, value, __ATOMIC_RELAXED);
79 |   }
80 |   pthread_mutex_unlock(&mutex);
81 | }
82 | 


--------------------------------------------------------------------------------
/src/misc/profiler.cc:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "profiler.h"
  8 | 
  9 | //#define PROFILE_PROXY 1
 10 | #ifdef PROFILE_PROXY
 11 | #include "timer.h"
 12 | #include "alloc.h"
 13 | 
 14 | static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" };
 15 | static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" };
 16 | static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" };
 17 | struct ncclProxyProfileEvent {
 18 |   double timestamp[6];
 19 |   uint64_t opCount;
 20 |   int peer;
 21 |   int step;
 22 |   uint16_t channel;
 23 |   uint8_t type; // send / recv
 24 |   uint8_t opIndex;
 25 | };
 26 | 
 27 | struct ncclProxyProfileEvent* profilingEvents = NULL;
 28 | int profilingIndex = 0;
 29 | double profilingStart = 0;
 30 | #define MAX_EVENTS 200000
 31 | 
 32 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) {
 33 |   if (profilingEvents == NULL) {
 34 |     NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS));
 35 |     profilingStart = gettime();
 36 |   }
 37 |   struct ncclProxyProfileEvent* event = NULL;
 38 |   if (state%8 == 0) {
 39 |     if (profilingIndex == MAX_EVENTS) return ncclSuccess;
 40 |     args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++;
 41 |     if (state == ncclProxyProfileBegin) {
 42 |       // Proxy operation information
 43 |       event->opCount = args->opCount;
 44 |       event->channel = args->subs[sub].channelId;
 45 |       event->peer = args->subs[sub].peer;
 46 |       event->type = args->pattern;
 47 |       event->step = step;
 48 |       event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256;
 49 |     } else event->peer = -state;
 50 |   } else {
 51 |     event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS];
 52 |     if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL;
 53 |     if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount;
 54 |   }
 55 |   // Timestamp
 56 |   event->timestamp[state%8] = gettime()-profilingStart;
 57 |   return ncclSuccess;
 58 | }
 59 | 
 60 | void ncclProfilingDump() {
 61 |   static int dumpDone = 0;
 62 |   if (dumpDone) return;
 63 |   dumpDone = 1;
 64 |   const char* str = getenv("NCCL_PROXY_PROFILE");
 65 |   if (!str) { free(profilingEvents); return; }
 66 |   FILE* f = fopen(str, "w");
 67 |   fprintf(f, "[\n");
 68 | 
 69 |   for (int i=0; i<profilingIndex; i++) {
 70 |     struct ncclProxyProfileEvent* e = profilingEvents+i;
 71 |     const int sendrecv = e->peer >= 0;
 72 |     const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") :
 73 |       profilingEventStr[-(e->peer/8)];
 74 | 
 75 | 
 76 |     if (sendrecv) {
 77 |       int state = ncclProxyProfileBegin;
 78 |       const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr;
 79 |       fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n",
 80 |           typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex);
 81 | 
 82 |       while (state<ncclProxyProfileEnd) {
 83 |         if (e->timestamp[state]) {
 84 |           const char* name = stateStr[state];
 85 |           fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
 86 |               name, i, e->channel, e->timestamp[state]);
 87 |           state++;
 88 |           while (e->timestamp[state] == 0) state++;
 89 |           fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
 90 |               name, i, e->channel, e->timestamp[state]);
 91 |         }
 92 |       }
 93 | 
 94 |       fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
 95 |           typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]);
 96 |     } else {
 97 |       if (e->peer == -ncclProxyProfileAppend) {
 98 |       fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n",
 99 |           typeStr, i, e->timestamp[0], e->opCount);
100 |       } else {
101 |         fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
102 |           typeStr, i, e->timestamp[0]);
103 |       }
104 |       fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
105 |           typeStr, i, e->timestamp[1]);
106 |     }
107 |   }
108 |   fprintf(f, "{} ]\n");
109 |   fclose(f);
110 |   free(profilingEvents);
111 | }
112 | #else
113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; }
114 | void ncclProfilingDump() {}
115 | #endif
116 | 


--------------------------------------------------------------------------------
/src/misc/shmutils.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "shm.h"
 8 | #include "checks.h"
 9 | #include <sys/types.h>
10 | #include <sys/mman.h>
11 | #include <sys/stat.h>
12 | #include <fcntl.h>
13 | #include <errno.h>
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <unistd.h>
17 | 
18 | // Change functions behavior to match other SYS functions
19 | static int shm_allocate(int fd, const int shmSize) {
20 |   int err = posix_fallocate(fd, 0, shmSize);
21 |   if (err) { errno = err; return -1; }
22 |   return 0;
23 | }
24 | static int shm_map(int fd, const int shmSize, void** ptr) {
25 |   *ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
26 |   return (*ptr == MAP_FAILED) ? -1 : 0;
27 | }
28 | 
29 | static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) {
30 |   if (create) {
31 |     if (shmPath[0] == '\0') {
32 |       sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
33 |       *fd = mkstemp(shmPath);
34 |     } else {
35 |       SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
36 |     }
37 |     if (ftruncate(*fd, shmSize) != 0) {
38 |       WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
39 |       return ncclSystemError;
40 |     }
41 |   } else {
42 |     SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
43 |   }
44 |   *ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
45 |   if (*ptr == NULL) {
46 |     WARN("Could not map %s\n", shmPath);
47 |     return ncclSystemError;
48 |   }
49 |   close(*fd);
50 |   *fd = -1;
51 |   if (create) memset(*ptr, 0, shmSize);
52 |   return ncclSuccess;
53 | }
54 | 
55 | ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) {
56 |   int fd = -1;
57 |   void* ptr = MAP_FAILED;
58 |   ncclResult_t res = ncclSuccess;
59 | 
60 |   NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
61 |   if (devShmPtr) {
62 |     CUDACHECKGOTO(cudaHostRegister(ptr, shmSize, cudaHostRegisterMapped), res, cudaError);
63 |     CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
64 |   }
65 | 
66 |   *shmPtr = ptr;
67 |   return ncclSuccess;
68 | sysError:
69 |   WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
70 | cudaError:
71 |   if (fd != -1) close(fd);
72 |   if (create) shm_unlink(shmPath);
73 |   if (ptr != MAP_FAILED) munmap(ptr, shmSize);
74 |   *shmPtr = NULL;
75 |   return res;
76 | }
77 | 
78 | ncclResult_t ncclShmUnlink(const char* shmPath) {
79 |   if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink");
80 |   return ncclSuccess;
81 | }
82 | 
83 | ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
84 |   if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr));
85 |   if (munmap(shmPtr, shmSize) != 0) {
86 |     WARN("munmap of shared memory failed");
87 |     return ncclSystemError;
88 |   }
89 |   return ncclSuccess;
90 | }
91 | 


--------------------------------------------------------------------------------
/src/nccl.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=${nccl:Prefix}
 2 | exec_prefix=${prefix}
 3 | libdir=${exec_prefix}/lib
 4 | includedir=${prefix}/include
 5 | 
 6 | Name: nccl
 7 | Description: Optimized primitives for collective multi-GPU communication
 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
 9 | Libs: -L${libdir} -lnccl
10 | Cflags: -I${includedir}
11 | 


--------------------------------------------------------------------------------