├── pkg
    ├── debian
    │   ├── compat
    │   ├── copyright
    │   ├── source
    │   │   └── format
    │   ├── .gitignore
    │   ├── libnccl2.install.in
    │   ├── gbp.conf
    │   ├── libnccl-dev.install.in
    │   ├── changelog.in
    │   ├── rules
    │   ├── control.in
    │   └── Makefile
    ├── Makefile
    ├── txz
    │   ├── create_txz.sh.in
    │   └── Makefile
    ├── srctxz
    │   ├── create_srctxz.sh.in
    │   └── Makefile
    └── redhat
    │   ├── Makefile
    │   └── nccl.spec.in
├── ext-tuner
    └── example
    │   ├── src
    │       ├── include
    │       │   ├── optimizers
    │       │   │   ├── co_optimizer.h
    │       │   │   ├── het_optimizer.h
    │       │   │   ├── unified_optimizer.h
    │       │   │   └── optimizer.h
    │       │   ├── timer.h
    │       │   ├── jobs
    │       │   │   ├── bruteforce_job.h
    │       │   │   └── native_job.h
    │       │   └── internal
    │       │   │   ├── threadsafe_queue.h
    │       │   │   ├── env.h
    │       │   │   └── database.h
    │       └── cuda
    │       │   ├── collectives
    │       │       └── util.h
    │       │   ├── nccl_params.h
    │       │   └── nccl_socket.h
    │   ├── example
    │       └── cuda
    │       │   └── pytorch
    │       │       ├── whitelistrules.txt
    │       │       ├── whitelistcases.txt
    │       │       ├── run.sh
    │       │       └── demo.py
    │   ├── test
    │       └── cuda
    │       │   ├── test_loader.cc
    │       │   └── tuner_loader.h
    │   ├── README.md
    │   ├── plugin
    │       └── cuda
    │       │   ├── wrapper.cc
    │       │   ├── wrapper.py
    │       │   └── plugin.h
    │   ├── .clang-tidy
    │   ├── utils
    │       └── get_candidates.py
    │   └── .clang-format
├── .gitignore
├── makefiles
    ├── version.mk
    ├── formatting.mk
    └── common.mk
├── .gitmodules
├── src
    ├── nccl.pc.in
    ├── graph
    │   ├── rings.h
    │   ├── rings.cc
    │   └── trees.cc
    ├── collectives
    │   ├── device
    │   │   ├── reduce.cu
    │   │   ├── sendrecv.cu
    │   │   ├── all_gather.cu
    │   │   ├── all_reduce.cu
    │   │   ├── broadcast.cu
    │   │   ├── reduce_scatter.cu
    │   │   ├── gen_rules.sh
    │   │   ├── onerank_reduce.cu
    │   │   ├── Makefile
    │   │   ├── broadcast.h
    │   │   └── reduce.h
    │   ├── all_gather.cc
    │   ├── all_reduce.cc
    │   ├── reduce_scatter.cc
    │   ├── reduce.cc
    │   ├── broadcast.cc
    │   └── sendrecv.cc
    ├── include
    │   ├── npkit
    │   │   ├── npkit_struct.h
    │   │   └── npkit.h
    │   ├── argcheck.h
    │   ├── trees.h
    │   ├── shm.h
    │   ├── net.h
    │   ├── tuner.h
    │   ├── p2p.h
    │   ├── profiler.h
    │   ├── param.h
    │   ├── ipcsocket.h
    │   ├── enqueue.h
    │   ├── align.h
    │   ├── nvtx3
    │   │   ├── nvtxExtDetail
    │   │   │   ├── nvtxExtTypes.h
    │   │   │   ├── nvtxExtImpl.h
    │   │   │   └── nvtxExtImplPayload_v1.h
    │   │   ├── nvtxDetail
    │   │   │   ├── nvtxImplCudaRt_v3.h
    │   │   │   ├── nvtxImplSync_v3.h
    │   │   │   ├── nvtxImplCuda_v3.h
    │   │   │   └── nvtxLinkOnce.h
    │   │   └── nvToolsExtCudaRt.h
    │   ├── debug.h
    │   ├── cpuset.h
    │   ├── bootstrap.h
    │   ├── timer.h
    │   ├── core.h
    │   ├── channel.h
    │   ├── ibvsymbols.h
    │   ├── coll_net.h
    │   ├── nvtx.h
    │   ├── nccl_tuner.h
    │   ├── socket.h
    │   └── info.h
    ├── init_nvtx.cc
    ├── enhcompat.cc
    └── misc
    │   ├── param.cc
    │   ├── argcheck.cc
    │   └── tuner.cc
├── ext-net
    ├── example
    │   ├── Makefile
    │   └── nccl
    │   │   ├── err.h
    │   │   ├── types.h
    │   │   ├── net.h
    │   │   ├── net_v3.h
    │   │   ├── net_v2.h
    │   │   ├── net_v5.h
    │   │   ├── net_v4.h
    │   │   └── net_v6.h
    └── google-fastsocket
    │   └── Makefile
├── Makefile
├── LICENSE.txt
└── README.md


/pkg/debian/compat:
--------------------------------------------------------------------------------
1 | 9
2 | 


--------------------------------------------------------------------------------
/pkg/debian/copyright:
--------------------------------------------------------------------------------
1 | ../../LICENSE.txt


--------------------------------------------------------------------------------
/pkg/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (native)
2 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/optimizers/co_optimizer.h:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/optimizers/het_optimizer.h:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/optimizers/unified_optimizer.h:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ext-tuner/example/example/cuda/pytorch/whitelistrules.txt:
--------------------------------------------------------------------------------
1 | ;;;0 16;


--------------------------------------------------------------------------------
/ext-tuner/example/example/cuda/pytorch/whitelistcases.txt:
--------------------------------------------------------------------------------
1 | -1;-1 4 1 ;
2 | 


--------------------------------------------------------------------------------
/pkg/debian/.gitignore:
--------------------------------------------------------------------------------
1 | /*.debhelper.log
2 | /*.debhelper
3 | /*.substvars
4 | /tmp/
5 | /files
6 | /libnccl1/
7 | /libnccl-dev/
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
2 | /build
3 | *.gcov
4 | /coverage/
5 | build
6 | __pycache__
7 | 


--------------------------------------------------------------------------------
/makefiles/version.mk:
--------------------------------------------------------------------------------
1 | ##### version
2 | NCCL_MAJOR   := 2
3 | NCCL_MINOR   := 18
4 | NCCL_PATCH   := 3
5 | NCCL_SUFFIX  :=
6 | PKG_REVISION := 1
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ext-tuner/example/3rdparty/cereal"]
2 | 	path = ext-tuner/example/3rdparty/cereal
3 | 	url = https://github.com/USCiLab/cereal
4 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl2.install.in:
--------------------------------------------------------------------------------
1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
3 | 


--------------------------------------------------------------------------------
/pkg/debian/gbp.conf:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | debian-branch   = master
 3 | upstream-branch = master
 4 | 
 5 | ignore-new = True
 6 | 
 7 | [git-buildpackage]
 8 | 
 9 | no-purge = True
10 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl-dev.install.in:
--------------------------------------------------------------------------------
1 | include/nccl.h /usr/include
2 | include/nccl_net.h /usr/include
3 | lib/libnccl.so /usr/lib/${pkg:MultiArch}
4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
5 | 


--------------------------------------------------------------------------------
/pkg/debian/changelog.in:
--------------------------------------------------------------------------------
1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
2 | 
3 |   * Automatic Debian package from build
4 | 
5 |  -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
6 | 


--------------------------------------------------------------------------------
/pkg/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | %:
 4 | 	dh $@ --parallel
 5 | 
 6 | override_dh_auto_install:
 7 | 	PREFIX=debian/tmp dh_auto_install
 8 | 
 9 | override_dh_auto_test:
10 | 	# Do not make test
11 | 
12 | override_dh_auto_clean:
13 | 	# Do not make clean
14 | 


--------------------------------------------------------------------------------
/src/nccl.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=${nccl:Prefix}
 2 | exec_prefix=${prefix}
 3 | libdir=${exec_prefix}/lib
 4 | includedir=${prefix}/include
 5 | 
 6 | Name: nccl
 7 | Description: Optimized primitives for collective multi-GPU communication
 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
 9 | Libs: -L${libdir} -lnccl
10 | Cflags: -I${includedir}
11 | 


--------------------------------------------------------------------------------
/src/graph/rings.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 |  *
4 |  * See LICENSE.txt for license information
5 |  ************************************************************************/
6 | 
7 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
8 | 


--------------------------------------------------------------------------------
/src/collectives/device/reduce.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "reduce.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_R(Reduce);
12 | 


--------------------------------------------------------------------------------
/ext-tuner/example/test/cuda/test_loader.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <sstream>
 3 | #include <thread>
 4 | #include "plugin/cuda/plugin.h"
 5 | #include "test/cuda/tuner_loader.h"
 6 | 
 7 | void testLoader() {
 8 |   ncclTuner_t *tuner = nullptr;
 9 |   ncclLoadTunerPlugin(&tuner);
10 |   ncclCloseTunerPlugin(&tuner);
11 | }
12 | 
13 | int main() {
14 |   testLoader();
15 |   printf("test_loader pass\n");
16 |   return 0;
17 | }
18 | 


--------------------------------------------------------------------------------
/src/collectives/device/sendrecv.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "sendrecv.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_P(SendRecv);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/all_gather.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "all_gather.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_C(AllGather);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/all_reduce.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "all_reduce.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_R(AllReduce);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/broadcast.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "broadcast.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_C(Broadcast);
12 | 


--------------------------------------------------------------------------------
/src/collectives/device/reduce_scatter.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "reduce_scatter.h"
 8 | #include "common.h"
 9 | #include "collectives.h"
10 | 
11 | IMPL_COLL_R(ReduceScatter);
12 | 


--------------------------------------------------------------------------------
/ext-net/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME:=../../build/
 7 | CUDA_HOME:=/usr/local/cuda
 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
 9 | PLUGIN_SO:=libnccl-net.so
10 | 
11 | default: $(PLUGIN_SO)
12 | 
13 | $(PLUGIN_SO): plugin.c
14 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
15 | 
16 | clean:
17 | 	rm -f $(PLUGIN_SO)
18 | 


--------------------------------------------------------------------------------
/src/include/npkit/npkit_struct.h:
--------------------------------------------------------------------------------
 1 | #ifndef NPKIT_STRUCT_H_
 2 | #define NPKIT_STRUCT_H_
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | #pragma pack(push, 1)
 7 | 
 8 | union NpKitEvent {
 9 |   uint64_t bits[2];
10 |   struct {
11 |     uint64_t type : 8;
12 |     uint64_t size : 32;
13 |     uint64_t rsvd : 24;
14 |     uint64_t timestamp;
15 |   } fields;
16 | };
17 | 
18 | struct NpKitEventCollectContext {
19 |   NpKitEvent* event_buffer;
20 |   uint64_t event_buffer_head;
21 | };
22 | 
23 | #pragma pack(pop)
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_ERR_H_
 6 | #define NCCL_ERR_H_
 7 | 
 8 | /* Error type for plugins */
 9 | typedef enum { ncclSuccess                 =  0,
10 |                ncclUnhandledCudaError      =  1,
11 |                ncclSystemError             =  2,
12 |                ncclInternalError           =  3,
13 |                ncclInvalidArgument         =  4,
14 |                ncclRemoteError             =  6 } ncclResult_t;
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/include/argcheck.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ARGCHECK_H_
 8 | #define NCCL_ARGCHECK_H_
 9 | 
10 | #include "core.h"
11 | #include "info.h"
12 | 
13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
14 | ncclResult_t ArgsCheck(struct ncclInfo* info);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/timer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include "src/include/datatype.h"
 4 | 
 5 | class Timer {
 6 |   public:
 7 |     virtual void begin(const RecordKey &recordKey, bool blocking,
 8 |                        void *context) = 0;
 9 |     virtual void end(GIDTYPE groupID, bool blocking) = 0;
10 |     virtual void start() = 0;
11 |     virtual void stop() = 0;
12 |     virtual void tryGetRecords(std::vector<Record> *records, bool blocking) = 0;
13 |     virtual void setProfiling(const Workload &workload,
14 |                               int32_t askedProfiling) = 0;
15 | };
16 | 


--------------------------------------------------------------------------------
/ext-net/google-fastsocket/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME?=/usr/local/cuda
 2 | INC:=-I$(CUDA_HOME)/include
 3 | PLUGIN_SO:=libnccl-net.so
 4 | 
 5 | default: $(PLUGIN_SO)
 6 | 
 7 | $(PLUGIN_SO): nccl-fastsocket/*.cc
 8 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
 9 | 
10 | nccl-fastsocket/*.cc:
11 | 	git clone https://github.com/google/nccl-fastsocket.git
12 | 
13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO)
14 | 
15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO)
16 | 	@printf "Grabbing %-35s > %s\n" $< $@
17 | 	mkdir -p $(BUILDDIR)/lib
18 | 	install -m 644 $< $@
19 | 
20 | clean:
21 | 	rm -f $(PLUGIN_SO)
22 | 	rm -Rf nccl-fastsocket
23 | 


--------------------------------------------------------------------------------
/src/include/trees.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TREES_H_
 8 | #define NCCL_TREES_H_
 9 | 
10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/pkg/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : build
 9 | build : debian.build txz.build
10 | 
11 | BUILDDIR ?= $(abspath ../build)
12 | ABSBUILDDIR := $(abspath $(BUILDDIR))
13 | TARGETS := debian txz
14 | all:   ${TARGETS:%=%.build}
15 | prep:  ${TARGETS:%=%.prep}
16 | build: ${TARGETS:%=%.build}
17 | clean: ${TARGETS:%=%.clean}
18 | 
19 | %.prep:
20 | 	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
21 | 
22 | %.build:
23 | 	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
24 | 
25 | %.clean:
26 | 	${MAKE} -C $* clean
27 | 


--------------------------------------------------------------------------------
/src/include/shm.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SHM_H_
 8 | #define NCCL_SHM_H_
 9 | 
10 | #include "nccl.h"
11 | 
12 | typedef void* ncclShmHandle_t;
13 | ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
14 | ncclResult_t ncclShmClose(ncclShmHandle_t handle);
15 | ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_ERR_H_
 6 | #define NCCL_ERR_H_
 7 | 
 8 | /* Data types */
 9 | typedef enum { ncclInt8       = 0, ncclChar       = 0,
10 |                ncclUint8      = 1,
11 |                ncclInt32      = 2, ncclInt        = 2,
12 |                ncclUint32     = 3,
13 |                ncclInt64      = 4,
14 |                ncclUint64     = 5,
15 |                ncclFloat16    = 6, ncclHalf       = 6,
16 |                ncclFloat32    = 7, ncclFloat      = 7,
17 |                ncclFloat64    = 8, ncclDouble     = 8,
18 |                ncclBfloat16   = 9,
19 | } ncclDataType_t;
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/pkg/txz/create_txz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | BUILDDIR=`basename $PWD`
11 | 
12 | cd ..
13 | NCCL_MAJOR=${nccl:Major}
14 | NCCL_MINOR=${nccl:Minor}
15 | NCCL_PATCH=${nccl:Patch}
16 | NCCL_SUFFIX=${nccl:Suffix}
17 | CUDA_MAJOR=${cuda:Major}
18 | CUDA_MINOR=${cuda:Minor}
19 | PKG_REVISION=${pkg:Revision}
20 | PKG_ARCH=${pkg:Arch}
21 | 
22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
23 | 
24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
25 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : src.build
 9 | install : src.install
10 | BUILDDIR ?= $(abspath ./build)
11 | ABSBUILDDIR := $(abspath $(BUILDDIR))
12 | TARGETS := src pkg
13 | clean: ${TARGETS:%=%.clean}
14 | test.build: src.build
15 | LICENSE_FILES := LICENSE.txt
16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
17 | lic: $(LICENSE_TARGETS)
18 | 
19 | ${BUILDDIR}/%.txt: %.txt
20 | 	@printf "Copying    %-35s > %s\n" $< $@
21 | 	mkdir -p ${BUILDDIR}
22 | 	cp $< $@
23 | 
24 | src.%:
25 | 	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
26 | 
27 | pkg.%:
28 | 	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
29 | 
30 | pkg.debian.prep: lic
31 | pkg.txz.prep: lic
32 | 


--------------------------------------------------------------------------------
/ext-tuner/example/example/cuda/pytorch/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | THIS_PATH=$(readlink -f "$0")
 4 | THIS_DIR=$(dirname "$THIS_PATH")
 5 | NCCL_HOME="$THIS_DIR/../../../../../build"
 6 | TUNER_HOME="$THIS_DIR/../../../build"
 7 | 
 8 | export TUNER_MAXCHANNELS=32
 9 | export TUNER_P2P_NCHANNELS=2
10 | export TUNER_WHITELIST_CASES_FILE="./whitelistcases.txt"
11 | export TUNER_WHITELIST_RULES_FILE="./whitelistrules.txt"
12 | export NCCL_TIMEOUT=3600
13 | export TUNER_PRETRAIN_STEPS=360
14 | export TUNER_TRAIN_STEPS=240
15 | export TUNER_PROFILE_REPEAT=5
16 | export TUNER_COORDINATOR=localhost:12449
17 | export TUNER_WORLDSIZE=8
18 | export NCCL_TUNER_PLUGIN=${TUNER_HOME}/libnccl-plugin.so
19 | export LD_PRELOAD=${NCCL_HOME}/lib/libnccl.so:$LD_PRELOAD
20 | export LD_LIBRARY_PATH=${NCCL_HOME}/lib:${TUNER_HOME}:$LD_LIBRARY_PATH
21 | 
22 | torchrun --nproc_per_node 8 --nnodes 1 --node_rank 0 demo.py
23 | 


--------------------------------------------------------------------------------
/src/include/net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INT_NET_H_
 8 | #define NCCL_INT_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | #include "comm.h"
13 | #include "checks.h"
14 | 
15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
16 | 
17 | ncclResult_t ncclNetPluginInit();
18 | ncclResult_t ncclNetInit(struct ncclComm* comm);
19 | int ncclNetVersion(struct ncclComm* comm);
20 | 
21 | // Test whether the current GPU support GPU Direct RDMA.
22 | ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
23 | 
24 | extern ncclNet_t ncclNetIb;
25 | extern ncclNet_t ncclNetSocket;
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/src/include/tuner.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_INT_TUNER_H_
 9 | #define NCCL_INT_TUNER_H_
10 | 
11 | #include "nccl_tuner.h"
12 | 
13 | // Tuning plugin to override NCCL's default algorithm/protocol tuning.
14 | 
15 | // Attempts to load NCCL tuner from environmental variable.
16 | // Returns ncclSuccess if the correct tuner symbol has been found and
17 | // successully loaded.  Otherwise returns an error and also logs the error.
18 | ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner);
19 | 
20 | // Cleans up NCCL tuner plugin.
21 | ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner);
22 | #endif


--------------------------------------------------------------------------------
/src/init_nvtx.cc:
--------------------------------------------------------------------------------
 1 | #include "nccl.h"
 2 | #include "nvtx.h"
 3 | 
 4 | static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
 5 |   {"Sum", ncclSum},
 6 |   {"Product", ncclProd},
 7 |   {"Max", ncclMax},
 8 |   {"Min", ncclMin},
 9 |   {"Avg", ncclAvg}
10 | };
11 | 
12 | // Must be called before the first call to any reduction operation.
13 | void initNvtxRegisteredEnums() {
14 |   // Register schemas and strings
15 |   constexpr const nvtxPayloadEnumAttr_t eAttr {
16 |     .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES |
17 |       NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID,
18 |     .name = NULL,
19 |     .entries = NvtxEnumRedSchema,
20 |     .numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
21 |     .sizeOfEnum = sizeof(ncclRedOp_t),
22 |     .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP
23 |   };
24 | 
25 |   nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
26 | }
27 | 


--------------------------------------------------------------------------------
/ext-tuner/example/test/cuda/tuner_loader.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_INT_TUNER_H_
 9 | #define NCCL_INT_TUNER_H_
10 | 
11 | #include "plugin/cuda/plugin.h"
12 | 
13 | // Tuning plugin to override NCCL's default algorithm/protocol tuning.
14 | 
15 | // Attempts to load NCCL tuner from environmental variable.
16 | // Returns ncclSuccess if the correct tuner symbol has been found and
17 | // successully loaded.  Otherwise returns an error and also logs the error.
18 | ncclResult_t ncclLoadTunerPlugin(ncclTuner_t **tuner);
19 | 
20 | // Cleans up NCCL tuner plugin.
21 | ncclResult_t ncclCloseTunerPlugin(ncclTuner_t **tuner);
22 | #endif
23 | 


--------------------------------------------------------------------------------
/pkg/srctxz/create_srctxz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | cd ..
11 | NCCLDIR=`basename $PWD`
12 | 
13 | echo "Checking for unclean directory ..."
14 | git clean -x -i
15 | echo "Clean done."
16 | echo "Checking for uncommited files ..."
17 | if [ "`git status -s | wc -l`" != "0" ]; then
18 |   git status -s
19 |   echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
20 |   read
21 | fi
22 | 
23 | cd ..
24 | NCCL_MAJOR=${nccl:Major}
25 | NCCL_MINOR=${nccl:Minor}
26 | NCCL_PATCH=${nccl:Patch}
27 | NCCL_SUFFIX=${nccl:Suffix}
28 | NCCL_BUILD=${pkg:Revision}
29 | 
30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
31 | 
32 | tar --exclude build \
33 |     --exclude ".git*" \
34 |     --exclude pkg/srctxz \
35 |     --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
36 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/cuda/collectives/util.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cstdlib>
 4 | #include <cstring>
 5 | #include "src/cuda/nccl_params.h"
 6 | 
 7 | int effective2Wire(int effectiveChunksize, int proto) {
 8 |   int wireChunksize = -1;
 9 |   if (proto == NCCL_PROTO_LL)
10 |     wireChunksize = static_cast<int>(effectiveChunksize * 2);
11 |   else if (proto == NCCL_PROTO_LL128)
12 |     wireChunksize = static_cast<int>(effectiveChunksize * 16 / 15);
13 |   else if (proto == NCCL_PROTO_SIMPLE)
14 |     wireChunksize = effectiveChunksize;
15 |   return wireChunksize;
16 | }
17 | int wire2Effective(int wireChunksize, int proto) {
18 |   int effectiveChunksize = -1;
19 |   if (proto == NCCL_PROTO_LL)
20 |     effectiveChunksize = static_cast<int>(wireChunksize / 2);
21 |   else if (proto == NCCL_PROTO_LL128)
22 |     effectiveChunksize = static_cast<int>(wireChunksize * 15 / 16);
23 |   else if (proto == NCCL_PROTO_SIMPLE)
24 |     effectiveChunksize = wireChunksize;
25 |   return effectiveChunksize;
26 | }
27 | 


--------------------------------------------------------------------------------
/src/include/p2p.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <stdlib.h>
 8 | 
 9 | #ifndef NCCL_P2P_H_
10 | #define NCCL_P2P_H_
11 | 
12 | #define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
13 | 
14 | typedef struct {
15 |   int data; // Currently only support an fd based descriptor
16 | } ncclCuDesc;
17 | 
18 | typedef union {
19 |   // Legacy CUDA IPC
20 |   cudaIpcMemHandle_t devIpc;
21 |   // cuMem API support
22 |   ncclCuDesc cuDesc;
23 | } ncclIpcDesc;
24 | 
25 | ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
26 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
27 | ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/include/profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PROFILER_H_
 8 | #define NCCL_PROFILER_H_
 9 | 
10 | #include "proxy.h"
11 | 
12 | enum ncclProxyProfileState {
13 |   ncclProxyProfileBegin = 0,
14 | 
15 |   ncclProxyProfileSendGPUWait = 1,
16 |   ncclProxyProfileSendWait = 2,
17 | 
18 |   ncclProxyProfileRecvWait = 1,
19 |   ncclProxyProfileRecvFlushWait = 2,
20 |   ncclProxyProfileRecvGPUWait = 3,
21 | 
22 |   ncclProxyProfileEnd = 4,
23 | 
24 |   ncclProxyProfileSleep = 8,
25 |   ncclProxyProfileWakeup = 9,
26 | 
27 |   ncclProxyProfileIdle = 16,
28 |   ncclProxyProfileActive = 17,
29 | 
30 |   ncclProxyProfileAppend = 24,
31 |   ncclProxyProfileAppendEnd = 25
32 | };
33 | 
34 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
35 | void ncclProfilingDump();
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/include/param.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PARAM_H_
 8 | #define NCCL_PARAM_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | const char* userHomeDir();
13 | void setEnvFile(const char* fileName);
14 | void initEnv();
15 | 
16 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
17 | 
18 | #define NCCL_PARAM(name, env, deftVal) \
19 |   int64_t ncclParam##name() { \
20 |     constexpr int64_t uninitialized = INT64_MIN; \
21 |     static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
22 |     static int64_t cache = uninitialized; \
23 |     if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
24 |       ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \
25 |     } \
26 |     return cache; \
27 |   }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_H_
 6 | #define NCCL_NET_H_
 7 | 
 8 | #include <stdint.h>
 9 | #include <stdlib.h>
10 | 
11 | #include "err.h"
12 | 
13 | #define NCCL_NET_HANDLE_MAXSIZE 128
14 | 
15 | #define NCCL_PTR_HOST 0x1
16 | #define NCCL_PTR_CUDA 0x2
17 | #define NCCL_PTR_DMABUF 0x4
18 | 
19 | // Maximum number of requests per comm object
20 | #define NCCL_NET_MAX_REQUESTS 8
21 | 
22 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
23 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
24 | 
25 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
26 | 
27 | #include "net_v6.h"
28 | #include "net_v5.h"
29 | #include "net_v4.h"
30 | #include "net_v3.h"
31 | #include "net_v2.h"
32 | 
33 | #endif // end include guard
34 | 


--------------------------------------------------------------------------------
/src/include/ipcsocket.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See COPYRIGHT for license information
 5 |  */
 6 | 
 7 | #ifndef NCCL_IPCSOCKET_H
 8 | #define NCCL_IPCSOCKET_H
 9 | 
10 | #include "nccl.h"
11 | #include <stdio.h>
12 | #include <fcntl.h>
13 | #include <sys/mman.h>
14 | #include <unistd.h>
15 | #include <errno.h>
16 | #include <sys/wait.h>
17 | #include <sys/types.h>
18 | #include <sys/socket.h>
19 | #include <memory.h>
20 | #include <sys/un.h>
21 | #include <inttypes.h>
22 | 
23 | #define NCCL_IPC_SOCKNAME_LEN 64
24 | 
25 | struct ncclIpcSocket {
26 |   int fd;
27 |   char socketName[NCCL_IPC_SOCKNAME_LEN];
28 |   volatile uint32_t* abortFlag;
29 | };
30 | 
31 | ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
32 | ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
33 | 
34 | ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
35 | ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
36 | 
37 | #endif /* NCCL_IPCSOCKET_H */
38 | 


--------------------------------------------------------------------------------
/pkg/srctxz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/srctxz
11 | PKGDIR  := $(BUILDDIR)/pkg/srctxz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_REVISION   ?= 3
18 | PKG_ARCH       := $(shell uname -m)
19 | 
20 | prep: $(TXZTARGETS)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../../src clean
24 | 	@printf "Building source tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
40 | 	    $< > $@
41 | 


--------------------------------------------------------------------------------
/src/collectives/all_gather.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | 
10 | NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
11 |     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
12 | ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
13 |     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
14 |   // Just pass the size of one message and not the total bytes sent/received.
15 |   constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
16 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
17 |   };
18 |   size_t msgsize = sendcount * ncclTypeSize(datatype);
19 |   NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
20 | 
21 |   struct ncclInfo info = { ncclFuncAllGather, "AllGather",
22 |     sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
23 |     ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
24 |   return ncclEnqueueCheck(&info);
25 | }
26 | 


--------------------------------------------------------------------------------
/makefiles/formatting.mk:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
 8 | # As this file defines a new target (format), it should be included at least after the definition of the
 9 | # default target.
10 | 
11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
12 | ASTYLEDIR := $(BUILDDIR)/contrib
13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
16 | ASTYLEVER := 3.1
17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
18 | 
19 | $(ASTYLEDIR) :
20 | 	@mkdir -p $(ASTYLEDIR)
21 | 
22 | $(ASTYLETAR) : $(ASTYLEDIR)
23 | 	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
24 | 
25 | $(ASTYLEBLD) : $(ASTYLETAR)
26 | 	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
27 | 
28 | $(ASTYLEBIN) : $(ASTYLEBLD)
29 | 	${MAKE} -C $(ASTYLEBLD)
30 | 
31 | .PHONY : format
32 | format : $(ASTYLEBIN)
33 | 	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
34 | 


--------------------------------------------------------------------------------
/src/include/enqueue.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ENQUEUE_H_
 8 | #define NCCL_ENQUEUE_H_
 9 | 
10 | #include "comm.h"
11 | #include "group.h"
12 | #include "collectives.h"
13 | #include "utils.h"
14 | 
15 | #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
16 | #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
17 | 
18 | ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
19 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
20 | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
21 | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
22 | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
23 | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
24 | ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
25 | ncclResult_t chooseTransport(struct ncclComm* comm, int channelId, int peer, uint8_t isCopyEngineNotSmCopy, uint8_t p2pLevel, uint8_t* transportIndex);
26 | 
27 | #endif // End include guard
28 | 


--------------------------------------------------------------------------------
/pkg/debian/control.in:
--------------------------------------------------------------------------------
 1 | Source: nccl
 2 | Section: libs
 3 | Maintainer: cudatools <cudatools@nvidia.com>
 4 | Priority: optional
 5 | Build-depends: debhelper(>=9)
 6 | Standards-Version: 3.9.5
 7 | 
 8 | Package: libnccl${nccl:Major}
 9 | Section: libs
10 | Architecture: ${pkg:Arch}
11 | Depends: ${misc:Depends}, ${shlibs:Depends}
12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime
13 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 |  broadcast, and reduce-scatter.
16 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
17 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 |  sockets.
19 | 
20 | Package: libnccl-dev
21 | Section: libdevel
22 | Architecture: ${pkg:Arch}
23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files
25 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
26 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
27 |  broadcast, and reduce-scatter.
28 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
29 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
30 |  sockets.
31 | 


--------------------------------------------------------------------------------
/src/include/align.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ALIGN_H_
 8 | #define NCCL_ALIGN_H_
 9 | 
10 | #define DIVUP(x, y) \
11 |     (((x)+(y)-1)/(y))
12 | 
13 | #define ROUNDUP(x, y) \
14 |     (DIVUP((x), (y))*(y))
15 | 
16 | #define ALIGN_POWER(x, y) \
17 |     ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
18 | 
19 | #define ALIGN_SIZE(size, align) \
20 |   size = ((size + (align) - 1) / (align)) * (align);
21 | 
22 | #if !__CUDA_ARCH__
23 |   #ifndef __host__
24 |     #define __host__
25 |   #endif
26 |   #ifndef __device__
27 |     #define __device__
28 |   #endif
29 | #endif
30 | 
31 | template<typename X, typename Y, typename Z = decltype(X()+Y())>
32 | __host__ __device__ constexpr Z divUp(X x, Y y) {
33 |   return (x+y-1)/y;
34 | }
35 | 
36 | template<typename X, typename Y, typename Z = decltype(X()+Y())>
37 | __host__ __device__ constexpr Z roundUp(X x, Y y) {
38 |   return (x+y-1) - (x+y-1)%y;
39 | }
40 | 
41 | // assumes second argument is a power of 2
42 | template<typename X, typename Z = decltype(X()+int())>
43 | __host__ __device__ constexpr Z alignUp(X x, int a) {
44 |   return (x+a-1) & Z(-a);
45 | }
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/pkg/txz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/txz
11 | PKGDIR  := $(BUILDDIR)/pkg/txz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_ARCH   := $(shell uname -m)
18 | 
19 | prep: $(TXZTARGETS)
20 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
24 | 	@printf "Building tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash txz/create_txz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
40 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
41 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
42 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
43 | 	    $< > $@
44 | 


--------------------------------------------------------------------------------
/src/collectives/device/gen_rules.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | dir=$1
 9 | 
10 | datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64"
11 | if [ "$CUDA_MAJOR" -ge 11 ]
12 | then
13 |     datatypes+=" bf16"
14 | fi
15 | 
16 | targets="GENOBJS := \\\\\n"
17 | 
18 | for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
19 |   opn=0
20 |   for op in sum prod min max premulsum sumpostdiv; do
21 |     dtn=0
22 |     # Order must match that of the ncclDataType_t enum
23 |     for dt in ${datatypes}; do
24 |       # Generate a unique filename for each compilation unit,
25 |       # otherwise the __nv_module_id may conflict at link time
26 |       echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu"
27 |       echo "	@printf \"Copying    %-35s > %s\\\\n\" \$< \$@"
28 |       echo "	cp \$< \$@"
29 |       echo ""
30 |       # Compile the file
31 |       echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep"
32 | 
33 |       echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
34 |       echo "	mkdir -p ${dir}"
35 |       echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
36 |       echo ""
37 |       targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
38 |       dtn=$(($dtn + 1))
39 |     done
40 |     opn=$(($opn + 1))
41 |   done
42 | done
43 | echo -e "$targets"
44 | 


--------------------------------------------------------------------------------
/src/collectives/all_reduce.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "nccl.h"
 9 | 
10 | NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
11 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
12 | ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
13 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
14 |   struct NvtxParamsAllReduce {
15 |     size_t bytes;
16 |     ncclRedOp_t op;
17 |   };
18 |   // Just pass the size of one message and not the total bytes sent/received.
19 |   static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
20 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
21 |     {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
22 |       offsetof(NvtxParamsAllReduce, op)}
23 |   };
24 |   NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
25 |   NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
26 | 
27 |   struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
28 |     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
29 |     ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
30 |   return ncclEnqueueCheck(&info);
31 | }
32 | 


--------------------------------------------------------------------------------
/src/enhcompat.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
 8 | 
 9 | enum cudaError_t { cudaErrorStubLibrary = 34 };
10 | 
11 | extern "C" {
12 | 
13 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         __attribute__((visibility("hidden"))) __attribute((weak));
14 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         { return cudaErrorStubLibrary; }
15 | 
16 | cudaError_t cudaUserObjectCreate(...)                __attribute__((visibility("hidden"))) __attribute((weak));
17 | cudaError_t cudaUserObjectCreate(...)                { return cudaErrorStubLibrary; }
18 | 
19 | cudaError_t cudaGraphRetainUserObject(...)           __attribute__((visibility("hidden"))) __attribute((weak));
20 | cudaError_t cudaGraphRetainUserObject(...)           { return cudaErrorStubLibrary; }
21 | 
22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
24 | 
25 | cudaError_t cudaGetDriverEntryPoint(...)             __attribute__((visibility("hidden"))) __attribute((weak));
26 | cudaError_t cudaGetDriverEntryPoint(...)             { return cudaErrorStubLibrary; }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/collectives/reduce_scatter.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | #include "nccl.h"
10 | 
11 | NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
12 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
13 | ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
14 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
15 |   struct NvtxParamsReduceScatter {
16 |     size_t bytes;
17 |     ncclRedOp_t op;
18 |   };
19 |   constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
20 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
21 |     {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
22 |       offsetof(NvtxParamsReduceScatter, op)}
23 |   };
24 |   NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
25 |   NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
26 | 
27 |   struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
28 |     sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
29 |     REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
30 |   return ncclEnqueueCheck(&info);
31 | }
32 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2021  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /* This header defines types which are used by the internal implementation
10 | *  of NVTX and callback subscribers.  API clients do not use these types,
11 | *  so they are defined here instead of in nvToolsExt.h to clarify they are
12 | *  not part of the NVTX client API. */
13 | 
14 | #ifndef NVTXEXTTYPES_H
15 | #define NVTXEXTTYPES_H
16 | 
17 | #ifndef NVTX_EXT_TYPES_GUARD
18 | #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h.
19 | #endif
20 | 
21 | typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
22 | 
23 | typedef struct nvtxExtModuleSegment_t
24 | {
25 |     size_t segmentId;
26 |     size_t slotCount;
27 |     intptr_t* functionSlots;
28 | } nvtxExtModuleSegment_t;
29 | 
30 | typedef struct nvtxExtModuleInfo_t
31 | {
32 |     uint16_t nvtxVer;
33 |     uint16_t structSize;
34 |     uint16_t moduleId;
35 |     uint16_t compatId;
36 |     size_t segmentsCount;
37 |     nvtxExtModuleSegment_t* segments;
38 |     NvtxExtGetExportFunction_t getExportFunction;
39 |     const void* extInfo;
40 | } nvtxExtModuleInfo_t;
41 | 
42 | typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
43 | 
44 | #endif /* NVTXEXTTYPES_H */


--------------------------------------------------------------------------------
/src/collectives/reduce.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | #include "nccl.h"
10 | 
11 | NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
12 |     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
13 | ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
14 |     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
15 |   struct NvtxParamsReduce {
16 |     size_t bytes;
17 |     int root;
18 |     ncclRedOp_t op;
19 |   };
20 |   constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
21 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
22 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
23 |     {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
24 |       offsetof(NvtxParamsReduce, op)}
25 |   };
26 |   NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
27 |   NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
28 | 
29 |   struct ncclInfo info = { ncclFuncReduce, "Reduce",
30 |     sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
31 |     REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
32 |   return ncclEnqueueCheck(&info);
33 | }
34 | 


--------------------------------------------------------------------------------
/ext-tuner/example/README.md:
--------------------------------------------------------------------------------
 1 | # Contribution
 2 | 
 3 | ## docker
 4 | ```
 5 | docker pull nvcr.io/nvidia/pytorch:23.08-py3
 6 | ```
 7 | 
 8 | ## Code Style
 9 | ```sh
10 | # download clang-format 10.0.0
11 | wget https://gh-proxy.com/https://github.com/llvm/llvm-project/releases/download/llvmorg-13.0.0/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz
12 | # uncompress
13 | tar -xf clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz
14 | clangtidy=./clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04/bin/clang-tidy
15 | clangformat=./clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04/bin/clang-format 
16 | 
17 | # use bear to generate compile_commands.json
18 | sudo apt install bear
19 | bear make
20 | find . -name '*.cc' -or -name '*.h' -or -name '*.cu' | xargs $clangtidy -p . -fix-errors
21 | find . -name '*.cc' -or -name '*.h' -or -name '*.cu' | xargs $clangtidy -p . -checks=cppcoreguidelines-init-variables  > tidy_result.txt 2>&1
22 | 
23 | pip3 install cpplint
24 | 
25 | # check the format
26 | find . -name '*.cc' -or -name '*.h' -or -name '*.cu' | xargs cpplint > lint_result.txt 2>&1
27 | find . -name '*.cc' -or -name '*.h' -or -name '*.cu' | xargs $clangformat -i
28 | ```
29 | 
30 | ## performance debug
31 | ```sh
32 | apt update
33 | apt install linux-tools-`uname -r | cut -d- -f1-2`-`uname -r | cut -d- -f3` -y
34 | apt install linux-tools-common
35 | apt install linux-tools-generic -y
36 | make perf
37 | perf report -g -i perf.data
38 | ```
39 | * flamegraph
40 | ```
41 | git clone https://github.com/brendangregg/FlameGraph.git
42 | perf script -i perf.data | FlameGraph/stackcollapse-perf.pl | FlameGraph/flamegraph.pl > out.svg
43 | ```
44 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/jobs/bruteforce_job.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/include/datatype.h"
 3 | #include "src/include/internal/logging.h"
 4 | #include "src/include/jobs/job.h"
 5 | 
 6 | struct BFJob : public Job {
 7 |     static bool isDecentralized() { return false; }
 8 |     BFJob(Tuner *tuner, const GIDTYPE &groupID, const Workload &workload,
 9 |           const Candidate &startCandidate,
10 |           std::vector<Candidate> &&validCandidates, std::vector<ConfigRange> configRanges, const int32_t warmupSteps, const int32_t nativeSteps,
11 |           const int32_t pretrainSteps, const int32_t trainSteps,
12 |           const int32_t roundMaxSteps, const int32_t optimumExpireSteps,
13 |           const int32_t expireSteps) : Job(tuner, groupID, workload, startCandidate, std::move(validCandidates), configRanges,
14 |           warmupSteps, nativeSteps, pretrainSteps+trainSteps, 0, roundMaxSteps,
15 |           optimumExpireSteps, expireSteps) {
16 |       if (this->roundMaxSteps > this->pretrainSteps) {
17 |         WARN(Logger::LogSubSys::OPTIMIZER) << "set pretrainSteps at least: " << this->roundMaxSteps;
18 |       }
19 |       this->accessByRand = Environment::get()->find("TUNER_BRUTEFOROCE_RAND", 1) > 0;
20 |     }
21 |     std::string debugStr() const override;
22 | 
23 |     JobType getType() const override {
24 |       return JobType::BFJob;
25 |     }
26 | 
27 |   protected:
28 |     void addTrain(Result *jobResult) override {};
29 | };
30 | 
31 | std::string BFJob::debugStr() const {
32 |   std::stringstream ss;  // NOLINT
33 |   ss << Job::debugStr();
34 |   ss << " + BFJob[";
35 |   ss << "]";
36 |   return ss.str();
37 | }
38 | 


--------------------------------------------------------------------------------
/ext-tuner/example/plugin/cuda/wrapper.cc:
--------------------------------------------------------------------------------
 1 | #include "src/cuda/collectives/nccl_candidate.h"
 2 | 
 3 | #define WORKLOAD_SIZE 3
 4 | #define CANDIDATE_SIZE 10
 5 | 
 6 | // Wrapper function to match Python ctypes expectations
 7 | extern "C" void ncclGetValidCandidatesWrapper(
 8 |     const GroupInfo* groupInfo,
 9 |     const std::pair<const char*, int>* pairs, size_t pairs_size,
10 |     const uint64_t* workloadElemPtr, bool scale2, int32_t** candidateElemPtr, size_t* candidateElemCount) {
11 |   Info myInfo;
12 | 
13 |   std::map<std::string, int32_t> tunerEnvs;
14 |   for (size_t i = 0; i < pairs_size; ++i) {
15 |       tunerEnvs[pairs[i].first] = pairs[i].second;
16 |   }
17 |   std::unordered_map<GIDTYPE, GroupInfo> allGroupInfos;
18 |   allGroupInfos[groupInfo->groupID] = GroupInfo(
19 |     groupInfo->groupID,
20 |     groupInfo->root,
21 |     groupInfo->rank,
22 |     groupInfo->nrank,
23 |     groupInfo->nnode,
24 |     tunerEnvs);
25 | 
26 |   Workload workload(workloadElemPtr, workloadElemPtr+WORKLOAD_SIZE);
27 |   std::vector<Candidate> candidates;
28 |   std::vector<ConfigRange> configRanges;
29 | 
30 |   ncclGetValidCandidates(myInfo, allGroupInfos, workload, scale2, &candidates, &configRanges);
31 | 
32 |   *candidateElemCount = candidates.size() * CANDIDATE_SIZE;
33 |   *candidateElemPtr = new int32_t[*candidateElemCount];
34 | 
35 |   size_t index = 0;
36 |   for (const auto& candidate : candidates) {
37 |     std::copy(candidate.begin(), candidate.end(), *candidateElemPtr + index);
38 |     index += CANDIDATE_SIZE;
39 |   }
40 | }
41 | 
42 | // Function to free allocated candidate memory
43 | extern "C" void freeCandidates(int32_t* candidateElemPtr) {
44 |     delete[] candidateElemPtr;
45 | }
46 | 


--------------------------------------------------------------------------------
/src/include/debug.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_DEBUG_H_
 8 | #define NCCL_DEBUG_H_
 9 | 
10 | #include "nccl_net.h"
11 | #include <stdio.h>
12 | #include <chrono>
13 | #include <type_traits>
14 | 
15 | #include <limits.h>
16 | #include <string.h>
17 | #include <pthread.h>
18 | 
19 | // Conform to pthread and NVTX standard
20 | #define NCCL_THREAD_NAMELEN 16
21 | 
22 | extern int ncclDebugLevel;
23 | extern uint64_t ncclDebugMask;
24 | extern pthread_mutex_t ncclDebugLock;
25 | extern FILE *ncclDebugFile;
26 | extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
27 | 
28 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
29 | 
30 | // Let code temporarily downgrade WARN into INFO
31 | extern thread_local int ncclDebugNoWarn;
32 | extern char ncclLastError[];
33 | 
34 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
35 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
36 | #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
37 | 
38 | #ifdef ENABLE_TRACE
39 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
40 | extern std::chrono::steady_clock::time_point ncclEpoch;
41 | #else
42 | #define TRACE(...)
43 | #endif
44 | 
45 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/src/include/cpuset.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CPUSET_H_
 8 | #define NCCL_CPUSET_H_
 9 | 
10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
11 | 
12 | static int hexToInt(char c) {
13 |   int v = c - '0';
14 |   if (v < 0) return -1;
15 |   if (v > 9) v = 10 + c - 'a';
16 |   if ((v < 0) || (v > 15)) return -1;
17 |   return v;
18 | }
19 | 
20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
21 | 
22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
23 |   uint32_t cpumasks[CPU_SET_N_U32];
24 |   int m = CPU_SET_N_U32-1;
25 |   cpumasks[m] = 0;
26 |   for (int o=0; o<strlen(str); o++) {
27 |     char c = str[o];
28 |     if (c == ',') {
29 |       m--;
30 |       cpumasks[m] = 0;
31 |     } else {
32 |       int v = hexToInt(c);
33 |       if (v == -1) break;
34 |       cpumasks[m] <<= 4;
35 |       cpumasks[m] += v;
36 |     }
37 |   }
38 |   // Copy cpumasks to mask
39 |   for (int a=0; m<CPU_SET_N_U32; a++,m++) {
40 |     memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
41 |   }
42 |   return ncclSuccess;
43 | }
44 | 
45 | static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
46 |   int c = 0;
47 |   uint8_t* m8 = (uint8_t*)mask;
48 |   for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
49 |     if (c == 0 && m8[o] == 0) continue;
50 |     sprintf(str+c, "%02x", m8[o]);
51 |     c+=2;
52 |     if (o && o%4 == 0) {
53 |       sprintf(str+c, ",");
54 |       c++;
55 |     }
56 |   }
57 |   str[c] = '\0';
58 |   return ncclSuccess;
59 | }
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/include/bootstrap.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_BOOTSTRAP_H_
 8 | #define NCCL_BOOTSTRAP_H_
 9 | 
10 | #include "nccl.h"
11 | #include "comm.h"
12 | 
13 | struct ncclBootstrapHandle {
14 |   uint64_t magic;
15 |   union ncclSocketAddress addr;
16 | };
17 | static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
18 | 
19 | ncclResult_t bootstrapNetInit();
20 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
21 | ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
22 | ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
23 | ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
24 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
25 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
26 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
27 | ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
28 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
29 | ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
30 | ncclResult_t bootstrapClose(void* commState);
31 | ncclResult_t bootstrapAbort(void* commState);
32 | #endif
33 | 


--------------------------------------------------------------------------------
/src/collectives/broadcast.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | 
10 | NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
11 |     ncclComm_t comm, cudaStream_t stream);
12 | ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
13 |     ncclComm_t comm, cudaStream_t stream) {
14 |   struct NvtxParamsBroadcast {
15 |     size_t bytes;
16 |     int root;
17 |   };
18 |   constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
19 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
20 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
21 |   };
22 |   NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
23 |   NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
24 | 
25 |   struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
26 |     sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
27 |     BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
28 |   return ncclEnqueueCheck(&info);
29 | }
30 | /* Deprecated original "in place" function, similar to MPI */
31 | NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
32 |     ncclComm_t comm, cudaStream_t stream);
33 | ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
34 |     ncclComm_t comm, cudaStream_t stream) {
35 |   return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/pkg/debian/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | DEBPREPDIR := $(BUILDDIR)/debian
11 | PKGDIR  := $(BUILDDIR)/pkg/deb/
12 | 
13 | DEBGEN_IN  := $(wildcard *.in)
14 | DEBGEN     := $(DEBGEN_IN:.in=)
15 | DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | PKG_ARCH       ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
20 | PKG_MULTIARCH  ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
21 | 
22 | prep : $(DEBTARGETS)
23 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
24 | 
25 | build : prep
26 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
27 | 	@printf "Building Debian package\n"
28 | 	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
29 | 	mkdir -p $(PKGDIR)
30 | 	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
31 | 
32 | clean:
33 | 	rm -Rf $(DEBPREPDIR) $(PKGDIR)
34 | 
35 | $(DEBPREPDIR)/% : %.in
36 | 	@printf "Generating %-35s > %s\n" $< $@
37 | 	mkdir -p $(DEBPREPDIR)
38 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
39 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
40 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
41 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
42 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
43 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
44 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
45 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
46 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
47 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
48 | 	    $< > $@
49 | 
50 | $(DEBPREPDIR)/% : %
51 | 	@printf "Grabbing   %-35s > %s\n" $< $@
52 | 	mkdir -p $(DEBPREPDIR)
53 | 	cp -f $< $@
54 | 


--------------------------------------------------------------------------------
/src/include/timer.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TIMER_H_
 8 | #define NCCL_TIMER_H_
 9 | #if ENABLE_TIMER
10 | #include <unistd.h>
11 | #include <sys/time.h>
12 | #include <x86intrin.h>
13 | static double freq = -1;
14 | static void calibrate() {
15 |   struct timeval tv;
16 |   gettimeofday(&tv, NULL);
17 |   uint64_t timeCycles = __rdtsc();
18 |   double time = - tv.tv_sec*1E6 - tv.tv_usec;
19 |   uint64_t total = 0ULL;
20 |   for (int i=0; i<10000; i++) total += __rdtsc();
21 |   gettimeofday(&tv, NULL);
22 |   timeCycles = __rdtsc() - timeCycles;
23 |   time += tv.tv_sec*1E6 + tv.tv_usec;
24 |   freq = timeCycles/time;
25 | }
26 | static inline double gettime() {
27 |   if (freq == -1) calibrate();
28 |   return __rdtsc()/freq;
29 | }
30 | static uint64_t counts[8];
31 | static double times[8];
32 | static double startTimes[8];
33 | #define TIME_START(index) do { \
34 |   counts[index]++; \
35 |   startTimes[index] = gettime(); \
36 | } while (0);
37 | 
38 | #define TIME_STOP(index) do { \
39 |   times[index] += gettime() - startTimes[index]; \
40 | } while (0);
41 | 
42 | #define TIME_CANCEL(index) do { \
43 |   counts[index]--; \
44 | } while (0);
45 | 
46 | #define TIME_PRINT(name) do { \
47 |   printf("%s stats", name); \
48 |   for (int i=0; i<8; i++) { \
49 |     if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
50 |     counts[i] = 0; \
51 |   } \
52 |   printf("\n"); \
53 | } while (0);
54 | #else
55 | #define TIME_START(index) while(0);
56 | #define TIME_STOP(index) while(0);
57 | #define TIME_CANCEL(index) while(0);
58 | #define TIME_PRINT(name)
59 | #endif
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/include/core.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CORE_H_
 8 | #define NCCL_CORE_H_
 9 | 
10 | #include <pthread.h>
11 | #include <unistd.h>
12 | #include <stdlib.h>
13 | #include <stdint.h>
14 | #include <algorithm> // For std::min/std::max
15 | #include "nccl.h"
16 | 
17 | #ifdef PROFAPI
18 | #define NCCL_API(ret, func, args...)        \
19 |     __attribute__ ((visibility("default"))) \
20 |     __attribute__ ((alias(#func)))          \
21 |     ret p##func (args);                     \
22 |     extern "C"                              \
23 |     __attribute__ ((visibility("default"))) \
24 |     __attribute__ ((weak))                  \
25 |     ret func(args)
26 | #else
27 | #define NCCL_API(ret, func, args...)        \
28 |     extern "C"                              \
29 |     __attribute__ ((visibility("default"))) \
30 |     ret func(args)
31 | #endif // end PROFAPI
32 | 
33 | static __inline__ int ncclTypeSize(ncclDataType_t type) {
34 |   switch (type) {
35 |     case ncclInt8:
36 |     case ncclUint8:
37 |       return 1;
38 |     case ncclFloat16:
39 | #if defined(__CUDA_BF16_TYPES_EXIST__)
40 |     case ncclBfloat16:
41 | #endif
42 |       return 2;
43 |     case ncclInt32:
44 |     case ncclUint32:
45 |     case ncclFloat32:
46 |       return 4;
47 |     case ncclInt64:
48 |     case ncclUint64:
49 |     case ncclFloat64:
50 |       return 8;
51 |     default:
52 |       return -1;
53 |   }
54 | }
55 | 
56 | #include "debug.h"
57 | #include "checks.h"
58 | #include "cudawrap.h"
59 | #include "alloc.h"
60 | #include "utils.h"
61 | #include "param.h"
62 | #include "nvtx.h"
63 | 
64 | #endif // end include guard
65 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/jobs/native_job.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/include/datatype.h"
 3 | #include "src/include/internal/logging.h"
 4 | #include "src/include/jobs/job.h"
 5 | 
 6 | #define DEBUG_DIST 0
 7 | struct NativeJob : public Job {
 8 |     NativeJob(Tuner *tuner, const GIDTYPE &groupID, const Workload &workload,
 9 |               const Candidate &startCandidate,
10 |               std::vector<Candidate> &&validCandidates, std::vector<ConfigRange> configRanges,
11 |               const int32_t warmupSteps, const int32_t nativeSteps, const int32_t pretrainSteps,
12 |               const int32_t trainSteps, const int32_t roundMaxSteps,
13 |               const int32_t optimumExpireSteps, const int32_t expireSteps)
14 |         : Job(tuner, groupID, workload, startCandidate,
15 |               std::move(validCandidates), configRanges, 0, 0, 0,
16 |               0, roundMaxSteps, optimumExpireSteps, expireSteps) {
17 |     }
18 |     JobType getType() const override {
19 |       return JobType::NativeJob;
20 |     }
21 |     static bool isDecentralized() { return DEBUG_DIST == 0; }
22 |     void createResult(const SimpleQueryStatus &status, Result *result) override;
23 | 
24 |   protected:
25 |     void addTrain(Result *jobResult) override {};
26 | };
27 | 
28 | void NativeJob::createResult(const SimpleQueryStatus &status, Result *result) {
29 |   if (this->isDone)
30 |     return;
31 |   this->isDone = true;
32 |   result->workload = this->workload;
33 |   result->roundCandidates.version = this->nextVersion++;
34 |   result->roundCandidates.roundExpire = ExpireCandidate::FOREVER;
35 |   auto nativeEC =
36 |       ExpireCandidate(ExpireCandidate::FOREVER, this->startCandidate);
37 |   this->optimum = nativeEC.candidate;
38 |   result->roundCandidates.expireCandidates.push_back(std::move(nativeEC));
39 |   this->currTotalExpire = ExpireCandidate::FOREVER;
40 |   INFO(Logger::LogSubSys::OPTIMIZER) << " add permanent:" << toDebugStr(this->workload)
41 |        << " candidate:" << toDebugStr(this->startCandidate);
42 | }
43 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 | 
 4 |  Redistribution and use in source and binary forms, with or without
 5 |  modification, are permitted provided that the following conditions
 6 |  are met:
 7 |   * Redistributions of source code must retain the above copyright
 8 |     notice, this list of conditions and the following disclaimer.
 9 |   * Redistributions in binary form must reproduce the above copyright
10 |     notice, this list of conditions and the following disclaimer in the
11 |     documentation and/or other materials provided with the distribution.
12 |   * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
13 |     Laboratory, the U.S. Department of Energy, nor the names of their
14 |     contributors may be used to endorse or promote products derived
15 |     from this software without specific prior written permission.
16 | 
17 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 |  The U.S. Department of Energy funded the development of this software
30 |  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
31 | 
32 | 
33 | This code also includes files from the NVIDIA Tools Extension SDK project.
34 | 
35 | See:
36 | 
37 |    https://github.com/NVIDIA/NVTX
38 | 
39 | for more information and license details.
40 | 


--------------------------------------------------------------------------------
/src/graph/rings.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "core.h"
 8 | 
 9 | #define MAXWIDTH 20
10 | #define PREFIXLEN 15
11 | #define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
12 | void dumpLine(int* values, int nranks, const char* prefix) {
13 |   int prefixlen = strlen(prefix);
14 |   char line[STRLENGTH+1];
15 |   line[STRLENGTH] = '\0';
16 |   memset(line, ' ', STRLENGTH);
17 |   strncpy(line, prefix, PREFIXLEN);
18 |   for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
19 |   INFO(NCCL_INIT,"%s", line);
20 | }
21 | 
22 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
23 |   for (int r=0; r<nrings; r++) {
24 |     char prefix[40];
25 |     /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
26 |     dumpLine(prev+r*nranks, nranks, prefix);
27 |     sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
28 |     dumpLine(next+r*nranks, nranks, prefix);*/
29 | 
30 |     int current = rank;
31 |     for (int i=0; i<nranks; i++) {
32 |       rings[r*nranks+i] = current;
33 |       current = next[r*nranks+current];
34 |     }
35 |     sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
36 |     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
37 |     if (current != rank) {
38 |       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
39 |       return ncclInternalError;
40 |     }
41 |     // Check that all ranks are there
42 |     for (int i=0; i<nranks; i++) {
43 |       int found = 0;
44 |       for (int j=0; j<nranks; j++) {
45 |         if (rings[r*nranks+j] == i) {
46 |           found = 1;
47 |           break;
48 |         }
49 |       }
50 |       if (found == 0) {
51 |         WARN("Error : ring %d does not contain rank %d", r, i);
52 |         return ncclInternalError;
53 |       }
54 |     }
55 |   }
56 |   return ncclSuccess;
57 | }
58 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/internal/threadsafe_queue.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <condition_variable>
 3 | #include <mutex>
 4 | #include <queue>
 5 | #include <utility>
 6 | 
 7 | template <typename T> class ThreadsafeQueue {
 8 |   public:
 9 |     ThreadsafeQueue() = default;
10 | 
11 |     void push(T &&value) {
12 |       // std::lock_guard<std::mutex> lock(mutex_);
13 |       pthread_mutex_lock(&this->mutex);
14 |       queue_.push(std::move(value));
15 |       // condition_.notify_one();
16 |       pthread_cond_signal(&this->cond);
17 |       pthread_mutex_unlock(&this->mutex);
18 |     }
19 | 
20 |     void waitPop(T *value) {
21 |       // std::unique_lock<std::mutex> lock(mutex_);
22 |       pthread_mutex_lock(&this->mutex);
23 |       // condition_.wait(lock, [this] { return !queue_.empty(); });
24 |       while (queue_.empty()) {
25 |         pthread_cond_wait(&this->cond, &this->mutex);
26 |       }
27 |       *value = std::move(queue_.front());
28 |       queue_.pop();
29 |       pthread_mutex_unlock(&this->mutex);
30 |     }
31 | 
32 |     void merge(std::vector<T> *v) {
33 |       for (auto &item : *v) {
34 |         // std::lock_guard<std::mutex> lock(mutex_);
35 |         pthread_mutex_lock(&this->mutex);
36 |         queue_.push(std::move(item));
37 |         // condition_.notify_one();
38 |         pthread_cond_signal(&this->cond);
39 |         pthread_mutex_unlock(&this->mutex);
40 |       }
41 |     }
42 |     bool empty() {
43 |       // std::lock_guard<std::mutex> lock(mutex_);
44 |       pthread_mutex_lock(&this->mutex);
45 |       bool flag = queue_.empty();  // NOLINT
46 |       pthread_mutex_unlock(&this->mutex);
47 |       return flag;
48 |     }
49 | 
50 |     void wait(const bool &flag = false) {
51 |       pthread_mutex_lock(&this->mutex);
52 |       while (!flag && queue_.empty()) {
53 |         pthread_cond_wait(&this->cond, &this->mutex);
54 |       }
55 |       pthread_mutex_unlock(&this->mutex);
56 |     }
57 | 
58 |   private:
59 |     std::queue<T> queue_;
60 |     pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
61 |     pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
62 |     // mutable std::mutex mutex_;
63 |     // std::condition_variable condition_;
64 | };
65 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/internal/env.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdlib>
 3 | #include <cstring>
 4 | #include <iostream>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <unordered_map>
 8 | 
 9 | class Environment {
10 |   public:
11 |     static Environment *get() {
12 |       static std::shared_ptr<Environment> instance(new Environment());
13 |       return instance.get();
14 |     }
15 | 
16 |     template <typename T> T find(const char *key, const T defaultValue) {
17 |       const char *value = this->find(key);
18 |       if (value == nullptr) {
19 |         return defaultValue;
20 |       } else {
21 |         return T(value);
22 |       }
23 |     }
24 | 
25 |     int32_t find(const char *key, const int32_t defaultValue) {
26 |       const char *value = this->find(key);
27 |       if (value == nullptr) {
28 |         return defaultValue;
29 |       } else {
30 |         return std::stoi(value);
31 |       }
32 |     }
33 | 
34 |     const char *find(const char *key) {
35 |       if (kvs.count(std::string(key)) > 0) {
36 |         return kvs[std::string(key)].c_str();
37 |       } else {
38 |         const char *value = getenv(key);
39 |         if (value == nullptr || strlen(value) == 0)
40 |           return nullptr;
41 |         return value;
42 |       }
43 |     }
44 | 
45 |     void set(const std::string &key, const std::string &value) {
46 |       kvs[key] = value;
47 |     }
48 | 
49 |   private:
50 |     std::unordered_map<std::string, std::string> kvs;
51 | };
52 | 
53 | #if 0
54 | int32_t main() {
55 |   // Test the Environment class
56 |   Environment& env = Environment::get();
57 | 
58 |   // Test find() with string value
59 |   std::string strValue = env.find("STRING_KEY", "default_string");
60 |   std::cout << "String value: " << strValue << std::endl;
61 | 
62 |   // Test find() with int32_t value
63 |   int32_t intValue = env.find("INT_KEY", 123);
64 |   std::cout << "Int value: " << intValue << std::endl;
65 | 
66 |   // Test set() and find() with newly set value
67 |   env.set("NEW_KEY", "new_value");
68 |   std::string newValue = env.find("NEW_KEY", "default_new_value");
69 |   std::cout << "New value: " << newValue << std::endl;
70 | 
71 |   return 0;
72 | }
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/include/npkit/npkit.h:
--------------------------------------------------------------------------------
 1 | #ifndef NPKIT_H_
 2 | #define NPKIT_H_
 3 | 
 4 | #include <string>
 5 | #include <thread>
 6 | 
 7 | #include <cuda_runtime.h>
 8 | 
 9 | #include "npkit/npkit_event.h"
10 | #include "npkit/npkit_struct.h"
11 | 
12 | class NpKit {
13 |  public:
14 |   static const uint64_t kNumGpuEventBuffers = 64; // [32, 64) for TreeSlipt
15 | 
16 |   static const uint64_t kNumCpuEventBuffers = 64; // [32, 64) for TreeSlipt
17 | 
18 |   static ncclResult_t Init(int rank);
19 | 
20 |   static ncclResult_t Dump(const std::string& dump_dir);
21 | 
22 |   static ncclResult_t Shutdown();
23 | 
24 |   static NpKitEventCollectContext* GetGpuEventCollectContexts();
25 | 
26 |   static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp,
27 |                                                 NpKitEventCollectContext* ctx) {
28 |     uint64_t event_buffer_head = ctx->event_buffer_head;
29 |     if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
30 |       NpKitEvent& event = ctx->event_buffer[event_buffer_head];
31 |       event.fields.type = type;
32 |       event.fields.size = size;
33 |       event.fields.rsvd = rsvd;
34 |       event.fields.timestamp = timestamp;
35 |       ctx->event_buffer_head++;
36 |     }
37 |   }
38 | 
39 |   static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
40 | 
41 |   static uint64_t* GetCpuTimestamp();
42 | 
43 |  private:
44 |   static void CpuTimestampUpdateThread();
45 | 
46 |   // max: 2M * 32 * 16B = 1GB per GPU
47 |   static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 21; // 2M
48 | 
49 |   // max: 4M * 32 * 16B = 2GB per CPU
50 |   static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 22; // 2M * 2 (send/recv) * (32/32) = 4M
51 | 
52 |   static NpKitEvent** gpu_event_buffers_;
53 |   static NpKitEvent** cpu_event_buffers_;
54 | 
55 |   static NpKitEventCollectContext* gpu_collect_contexts_;
56 |   static NpKitEventCollectContext* cpu_collect_contexts_;
57 |   static uint64_t* cpu_timestamp_;
58 | 
59 |   static uint64_t rank_;
60 | 
61 |   static std::thread* cpu_timestamp_update_thread_;
62 |   static volatile bool cpu_timestamp_update_thread_should_stop_;
63 | };
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/src/collectives/sendrecv.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "enqueue.h"
 8 | #include "collectives.h"
 9 | #include "argcheck.h" // Need some checks here since we access comm
10 | 
11 | struct NvtxParamsSendRecv {
12 |     size_t bytes;
13 |     int peer;
14 | };
15 | constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
16 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
17 |     {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
18 | };
19 | 
20 | NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
21 |     ncclComm_t comm, cudaStream_t stream);
22 | ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
23 |     ncclComm_t comm, cudaStream_t stream) {
24 |   NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
25 |   NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
26 | 
27 |   struct ncclInfo info = { ncclFuncSend, "Send",
28 |     NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
29 |     1, 1 };
30 |   ncclResult_t ret;
31 |   NCCLCHECK(ncclGroupStart());
32 |   ret = ncclEnqueueCheck(&info);
33 |   NCCLCHECK(ncclGroupEnd());
34 |   return ret;
35 | }
36 | 
37 | NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
38 |     ncclComm_t comm, cudaStream_t stream);
39 | ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
40 |     ncclComm_t comm, cudaStream_t stream) {
41 |   NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
42 |   NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
43 | 
44 |   struct ncclInfo info = { ncclFuncRecv, "Recv",
45 |     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
46 |     1, 1 };
47 |   ncclResult_t ret;
48 |   NCCLCHECK(ncclGroupStart());
49 |   ret = ncclEnqueueCheck(&info);
50 |   NCCLCHECK(ncclGroupEnd());
51 |   return ret;
52 | }
53 | 


--------------------------------------------------------------------------------
/pkg/redhat/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | RPMPREPDIR := $(BUILDDIR)/redhat
11 | PKGDIR  := $(BUILDDIR)/pkg/rpm/
12 | 
13 | RPMGEN_IN  := $(wildcard *.in)
14 | RPMGEN     := $(RPMGEN_IN:.in=)
15 | RPMFILES   := $(RPMGEN)
16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | ARCH           := $(shell uname -m)
20 | PKG_ARCH       ?= $(shell uname -m)
21 | PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
22 | ifeq ($(PKG_MULTIARCH),)
23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
24 | PKG_MULTIARCH  := $(ARCH)-linux-gnu
25 | endif
26 | 
27 | prep : $(RPMTARGETS)
28 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
29 | 
30 | build : prep
31 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
32 | 	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
33 | 	@printf "Building Redhat package\n"
34 | 	mkdir -p $(PKGDIR)
35 | 	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
36 |                  --define "_rpmdir $(PKGDIR)" \
37 |                  --define "_builddir $(PKGDIR)/build/" \
38 |                  --define "_buildrootdir $(PKGDIR)/buildroot/" \
39 |                  -bb $(BUILDDIR)/redhat/nccl.spec
40 | 
41 | clean:
42 | 	rm -Rf $(RPMPREPDIR) $(PKGDIR)
43 | 
44 | $(RPMPREPDIR)/% : %.in
45 | 	@printf "Generating %-35s > %s\n" $< $@
46 | 	mkdir -p $(RPMPREPDIR)
47 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
48 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
49 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
50 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
51 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
52 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
53 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
54 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
55 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
56 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
57 | 	    $< > $@
58 | 
59 | $(RPMPREPDIR)/% : %
60 | 	@printf "Grabbing   %-35s > %s\n" $< $@
61 | 	mkdir -p $(RPMPREPDIR)
62 | 	cp -f $< $@
63 | 


--------------------------------------------------------------------------------
/ext-tuner/example/.clang-tidy:
--------------------------------------------------------------------------------
 1 | # refer to https://clang.llvm.org/extra/clang-tidy/checks/list.html
 2 | Checks: -*, clang-analyzer-core.*, clang-analyzer-cplusplus.*, clang-analyzer-deadcode.*, clang-analyzer-nullability.*, clang-analyzer-security.*, clang-analyzer-unix.*, clang-analyzer-valist.*, cppcoreguidelines-macro-usage, cppcoreguidelines-interfaces-global-init, cppcoreguidelines-narrowing-conversions, cppcoreguidelines-no-malloc, cppcoreguidelines-prefer-member-initializer, cppcoreguidelines-special-member-functions, cppcoreguidelines-slicing, google-build-explicit-make-pair, google-default-arguments, google-explicit-constructor, modernize-avoid-bind, modernize-loop-convert, modernize-macro-to-enum, modernize-make-shared, modernize-make-unique, modernize-pass-by-value, modernize-redundant-void-arg, modernize-return-braced-init-list, modernize-use-auto, modernize-use-bool-literals, modernize-use-emplace, modernize-use-equals-default, modernize-use-equals-delete, modernize-use-nullptr, modernize-use-override, modernize-use-using, performance-faster-string-find, performance-for-range-copy, performance-implicit-conversion-in-loop, performance-inefficient-algorithm, performance-inefficient-vector-operation, performance-move-const-arg, performance-move-constructor-init, performance-no-automatic-move, performance-trivially-destructible, performance-type-promotion-in-math-fn, performance-unnecessary-copy-initialization, performance-unnecessary-value-param
 3 | 
 4 | WarningsAsErrors: clang-analyzer-*, -clang-analyzer-security.insecureAPI.rand, cppcoreguidelines-interfaces-global-init, cppcoreguidelines-no-malloc, cppcoreguidelines-slicing, google-*, modernize-use-emplace, modernize-use-equals-default, modernize-use-equals-delete, performance-implicit-conversion-in-loop, performance-inefficient-algorithm, performance-move-constructor-init, performance-no-automatic-move, performance-trivially-destructible, performance-type-promotion-in-math-fn, performance-unnecessary-copy-initialization
 5 | 
 6 | CheckOptions:
 7 |   - key:           cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor
 8 |     value:         True
 9 |   - key:           cppcoreguidelines-special-member-functions.AllowMissingMoveFunctionsWhenCopyIsDeleted
10 |     value:         True
11 |   - key:           performance-move-const-arg.CheckTriviallyCopyableMove
12 |     value:         False
13 | 


--------------------------------------------------------------------------------
/src/collectives/device/onerank_reduce.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "devcomm.h"
 8 | #include "collectives.h"
 9 | #include "common_kernel.h"
10 | #include "common.h"
11 | 
12 | namespace {
13 |   template<typename T, typename RedOp>
14 |   __device__ __forceinline__ void oneRankReduce() {
15 |     ncclWork *w = &ncclShmem.work;
16 |     int tid = threadIdx.x;
17 |     int tn = blockDim.x;
18 |     #pragma unroll 1
19 |     for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
20 |       ncclWorkElem *we = &w->elems[e];
21 |       intptr_t eltN = we->count;
22 |       int bid = we->bid;
23 |       int bn = we->nChannels;
24 |       T const *src = (T const*)we->sendbuff;
25 |       T *dst = (T*)we->recvbuff;
26 | 
27 |       // each block/channel gets a roughly equal segment of 16 byte packs
28 |       constexpr int EltPerPack = 16/sizeof(T);
29 |       intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack;
30 |       intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn);
31 |       intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn);
32 |       i0 *= EltPerPack;
33 |       i0 = i0 < eltN ? i0 : eltN;
34 |       i1 *= EltPerPack;
35 |       i1 = i1 < eltN ? i1 : eltN;
36 |       src += i0;
37 |       dst += i0;
38 |       void *vsrc = (void*)src;
39 |       void *vdst = (void*)dst;
40 |       reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
41 |         (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
42 |     }
43 |   }
44 | }
45 | 
46 | #define INSTANTIATE(devredop, type) \
47 |   __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
48 |     oneRankReduce<type, Func##devredop<type>>(); \
49 |   }
50 | 
51 | INSTANTIATE(PreMulSum, int8_t)
52 | INSTANTIATE(PreMulSum, uint8_t)
53 | INSTANTIATE(PreMulSum, int32_t)
54 | INSTANTIATE(PreMulSum, uint32_t)
55 | INSTANTIATE(PreMulSum, int64_t)
56 | INSTANTIATE(PreMulSum, uint64_t)
57 | INSTANTIATE(PreMulSum, half)
58 | #if defined(__CUDA_BF16_TYPES_EXIST__)
59 | INSTANTIATE(PreMulSum, __nv_bfloat16)
60 | #endif
61 | INSTANTIATE(PreMulSum, float)
62 | INSTANTIATE(PreMulSum, double)
63 | 


--------------------------------------------------------------------------------
/src/include/channel.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CHANNEL_H_
 8 | #define NCCL_CHANNEL_H_
 9 | #include "comm.h"
10 | 
11 | ncclResult_t initChannel(struct ncclComm* comm, int channelid);
12 | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
13 | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
14 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
15 | static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
16 |   int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
17 |   int peerNode = comm->rankToNode[peer];
18 |   int peerIndex = comm->rankToLocalRank[peer];
19 |   int nsteps = comm->maxLocalRanks;
20 |   int rankIndex = comm->rankToLocalRank[comm->rank];
21 |   int step, delta;
22 |   if (coll == ncclFuncSend) {
23 |     step = (nsteps + peerIndex - rankIndex)%nsteps;
24 |     delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
25 |   } else if (coll == ncclFuncRecv) {
26 |     step = (nsteps + rankIndex - peerIndex)%nsteps;
27 |     delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
28 |   } else {
29 |     return ncclInternalError;
30 |   }
31 |   *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
32 |   TRACE(NCCL_COLL, "native=1, peer=%d %s -> channelBase=%d", peer, (coll == ncclFuncSend ? "send" : "recv"), *channelBase);
33 |   return ncclSuccess;
34 | }
35 | 
36 | static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
37 |   //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
38 |   *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
39 |   return ncclSuccess;
40 | }
41 | 
42 | static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
43 |   int base;
44 |   NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
45 |   NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
46 |   return ncclSuccess;
47 | }
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V3_H_
 6 | #define NCCL_NET_V3_H_
 7 | 
 8 | #define NCCL_NET_MAX_REQUESTS_V3 16
 9 | 
10 | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
11 | typedef struct {
12 |   // Name of the network (mainly for logs)
13 |   const char* name;
14 |   // Initialize the network.
15 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
16 |   // Return the number of adapters.
17 |   ncclResult_t (*devices)(int* ndev);
18 |   // Get various device properties.
19 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
20 |   // Create a receiving object and provide a handle to connect to it. The
21 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
22 |   // between ranks to create a connection.
23 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
24 |   // Connect to a handle and return a sending comm object for that peer.
25 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
26 |   // Finalize connection establishment after remote peer has called connectHandle
27 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
28 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
29 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
30 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
31 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
32 |   // Asynchronous send to a peer.
33 |   // May return request == NULL if the call cannot be performed (or would block)
34 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
35 |   // Asynchronous recv from a peer.
36 |   // May return request == NULL if the call cannot be performed (or would block)
37 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
38 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
39 |   // visible to the GPU
40 |   ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
41 |   // Test whether a request is complete. If size is not NULL, it returns the
42 |   // number of bytes sent/received.
43 |   ncclResult_t (*test)(void* request, int* done, int* size);
44 |   // Close and free send/recv comm objects
45 |   ncclResult_t (*closeSend)(void* sendComm);
46 |   ncclResult_t (*closeRecv)(void* recvComm);
47 |   ncclResult_t (*closeListen)(void* listenComm);
48 | } ncclNet_v3_t;
49 | 
50 | #endif // end include guard
51 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/cuda/nccl_params.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #define NCCL_NUM_ALGORITHMS 6  // Tree/Ring/CollNet*
 4 | #define NCCL_ALGO_UNDEF -1
 5 | #define NCCL_ALGO_TREE 0
 6 | #define NCCL_ALGO_RING 1
 7 | #define NCCL_ALGO_COLLNET_DIRECT 2
 8 | #define NCCL_ALGO_COLLNET_CHAIN 3
 9 | #define NCCL_ALGO_NVLS 4
10 | #define NCCL_ALGO_NVLS_TREE 5
11 | 
12 | #define NCCL_NUM_PROTOCOLS 3  // Simple/LL/LL128
13 | #define NCCL_PROTO_UNDEF -1
14 | #define NCCL_PROTO_LL 0
15 | #define NCCL_PROTO_LL128 1
16 | #define NCCL_PROTO_SIMPLE 2
17 | 
18 | #define NCCL_SM_COPY 0
19 | #define NCCL_COPY_ENGINE 1
20 | 
21 | #define NCCL_STEPS 8
22 | #define WARP_SIZE 32
23 | #define SIZEELEM (sizeof(int8_t))
24 | // LL
25 | #define EltPerLine (sizeof(uint64_t)/SIZEELEM)
26 | // LL128
27 | #define NCCL_LL128_LINESIZE 128
28 | #define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
29 | #define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS - 1)
30 | #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
31 | #define WireWordPerSlice WARP_SIZE *NCCL_LL128_SHMEM_ELEMS_PER_THREAD
32 | 
33 | #define DataEltPerSlice                                                        \
34 |   ((WireWordPerSlice-WireWordPerSlice/NCCL_LL128_LINEELEMS)*(sizeof(uint64_t)/SIZEELEM))
35 | 
36 | #define NCCL_MAX_NTHREADS 640
37 | 
38 | using ncclFunc_t = enum {
39 |   ncclFuncBroadcast,
40 |   ncclFuncReduce,
41 |   ncclFuncAllGather,
42 |   ncclFuncReduceScatter,
43 |   ncclFuncAllReduce,
44 |   ncclFuncSendRecv,
45 |   ncclFuncSend,
46 |   ncclFuncRecv,
47 |   ncclNumFuncs,
48 |   ncclFuncAll2All,
49 |   ncclFuncAll2Allv
50 | };
51 | 
52 | using LinkType = enum {
53 |   // Local (myself)
54 |   PATH_LOC = 0,
55 |   // Connection traversing NVLink
56 |   PATH_NVL = 1,
57 |   // Connection through NVLink using an intermediate GPU
58 |   PATH_NVB = 2,
59 |   // Connection traversing at most a single PCIe bridge
60 |   PATH_PIX = 3,
61 |   // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
62 |   PATH_PXB = 4,
63 |   // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
64 |   PATH_PXN = 5,
65 |   // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
66 |   PATH_PHB = 6,
67 |   // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
68 |   PATH_SYS = 7,
69 |   // Connection through the network
70 |   PATH_NET = 8
71 | };
72 | 
73 | using Transports = enum {
74 |   TRANSPORT_P2P = 0,
75 |   TRANSPORT_P2P_CE = 1,
76 |   TRANSPORT_SHM = 2,
77 |   TRANSPORT_NET = 3,
78 |   TRANSPORT_COLLNET = 4
79 | };
80 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_IMPL_GUARD
10 | #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifndef NVTX_EXT_IMPL_H
14 | #define NVTX_EXT_IMPL_H
15 | /* ---- Include required platform headers ---- */
16 | 
17 | #if defined(_WIN32) 
18 | 
19 | #include <Windows.h>
20 | 
21 | #else
22 | #include <unistd.h>
23 | 
24 | #if defined(__ANDROID__)
25 | #include <android/api-level.h> 
26 | #endif
27 | 
28 | #if defined(__linux__) || defined(__CYGWIN__)
29 | #include <sched.h>
30 | #endif
31 | 
32 | #include <limits.h>
33 | #include <dlfcn.h>
34 | #include <fcntl.h>
35 | #include <stdlib.h>
36 | #include <stdio.h>
37 | #include <sys/types.h>
38 | #include <unistd.h>
39 | #include <errno.h>
40 | 
41 | #include <string.h>
42 | #include <sys/types.h>
43 | #include <pthread.h>
44 | #include <stdlib.h>
45 | #include <wchar.h>
46 | 
47 | #endif
48 | 
49 | /* ---- Define macros used in this file ---- */
50 | 
51 | #ifdef NVTX_DEBUG_PRINT
52 | #ifdef __ANDROID__
53 | #include <android/log.h>
54 | #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
55 | #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
56 | #else
57 | #include <stdio.h>
58 | #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
59 | #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
60 | #endif
61 | #else /* !defined(NVTX_DEBUG_PRINT) */
62 | #define NVTX_ERR(...)
63 | #define NVTX_INFO(...)
64 | #endif
65 | 
66 | #ifdef __cplusplus
67 | extern "C" {
68 | #endif /* __cplusplus */
69 | 
70 | // #ifdef __GNUC__
71 | // #pragma GCC visibility push(hidden)
72 | // #endif
73 | 
74 | #define NVTX_EXTENSION_FRESH 0
75 | #define NVTX_EXTENSION_DISABLED 1
76 | #define NVTX_EXTENSION_STARTING 2
77 | #define NVTX_EXTENSION_LOADED 3
78 | 
79 | NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0;
80 | 
81 | #define NVTX_EXT_INIT_GUARD
82 | #include "nvtxExtInit.h"
83 | #undef NVTX_EXT_INIT_GUARD
84 | 
85 | // #ifdef __GNUC__
86 | // #pragma GCC visibility pop
87 | // #endif
88 | 
89 | #ifdef __cplusplus
90 | } /* extern "C" */
91 | #endif /* __cplusplus */
92 | 
93 | #endif /* NVTX_EXT_IMPL_H */


--------------------------------------------------------------------------------
/src/misc/param.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "param.h"
 8 | #include "debug.h"
 9 | 
10 | #include <algorithm>
11 | #include <errno.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <string.h>
15 | #include <sys/types.h>
16 | #include <unistd.h>
17 | #include <pthread.h>
18 | #include <pwd.h>
19 | 
20 | const char* userHomeDir() {
21 |   struct passwd *pwUser = getpwuid(getuid());
22 |   return pwUser == NULL ? NULL : pwUser->pw_dir;
23 | }
24 | 
25 | void setEnvFile(const char* fileName) {
26 |   FILE * file = fopen(fileName, "r");
27 |   if (file == NULL) return;
28 | 
29 |   char *line = NULL;
30 |   char envVar[1024];
31 |   char envValue[1024];
32 |   size_t n = 0;
33 |   ssize_t read;
34 |   while ((read = getline(&line, &n, file)) != -1) {
35 |     if (line[read-1] == '\n') line[read-1] = '\0';
36 |     int s=0; // Env Var Size
37 |     while (line[s] != '\0' && line[s] != '=') s++;
38 |     if (line[s] == '\0') continue;
39 |     strncpy(envVar, line, std::min(1023,s));
40 |     envVar[s] = '\0';
41 |     s++;
42 |     strncpy(envValue, line+s, 1023);
43 |     envValue[1023]='\0';
44 |     setenv(envVar, envValue, 0);
45 |     //printf("%s : %s->%s\n", fileName, envVar, envValue);
46 |   }
47 |   if (line) free(line);
48 |   fclose(file);
49 | }
50 | 
51 | void initEnv() {
52 |   char confFilePath[1024];
53 |   const char * userDir = userHomeDir();
54 |   if (userDir) {
55 |     sprintf(confFilePath, "%s/.nccl.conf", userDir);
56 |     setEnvFile(confFilePath);
57 |   }
58 |   sprintf(confFilePath, "/etc/nccl.conf");
59 |   setEnvFile(confFilePath);
60 | }
61 | 
62 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
63 |   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
64 |   pthread_mutex_lock(&mutex);
65 |   if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
66 |     char* str = getenv(env);
67 |     int64_t value = deftVal;
68 |     if (str && strlen(str) > 0) {
69 |       errno = 0;
70 |       value = strtoll(str, nullptr, 0);
71 |       if (errno) {
72 |         value = deftVal;
73 |         INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
74 |       } else {
75 |         INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value);
76 |       }
77 |     }
78 |     __atomic_store_n(cache, value, __ATOMIC_RELAXED);
79 |   }
80 |   pthread_mutex_unlock(&mutex);
81 | }
82 | 


--------------------------------------------------------------------------------
/src/include/ibvsymbols.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_IBV_SYMBOLS_H_
 2 | #define NCCL_IBV_SYMBOLS_H_
 3 | 
 4 | #ifdef NCCL_BUILD_RDMA_CORE
 5 | #include <infiniband/verbs.h>
 6 | #else
 7 | #include "ibvcore.h"
 8 | #endif
 9 | 
10 | #include "nccl.h"
11 | 
12 | /* IB Verbs Function Pointers*/
13 | struct ncclIbvSymbols {
14 |   int (*ibv_internal_fork_init)(void);
15 |   struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
16 |   void (*ibv_internal_free_device_list)(struct ibv_device **list);
17 |   const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
18 |   struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
19 |   int (*ibv_internal_close_device)(struct ibv_context *context);
20 |   int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
21 |   void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
22 |   int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
23 |   int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
24 |   int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
25 |   int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
26 |   struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
27 |   int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
28 |   struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
29 |   struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
30 |   /* DMA-BUF support */
31 |   struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
32 |   int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
33 |   struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
34 |   int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
35 |   struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
36 |   int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
37 |   int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
38 |   const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
39 | };
40 | 
41 | /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
42 | ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
43 | 
44 | #endif  // NCCL_IBV_SYMBOLS_H_
45 | 


--------------------------------------------------------------------------------
/src/collectives/device/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../../makefiles/common.mk
 8 | include ../../../makefiles/version.mk
 9 | 
10 | BUILDDIR ?= $(abspath ../../../build)
11 | OBJDIR := $(BUILDDIR)/obj/collectives/device
12 | 
13 | LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu
14 | 
15 | LIBSRCFILES += functions.cu
16 | 
17 | DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
18 | DEPENDFILES:= $(DEPFILES:%.d=%.dep)
19 | STATICLIB  := $(OBJDIR)/colldevice.a
20 | DEVOBJ     := $(OBJDIR)/devlink.o
21 | RULESFILE  := $(OBJDIR)/Makefile.rules
22 | 
23 | NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
24 | 
25 | 
26 | all: $(STATICLIB)
27 | 
28 | # Dummy rule so that the extra dependency (%.dep) files are preserved by make
29 | all_deps: $(DEPENDFILES)
30 | 
31 | # Auto-generating the rules per op/reduction/datatype/algorithm
32 | $(RULESFILE) : gen_rules.sh
33 | 	@printf "Generating %-35s > %s\n" rules $@
34 | 	@mkdir -p $(OBJDIR)
35 | 	@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
36 | 
37 | -include $(RULESFILE)
38 | 
39 | LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o
40 | 
41 | -include $(DEPFILES)
42 | 
43 | $(STATICLIB): $(LIBOBJ) $(DEVOBJ)
44 | 	@printf "Archiving  %-35s > %s\n" objects $@
45 | 	ar cr $@ $^
46 | 
47 | # We do not want make to build *.d when running make clean.
48 | # So we only provide targets for .dep which will produce .dep and .d,
49 | # with only .d being included, and .dep keeping track of what needs to
50 | # be regenerated.
51 | $(OBJDIR)/%.dep : %.cu
52 | 	@mkdir -p $(OBJDIR)
53 | 	@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
54 | 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
55 | 	@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
56 |                 sed -e 's/^ *//' -e 's/$$/:/' >> $@
57 | 	@rm -f $@.tmp
58 | 	@cp $@ $(@:.dep=.d)
59 | 
60 | # Compiled kernels and collectives with relocatable device code ...
61 | $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
62 | 	@printf "Compiling  %-35s > %s\n" $< $@
63 | 	mkdir -p `dirname $@`
64 | 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
65 | 
66 | $(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep
67 | 	@printf "Compiling  %-35s > %s\n" $< $@
68 | 	mkdir -p `dirname $@`
69 | 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
70 | 
71 | # ... and create the device-side linked object with all those.
72 | $(DEVOBJ) : $(LIBOBJ)
73 | 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
74 | 
75 | clean:
76 | 	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
77 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v2.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V2_H_
 6 | #define NCCL_NET_V2_H_
 7 | 
 8 | typedef struct {
 9 |   // Name of the network (mainly for logs)
10 |   const char* name;
11 |   // Initialize the network.
12 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
13 |   // Return the number of adapters.
14 |   ncclResult_t (*devices)(int* ndev);
15 |   // Return the device path in /sys. NCCL will call free on this path.
16 |   ncclResult_t (*pciPath)(int dev, char** path);
17 |   // Return whether this device supports host pointers and/or CUDA pointers
18 |   // as data from the current GPU. Supported types should be composed with
19 |   // NCCL_PTR_HOST and NCCL_PTR_CUDA.
20 |   ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
21 |   // Create a receiving object and provide a handle to connect to it. The
22 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
23 |   // between ranks to create a connection.
24 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
25 |   // Connect to a handle and return a sending comm object for that peer.
26 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
27 |   // Finalize connection establishment after remote peer has called connectHandle
28 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
29 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
30 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
31 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
32 |   // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
33 |   // May return request == NULL if the call cannot be performed (or would block)
34 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
35 |   // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
36 |   // May return request == NULL if the call cannot be performed (or would block)
37 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
38 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
39 |   // visible to the GPU
40 |   ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
41 |   // Test whether a request is complete. If size is not NULL, it returns the
42 |   // number of bytes sent/received.
43 |   ncclResult_t (*test)(void* request, int* done, int* size);
44 |   // Close and free send/recv comm objects
45 |   ncclResult_t (*closeSend)(void* sendComm);
46 |   ncclResult_t (*closeRecv)(void* recvComm);
47 |   ncclResult_t (*closeListen)(void* listenComm);
48 | } ncclNet_v2_t;
49 | 
50 | #endif // end include guard
51 | 


--------------------------------------------------------------------------------
/pkg/redhat/nccl.spec.in:
--------------------------------------------------------------------------------
 1 | Name:           libnccl
 2 | Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
 3 | Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
 4 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 5 | 
 6 | Group:          Development/Libraries
 7 | License:        BSD
 8 | URL:            http://developer.nvidia.com/nccl
 9 | Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
10 | Requires(pre,preun): /sbin/ldconfig
11 | 
12 | %description
13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 | broadcast, and reduce-scatter.
16 | It has been optimized to achieve high bandwidth on any platform using PCIe,
17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 | sockets.
19 | 
20 | %package devel
21 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
22 | Group:          Development/Libraries
23 | %description devel
24 | NCCL development files
25 | 
26 | %package static
27 | Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
28 | Group:          Development/Libraries
29 | %description static
30 | NCCL static library
31 | 
32 | %define debug_package %{nil}
33 | 
34 | %prep
35 | %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
36 | 
37 | %build
38 | 
39 | %install
40 | rm -rf $RPM_BUILD_ROOT
41 | install -m 755 -d $RPM_BUILD_ROOT
42 | install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
43 | install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
44 | ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
45 | 
46 | # devel
47 | install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
48 | install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
49 | install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir}
50 | ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
51 | 
52 | # static
53 | install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
54 | 
55 | %post -p /sbin/ldconfig
56 | %postun -p /sbin/ldconfig
57 | 
58 | %post devel -p /sbin/ldconfig
59 | %postun devel -p /sbin/ldconfig
60 | 
61 | %clean
62 | rm -rf $RPM_BUILD_ROOT
63 | 
64 | %files devel
65 | %doc LICENSE.txt
66 | %defattr(-,root,root,-)
67 | %{_includedir}/nccl.h
68 | %{_includedir}/nccl_net.h
69 | %{_libdir}/libnccl.so
70 | 
71 | %files static
72 | %doc LICENSE.txt
73 | %defattr(-,root,root,-)
74 | %{_libdir}/libnccl_static.a
75 | 
76 | %files
77 | %doc LICENSE.txt
78 | %defattr(-,root,root,-)
79 | %{_libdir}/libnccl.so.${nccl:Major}
80 | %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
81 | 
82 | %changelog
83 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Prepare the docker
 2 | ```
 3 | docker pull nvcr.io/nvidia/pytorch:23.08-py3
 4 | ```
 5 | 
 6 | ## Download source code
 7 | ```
 8 | git clone --recursive https://github.com/gbxu/autoccl.git
 9 | ```
10 | 
11 | ## To build the library :
12 | 
13 | ```shell
14 | cd autoccl
15 | make -j src.build
16 | ```
17 | 
18 | If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :
19 | 
20 | ```shell
21 | make src.build CUDA_HOME=<path to cuda install>
22 | ```
23 | 
24 | AutoCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.
25 | 
26 | By default, AutoCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
27 | ```shell
28 | $ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
29 | ```
30 | 
31 | ## Build AutoCCL Tuner
32 | ```shell
33 | $ cd ext-tuner/example && make clean && make
34 | ```
35 | 
36 | ## Use AutoCCL
37 | We assume that in a distributed scenario, each CPU process is responsible for managing a GPU.
38 | 
39 | * Preload runtime and tuner to bypass Tccl on the system
40 | ```sh
41 | # Setting environment variables on each process
42 | export LD_PRELOAD=path/to/autoccl/build/lib/libnccl.so
43 | export LD_LIBRARY_PATH=path/to/autoccl/build/lib:path/to/autoccl/ext-tuner/example/build/:$LD_LIBRARY_PATH
44 | export NCCL_TUNER_PLUGIN=path/to/autoccl/ext-tuner/example/build/libnccl-plugin.so
45 | ```
46 | 
47 | * Specify the ip and port of the monitoring process
48 | ```sh
49 | # Setting environment variables on each process
50 | export TUNER_COORDINATOR="coordinator_node_ip:port"
51 | export TUNER_WORLDSIZE="YOUR_COMM_GROUP_SIZE"
52 | ```
53 | 
54 | * Specify a process on the coordinator node to create an additional thread to act as a coordinator responsible for listening to the coordinator_node_ip:port.
55 | ```sh
56 | # Setting environment variables only on a certain process
57 | export TUNER_ROLE="COORDINATOR"
58 | ```
59 | 
60 | ## Example
61 | see  `autoccl/ext-tuner/example/example/cuda/pytorch/run.sh`
62 | 
63 | 
64 | ## Citation
65 | If you use autoccl in a scientific publication, we encourage you to add the following reference to the related papers:
66 | ```
67 | @inproceedings {xu2025autoccl,
68 |     author = {Guanbin Xu and Zhihao Le and Yinhe Chen and Zhiqi Lin and Zewen Jin and Youshan Miao and Cheng Li},
69 |     title = {{AutoCCL}: Automated Collective Communication Tuning for Accelerating Distributed and Parallel {DNN} Training},
70 |     booktitle = {22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)},
71 |     year = {2025},
72 |     isbn = {978-1-939133-46-5},
73 |     address = {Philadelphia, PA},
74 |     pages = {667--683},
75 |     url = {https://www.usenix.org/conference/nsdi25/presentation/xu-guanbin},
76 |     publisher = {USENIX Association},
77 |     month = apr
78 | }
79 | ```
80 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v5.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V5_H_
 6 | #define NCCL_NET_V5_H_
 7 | 
 8 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
 9 | typedef struct {
10 |   // Name of the network (mainly for logs)
11 |   const char* name;
12 |   // Initialize the network.
13 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
14 |   // Return the number of adapters.
15 |   ncclResult_t (*devices)(int* ndev);
16 |   // Get various device properties.
17 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
18 |   // Create a receiving object and provide a handle to connect to it. The
19 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
20 |   // between ranks to create a connection.
21 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
22 |   // Connect to a handle and return a sending comm object for that peer.
23 |   // This call must not block for the connection to be established, and instead
24 |   // should return successfully with sendComm == NULL with the expectation that
25 |   // it will be called again until sendComm != NULL.
26 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
27 |   // Finalize connection establishment after remote peer has called connect.
28 |   // This call must not block for the connection to be established, and instead
29 |   // should return successfully with recvComm == NULL with the expectation that
30 |   // it will be called again until recvComm != NULL.
31 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
32 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
33 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
34 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
35 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
36 |   // Asynchronous send to a peer.
37 |   // May return request == NULL if the call cannot be performed (or would block)
38 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
39 |   // Asynchronous recv from a peer.
40 |   // May return request == NULL if the call cannot be performed (or would block)
41 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
42 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
43 |   // visible to the GPU
44 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
45 |   // Test whether a request is complete. If size is not NULL, it returns the
46 |   // number of bytes sent/received.
47 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
48 |   // Close and free send/recv comm objects
49 |   ncclResult_t (*closeSend)(void* sendComm);
50 |   ncclResult_t (*closeRecv)(void* recvComm);
51 |   ncclResult_t (*closeListen)(void* listenComm);
52 | } ncclNet_v5_t;
53 | 
54 | #endif // end include guard
55 | 


--------------------------------------------------------------------------------
/ext-tuner/example/plugin/cuda/wrapper.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import sys
 3 | import os
 4 | 
 5 | class Pair(ctypes.Structure):
 6 |     _fields_ = [("key", ctypes.c_char_p),
 7 |                 ("value", ctypes.c_int)]
 8 | 
 9 | WORKLOAD_SIZE = 3
10 | CANDIDATE_SIZE = 10
11 | 
12 | class GroupInfo(ctypes.Structure):
13 |     _fields_ = [("groupID", ctypes.c_int64),
14 |                 ("root", ctypes.c_int32),
15 |                 ("rank", ctypes.c_int32),
16 |                 ("nrank", ctypes.c_int32),
17 |                 ("nnode", ctypes.c_int32)]
18 | 
19 | class NCCLCandidateWrapper:
20 |     def __init__(self, lib_path=None):
21 |         try:
22 |             # get current file directory
23 |             current_dir = os.path.dirname(os.path.abspath(__file__))
24 |             # get dynamic library path
25 |             lib_path = os.path.join(current_dir, "../../build/libwrapper.so")
26 |             self.lib = ctypes.CDLL(lib_path)
27 |             print("Library loaded successfully")
28 |         except OSError as e:
29 |             print(f"Failed to load library: {e}")
30 |             sys.exit(1)
31 | 
32 |         self.lib.ncclGetValidCandidatesWrapper.argtypes = [
33 |             ctypes.POINTER(GroupInfo), # pointer
34 |             ctypes.POINTER(Pair), # pointer
35 |             ctypes.c_size_t,
36 |             ctypes.POINTER(ctypes.c_uint64), # pointer
37 |             ctypes.c_bool,
38 |             ctypes.POINTER(ctypes.POINTER(ctypes.c_int32)), # pointer of pointer
39 |             ctypes.POINTER(ctypes.c_size_t) # pointer
40 |         ]
41 |         self.lib.ncclGetValidCandidatesWrapper.restype = None
42 | 
43 |         self.lib.freeCandidates.argtypes = [ctypes.POINTER(ctypes.c_int32)]
44 |         self.lib.freeCandidates.restype = None
45 | 
46 |     def nccl_get_valid_candidates(self, nrank, nnode, coll, size, tunerEnvs, scale2):
47 |         pairs = [(key.encode('utf-8'), value) for key, value in tunerEnvs.items()]
48 |         pairCount = len(pairs)
49 |         pairPtr = (Pair * pairCount)(*pairs)
50 | 
51 |         group_info = GroupInfo(groupID=0, root=0, rank=0, nrank=nrank, nnode=nnode)
52 | 
53 |         workloadElemPtr = (ctypes.c_uint64 * WORKLOAD_SIZE)(*[0, coll, size])
54 | 
55 |         candidateElemPtr = ctypes.POINTER(ctypes.c_int32)() # allocate the pointer
56 |         candidateElemCount = ctypes.c_size_t()
57 | 
58 |         self.lib.ncclGetValidCandidatesWrapper(
59 |             ctypes.byref(group_info), # get the pointer
60 |             pairPtr,
61 |             pairCount,
62 |             workloadElemPtr,
63 |             scale2,
64 |             ctypes.byref(candidateElemPtr), # pointer of pointer
65 |             ctypes.byref(candidateElemCount) # get the pointer
66 |         )
67 | 
68 |         candidates = []
69 |         for i in range(int(candidateElemCount.value/CANDIDATE_SIZE)):
70 |             candidate = []
71 |             for j in range(CANDIDATE_SIZE):
72 |                 candidate.append(candidateElemPtr[CANDIDATE_SIZE*i+j])
73 |             candidates.append(candidate)
74 | 
75 |         self.lib.freeCandidates(candidateElemPtr)
76 | 
77 |         return candidates
78 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v4.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V4_H_
 6 | #define NCCL_NET_V4_H_
 7 | 
 8 | #define NCCL_NET_HANDLE_MAXSIZE_V4 64
 9 | 
10 | typedef struct {
11 |   char* name;     // Used mostly for logging.
12 |   char* pciPath;  // Path to the PCI device in /sys.
13 |   uint64_t guid;  // Unique identifier for the NIC chip. Important for
14 |                   // cards with multiple PCI functions (Physical or virtual).
15 |   int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
16 |   int speed;      // Port speed in Mbps.
17 |   int port;       // Port number.
18 |   int maxComms;   // Maximum number of comms we can create
19 | } ncclNetProperties_v4_t;
20 | 
21 | // v4 struct for backwards compatibility
22 | typedef struct {
23 |   // Name of the network (mainly for logs)
24 |   const char* name;
25 |   // Initialize the network.
26 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
27 |   // Return the number of adapters.
28 |   ncclResult_t (*devices)(int* ndev);
29 |   // Get various device properties.
30 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
31 |   // Create a receiving object and provide a handle to connect to it. The
32 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
33 |   // between ranks to create a connection.
34 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
35 |   // Connect to a handle and return a sending comm object for that peer.
36 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
37 |   // Finalize connection establishment after remote peer has called connectHandle
38 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
39 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
40 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
41 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
42 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
43 |   // Asynchronous send to a peer.
44 |   // May return request == NULL if the call cannot be performed (or would block)
45 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
46 |   // Asynchronous recv from a peer.
47 |   // May return request == NULL if the call cannot be performed (or would block)
48 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
49 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
50 |   // visible to the GPU
51 |   ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
52 |   // Test whether a request is complete. If size is not NULL, it returns the
53 |   // number of bytes sent/received.
54 |   ncclResult_t (*test)(void* request, int* done, int* size);
55 |   // Close and free send/recv comm objects
56 |   ncclResult_t (*closeSend)(void* sendComm);
57 |   ncclResult_t (*closeRecv)(void* recvComm);
58 |   ncclResult_t (*closeListen)(void* listenComm);
59 | } ncclNet_v4_t;
60 | 
61 | #endif // end include guard
62 | 


--------------------------------------------------------------------------------
/src/misc/argcheck.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "argcheck.h"
 8 | #include "comm.h"
 9 | 
10 | static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
11 |   cudaPointerAttributes attr;
12 |   cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
13 |   if (err != cudaSuccess || attr.devicePointer == NULL) {
14 |     WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer);
15 |     return ncclInvalidArgument;
16 |   }
17 | #if CUDART_VERSION >= 10000
18 |   if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
19 | #else
20 |   if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
21 | #endif
22 |     WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
23 |     return ncclInvalidArgument;
24 |   }
25 |   return ncclSuccess;
26 | }
27 | 
28 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
29 |   if (ptr == NULL) {
30 |     WARN("%s : %s argument is NULL", opname, ptrname);
31 |     return ncclInvalidArgument;
32 |   }
33 |   return ncclSuccess;
34 | }
35 | 
36 | ncclResult_t ArgsCheck(struct ncclInfo* info) {
37 |   // First, the easy ones
38 |   if (info->root < 0 || info->root >= info->comm->nRanks) {
39 |     WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
40 |     return ncclInvalidArgument;
41 |   }
42 |   if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
43 |     WARN("%s : invalid type %d", info->opName, info->datatype);
44 |     return ncclInvalidArgument;
45 |   }
46 |   // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
47 |   NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));
48 | 
49 |   if (info->op < 0 || ncclMaxRedOp < info->op) {
50 |     WARN("%s : invalid reduction operation %d", info->opName, info->op);
51 |     return ncclInvalidArgument;
52 |   }
53 |   int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps);
54 |   if (ncclNumOps <= info->op &&
55 |       (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) {
56 |     WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op);
57 |     return ncclInvalidArgument;
58 |   }
59 | 
60 |   if (info->comm->checkPointers) {
61 |     if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) {
62 |       if (info->count >0)
63 |         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName));
64 |     } else {
65 |       // Check CUDA device pointers
66 |       if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
67 |         NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
68 |       }
69 |       if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
70 |         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
71 |       }
72 |     }
73 |   }
74 |   return ncclSuccess;
75 | }
76 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/internal/database.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <algorithm>
 3 | #include <map>
 4 | #include <mutex>
 5 | #include <string>
 6 | #include <vector>
 7 | #include "src/include/datatype.h"
 8 | #include "src/include/internal/logging.h"
 9 | #include "src/include/timer.h"
10 | 
11 | class Database {
12 |   public:
13 |     // TODO: reload from previous database
14 |     RecordValue fetct(const RecordKey &key) {
15 |       std::lock_guard<std::mutex> lock(this->mutex);
16 |       // pthread_mutex_lock(&this->mutex);
17 |       if (table.count(key) == 0) {
18 |         // pthread_mutex_unlock(&this->mutex);
19 |         return {-1, 0};
20 |       } else {
21 |         auto item = table[key];
22 |         // pthread_mutex_unlock(&this->mutex);
23 |         return item;
24 |       }
25 |     }
26 | 
27 |     void add(const Record &record) {
28 |       std::lock_guard<std::mutex> lock(this->mutex);
29 |       // pthread_mutex_lock(&this->mutex);
30 |       if (table.count(record.key) == 0) {
31 |         table[record.key] = record.value;
32 |       } else {
33 |         table[record.key] =
34 |             RecordValue((table[record.key].duration * table[record.key].repeat +
35 |                          record.value.duration * record.value.repeat) /
36 |                             (table[record.key].repeat + record.value.repeat),
37 |                         (table[record.key].repeat + record.value.repeat));
38 |       }
39 |       log.push_back(record);
40 |       // pthread_mutex_unlock(&this->mutex);
41 |     }
42 |     std::string dataDebugStr();
43 |     std::string logDebugStr();
44 | 
45 |   private:
46 |     std::map<RecordKey, RecordValue> table;
47 |     std::vector<Record> log;
48 |     std::mutex mutex;
49 |     // pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
50 | };
51 | 
52 | std::string Database::dataDebugStr() {
53 |   std::lock_guard<std::mutex> lock(this->mutex);
54 |   // pthread_mutex_lock(&this->mutex);
55 |   std::stringstream ss;  // NOLINT
56 |   // the sorted records
57 |   std::vector<std::pair<RecordKey, RecordValue>> sortedRecords;
58 | 
59 |   // group records by map
60 |   std::map<Workload, std::vector<std::pair<RecordKey, RecordValue>>> groupedRecords;
61 |   for (const auto& entry : this->table) {
62 |     groupedRecords[entry.first.workload].emplace_back(entry);
63 |   }
64 |   // sort each group by duration value
65 |   for (auto& group : groupedRecords) {
66 |     auto& records = group.second;
67 |     std::sort(records.begin(), records.end(), [](const auto& a, const auto& b) {
68 |         return a.second.duration < b.second.duration;
69 |     });
70 |     // update sortedRecords
71 |     sortedRecords.insert(sortedRecords.end(), records.begin(), records.end());
72 |   }
73 | 
74 |   for (const auto &pair : sortedRecords) {
75 |     ss << pair.first.debugStr();
76 |     ss << pair.second.debugStr() << "\n";
77 |   }
78 |   // pthread_mutex_unlock(&this->mutex);
79 |   return ss.str();
80 | }
81 | 
82 | std::string Database::logDebugStr() {
83 |   std::lock_guard<std::mutex> lock(this->mutex);
84 |   // pthread_mutex_lock(&this->mutex);
85 |   std::stringstream ss;  // NOLINT
86 |   for (const auto &record : log) {
87 |     ss << record.debugStr() << "\n";
88 |   }
89 |   // pthread_mutex_unlock(&this->mutex);
90 |   return ss.str();
91 | 
92 |   // TODO: chrome timeline
93 | }
94 | 


--------------------------------------------------------------------------------
/src/misc/tuner.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #include <dlfcn.h>
 9 | #include <errno.h>
10 | #include <stdlib.h>
11 | 
12 | #include "debug.h"
13 | #include "nccl_tuner.h"
14 | 
15 | pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
16 | static int tunerPluginRefCount = -1;
17 | static void* tunerPluginLib = nullptr;
18 | ncclTuner_t* tunerSymbol = nullptr;
19 | 
20 | __attribute__((visibility("default"))) ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
21 |   // Initialize to nullptr by default if plugin tuner cannot be loaded.
22 |   *tuner = nullptr;
23 |   if (tunerPluginRefCount == -2) return ncclSuccess;
24 | 
25 |   pthread_mutex_lock(&tunerPluginLock);
26 |   if (tunerPluginRefCount == -1) {
27 |     tunerPluginRefCount = -2; // Default: no plugin, don't try again later
28 | 
29 |     const char* name = getenv("NCCL_TUNER_PLUGIN");
30 |     if (name) {
31 |       INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
32 |       tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
33 |     }
34 |     if (tunerPluginLib == nullptr) {
35 |       // dlopen does not guarantee to set errno, but dlerror only gives us a
36 |       // string, so checking errno doesn't hurt to try to provide a better
37 |       // error message
38 |       if (errno == ENOENT) {
39 |         INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
40 |       } else {
41 |         INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
42 |       }
43 |     } else {
44 |       tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
45 |       if (tunerSymbol == nullptr) {
46 |         INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
47 |         dlclose(tunerPluginLib);
48 |         tunerPluginLib = nullptr;
49 |       } else {
50 |         INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
51 |         tunerPluginRefCount = 0;
52 |       }
53 |     }
54 |   }
55 | 
56 |   if (tunerPluginRefCount >= 0) {
57 |     *tuner = tunerSymbol;
58 |     INFO(NCCL_INIT, "Using tuner plugin: '%s'", tunerSymbol->name);
59 |     tunerPluginRefCount++;
60 |   }
61 |   pthread_mutex_unlock(&tunerPluginLock);
62 |   return ncclSuccess;
63 | }
64 | 
65 | __attribute__((visibility("default"))) ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner) {
66 |   if (*tuner == nullptr) return ncclSuccess;
67 |   pthread_mutex_lock(&tunerPluginLock);
68 |   if (--tunerPluginRefCount == 0) {
69 |     if (tunerPluginLib == nullptr) {
70 |       WARN("Tuner plugin refcount is 0, yet tunerPluginLib ptr is NULL\n");
71 |     } else {
72 |       INFO(NCCL_TUNING, "Closing tuner: '%s'", tunerSymbol->name);
73 |       dlclose(tunerPluginLib);
74 |     }
75 |     tunerPluginLib = nullptr;
76 |     tunerSymbol = nullptr;
77 |     *tuner = nullptr;
78 |     tunerPluginRefCount = -1;
79 |   }
80 |   pthread_mutex_unlock(&tunerPluginLock);
81 |   return ncclSuccess;
82 | }


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_IMPL_GUARD_CUDART
10 | #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif /* __cplusplus */
16 | 
17 | typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
18 | typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
19 | typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
20 | typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
21 | typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
22 | typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
23 | 
24 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
25 | {
26 | #ifndef NVTX_DISABLE
27 |     nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
28 |     if(local!=0)
29 |         (*local)(device, name);
30 | #endif /*NVTX_DISABLE*/
31 | }
32 | 
33 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
34 | {
35 | #ifndef NVTX_DISABLE
36 |     nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
37 |     if(local!=0)
38 |         (*local)(device, name);
39 | #endif /*NVTX_DISABLE*/
40 | }
41 | 
42 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
43 | {
44 | #ifndef NVTX_DISABLE
45 |     nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
46 |     if(local!=0)
47 |         (*local)(stream, name);
48 | #endif /*NVTX_DISABLE*/
49 | }
50 | 
51 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
52 | {
53 | #ifndef NVTX_DISABLE
54 |     nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
55 |     if(local!=0)
56 |         (*local)(stream, name);
57 | #endif /*NVTX_DISABLE*/
58 | }
59 | 
60 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
61 | {
62 | #ifndef NVTX_DISABLE
63 |     nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
64 |     if(local!=0)
65 |         (*local)(event, name);
66 | #endif /*NVTX_DISABLE*/
67 | }
68 | 
69 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
70 | {
71 | #ifndef NVTX_DISABLE
72 |     nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
73 |     if(local!=0)
74 |         (*local)(event, name);
75 | #endif /*NVTX_DISABLE*/
76 | }
77 | 
78 | #ifdef __cplusplus
79 | } /* extern "C" */
80 | #endif /* __cplusplus */
81 | 
82 | 


--------------------------------------------------------------------------------
/src/include/coll_net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COLL_NET_H_
 8 | #define COLL_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | 
13 | typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
14 | 
15 | // Translation to external API
16 | static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
17 | static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
18 | static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
19 | static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
20 | static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
21 | static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
22 | static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
23 | /* DMA-BUF support */
24 | static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
25 | static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
26 | static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
27 |   NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
28 | static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
29 | static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
30 | static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
31 | static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
32 | 
33 | static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/src/include/nvtx.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_NVTX_H_
 8 | #define NCCL_NVTX_H_
 9 | 
10 | #include "nvtx3/nvtx3.hpp"
11 | 
12 | #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
13 | #define NVTX3_CONSTEXPR_IF_CPP14 constexpr
14 | #else
15 | #define NVTX3_CONSTEXPR_IF_CPP14
16 | #endif
17 | 
18 | // Define all NCCL-provided static schema IDs here (avoid duplicates).
19 | #define NVTX_SID_CommInitRank  0
20 | #define NVTX_SID_CommInitAll   1
21 | #define NVTX_SID_CommDestroy   2 // same schema as NVTX_SID_CommInitRank
22 | #define NVTX_SID_CommAbort     3 // same schema as NVTX_SID_CommInitRank
23 | #define NVTX_SID_AllGather     4
24 | #define NVTX_SID_AllReduce     5
25 | #define NVTX_SID_Broadcast     6
26 | #define NVTX_SID_ReduceScatter 7
27 | #define NVTX_SID_Reduce        8
28 | #define NVTX_SID_Send          9
29 | #define NVTX_SID_Recv          10
30 | 
31 | // Define static schema ID for the reduction operation.
32 | #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
33 | 
34 | extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
35 | 
36 | struct nccl_domain{static constexpr char const* name{"NCCL"};};
37 | 
38 | class payload_schema {
39 |  public:
40 |   explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
41 |   {
42 |     schema_attr.name = schemaName;
43 |     schema_attr.entries = entries;
44 |     schema_attr.numEntries = numEntries;
45 |     schema_attr.schemaId = schemaId;
46 |     nvtxPayloadSchemaRegister(nvtx3::domain::get<nccl_domain>(), &schema_attr);
47 |   }
48 | 
49 |   payload_schema() = delete;
50 |   ~payload_schema() = default;
51 |   payload_schema(payload_schema const&) = default;
52 |   payload_schema& operator=(payload_schema const&) = default;
53 |   payload_schema(payload_schema&&) = default;
54 |   payload_schema& operator=(payload_schema&&) = default;
55 | 
56 |  private:
57 |   nvtxPayloadSchemaAttr_t schema_attr{
58 |     NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
59 |     NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
60 |     NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
61 |     NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
62 |     NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
63 |     nullptr,
64 |     NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
65 |     NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
66 |     nullptr, 0, 0, 0};
67 | };
68 | 
69 | // Create NVTX push/pop range with parameters
70 | // @param name of the operation (see `NVTX_SID_*`)
71 | // @param N  schema name
72 | // @param S  schema (entries)
73 | // @param P  payload (struct)
74 | #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
75 |   static const payload_schema schema{S, std::extent<decltype(S)>::value, \
76 |     NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
77 |   static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
78 |   nvtxPayloadData_t nvtx3_bpl__[] = { \
79 |     {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
80 |   ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
81 |   ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
82 | 
83 | extern void initNvtxRegisteredEnums();
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/ext-tuner/example/utils/get_candidates.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import time
 4 | import argparse
 5 | 
 6 | current_dir = os.path.dirname(os.path.abspath(__file__))
 7 | wrapper_dir = os.path.join(current_dir, "../plugin/cuda/")
 8 | sys.path.append(wrapper_dir)
 9 | from wrapper import NCCLCandidateWrapper
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("--nrank", type=int, default=8)
14 |     parser.add_argument("--nnode", type=int, default=1)
15 |     parser.add_argument("--coll", type=str, default="AllReduce")
16 |     parser.add_argument("--size", type=int, default=1024, help="nbyte per rank")
17 |     parser.add_argument("--expire", type=int, default=10, help="expire per candidate, -1 is forever")
18 |     parser.add_argument("--scale2", action='store_true', default=False, help="")
19 |     args = parser.parse_args()
20 |     coll_name = f"nnode{args.nnode}_nrank{args.nrank}_coll{args.coll}_size{args.size}"
21 |     coll_to_key = {
22 |         "Broadcast" : 0,
23 |         "Reduce" : 1,
24 |         "AllGather" : 2,
25 |         "ReduceScatter" : 3,
26 |         "AllReduce" : 4,
27 |         "SendRecv" : 5,
28 |         "All2All" : 9
29 |     }
30 | 
31 |     wrapper = NCCLCandidateWrapper()
32 |     # p2plevel=7 by default in NCCL
33 |     # gdrlevel=4 by default in NCCL
34 |     map_dict = {
35 |         "tuner_extraP2PCE": 0,
36 |         "tuner_extraSHM": 1,
37 |         "tuner_p2pLevel": -1,
38 |         # set tuner_p2pChunkSize=512*1024 for nvlink-1-node
39 |         "tuner_p2pChunkSize": 128*1024,
40 |         "tuner_p2pnChannelsPerPeer": 128,
41 |         "tuner_p2pnChannels": 128,
42 |         "tuner_nChannels": 128,
43 |         "tuner_treeupdown_allreduce_simple": 0
44 |     }
45 |     candidates = wrapper.nccl_get_valid_candidates(
46 |         nrank=args.nrank,
47 |         nnode=args.nnode,
48 |         coll=coll_to_key[args.coll],
49 |         size=args.size,
50 |         tunerEnvs=map_dict,
51 |         scale2=args.scale2)
52 | 
53 |     count_after_filter = 0
54 |     file = open(f"{coll_name}.txt", 'w')
55 |     print(f"writting {coll_name}.txt")
56 |     last_output = None
57 |     for candidate in candidates:
58 |         # you can filter the candidate here
59 |         algo, proto, copytype, p2plevel, nc, nt, chunksize, _, _, _ = candidate
60 |         '''example
61 |         if algo != 1:
62 |             continue
63 |         if proto != 2:
64 |             continue
65 |         if nc & (nc - 1) != 0:
66 |             continue
67 |         if nt & (nt - 1) != 0:
68 |             continue
69 |         if chunksize & (chunksize - 1) != 0:
70 |             continue
71 |         '''
72 |         count_after_filter += 1
73 |         specific_candidate = f"{args.nrank}; -1 {coll_to_key[args.coll]} {args.size};{' '.join(map(str, candidate))}"
74 |         output = f"{specific_candidate};{args.expire}"
75 |         last_output = f"{specific_candidate};-1"
76 |         file.write(output + '\n')
77 |     file.write(last_output + '\n')
78 |     file.close()
79 | 
80 |     print(f"The communication: nrank={args.nrank}, nnode={args.nnode}, coll={args.coll}, sizePerRank={args.size},\n"
81 |         f"tunerEnvs={map_dict},\n"
82 |         f"scale2={args.scale2}, has {len(candidates)} candidates, after filter: {count_after_filter} candidates.")
83 | 
84 |     print(f"set environment variable TUNER_PROFILE_MORE={count_after_filter*args.expire},\n"
85 |         f"then call {count_after_filter*args.expire} times {coll_name},\n"
86 |         f"add more repeats will be safer.")
87 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/include/optimizers/optimizer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cmath>
 3 | #include <limits>
 4 | #include <map>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <unordered_map>
 8 | #include <utility>
 9 | #include <vector>
10 | #include "src/include/jobs/job.h"
11 | #include "src/include/tuner.h"
12 | #include "src/include/internal/env.h"
13 | #include "src/misc/math.h"
14 | 
15 | class Tuner;
16 | 
17 | class Optimizer {
18 |   public:
19 |     explicit Optimizer(const CandidateFunc &getValidCandidatesFunc)
20 |         : getValidCandidatesFunc(getValidCandidatesFunc) {
21 |       initEnv();
22 |     }
23 |     void initialize(Tuner *tuner) { this->tuner = tuner; }
24 |     virtual void
25 |     addJob(const Info &myInfo,
26 |            const std::unordered_map<GIDTYPE, GroupInfo> &allGroupInfos,
27 |            const GIDTYPE &groupID, const Workload &workload,
28 |            Candidate &startCandidate) = 0;
29 |     // TODO: multithread, workload-specific
30 |     virtual void run(const std::map<Workload, SimpleQueryStatus> &status,
31 |                      const std::vector<Record> &records,
32 |                      ResultPack *resultPack) = 0;
33 |     virtual void pick(const std::map<Workload, SimpleQueryStatus> &status,
34 |                       DecisionPack *decisionPack) = 0;
35 | 
36 |     std::string debugStr() const {
37 |       std::stringstream ss;  // NOLINT
38 |       ss << "mode=" << Environment::get()->find("TUNER_MODE", "-1") << "\n";
39 |       ss << "warmupSteps=" << warmupSteps << "\n";
40 |       ss << "pretrainSteps=" << pretrainSteps << "\n";
41 |       ss << "trainSteps=" << trainSteps << "\n";
42 |       ss << "roundMaxSteps=" << roundMaxSteps << "\n";
43 |       ss << "optimumExpireSteps=" << optimumExpireSteps << "\n";
44 |       ss << "expireSteps=" << expireSteps << "\n";
45 |       std::lock_guard<std::mutex> lock(this->jobsMutex);
46 |       for (auto &pair : this->jobs) {
47 |         const auto &workload = pair.first;
48 |         const auto &job = pair.second;
49 |         ss << job->debugStr() << "\n";;
50 |       }
51 |       return ss.str();
52 |     }
53 | 
54 |   protected:
55 |     mutable std::mutex jobsMutex;
56 |     // pthread_mutex_t jobsMutex = PTHREAD_MUTEX_INITIALIZER;
57 |     std::map<Workload, std::unique_ptr<Job>> jobs;
58 |     Tuner *tuner;
59 |     CandidateFunc getValidCandidatesFunc;
60 |     int32_t warmupSteps;
61 |     int32_t pretrainSteps;
62 |     int32_t trainSteps;
63 |     int32_t roundMaxSteps;
64 |     int32_t optimumExpireSteps;
65 |     int32_t expireSteps;
66 | 
67 |   private:
68 |     void initEnv();
69 | };
70 | 
71 | void Optimizer::initEnv() {
72 |   warmupSteps = Environment::get()->find("TUNER_WARMUP_STEPS", 5);
73 |   CHECK(warmupSteps >= 1);
74 |   pretrainSteps = Environment::get()->find("TUNER_PRETRAIN_STEPS", 20);
75 |   CHECK(pretrainSteps >= 1);
76 |   trainSteps = Environment::get()->find("TUNER_TRAIN_STEPS", 50);
77 |   CHECK(trainSteps >= 1);
78 |   roundMaxSteps = Environment::get()->find("TUNER_ROUND_MAX_STEPS", 10);
79 |   CHECK(roundMaxSteps >= 1);
80 |   optimumExpireSteps = Environment::get()->find("TUNER_OPTIMUM_EXPIRE", 10);
81 |   expireSteps = Environment::get()->find("TUNER_PROFILE_REPEAT", 1);
82 |   CHECK(pretrainSteps + trainSteps >= expireSteps);
83 |   CHECK(expireSteps > 0);
84 |   INFO(Logger::LogSubSys::OPTIMIZER) << "set warmupSteps=" << warmupSteps
85 |        << " pretrainSteps=" << pretrainSteps << " trainSteps=" << trainSteps
86 |        << " roundMaxSteps=" << roundMaxSteps
87 |        << " optimumExpireSteps=" << optimumExpireSteps
88 |        << " expireSteps=" << expireSteps;
89 | }
90 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2021  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
10 | #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | #define NVTX_EXT_IMPL_GUARD
14 | #include "nvtxExtImpl.h"
15 | #undef NVTX_EXT_IMPL_GUARD
16 | 
17 | #ifdef __cplusplus
18 | extern "C" {
19 | #endif /* __cplusplus */
20 | 
21 | #define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
22 |     NAME##_v##VERSION##_mem##COMPATID
23 | #define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
24 |     NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
25 | #define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
26 |     NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD)
27 | 
28 | /*
29 |  * Function slots for the binary payload extension. First entry is the module
30 |  * state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
31 |  */
32 | NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
33 | NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1]
34 |     = {0};
35 | 
36 | NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
37 | {
38 |     intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
39 |     nvtxExtModuleSegment_t segment = {
40 |         0, // unused (only one segment)
41 |         NVTX3EXT_CBID_PAYLOAD_FN_NUM,
42 |         fnSlots
43 |     };
44 | 
45 |     nvtxExtModuleInfo_t module = {
46 |         NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
47 |         NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD,
48 |         1, &segment, // number of segments, segments
49 |         NULL, // no export function needed
50 |         // bake type sizes and alignment information into program binary
51 |         &nvtxExtPayloadTypeInfo
52 |     };
53 | 
54 |     NVTX_INFO( "%s\n", __FUNCTION__  );
55 | 
56 |     NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
57 |         NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
58 | }
59 | 
60 | #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
61 | typedef ret_val ( * fn_name##_impl_fntype )signature; \
62 | NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \
63 |     intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
64 |     if (slot != NVTX_EXTENSION_DISABLED) { \
65 |         if (slot) { \
66 |             return (*(fn_name##_impl_fntype)slot) arg_names; \
67 |         } else { \
68 |             NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
69 |             slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
70 |             if (slot != NVTX_EXTENSION_DISABLED && slot) { \
71 |                 return (*(fn_name##_impl_fntype)slot) arg_names; \
72 |             } \
73 |         } \
74 |     } \
75 |     return ((ret_val)(intptr_t)-1); \
76 | }
77 | 
78 | NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr))
79 | 
80 | NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr))
81 | 
82 | #undef NVTX_EXT_FN_IMPL
83 | 
84 | #ifdef __cplusplus
85 | } /* extern "C" */
86 | #endif /* __cplusplus */


--------------------------------------------------------------------------------
/ext-tuner/example/plugin/cuda/plugin.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_TUNER_H_
 9 | #define NCCL_TUNER_H_
10 | 
11 | #include <cstdint>
12 | #include <map>
13 | #include <string>
14 | #include "nccl.h"
15 | #include "src/cuda/nccl_params.h"
16 | 
17 | // API to be implemented by external tuner
18 | struct ncclTuner_v1_t {
19 |     // Name of the tuner
20 |     const char *name;
21 | 
22 |     // Initializes tuner states.
23 |     // nRanks: number of ranks in current communicator. Each communicator
24 |     // initialize its own tuner. nNodes: number of nodes in current
25 |     // communicator. logFunction: a logFunction can be useful to integrate
26 |     // logging together with NCCL core.
27 |     ncclResult_t (*init)(uint64_t commHash, size_t nRanks, size_t nNodes,
28 |                          size_t rank, size_t node, size_t device, std::map<std::string, int32_t> tunerEnvs,
29 |                          void *handler);
30 | 
31 |     // Gets info (algo, protocol, number of ctas and threads) for a given
32 |     // collective. Inputs:
33 |     //   - collType: collective type , e.g., allreduce, allgather…
34 |     //   - nBytes: collective size in bytes
35 |     //   - collNetSupport: whether collnet supports this type
36 |     //   - nvlsSupport: whether nvlink sharp supports this time
37 |     //   - numPipeOps: number of operations in the group
38 |     //
39 |     // Outputs:
40 |     //   - algorithm: selected algorithm to be used for the given collective
41 |     //   - protocol: selected protocol to be used for the given collective
42 |     //   - nChannels: number of channels (hence SMs) to be used.
43 |     //
44 |     // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
45 |     // default tuning for the given collective.
46 |     // Also, the plugin is allowed to not set any output, or set only the
47 |     // algorithm and protocol, but not only the algorithm or only the protocol.
48 |     // Unset fields will be set automatically by NCCL.
49 |     ncclResult_t (*getCandidate)(uint64_t commHash, ncclFunc_t collType,
50 |                                 size_t nBytes, int *algorithm, int *protocol,
51 |                                 int *isCopyEngineNotSmCopy, int *p2pLevel, int *nChannels,
52 |                                 int *nThreads, int *chunkSize, int *iteration,
53 |                                 int *lastIterEffectiveChunksize, int *native);
54 | 
55 |     // Terminates the plugin and cleans up any resources that the plugin
56 |     // allocated.
57 |     ncclResult_t (*destroy)(uint64_t commHash);
58 |     // Profiles the communication
59 |     ncclResult_t (*startProfiling)(uint64_t commHash, cudaStream_t stream,
60 |                                    ncclFunc_t collType, size_t nBytes,
61 |                                    int algorithm, int protocol,
62 |                                    int isCopyEngineNotSmCopy, int p2pLevel, int nChannels,
63 |                                    int nThreads, int chunkSize, int iteration,
64 |                                    int lastIterEffectiveChunksize, int native);
65 |     ncclResult_t (*stopProfiling)(uint64_t commHash);
66 |     ncclResult_t (*isNewWorkload)(uint64_t commHash, ncclFunc_t collType, size_t nBytes, bool* flag);
67 |     // Workload workload;
68 |     // Candidate candidate;
69 | };
70 | 
71 | using ncclTuner_t = ncclTuner_v1_t;
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_IMPL_GUARD_SYNC
10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
11 | #endif
12 | 
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif /* __cplusplus */
17 | 
18 | typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
19 | typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
20 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
21 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
22 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
23 | typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
24 | 
25 | NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
26 | {
27 | #ifndef NVTX_DISABLE
28 |     nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
29 |     if(local!=0)
30 |         return (*local)(domain, attribs);
31 |     else
32 | #endif  /*NVTX_DISABLE*/
33 |         return (nvtxSyncUser_t)0;
34 | }
35 | 
36 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
37 | {
38 | #ifndef NVTX_DISABLE
39 |     nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
40 |     if(local!=0)
41 |         (*local)(handle);
42 | #endif /*NVTX_DISABLE*/
43 | }
44 | 
45 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
46 | {
47 | #ifndef NVTX_DISABLE
48 |     nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
49 |     if(local!=0)
50 |         (*local)(handle);
51 | #endif /*NVTX_DISABLE*/
52 | }
53 | 
54 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
55 | {
56 | #ifndef NVTX_DISABLE
57 |     nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
58 |     if(local!=0)
59 |         (*local)(handle);
60 | #endif /*NVTX_DISABLE*/
61 | }
62 | 
63 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
64 | {
65 | #ifndef NVTX_DISABLE
66 |     nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
67 |     if(local!=0)
68 |         (*local)(handle);
69 | #endif /*NVTX_DISABLE*/
70 | }
71 | 
72 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
73 | {
74 | #ifndef NVTX_DISABLE
75 |     nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
76 |     if(local!=0)
77 |         (*local)(handle);
78 | #endif /*NVTX_DISABLE*/
79 | }
80 | 
81 | #ifdef __cplusplus
82 | } /* extern "C" */
83 | #endif /* __cplusplus */
84 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v6.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V6_H_
 6 | #define NCCL_NET_V6_H_
 7 | 
 8 | typedef struct {
 9 |   char* name;     // Used mostly for logging.
10 |   char* pciPath;  // Path to the PCI device in /sys.
11 |   uint64_t guid;  // Unique identifier for the NIC chip. Important for
12 |                   // cards with multiple PCI functions (Physical or virtual).
13 |   int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
14 |   int speed;      // Port speed in Mbps.
15 |   int port;       // Port number.
16 |   float latency;  // Network latency
17 |   int maxComms;   // Maximum number of comms we can create
18 |   int maxRecvs;   // Maximum number of grouped receives.
19 | }ncclNetProperties_v6_t;
20 | 
21 | typedef ncclNetProperties_v6_t ncclNetProperties_t;
22 | 
23 | typedef struct {
24 |   // Name of the network (mainly for logs)
25 |   const char* name;
26 |   // Initialize the network.
27 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
28 |   // Return the number of adapters.
29 |   ncclResult_t (*devices)(int* ndev);
30 |   // Get various device properties.
31 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
32 |   // Create a receiving object and provide a handle to connect to it. The
33 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
34 |   // between ranks to create a connection.
35 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
36 |   // Connect to a handle and return a sending comm object for that peer.
37 |   // This call must not block for the connection to be established, and instead
38 |   // should return successfully with sendComm == NULL with the expectation that
39 |   // it will be called again until sendComm != NULL.
40 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
41 |   // Finalize connection establishment after remote peer has called connect.
42 |   // This call must not block for the connection to be established, and instead
43 |   // should return successfully with recvComm == NULL with the expectation that
44 |   // it will be called again until recvComm != NULL.
45 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
46 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
47 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
48 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
49 |   /* DMA-BUF support */
50 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
51 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
52 |   // Asynchronous send to a peer.
53 |   // May return request == NULL if the call cannot be performed (or would block)
54 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
55 |   // Asynchronous recv from a peer.
56 |   // May return request == NULL if the call cannot be performed (or would block)
57 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
58 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
59 |   // visible to the GPU
60 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
61 |   // Test whether a request is complete. If size is not NULL, it returns the
62 |   // number of bytes sent/received.
63 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
64 |   // Close and free send/recv comm objects
65 |   ncclResult_t (*closeSend)(void* sendComm);
66 |   ncclResult_t (*closeRecv)(void* recvComm);
67 |   ncclResult_t (*closeListen)(void* listenComm);
68 | } ncclNet_v6_t;
69 | 
70 | #endif // end include guard
71 | 


--------------------------------------------------------------------------------
/src/include/nccl_tuner.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_TUNER_H_
 9 | #define NCCL_TUNER_H_
10 | 
11 | #include <map>
12 | #include <string>
13 | #include "nccl.h"
14 | #include "devcomm.h"
15 | 
16 | // API to be implemented by external tuner
17 | typedef struct {
18 |   // Name of the tuner
19 |   const char* name;
20 | 
21 |   // Initializes tuner states.
22 |   // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
23 |   // nNodes: number of nodes in current communicator.
24 |   // logFunction: a logFunction can be useful to integrate logging together with NCCL core.
25 |   ncclResult_t (*init)(uint64_t commHash, size_t nRanks, size_t nNodes, size_t rank, size_t node, size_t device, std::map<std::string, int32_t> tunerEnvs, void* handler);
26 | 
27 |   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
28 |   // Inputs:
29 |   //   - collType: collective type , e.g., allreduce, allgather…
30 |   //   - nBytes: collective size in bytes
31 |   //   - collNetTypeSupport: whether collnet supports this type
32 |   //   - nvlsTypeSupport: whether nvlink sharp supports this time
33 |   //   - numPipeOps: number of operations in the group
34 |   //
35 |   // Outputs:
36 |   //   - algorithm: selected algorithm to be used for the given collective
37 |   //   - protocol: selected protocol to be used for the given collective
38 |   //   - nChannels: number of channels (hence SMs) to be used.
39 |   //
40 |   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
41 |   // default tuning for the given collective.
42 |   // Also, the plugin is allowed to not set any output, or set only the
43 |   // algorithm and protocol, but not only the algorithm or only the protocol.
44 |   // Unset fields will be set automatically by NCCL.
45 |   ncclResult_t (*getCandidate)(uint64_t commHash, ncclFunc_t collType, size_t nBytes,
46 |                               int *algorithm, int *protocol, int* isCopyEngineNotSmCopy, int* p2pLevel,
47 |                               int* nChannels, int* nThreads,
48 |                               int* chunkSize, int* iteration, int* lastIterEffectiveChunksize, int* native);
49 | 
50 |   // Terminates the plugin and cleans up any resources that the plugin allocated.
51 |   ncclResult_t (*destroy)(uint64_t commHash);
52 |   // Profiles the communication
53 |   ncclResult_t (*startProfiling)(uint64_t commHash, cudaStream_t stream, ncclFunc_t collType, size_t nBytes,
54 |                               int algorithm, int protocol, int isCopyEngineNotSmCopy, int p2pLevel,
55 |                               int nChannels, int nThreads,
56 |                               int chunkSize, int iteration, int lastIterEffectiveChunksize, int native);
57 |   ncclResult_t (*stopProfiling)(uint64_t commHash);
58 |   ncclResult_t (*isNewWorkload)(uint64_t commHash, ncclFunc_t collType, size_t nBytes, bool* flag);
59 | } ncclTuner_v1_t;
60 | 
61 | typedef ncclTuner_v1_t ncclTuner_t;
62 | 
63 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
64 | 
65 | #define TUNER_P2PCHUNKSIZE 128 * 1024
66 | #define SIMPLE_P2PCHUNKSIZE_UPPER_BOUND 128*1024
67 | #define LL128_P2PCHUNKSIZE_UPPER_BOUND 128*1024
68 | #define LL_P2PCHUNKSIZE_UPPER_BOUND 80*1024
69 | static_assert(SIMPLE_P2PCHUNKSIZE_UPPER_BOUND <= TUNER_P2PCHUNKSIZE);
70 | static_assert(LL128_P2PCHUNKSIZE_UPPER_BOUND <= TUNER_P2PCHUNKSIZE);
71 | static_assert(LL_P2PCHUNKSIZE_UPPER_BOUND <= TUNER_P2PCHUNKSIZE);
72 | ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv);
73 | #endif
74 | 


--------------------------------------------------------------------------------
/ext-tuner/example/src/cuda/nccl_socket.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_SOCKET_H_
  8 | #define NCCL_SOCKET_H_
  9 | 
 10 | #include <arpa/inet.h>
 11 | #include <fcntl.h>
 12 | #include <netdb.h>
 13 | #include <netinet/tcp.h>
 14 | #include <poll.h>
 15 | #include <sys/socket.h>
 16 | #include "nccl.h"
 17 | 
 18 | #define MAX_IFS 16
 19 | #define MAX_IF_NAME_SIZE 16
 20 | #define SLEEP_INT 1000  // connection retry sleep interval in usec
 21 | #define RETRY_REFUSED_TIMES                                                    \
 22 |   2e4  // connection refused retry times before reporting a timeout (20 sec)
 23 | #define RETRY_TIMEDOUT_TIMES                                                   \
 24 |   3  // connection timed out retry times (each one can take 20s)
 25 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST + NI_MAXSERV)
 26 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
 27 | 
 28 | /* Common socket address storage structure for IPv4/IPv6 */
 29 | union ncclSocketAddress {
 30 |     struct sockaddr sa;
 31 |     struct sockaddr_in sin;
 32 |     struct sockaddr_in6 sin6;
 33 | };
 34 | 
 35 | enum ncclSocketState {
 36 |   ncclSocketStateNone = 0,
 37 |   ncclSocketStateInitialized = 1,
 38 |   ncclSocketStateAccepting = 2,
 39 |   ncclSocketStateAccepted = 3,
 40 |   ncclSocketStateConnecting = 4,
 41 |   ncclSocketStateConnectPolling = 5,
 42 |   ncclSocketStateConnected = 6,
 43 |   ncclSocketStateReady = 7,
 44 |   ncclSocketStateClosed = 8,
 45 |   ncclSocketStateError = 9,
 46 |   ncclSocketStateNum = 10
 47 | };
 48 | 
 49 | enum ncclSocketType {
 50 |   ncclSocketTypeUnknown = 0,
 51 |   ncclSocketTypeBootstrap = 1,
 52 |   ncclSocketTypeProxy = 2,
 53 |   ncclSocketTypeNetSocket = 3,
 54 |   ncclSocketTypeNetIb = 4
 55 | };
 56 | 
 57 | struct ncclSocket {
 58 |     int fd;
 59 |     int acceptFd;
 60 |     int timedOutRetries;
 61 |     int refusedRetries;
 62 |     union ncclSocketAddress addr;
 63 |     volatile uint32_t *abortFlag;
 64 |     int asyncFlag;
 65 |     enum ncclSocketState state;
 66 |     int salen;
 67 |     uint64_t magic;
 68 |     enum ncclSocketType type;
 69 | };
 70 | 
 71 | // Initialize a socket
 72 | ncclResult_t ncclSocketInit(struct ncclSocket *sock,
 73 |                             union ncclSocketAddress *addr = NULL,
 74 |                             uint64_t magic = NCCL_SOCKET_MAGIC,
 75 |                             enum ncclSocketType type = ncclSocketTypeUnknown,
 76 |                             volatile uint32_t *abortFlag = NULL,
 77 |                             int asyncFlag = 0);
 78 | // Connect to sock->addr. sock->fd is set after a successful call.
 79 | ncclResult_t ncclSocketConnect(struct ncclSocket *sock);
 80 | // Accept an incoming connection from listenSock->fd and keep the file
 81 | // descriptor in sock->fd, with the remote side IP/port in sock->addr.
 82 | ncclResult_t ncclSocketAccept(struct ncclSocket *sock,
 83 |                               struct ncclSocket *ulistenSock);
 84 | 
 85 | #define NCCL_SOCKET_SEND 0
 86 | #define NCCL_SOCKET_RECV 1
 87 | 
 88 | ncclResult_t ncclSocketSend(struct ncclSocket *sock, void *ptr, int size);
 89 | ncclResult_t ncclSocketRecv(struct ncclSocket *sock, void *ptr, int size);
 90 | ncclResult_t ncclSocketTryRecv(struct ncclSocket *sock, void *ptr, int size,
 91 |                                int *closed, bool blocking);
 92 | ncclResult_t ncclSocketClose(struct ncclSocket *sock);
 93 | 
 94 | struct unexConn {
 95 |     int peer;
 96 |     int tag;
 97 |     struct ncclSocket sock;
 98 |     struct unexConn *next;
 99 | };
100 | 
101 | struct bootstrapState {
102 |     struct ncclSocket listenSock;
103 |     struct ncclSocket ringRecvSocket;
104 |     struct ncclSocket ringSendSocket;
105 |     union ncclSocketAddress *peerCommAddresses;
106 |     union ncclSocketAddress *peerProxyAddresses;
107 |     struct unexConn *unexpectedConnections;
108 |     int cudaDev;
109 |     int rank;
110 |     int nranks;
111 |     uint64_t magic;
112 |     volatile uint32_t *abortFlag;
113 | };
114 | 
115 | #endif
116 | 


--------------------------------------------------------------------------------
/ext-tuner/example/example/cuda/pytorch/demo.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | import argparse
 4 | import os
 5 | import datetime
 6 | import sys
 7 | import numpy as np
 8 | 
 9 | def profiling(args, nBytes, repeat):
10 |     data_type = torch.float32
11 |     data_type_size = torch.finfo(data_type).bits // 8
12 | 
13 |     if repeat == 0 or nBytes//data_type_size == 0:
14 |         return
15 |     comm_begin_event = torch.cuda.Event(enable_timing=True)
16 |     comm_end_event = torch.cuda.Event(enable_timing=True)
17 | 
18 |     tensor = torch.randn([1, nBytes//data_type_size], dtype=data_type).to(args.local_rank)
19 |     comm_begin_event.record()
20 |     for i in range(repeat):
21 |         torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
22 |     comm_end_event.record()
23 |     comm_end_event.synchronize() # blocking CPU thread until the event completes.
24 |     comm_gpu_time = comm_begin_event.elapsed_time(comm_end_event)/repeat
25 |     GB_s = nBytes*1.0/1024/1024/1024/(comm_gpu_time/1000)
26 |     if args.rank == 0:
27 |         print(f"avged by repeat: {repeat:>5}, bytes={nBytes:>10}, nrank={args.world_size:>4}: {GB_s:>5.2f} GB/s")
28 | 
29 |     comm_begin_event.record()
30 |     for i in range(100):
31 |         torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
32 |     comm_end_event.record()
33 |     comm_end_event.synchronize() # blocking CPU thread until the event completes.
34 |     comm_gpu_time = comm_begin_event.elapsed_time(comm_end_event)/100
35 |     GB_s = nBytes*1.0/1024/1024/1024/(comm_gpu_time/1000)
36 |     if args.rank == 0:
37 |         print(f"after pretuning, avged by repeat 100, bytes={nBytes:>10}, nrank={args.world_size:>4}: {GB_s:>5.2f} GB/s")
38 | 
39 |     torch.cuda.synchronize(args.local_rank)
40 | 
41 | if __name__ == '__main__':
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument("--rank", type=int, default=0, help="global rank")
44 |     parser.add_argument("--local_rank", type=int, default=0, help="local rank")
45 |     parser.add_argument("--world_size", type=int, default=1, help="world size")
46 |     parser.add_argument("--master_ip", type=str, default="localhost", help="master ip")
47 |     parser.add_argument("--master_port", type=str, default="6000", help="master port")
48 | 
49 |     args = parser.parse_args()
50 |     if os.getenv('OMPI_COMM_WORLD_SIZE') is not None: # for mpirun
51 |         args.rank = int(os.environ.get('OMPI_COMM_WORLD_RANK', args.rank))
52 |         args.local_rank = int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK', args.local_rank))
53 |         args.world_size = int(os.environ.get('OMPI_COMM_WORLD_SIZE', args.world_size))
54 |         os.environ['MASTER_ADDR'] = str(args.master_ip)
55 |         os.environ['MASTER_PORT'] = str(args.master_port)
56 |     elif os.getenv('WORLD_SIZE') is not None: # for torchrun
57 |         args.rank = int(os.environ.get('RANK', args.rank))
58 |         args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank))
59 |         args.world_size = int(os.environ.get('WORLD_SIZE', args.world_size))
60 |         args.master_ip = str(os.environ.get('MASTER_ADDR', args.master_ip))
61 |         args.master_port = str(os.environ.get('MASTER_PORT', args.master_port))
62 | 
63 |     torch.cuda.set_device(args.local_rank)
64 | 
65 |     tuning_steps = 5 # for warmup
66 |     tuning_steps += 5 # for native
67 |     tuning_steps += int(os.environ.get('TUNER_PRETRAIN_STEPS'))
68 |     tuning_steps += int(os.environ.get('TUNER_TRAIN_STEPS'))
69 |     # to avoid the remaing round
70 |     tuning_steps += 5 # in case
71 | 
72 |     if args.rank == 0:
73 |         os.environ['TUNER_ROLE'] = "COORDINATOR"
74 | 
75 |     if args.world_size > 1:
76 |         init_method = 'tcp://'
77 |         init_method += args.master_ip + ':' + args.master_port
78 |         torch.distributed.init_process_group(backend='nccl',
79 |                                             rank=args.rank,
80 |                                             world_size=args.world_size,
81 |                                             init_method=init_method,
82 |                                             timeout=datetime.timedelta(seconds=14400000))
83 | 
84 |     for nBytes in [1024*(2**i) for i in range(19)]:
85 |         profiling(args, nBytes, tuning_steps)
86 | 


--------------------------------------------------------------------------------
/src/collectives/device/broadcast.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "devcomm.h"
  8 | #include "collectives.h"
  9 | #include "primitives.h"
 10 | 
 11 | namespace {
 12 |   template<typename T, typename RedOp, typename Proto>
 13 |   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
 14 |     const int tid = threadIdx.x;
 15 |     const int nthreads = args->nWarps*WARP_SIZE;
 16 |     const int bid = args->bid;
 17 |     const int nChannels = args->nChannels;
 18 |     ncclRing *ring = &ncclShmem.channel.ring;
 19 |     ssize_t chunkSize;
 20 |     if (args->native) {
 21 |       chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
 22 |     } else {
 23 |       chunkSize = args->effectiveChunkSize; // effective chunksize
 24 |     }
 25 |     ssize_t minChunkSize;
 26 |     if (Proto::Id == NCCL_PROTO_LL)
 27 |       minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T));
 28 |     if (Proto::Id == NCCL_PROTO_LL128) {
 29 |       // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
 30 |       minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T)); // minChunkSizeLL128
 31 |     }
 32 | 
 33 |     const ssize_t loopSize = nChannels*chunkSize;
 34 |     const ssize_t size = args->count;
 35 |     const int rank = ring->userRanks[0];
 36 |     const int nextRank = ring->userRanks[1];
 37 |     const int root = args->root;
 38 | 
 39 |     T *inputBuf = (T*)args->sendbuff;
 40 |     T *outputBuf = (T*)args->recvbuff;
 41 |     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
 42 |       prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, 0, 0, args->transportIndex);
 43 | 
 44 |     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 45 |       ssize_t realChunkSize;
 46 |       if (Proto::Id == NCCL_PROTO_SIMPLE) {
 47 |         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
 48 |         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
 49 |       }
 50 |       else if (Proto::Id == NCCL_PROTO_LL) {
 51 |         if (args->native) {
 52 |           realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
 53 |         } else {
 54 |           realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSize)*minChunkSize);
 55 |         }
 56 |       }
 57 |       else if (Proto::Id == NCCL_PROTO_LL128) {
 58 |         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSize)*minChunkSize);
 59 |       }
 60 |       realChunkSize = int(realChunkSize);
 61 | 
 62 |       ssize_t offset = gridOffset + int(bid*realChunkSize);
 63 |       int nelem = min(realChunkSize, size-offset);
 64 | 
 65 |       if (rank == root) {
 66 |         if (inputBuf == outputBuf) {
 67 |           prims.send(offset, nelem);
 68 |         } else {
 69 |           prims.copySend(offset, offset, nelem);
 70 |         }
 71 |       } else if (nextRank == root) {
 72 |         prims.recv(offset, nelem);
 73 |       } else {
 74 |         prims.recvCopySend(offset, nelem);
 75 |       }
 76 |     }
 77 |   }
 78 | }
 79 | 
 80 | template<typename T, typename RedOp>
 81 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 82 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 83 |     using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
 84 |     runRing<T, RedOp, Proto>(args);
 85 |   }
 86 | };
 87 | 
 88 | template<typename T, typename RedOp>
 89 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
 90 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 91 |     runRing<T, RedOp, ProtoLL>(args);
 92 |   }
 93 | };
 94 | 
 95 | template<typename T, typename RedOp>
 96 | struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
 97 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 98 |     runRing<T, RedOp, ProtoLL128>(args);
 99 |   }
100 | };
101 | 


--------------------------------------------------------------------------------
/src/graph/trees.cc:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "nccl.h"
  8 | 
  9 | #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
 10 | 
 11 | /* Btree which alternates leaves and nodes.
 12 |  * Assumes root is 0, which conveniently builds a tree on powers of two,
 13 |  * (because we have pow2-1 ranks) which lets us manipulate bits.
 14 |  * Find first non-zero bit, then :
 15 |  * Find the parent :
 16 |  *   xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
 17 |  *   xx11[0] -> xx10[0] (3,7,11 below)
 18 |  * Find the children :
 19 |  *   xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
 20 |  *   xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
 21 |  *
 22 |  * Illustration :
 23 |  * 0---------------8
 24 |  *          ______/ \______
 25 |  *         4               12
 26 |  *       /   \            /  \
 27 |  *     2       6       10     \
 28 |  *    / \     / \     /  \     \
 29 |  *   1   3   5   7   9   11    13
 30 |  */
 31 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
 32 |   int up, down0, down1;
 33 |   int bit;
 34 |   for (bit=1; bit<nranks; bit<<=1) {
 35 |     if (bit & rank) break;
 36 |   }
 37 | 
 38 |   if (rank == 0) {
 39 |     *u = -1;
 40 |     *d0 = -1;
 41 |     // Child rank is > 0 so it has to be our child 1, not 0.
 42 |     *d1 = nranks > 1 ? bit >> 1 : -1;
 43 |     return ncclSuccess;
 44 |   }
 45 | 
 46 |   up = (rank ^ bit) | (bit << 1);
 47 |   // if smaller than the parent, we are his first child, otherwise we're his second
 48 |   if (up >= nranks) up = (rank ^ bit);
 49 |   *parentChildType = (rank < up) ? 0 : 1;
 50 |   *u = up;
 51 | 
 52 |   int lowbit = bit >> 1;
 53 |   // down0 is always within bounds
 54 |   down0 = lowbit == 0 ? -1 : rank-lowbit;
 55 | 
 56 |   down1 = lowbit == 0 ? -1 : rank+lowbit;
 57 |   // Make sure down1 is within bounds
 58 |   while (down1 >= nranks) {
 59 |     down1 = lowbit == 0 ? -1 : rank+lowbit;
 60 |     lowbit >>= 1;
 61 |   }
 62 |   *d0 = down0; *d1 = down1;
 63 | 
 64 |   return ncclSuccess;
 65 | }
 66 | 
 67 | /* Build a double binary tree. Take the previous tree for the first tree.
 68 |  * For the second tree, we use a mirror tree (if nranks is even)
 69 |  *
 70 |  * 0---------------8                   3----------------11
 71 |  *          ______/ \                 / \______
 72 |  *         4         \               /         7
 73 |  *       /   \        \             /        /   \
 74 |  *     2       6       10         1        5      9
 75 |  *    / \     / \     /  \       / \      / \    / \
 76 |  *   1   3   5   7   9   11     0   2    4   6  8   10
 77 |  *
 78 |  * or shift it by one rank (if nranks is odd).
 79 |  *
 80 |  * 0---------------8            1---------------9
 81 |  *          ______/ \______              ______/ \______
 82 |  *         4               12           5                0
 83 |  *       /   \            /           /   \            /
 84 |  *     2       6       10           3       7       11
 85 |  *    / \     / \     /  \         / \     / \     /  \
 86 |  *   1   3   5   7   9   11       2   4   6   8  10   12
 87 |  */
 88 | ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
 89 |   // First tree ... use a btree
 90 |   ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
 91 |   // Second tree ... mirror or shift
 92 |   if (nranks % 2 == 1) {
 93 |     // shift
 94 |     int shiftrank = (rank-1+nranks) % nranks;
 95 |     int u, d0, d1;
 96 |     ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
 97 |     *s1 = u == -1 ? -1 : (u+1) % nranks;
 98 |     *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
 99 |     *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
100 |   } else {
101 |     // mirror
102 |     int u, d0, d1;
103 |     ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
104 |     *s1 = u == -1 ? -1 : nranks-1-u;
105 |     *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
106 |     *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
107 |   }
108 |   return ncclSuccess;
109 | }
110 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
  5 | * See https://llvm.org/LICENSE.txt for license information.
  6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7 | */
  8 | 
  9 | #ifndef NVTX_IMPL_GUARD_CUDA
 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
 11 | #endif
 12 | 
 13 | 
 14 | #ifdef __cplusplus
 15 | extern "C" {
 16 | #endif /* __cplusplus */
 17 | 
 18 | typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
 19 | typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
 20 | typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
 21 | typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
 22 | typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
 23 | typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
 24 | typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
 25 | typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
 26 | 
 27 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
 28 | {
 29 | #ifndef NVTX_DISABLE
 30 |     nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
 31 |     if(local!=0)
 32 |         (*local)(device, name);
 33 | #endif /*NVTX_DISABLE*/
 34 | }
 35 | 
 36 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
 37 | {
 38 | #ifndef NVTX_DISABLE
 39 |     nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
 40 |     if(local!=0)
 41 |         (*local)(device, name);
 42 | #endif /*NVTX_DISABLE*/
 43 | }
 44 | 
 45 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
 46 | {
 47 | #ifndef NVTX_DISABLE
 48 |     nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
 49 |     if(local!=0)
 50 |         (*local)(context, name);
 51 | #endif /*NVTX_DISABLE*/
 52 | }
 53 | 
 54 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
 55 | {
 56 | #ifndef NVTX_DISABLE
 57 |     nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
 58 |     if(local!=0)
 59 |         (*local)(context, name);
 60 | #endif /*NVTX_DISABLE*/
 61 | }
 62 | 
 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
 64 | {
 65 | #ifndef NVTX_DISABLE
 66 |     nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
 67 |     if(local!=0)
 68 |         (*local)(stream, name);
 69 | #endif /*NVTX_DISABLE*/
 70 | }
 71 | 
 72 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
 73 | {
 74 | #ifndef NVTX_DISABLE
 75 |     nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
 76 |     if(local!=0)
 77 |         (*local)(stream, name);
 78 | #endif /*NVTX_DISABLE*/
 79 | }
 80 | 
 81 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
 82 | {
 83 | #ifndef NVTX_DISABLE
 84 |     nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
 85 |     if(local!=0)
 86 |         (*local)(event, name);
 87 | #endif /*NVTX_DISABLE*/
 88 | }
 89 | 
 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
 91 | {
 92 | #ifndef NVTX_DISABLE
 93 |     nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
 94 |     if(local!=0)
 95 |         (*local)(event, name);
 96 | #endif /*NVTX_DISABLE*/
 97 | }
 98 | 
 99 | #ifdef __cplusplus
100 | } /* extern "C" */
101 | #endif /* __cplusplus */
102 | 
103 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtCudaRt.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
  5 | * See https://llvm.org/LICENSE.txt for license information.
  6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7 | */
  8 | 
  9 | #include "nvToolsExt.h"
 10 | 
 11 | #include "cuda.h"
 12 | #include "driver_types.h"
 13 | 
 14 | #ifndef NVTOOLSEXT_CUDART_V3
 15 | #define NVTOOLSEXT_CUDART_V3
 16 | 
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif /* __cplusplus */
 20 | 
 21 | /* ========================================================================= */
 22 | /** \name Functions for CUDA Resource Naming
 23 | */
 24 | /** \addtogroup RESOURCE_NAMING
 25 |  * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
 26 |  *
 27 |  * This section covers the API functions that allow to annotate CUDA resources
 28 |  * with user-provided names.
 29 |  *
 30 |  * @{
 31 |  */
 32 | 
 33 | /*  ------------------------------------------------------------------------- */
 34 | /* \cond SHOW_HIDDEN 
 35 | * \brief Used to build a non-colliding value for resource types separated class
 36 | * \version \NVTX_VERSION_2
 37 | */
 38 | #define NVTX_RESOURCE_CLASS_CUDART 5
 39 | /** \endcond */
 40 | 
 41 | /*  ------------------------------------------------------------------------- */
 42 | /** \brief Resource types for CUDART
 43 | */
 44 | typedef enum nvtxResourceCUDARTType_t
 45 | {
 46 |     NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
 47 |     NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
 48 |     NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
 49 | } nvtxResourceCUDARTType_t;
 50 | 
 51 | 
 52 | /* ------------------------------------------------------------------------- */
 53 | /** \brief Annotates a CUDA device.
 54 |  *
 55 |  * Allows the user to associate a CUDA device with a user-provided name.
 56 |  *
 57 |  * \param device - The id of the CUDA device to name.
 58 |  * \param name   - The name of the CUDA device.
 59 |  *
 60 |  * \version \NVTX_VERSION_1
 61 |  * @{ */
 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
 64 | /** @} */
 65 | 
 66 | /* ------------------------------------------------------------------------- */
 67 | /** \brief Annotates a CUDA stream.
 68 |  *
 69 |  * Allows the user to associate a CUDA stream with a user-provided name.
 70 |  *
 71 |  * \param stream - The handle of the CUDA stream to name.
 72 |  * \param name   - The name of the CUDA stream.
 73 |  *
 74 |  * \version \NVTX_VERSION_1
 75 |  * @{ */
 76 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
 77 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
 78 | /** @} */
 79 | 
 80 | /* ------------------------------------------------------------------------- */
 81 | /** \brief Annotates a CUDA event.
 82 |  *
 83 |  * Allows the user to associate a CUDA event with a user-provided name.
 84 |  *
 85 |  * \param event - The handle of the CUDA event to name.
 86 |  * \param name  - The name of the CUDA event.
 87 |  *
 88 |  * \version \NVTX_VERSION_1
 89 |  * @{ */
 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
 91 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
 92 | /** @} */
 93 | 
 94 | /** @} */ /* END RESOURCE_NAMING */
 95 | 
 96 | /* ========================================================================= */
 97 | #ifdef UNICODE
 98 |   #define nvtxNameCudaDevice nvtxNameCudaDeviceW
 99 |   #define nvtxNameCudaStream nvtxNameCudaStreamW
100 |   #define nvtxNameCudaEvent  nvtxNameCudaEventW
101 | #else
102 |   #define nvtxNameCudaDevice nvtxNameCudaDeviceA
103 |   #define nvtxNameCudaStream nvtxNameCudaStreamA
104 |   #define nvtxNameCudaEvent  nvtxNameCudaEventA
105 | #endif
106 | 
107 | #ifdef __cplusplus
108 | }
109 | #endif /* __cplusplus */
110 | 
111 | #ifndef NVTX_NO_IMPL
112 | #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
113 | #include "nvtxDetail/nvtxImplCudaRt_v3.h"
114 | #undef NVTX_IMPL_GUARD_CUDART
115 | #endif /*NVTX_NO_IMPL*/
116 | 
117 | #endif /* NVTOOLSEXT_CUDART_V3 */
118 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef __NVTX_LINKONCE_H__
10 | #define __NVTX_LINKONCE_H__
11 | 
12 | /* This header defines macros to permit making definitions of global variables
13 |  * and functions in C/C++ header files which may be included multiple times in
14 |  * a translation unit or linkage unit.  It allows authoring header-only libraries
15 |  * which can be used by multiple other header-only libraries (either as the same
16 |  * copy or multiple copies), and does not require any build changes, such as
17 |  * adding another .c file, linking a static library, or deploying a dynamic
18 |  * library.  Globals defined with these macros have the property that they have
19 |  * the same address, pointing to a single instance, for the entire linkage unit.
20 |  * It is expected but not guaranteed that each linkage unit will have a separate
21 |  * instance.
22 |  *
23 |  * In some situations it is desirable to declare a variable without initializing
24 |  * it, refer to it in code or other variables' initializers, and then initialize
25 |  * it later.  Similarly, functions can be prototyped, have their address taken,
26 |  * and then have their body defined later.  In such cases, use the FWDDECL macros 
27 |  * when forward-declaring LINKONCE global variables without initializers and
28 |  * function prototypes, and then use the DEFINE macros when later defining them.
29 |  * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
30 |  * following this pattern makes code maximally portable.
31 |  */
32 | 
33 | #if defined(__MINGW32__) /* MinGW */
34 |     #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
35 |     #if defined(__cplusplus)
36 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
37 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK
38 |     #else
39 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
40 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
41 |     #endif
42 | #elif defined(_MSC_VER) /* MSVC */
43 |     #if defined(__cplusplus)
44 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   extern "C" __declspec(selectany)
45 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
46 |     #else
47 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
48 |         #define NVTX_LINKONCE_DEFINE_FUNCTION __inline
49 |     #endif
50 | #elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */
51 |     #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
52 |     #if defined(__cplusplus)
53 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
54 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK
55 |     #else
56 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
57 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
58 |     #endif
59 | #elif defined(__CYGWIN__) /* Assume GCC or compatible */
60 |     #define NVTX_LINKONCE_WEAK __attribute__((weak))
61 |     #if defined(__cplusplus)
62 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
63 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
64 |     #else
65 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
66 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
67 |     #endif
68 | #else /* All others: Assume GCC, clang, or compatible */
69 |     #define NVTX_LINKONCE_WEAK   __attribute__((weak))
70 |     #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden")))
71 |     #if defined(__cplusplus)
72 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
73 |         #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline
74 |     #else
75 |         #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
76 |         #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
77 |     #endif
78 | #endif
79 | 
80 | #define NVTX_LINKONCE_FWDDECL_GLOBAL   NVTX_LINKONCE_DEFINE_GLOBAL   extern
81 | #define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION
82 | 
83 | #endif /* __NVTX_LINKONCE_H__ */
84 | 


--------------------------------------------------------------------------------
/src/include/socket.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SOCKET_H_
 8 | #define NCCL_SOCKET_H_
 9 | 
10 | #include "nccl.h"
11 | #include <sys/socket.h>
12 | #include <arpa/inet.h>
13 | #include <netinet/tcp.h>
14 | #include <netdb.h>
15 | #include <fcntl.h>
16 | #include <poll.h>
17 | 
18 | #define MAX_IFS 16
19 | #define MAX_IF_NAME_SIZE 16
20 | #define SLEEP_INT            1000 // connection retry sleep interval in usec
21 | #define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
22 | #define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
23 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
24 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
25 | 
26 | /* Common socket address storage structure for IPv4/IPv6 */
27 | union ncclSocketAddress {
28 |   struct sockaddr sa;
29 |   struct sockaddr_in sin;
30 |   struct sockaddr_in6 sin6;
31 | };
32 | 
33 | enum ncclSocketState {
34 |   ncclSocketStateNone = 0,
35 |   ncclSocketStateInitialized = 1,
36 |   ncclSocketStateAccepting = 2,
37 |   ncclSocketStateAccepted = 3,
38 |   ncclSocketStateConnecting = 4,
39 |   ncclSocketStateConnectPolling = 5,
40 |   ncclSocketStateConnected = 6,
41 |   ncclSocketStateReady = 7,
42 |   ncclSocketStateClosed = 8,
43 |   ncclSocketStateError = 9,
44 |   ncclSocketStateNum = 10
45 | };
46 | 
47 | enum ncclSocketType {
48 |   ncclSocketTypeUnknown = 0,
49 |   ncclSocketTypeBootstrap = 1,
50 |   ncclSocketTypeProxy = 2,
51 |   ncclSocketTypeNetSocket = 3,
52 |   ncclSocketTypeNetIb = 4
53 | };
54 | 
55 | struct ncclSocket {
56 |   int fd;
57 |   int acceptFd;
58 |   int timedOutRetries;
59 |   int refusedRetries;
60 |   union ncclSocketAddress addr;
61 |   volatile uint32_t* abortFlag;
62 |   int asyncFlag;
63 |   enum ncclSocketState state;
64 |   int salen;
65 |   uint64_t magic;
66 |   enum ncclSocketType type;
67 | };
68 | 
69 | const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
70 | ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
71 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
72 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
73 | 
74 | // Initialize a socket
75 | __attribute__((visibility("default"))) ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
76 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
77 | ncclResult_t ncclSocketListen(struct ncclSocket* sock);
78 | ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
79 | // Connect to sock->addr. sock->fd is set after a successful call.
80 | __attribute__((visibility("default"))) ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
81 | // Return socket connection state.
82 | ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
83 | // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
84 | __attribute__((visibility("default"))) ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
85 | ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
86 | ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
87 | 
88 | #define NCCL_SOCKET_SEND 0
89 | #define NCCL_SOCKET_RECV 1
90 | 
91 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
92 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
93 | __attribute__((visibility("default"))) ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
94 | __attribute__((visibility("default"))) ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
95 | __attribute__((visibility("default"))) ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
96 | __attribute__((visibility("default"))) ncclResult_t ncclSocketClose(struct ncclSocket* sock);
97 | #endif
98 | 


--------------------------------------------------------------------------------
/makefiles/common.mk:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # See LICENSE.txt for license information
  5 | #
  6 | 
  7 | CUDA_HOME ?= /usr/local/cuda
  8 | PREFIX ?= /usr/local
  9 | VERBOSE ?= 0
 10 | KEEP ?= 0
 11 | DEBUG ?= 0
 12 | TRACE ?= 0
 13 | PROFAPI ?= 1
 14 | NVTX ?= 1
 15 | RDMA_CORE ?= 0
 16 | TUNER_MAXCHANNELS ?= 128
 17 | 
 18 | NVCC = $(CUDA_HOME)/bin/nvcc
 19 | 
 20 | CUDA_LIB ?= $(CUDA_HOME)/lib64
 21 | CUDA_INC ?= $(CUDA_HOME)/include
 22 | CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 23 | #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
 24 | CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 25 | CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 26 | #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
 27 | 
 28 | # You should define NVCC_GENCODE in your environment to the minimal set
 29 | # of archs to reduce compile time.
 30 | CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
 31 |                 -gencode=arch=compute_60,code=sm_60 \
 32 |                 -gencode=arch=compute_61,code=sm_61
 33 | ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
 34 | # SM35 is deprecated from CUDA12.0 onwards
 35 | CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
 36 | endif
 37 | CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 38 | CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
 39 | CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
 40 | 
 41 | CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 42 | CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
 43 | CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
 44 | CUDA12_PTX    = -gencode=arch=compute_90,code=compute_90
 45 | 
 46 | 
 47 | ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
 48 | # Include Hopper support if we're using CUDA11.8 or above
 49 |   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
 50 | else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
 51 |   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
 52 | # Include Volta support if we're using CUDA9 or above
 53 | else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
 54 |   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
 55 | else
 56 |   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
 57 | endif
 58 | $(info NVCC_GENCODE is ${NVCC_GENCODE})
 59 | 
 60 | CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
 61 |               -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
 62 |               -I $(CUDA_INC) $(NPKIT_FLAGS) \
 63 |               $(CXXFLAGS)
 64 | # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
 65 | # 512 : 120, 640 : 96, 768 : 80, 1024 : 60
 66 | # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
 67 | NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all  --resource-usage $(NPKIT_FLAGS)
 68 | # Use addprefix so that we can specify more than one path
 69 | NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
 70 | 
 71 | #########
 72 | CXXFLAGS   += -DTUNER_MAXCHANNELS=${TUNER_MAXCHANNELS}
 73 | NVCUFLAGS  += -DTUNER_MAXCHANNELS=${TUNER_MAXCHANNELS}
 74 | ##########  ##########
 75 | 
 76 | ########## GCOV ##########
 77 | GCOV ?= 0 # disable by default.
 78 | GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
 79 | CXXFLAGS  += ${GCOV_FLAGS}
 80 | NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
 81 | LDFLAGS   += ${GCOV_FLAGS}
 82 | NVLDFLAGS   += ${GCOV_FLAGS:%=-Xcompiler %}
 83 | # $(warning GCOV_FLAGS=${GCOV_FLAGS})
 84 | ########## GCOV ##########
 85 | 
 86 | ifeq ($(DEBUG), 0)
 87 | NVCUFLAGS += -O3
 88 | CXXFLAGS  += -O3 -g
 89 | else
 90 | NVCUFLAGS += -O0 -G -g
 91 | CXXFLAGS  += -O0 -g -ggdb3
 92 | endif
 93 | 
 94 | ifneq ($(VERBOSE), 0)
 95 | NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 96 | CXXFLAGS  += -Wall -Wextra
 97 | else
 98 | .SILENT:
 99 | endif
100 | 
101 | ifneq ($(TRACE), 0)
102 | CXXFLAGS  += -DENABLE_TRACE
103 | endif
104 | 
105 | ifeq ($(NVTX), 0)
106 | CXXFLAGS  += -DNVTX_DISABLE
107 | endif
108 | 
109 | ifneq ($(KEEP), 0)
110 | NVCUFLAGS += -keep
111 | endif
112 | 
113 | ifneq ($(PROFAPI), 0)
114 | CXXFLAGS += -DPROFAPI
115 | endif
116 | 
117 | ifneq ($(RDMA_CORE), 0)
118 | CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
119 | endif
120 | 


--------------------------------------------------------------------------------
/ext-tuner/example/.clang-format:
--------------------------------------------------------------------------------
  1 | # clang-format -style=llvm -dump-config > .clang-format
  2 | ---
  3 | Language:        Cpp
  4 | # BasedOnStyle:  LLVM
  5 | AccessModifierOffset: -2
  6 | AlignAfterOpenBracket: Align
  7 | AlignConsecutiveMacros: false
  8 | AlignConsecutiveAssignments: false
  9 | AlignConsecutiveDeclarations: false
 10 | AlignEscapedNewlines: Right
 11 | AlignOperands:   true
 12 | AlignTrailingComments: true
 13 | AllowAllArgumentsOnNextLine: true
 14 | AllowAllConstructorInitializersOnNextLine: true
 15 | AllowAllParametersOfDeclarationOnNextLine: true
 16 | AllowShortBlocksOnASingleLine: Never
 17 | AllowShortCaseLabelsOnASingleLine: false
 18 | AllowShortFunctionsOnASingleLine: All
 19 | AllowShortLambdasOnASingleLine: All
 20 | AllowShortIfStatementsOnASingleLine: Never
 21 | AllowShortLoopsOnASingleLine: false
 22 | AlwaysBreakAfterDefinitionReturnType: None
 23 | AlwaysBreakAfterReturnType: None
 24 | AlwaysBreakBeforeMultilineStrings: false
 25 | AlwaysBreakTemplateDeclarations: MultiLine
 26 | BinPackArguments: true
 27 | BinPackParameters: true
 28 | BraceWrapping:
 29 |   AfterCaseLabel:  false
 30 |   AfterClass:      false
 31 |   AfterControlStatement: false
 32 |   AfterEnum:       false
 33 |   AfterFunction:   false
 34 |   AfterNamespace:  false
 35 |   AfterObjCDeclaration: false
 36 |   AfterStruct:     false
 37 |   AfterUnion:      false
 38 |   AfterExternBlock: false
 39 |   BeforeCatch:     false
 40 |   BeforeElse:      false
 41 |   IndentBraces:    false
 42 |   SplitEmptyFunction: true
 43 |   SplitEmptyRecord: true
 44 |   SplitEmptyNamespace: true
 45 | BreakBeforeBinaryOperators: None
 46 | BreakBeforeBraces: Attach
 47 | BreakBeforeInheritanceComma: false
 48 | BreakInheritanceList: BeforeColon
 49 | BreakBeforeTernaryOperators: true
 50 | BreakConstructorInitializersBeforeComma: false
 51 | BreakConstructorInitializers: BeforeColon
 52 | BreakAfterJavaFieldAnnotations: false
 53 | BreakStringLiterals: true
 54 | ColumnLimit:     80
 55 | CommentPragmas:  '^ IWYU pragma:'
 56 | CompactNamespaces: false
 57 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
 58 | ConstructorInitializerIndentWidth: 4
 59 | ContinuationIndentWidth: 4
 60 | Cpp11BracedListStyle: true
 61 | DeriveLineEnding: true
 62 | DerivePointerAlignment: false
 63 | DisableFormat:   false
 64 | ExperimentalAutoDetectBinPacking: false
 65 | FixNamespaceComments: true
 66 | ForEachMacros:
 67 |   - foreach
 68 |   - Q_FOREACH
 69 |   - BOOST_FOREACH
 70 | IncludeBlocks:   Preserve
 71 | IncludeCategories:
 72 |   - Regex:           '^<.*\.h>'
 73 |     Priority:        0
 74 |     SortPriority:    0
 75 |     CaseSensitive:   false
 76 |   - Regex:           '^<.*'
 77 |     Priority:        1
 78 |     SortPriority:    0
 79 |     CaseSensitive:   false
 80 |   - Regex:           '.*'
 81 |     Priority:        2
 82 |     SortPriority:    0
 83 |     CaseSensitive:   false
 84 | IncludeIsMainRegex: '(Test)?$'
 85 | IncludeIsMainSourceRegex: ''
 86 | IndentAccessModifiers: true
 87 | IndentCaseLabels: false
 88 | IndentGotoLabels: true
 89 | IndentPPDirectives: None
 90 | IndentWidth:     2
 91 | IndentWrappedFunctionNames: false
 92 | JavaScriptQuotes: Leave
 93 | JavaScriptWrapImports: true
 94 | KeepEmptyLinesAtTheStartOfBlocks: true
 95 | MacroBlockBegin: ''
 96 | MacroBlockEnd:   ''
 97 | MaxEmptyLinesToKeep: 1
 98 | NamespaceIndentation: None
 99 | ObjCBinPackProtocolList: Auto
100 | ObjCBlockIndentWidth: 2
101 | ObjCSpaceAfterProperty: false
102 | ObjCSpaceBeforeProtocolList: true
103 | PenaltyBreakAssignment: 2
104 | PenaltyBreakBeforeFirstCallParameter: 19
105 | PenaltyBreakComment: 300
106 | PenaltyBreakFirstLessLess: 120
107 | PenaltyBreakString: 1000
108 | PenaltyBreakTemplateDeclaration: 10
109 | PenaltyExcessCharacter: 1000000
110 | PenaltyReturnTypeOnItsOwnLine: 60
111 | PointerAlignment: Right
112 | ReflowComments:  true
113 | SortIncludes:    true
114 | SortUsingDeclarations: true
115 | SpaceAfterCStyleCast: false
116 | SpaceAfterLogicalNot: false
117 | SpaceAfterTemplateKeyword: true
118 | SpaceBeforeAssignmentOperators: true
119 | SpaceBeforeCpp11BracedList: false
120 | SpaceBeforeCtorInitializerColon: true
121 | SpaceBeforeInheritanceColon: true
122 | SpaceBeforeParens: ControlStatements
123 | SpaceBeforeRangeBasedForLoopColon: true
124 | SpaceInEmptyBlock: false
125 | SpaceInEmptyParentheses: false
126 | SpacesBeforeTrailingComments: 2
127 | SpacesInAngles:  false
128 | SpacesInConditionalStatement: false
129 | SpacesInContainerLiterals: true
130 | SpacesInCStyleCastParentheses: false
131 | SpacesInParentheses: false
132 | SpacesInSquareBrackets: false
133 | SpaceBeforeSquareBrackets: false
134 | Standard:        Latest
135 | StatementMacros:
136 |   - Q_UNUSED
137 |   - QT_REQUIRE_VERSION
138 | TabWidth:        8
139 | UseCRLF:         false
140 | UseTab:          Never
141 | ...
142 | 
143 | 


--------------------------------------------------------------------------------
/src/include/info.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #ifndef NCCL_INFO_H_
  8 | #define NCCL_INFO_H_
  9 | 
 10 | #include "nccl.h"
 11 | #include "devcomm.h"
 12 | #include "collectives.h"
 13 | #include "core.h"
 14 | #include "utils.h"
 15 | #include "strongstream.h"
 16 | 
 17 | typedef enum : uint8_t {
 18 |   ncclPatternRing,
 19 |   ncclPatternRingTwice,
 20 |   ncclPatternPipelineFrom,
 21 |   ncclPatternPipelineTo,
 22 |   ncclPatternTreeUp,
 23 |   ncclPatternTreeDown,
 24 |   ncclPatternTreeUpDown,
 25 |   ncclPatternCollnetChain,
 26 |   ncclPatternCollnetDirect,
 27 |   ncclPatternNvls,
 28 |   ncclPatternNvlsTree,
 29 |   ncclPatternSend,
 30 |   ncclPatternRecv
 31 | } ncclPattern_t;
 32 | 
 33 | // Used to pass NCCL call information between functions
 34 | struct ncclInfo {
 35 |   ncclFunc_t coll;
 36 |   const char* opName;
 37 |   // NCCL Coll Args
 38 |   const void* sendbuff;
 39 |   void* recvbuff;
 40 |   size_t count;
 41 |   ncclDataType_t datatype;
 42 |   ncclRedOp_t op;
 43 |   int root; // peer for p2p operations
 44 |   ncclComm_t comm;
 45 |   cudaStream_t stream;
 46 |   // Algorithm details
 47 |   int chunkSteps;
 48 |   int sliceSteps;
 49 |   // Computed later
 50 |   ncclDevRedOpFull opFull;
 51 |   int algorithm;
 52 |   int protocol;
 53 |   int isCopyEngineNotSmCopy;
 54 |   int p2pLevel;
 55 |   ncclPattern_t pattern;
 56 |   int nChannels;
 57 |   int nThreads;
 58 |   size_t nBytes;
 59 |   int nstepsPerLoop;
 60 |   int nchunksPerLoop;
 61 |   int chunkSize;
 62 |   int channelId;
 63 | };
 64 | 
 65 | inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
 66 |   info->nBytes = info->count * ncclTypeSize(info->datatype);
 67 |   if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
 68 |     info->count = info->nBytes;
 69 |     info->datatype = ncclInt8;
 70 |   }
 71 |   if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
 72 |   return ncclSuccess;
 73 | }
 74 | 
 75 | struct ncclTaskColl {
 76 |   struct ncclTaskColl* next;
 77 |   ncclFunc_t func;
 78 |   void const* sendbuff;
 79 |   void* recvbuff;
 80 |   size_t count;
 81 |   int root;
 82 |   ncclDataType_t datatype;
 83 |   ncclDevRedOpFull op;
 84 |   int chunkSteps, sliceSteps;
 85 | };
 86 | struct ncclTaskP2p {
 87 |   ncclTaskP2p *next;
 88 |   void *buff;
 89 |   size_t bytes;
 90 |   // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
 91 |   // of where it left off.
 92 |   int chunk;
 93 |   int peer;
 94 | };
 95 | 
 96 | struct ncclCudaStreamList {
 97 |   struct ncclCudaStreamList *next;
 98 |   cudaStream_t stream;
 99 | };
100 | struct ncclTasks {
101 |   struct Peer {
102 |     bool sendSeen, recvSeen;
103 |     struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
104 |     struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
105 |   };
106 |   struct Backup {
107 |     // backup for Tuner
108 |     struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
109 |     size_t collBytesTotal;
110 |     struct Peer* peers/*[nRanks]*/;
111 |     int nTasksColl, nTasksP2p;
112 |   };
113 |   Backup backup;
114 |   // for 1 plan: workload, candidate
115 |   struct Workload {
116 |     uint64_t commHash;
117 |     ncclFunc_t collType;
118 |     size_t nBytes;
119 |   };
120 |   Workload workload;
121 |   struct Candidate {
122 |     int algorithm;
123 |     int protocol;
124 |     int isCopyEngineNotSmCopy;
125 |     int p2pLevel;
126 |     int nChannels;
127 |     int nThreads;
128 |     int wireChunksize;
129 |     int iteration;
130 |     int lastIterEffectiveChunksize;
131 |     int native;
132 |     bool initialized = false;
133 |     int nThreadsTotal;
134 |   };
135 |   Candidate candidate; // 0 for native; 1 for tuner
136 | 
137 |   struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
138 |   size_t collBytesTotal;
139 |   struct Peer* peers/*[nRanks]*/;
140 |   int *p2pSendOrder, *p2pRecvOrder;
141 |   int p2pOrderSteps;
142 |   int nTasksColl, nTasksP2p;
143 | 
144 |   // The list of user streams aggregated over all tasks present.
145 |   struct ncclCudaStreamList* streams;
146 |   // The most recent user stream. Ignored if streams==nullptr
147 |   cudaStream_t streamRecent;
148 |   // The graph capturing all user streams or invalid if none. Thus we restrict the
149 |   // user that all streams must be captured in the same graph or not captured
150 |   // at all. Technically we could probably relax this, but that would mean
151 |   // collecting a different `ncclTasks` per graph and one for non-graph.
152 |   struct ncclCudaGraph capturingGraph;
153 | };
154 | 
155 | #endif
156 | 


--------------------------------------------------------------------------------
/src/collectives/device/reduce.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "devcomm.h"
  8 | #include "collectives.h"
  9 | #include "primitives.h"
 10 | 
 11 | namespace {
 12 |   template<typename T, typename RedOp, typename Proto>
 13 |   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
 14 |     const int tid = threadIdx.x;
 15 |     const int nthreads = args->nWarps*WARP_SIZE;
 16 |     const int bid = args->bid;
 17 |     const int nChannels = args->nChannels;
 18 |     ncclRing *ring = &ncclShmem.channel.ring;
 19 |     ssize_t chunkSize;
 20 |     if (args->native) {
 21 |       chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
 22 |     } else {
 23 |       chunkSize = args->effectiveChunkSize;
 24 |     }
 25 |     ssize_t minChunkSize;
 26 |     if (Proto::Id == NCCL_PROTO_LL)
 27 |       minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T));
 28 |     if (Proto::Id == NCCL_PROTO_LL128) {
 29 |       // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
 30 |       minChunkSize = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); // minChunkSizeLL128
 31 |     }
 32 | 
 33 |     const int nranks = ncclShmem.comm.nRanks;
 34 |     const ssize_t loopSize = nChannels*chunkSize;
 35 |     const ssize_t size = args->count;
 36 |     const int rank = ncclShmem.comm.rank;
 37 |     const int prevRank = ring->userRanks[nranks-1];
 38 |     const int root = args->root;
 39 | 
 40 |     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
 41 |       prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, 0, 0, args->transportIndex);
 42 | 
 43 |     auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
 44 |       int realChunkSize;
 45 |       if (Proto::Id == NCCL_PROTO_SIMPLE) {
 46 |         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
 47 |         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
 48 |       } else if (Proto::Id == NCCL_PROTO_LL) {
 49 |         if (args->native) {
 50 |           realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
 51 |         } else {
 52 |           realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
 53 |         }
 54 |       } else if (Proto::Id == NCCL_PROTO_LL128) {
 55 |         realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
 56 |       }
 57 |       return realChunkSize;
 58 |     };
 59 | 
 60 |     if (prevRank == root) {
 61 |       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 62 |         int realChunkSize;
 63 |         realChunkSize = calcChunkSize(gridOffset);
 64 |         ssize_t offset = gridOffset + bid*realChunkSize;
 65 |         int nelem = min(realChunkSize, size-offset);
 66 |         prims.send(offset, nelem);
 67 |       }
 68 |     }
 69 |     else if (rank == root) {
 70 |       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 71 |         int realChunkSize;
 72 |         realChunkSize = calcChunkSize(gridOffset);
 73 |         ssize_t offset = gridOffset + bid*realChunkSize;
 74 |         int nelem = min(realChunkSize, size-offset);
 75 |         prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
 76 |       }
 77 |     }
 78 |     else {
 79 |       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 80 |         int realChunkSize;
 81 |         realChunkSize = calcChunkSize(gridOffset);
 82 |         ssize_t offset = gridOffset + bid*realChunkSize;
 83 |         int nelem = min(realChunkSize, size-offset);
 84 |         prims.recvReduceSend(offset, nelem);
 85 |       }
 86 |     }
 87 |   }
 88 | }
 89 | 
 90 | template<typename T, typename RedOp>
 91 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 92 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
 93 |     using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
 94 |     runRing<T, RedOp, Proto>(args);
 95 |   }
 96 | };
 97 | 
 98 | template<typename T, typename RedOp>
 99 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
100 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
101 |     runRing<T, RedOp, ProtoLL>(args);
102 |   }
103 | };
104 | 
105 | template<typename T, typename RedOp>
106 | struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
107 |   __device__ __forceinline__ void run(ncclWorkElem *args) {
108 |     runRing<T, RedOp, ProtoLL128>(args);
109 |   }
110 | };
111 | 


--------------------------------------------------------------------------------