├── pkg ├── debian │ ├── compat │ ├── copyright │ ├── source │ │ └── format │ ├── .gitignore │ ├── libnccl2.install.in │ ├── gbp.conf │ ├── libnccl-dev.install.in │ ├── changelog.in │ ├── rules │ ├── control.in │ └── Makefile ├── Makefile ├── txz │ ├── create_txz.sh.in │ └── Makefile ├── srctxz │ ├── create_srctxz.sh.in │ └── Makefile └── redhat │ ├── Makefile │ └── nccl.spec.in ├── ext-tuner └── example │ ├── src │ ├── include │ │ ├── optimizers │ │ │ ├── co_optimizer.h │ │ │ ├── het_optimizer.h │ │ │ ├── unified_optimizer.h │ │ │ └── optimizer.h │ │ ├── timer.h │ │ ├── jobs │ │ │ ├── bruteforce_job.h │ │ │ └── native_job.h │ │ └── internal │ │ │ ├── threadsafe_queue.h │ │ │ ├── env.h │ │ │ └── database.h │ └── cuda │ │ ├── collectives │ │ └── util.h │ │ ├── nccl_params.h │ │ └── nccl_socket.h │ ├── example │ └── cuda │ │ └── pytorch │ │ ├── whitelistrules.txt │ │ ├── whitelistcases.txt │ │ ├── run.sh │ │ └── demo.py │ ├── test │ └── cuda │ │ ├── test_loader.cc │ │ └── tuner_loader.h │ ├── README.md │ ├── plugin │ └── cuda │ │ ├── wrapper.cc │ │ ├── wrapper.py │ │ └── plugin.h │ ├── .clang-tidy │ ├── utils │ └── get_candidates.py │ └── .clang-format ├── .gitignore ├── makefiles ├── version.mk ├── formatting.mk └── common.mk ├── .gitmodules ├── src ├── nccl.pc.in ├── graph │ ├── rings.h │ ├── rings.cc │ └── trees.cc ├── collectives │ ├── device │ │ ├── reduce.cu │ │ ├── sendrecv.cu │ │ ├── all_gather.cu │ │ ├── all_reduce.cu │ │ ├── broadcast.cu │ │ ├── reduce_scatter.cu │ │ ├── gen_rules.sh │ │ ├── onerank_reduce.cu │ │ ├── Makefile │ │ ├── broadcast.h │ │ └── reduce.h │ ├── all_gather.cc │ ├── all_reduce.cc │ ├── reduce_scatter.cc │ ├── reduce.cc │ ├── broadcast.cc │ └── sendrecv.cc ├── include │ ├── npkit │ │ ├── npkit_struct.h │ │ └── npkit.h │ ├── argcheck.h │ ├── trees.h │ ├── shm.h │ ├── net.h │ ├── tuner.h │ ├── p2p.h │ ├── profiler.h │ ├── param.h │ ├── ipcsocket.h │ ├── enqueue.h │ ├── align.h │ ├── nvtx3 │ │ ├── nvtxExtDetail │ │ │ ├── nvtxExtTypes.h │ │ │ ├── nvtxExtImpl.h │ │ │ └── nvtxExtImplPayload_v1.h │ │ ├── nvtxDetail │ │ │ ├── nvtxImplCudaRt_v3.h │ │ │ ├── nvtxImplSync_v3.h │ │ │ ├── nvtxImplCuda_v3.h │ │ │ └── nvtxLinkOnce.h │ │ └── nvToolsExtCudaRt.h │ ├── debug.h │ ├── cpuset.h │ ├── bootstrap.h │ ├── timer.h │ ├── core.h │ ├── channel.h │ ├── ibvsymbols.h │ ├── coll_net.h │ ├── nvtx.h │ ├── nccl_tuner.h │ ├── socket.h │ └── info.h ├── init_nvtx.cc ├── enhcompat.cc └── misc │ ├── param.cc │ ├── argcheck.cc │ └── tuner.cc ├── ext-net ├── example │ ├── Makefile │ └── nccl │ │ ├── err.h │ │ ├── types.h │ │ ├── net.h │ │ ├── net_v3.h │ │ ├── net_v2.h │ │ ├── net_v5.h │ │ ├── net_v4.h │ │ └── net_v6.h └── google-fastsocket │ └── Makefile ├── Makefile ├── LICENSE.txt └── README.md /pkg/debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /pkg/debian/copyright: -------------------------------------------------------------------------------- 1 | ../../LICENSE.txt -------------------------------------------------------------------------------- /pkg/debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/optimizers/co_optimizer.h: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/optimizers/het_optimizer.h: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/optimizers/unified_optimizer.h: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ext-tuner/example/example/cuda/pytorch/whitelistrules.txt: -------------------------------------------------------------------------------- 1 | ;;;0 16; -------------------------------------------------------------------------------- /ext-tuner/example/example/cuda/pytorch/whitelistcases.txt: -------------------------------------------------------------------------------- 1 | -1;-1 4 1 ; 2 | -------------------------------------------------------------------------------- /pkg/debian/.gitignore: -------------------------------------------------------------------------------- 1 | /*.debhelper.log 2 | /*.debhelper 3 | /*.substvars 4 | /tmp/ 5 | /files 6 | /libnccl1/ 7 | /libnccl-dev/ 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. 2 | /build 3 | *.gcov 4 | /coverage/ 5 | build 6 | __pycache__ 7 | -------------------------------------------------------------------------------- /makefiles/version.mk: -------------------------------------------------------------------------------- 1 | ##### version 2 | NCCL_MAJOR := 2 3 | NCCL_MINOR := 18 4 | NCCL_PATCH := 3 5 | NCCL_SUFFIX := 6 | PKG_REVISION := 1 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ext-tuner/example/3rdparty/cereal"] 2 | path = ext-tuner/example/3rdparty/cereal 3 | url = https://github.com/USCiLab/cereal 4 | -------------------------------------------------------------------------------- /pkg/debian/libnccl2.install.in: -------------------------------------------------------------------------------- 1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch} 2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch} 3 | -------------------------------------------------------------------------------- /pkg/debian/gbp.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | debian-branch = master 3 | upstream-branch = master 4 | 5 | ignore-new = True 6 | 7 | [git-buildpackage] 8 | 9 | no-purge = True 10 | -------------------------------------------------------------------------------- /pkg/debian/libnccl-dev.install.in: -------------------------------------------------------------------------------- 1 | include/nccl.h /usr/include 2 | include/nccl_net.h /usr/include 3 | lib/libnccl.so /usr/lib/${pkg:MultiArch} 4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch} 5 | -------------------------------------------------------------------------------- /pkg/debian/changelog.in: -------------------------------------------------------------------------------- 1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium 2 | 3 | * Automatic Debian package from build 4 | 5 | -- cudatools ${pkg:Timestamp} 6 | -------------------------------------------------------------------------------- /pkg/debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | %: 4 | dh $@ --parallel 5 | 6 | override_dh_auto_install: 7 | PREFIX=debian/tmp dh_auto_install 8 | 9 | override_dh_auto_test: 10 | # Do not make test 11 | 12 | override_dh_auto_clean: 13 | # Do not make clean 14 | -------------------------------------------------------------------------------- /src/nccl.pc.in: -------------------------------------------------------------------------------- 1 | prefix=${nccl:Prefix} 2 | exec_prefix=${prefix} 3 | libdir=${exec_prefix}/lib 4 | includedir=${prefix}/include 5 | 6 | Name: nccl 7 | Description: Optimized primitives for collective multi-GPU communication 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} 9 | Libs: -L${libdir} -lnccl 10 | Cflags: -I${includedir} 11 | -------------------------------------------------------------------------------- /src/graph/rings.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next); 8 | -------------------------------------------------------------------------------- /src/collectives/device/reduce.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "reduce.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_R(Reduce); 12 | -------------------------------------------------------------------------------- /ext-tuner/example/test/cuda/test_loader.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "plugin/cuda/plugin.h" 5 | #include "test/cuda/tuner_loader.h" 6 | 7 | void testLoader() { 8 | ncclTuner_t *tuner = nullptr; 9 | ncclLoadTunerPlugin(&tuner); 10 | ncclCloseTunerPlugin(&tuner); 11 | } 12 | 13 | int main() { 14 | testLoader(); 15 | printf("test_loader pass\n"); 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /src/collectives/device/sendrecv.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "sendrecv.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_P(SendRecv); 12 | -------------------------------------------------------------------------------- /src/collectives/device/all_gather.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "all_gather.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_C(AllGather); 12 | -------------------------------------------------------------------------------- /src/collectives/device/all_reduce.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "all_reduce.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_R(AllReduce); 12 | -------------------------------------------------------------------------------- /src/collectives/device/broadcast.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "broadcast.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_C(Broadcast); 12 | -------------------------------------------------------------------------------- /src/collectives/device/reduce_scatter.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "reduce_scatter.h" 8 | #include "common.h" 9 | #include "collectives.h" 10 | 11 | IMPL_COLL_R(ReduceScatter); 12 | -------------------------------------------------------------------------------- /ext-net/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME:=../../build/ 7 | CUDA_HOME:=/usr/local/cuda 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include 9 | PLUGIN_SO:=libnccl-net.so 10 | 11 | default: $(PLUGIN_SO) 12 | 13 | $(PLUGIN_SO): plugin.c 14 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 15 | 16 | clean: 17 | rm -f $(PLUGIN_SO) 18 | -------------------------------------------------------------------------------- /src/include/npkit/npkit_struct.h: -------------------------------------------------------------------------------- 1 | #ifndef NPKIT_STRUCT_H_ 2 | #define NPKIT_STRUCT_H_ 3 | 4 | #include 5 | 6 | #pragma pack(push, 1) 7 | 8 | union NpKitEvent { 9 | uint64_t bits[2]; 10 | struct { 11 | uint64_t type : 8; 12 | uint64_t size : 32; 13 | uint64_t rsvd : 24; 14 | uint64_t timestamp; 15 | } fields; 16 | }; 17 | 18 | struct NpKitEventCollectContext { 19 | NpKitEvent* event_buffer; 20 | uint64_t event_buffer_head; 21 | }; 22 | 23 | #pragma pack(pop) 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /ext-net/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_ERR_H_ 6 | #define NCCL_ERR_H_ 7 | 8 | /* Error type for plugins */ 9 | typedef enum { ncclSuccess = 0, 10 | ncclUnhandledCudaError = 1, 11 | ncclSystemError = 2, 12 | ncclInternalError = 3, 13 | ncclInvalidArgument = 4, 14 | ncclRemoteError = 6 } ncclResult_t; 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/include/argcheck.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ARGCHECK_H_ 8 | #define NCCL_ARGCHECK_H_ 9 | 10 | #include "core.h" 11 | #include "info.h" 12 | 13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); 14 | ncclResult_t ArgsCheck(struct ncclInfo* info); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/timer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "src/include/datatype.h" 4 | 5 | class Timer { 6 | public: 7 | virtual void begin(const RecordKey &recordKey, bool blocking, 8 | void *context) = 0; 9 | virtual void end(GIDTYPE groupID, bool blocking) = 0; 10 | virtual void start() = 0; 11 | virtual void stop() = 0; 12 | virtual void tryGetRecords(std::vector *records, bool blocking) = 0; 13 | virtual void setProfiling(const Workload &workload, 14 | int32_t askedProfiling) = 0; 15 | }; 16 | -------------------------------------------------------------------------------- /ext-net/google-fastsocket/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME?=/usr/local/cuda 2 | INC:=-I$(CUDA_HOME)/include 3 | PLUGIN_SO:=libnccl-net.so 4 | 5 | default: $(PLUGIN_SO) 6 | 7 | $(PLUGIN_SO): nccl-fastsocket/*.cc 8 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 9 | 10 | nccl-fastsocket/*.cc: 11 | git clone https://github.com/google/nccl-fastsocket.git 12 | 13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO) 14 | 15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO) 16 | @printf "Grabbing %-35s > %s\n" $< $@ 17 | mkdir -p $(BUILDDIR)/lib 18 | install -m 644 $< $@ 19 | 20 | clean: 21 | rm -f $(PLUGIN_SO) 22 | rm -Rf nccl-fastsocket 23 | -------------------------------------------------------------------------------- /src/include/trees.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TREES_H_ 8 | #define NCCL_TREES_H_ 9 | 10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); 11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /pkg/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : build 9 | build : debian.build txz.build 10 | 11 | BUILDDIR ?= $(abspath ../build) 12 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 13 | TARGETS := debian txz 14 | all: ${TARGETS:%=%.build} 15 | prep: ${TARGETS:%=%.prep} 16 | build: ${TARGETS:%=%.build} 17 | clean: ${TARGETS:%=%.clean} 18 | 19 | %.prep: 20 | ${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR} 21 | 22 | %.build: 23 | ${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR} 24 | 25 | %.clean: 26 | ${MAKE} -C $* clean 27 | -------------------------------------------------------------------------------- /src/include/shm.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SHM_H_ 8 | #define NCCL_SHM_H_ 9 | 10 | #include "nccl.h" 11 | 12 | typedef void* ncclShmHandle_t; 13 | ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); 14 | ncclResult_t ncclShmClose(ncclShmHandle_t handle); 15 | ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /ext-net/example/nccl/types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_ERR_H_ 6 | #define NCCL_ERR_H_ 7 | 8 | /* Data types */ 9 | typedef enum { ncclInt8 = 0, ncclChar = 0, 10 | ncclUint8 = 1, 11 | ncclInt32 = 2, ncclInt = 2, 12 | ncclUint32 = 3, 13 | ncclInt64 = 4, 14 | ncclUint64 = 5, 15 | ncclFloat16 = 6, ncclHalf = 6, 16 | ncclFloat32 = 7, ncclFloat = 7, 17 | ncclFloat64 = 8, ncclDouble = 8, 18 | ncclBfloat16 = 9, 19 | } ncclDataType_t; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /pkg/txz/create_txz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | BUILDDIR=`basename $PWD` 11 | 12 | cd .. 13 | NCCL_MAJOR=${nccl:Major} 14 | NCCL_MINOR=${nccl:Minor} 15 | NCCL_PATCH=${nccl:Patch} 16 | NCCL_SUFFIX=${nccl:Suffix} 17 | CUDA_MAJOR=${cuda:Major} 18 | CUDA_MINOR=${cuda:Minor} 19 | PKG_REVISION=${pkg:Revision} 20 | PKG_ARCH=${pkg:Arch} 21 | 22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" 23 | 24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : src.build 9 | install : src.install 10 | BUILDDIR ?= $(abspath ./build) 11 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 12 | TARGETS := src pkg 13 | clean: ${TARGETS:%=%.clean} 14 | test.build: src.build 15 | LICENSE_FILES := LICENSE.txt 16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) 17 | lic: $(LICENSE_TARGETS) 18 | 19 | ${BUILDDIR}/%.txt: %.txt 20 | @printf "Copying %-35s > %s\n" $< $@ 21 | mkdir -p ${BUILDDIR} 22 | cp $< $@ 23 | 24 | src.%: 25 | ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} 26 | 27 | pkg.%: 28 | ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} 29 | 30 | pkg.debian.prep: lic 31 | pkg.txz.prep: lic 32 | -------------------------------------------------------------------------------- /ext-tuner/example/example/cuda/pytorch/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | THIS_PATH=$(readlink -f "$0") 4 | THIS_DIR=$(dirname "$THIS_PATH") 5 | NCCL_HOME="$THIS_DIR/../../../../../build" 6 | TUNER_HOME="$THIS_DIR/../../../build" 7 | 8 | export TUNER_MAXCHANNELS=32 9 | export TUNER_P2P_NCHANNELS=2 10 | export TUNER_WHITELIST_CASES_FILE="./whitelistcases.txt" 11 | export TUNER_WHITELIST_RULES_FILE="./whitelistrules.txt" 12 | export NCCL_TIMEOUT=3600 13 | export TUNER_PRETRAIN_STEPS=360 14 | export TUNER_TRAIN_STEPS=240 15 | export TUNER_PROFILE_REPEAT=5 16 | export TUNER_COORDINATOR=localhost:12449 17 | export TUNER_WORLDSIZE=8 18 | export NCCL_TUNER_PLUGIN=${TUNER_HOME}/libnccl-plugin.so 19 | export LD_PRELOAD=${NCCL_HOME}/lib/libnccl.so:$LD_PRELOAD 20 | export LD_LIBRARY_PATH=${NCCL_HOME}/lib:${TUNER_HOME}:$LD_LIBRARY_PATH 21 | 22 | torchrun --nproc_per_node 8 --nnodes 1 --node_rank 0 demo.py 23 | -------------------------------------------------------------------------------- /src/include/net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INT_NET_H_ 8 | #define NCCL_INT_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | #include "comm.h" 13 | #include "checks.h" 14 | 15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 16 | 17 | ncclResult_t ncclNetPluginInit(); 18 | ncclResult_t ncclNetInit(struct ncclComm* comm); 19 | int ncclNetVersion(struct ncclComm* comm); 20 | 21 | // Test whether the current GPU support GPU Direct RDMA. 22 | ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); 23 | 24 | extern ncclNet_t ncclNetIb; 25 | extern ncclNet_t ncclNetSocket; 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /src/include/tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_INT_TUNER_H_ 9 | #define NCCL_INT_TUNER_H_ 10 | 11 | #include "nccl_tuner.h" 12 | 13 | // Tuning plugin to override NCCL's default algorithm/protocol tuning. 14 | 15 | // Attempts to load NCCL tuner from environmental variable. 16 | // Returns ncclSuccess if the correct tuner symbol has been found and 17 | // successully loaded. Otherwise returns an error and also logs the error. 18 | ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner); 19 | 20 | // Cleans up NCCL tuner plugin. 21 | ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner); 22 | #endif -------------------------------------------------------------------------------- /src/init_nvtx.cc: -------------------------------------------------------------------------------- 1 | #include "nccl.h" 2 | #include "nvtx.h" 3 | 4 | static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { 5 | {"Sum", ncclSum}, 6 | {"Product", ncclProd}, 7 | {"Max", ncclMax}, 8 | {"Min", ncclMin}, 9 | {"Avg", ncclAvg} 10 | }; 11 | 12 | // Must be called before the first call to any reduction operation. 13 | void initNvtxRegisteredEnums() { 14 | // Register schemas and strings 15 | constexpr const nvtxPayloadEnumAttr_t eAttr { 16 | .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES | 17 | NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID, 18 | .name = NULL, 19 | .entries = NvtxEnumRedSchema, 20 | .numEntries = std::extent::value, 21 | .sizeOfEnum = sizeof(ncclRedOp_t), 22 | .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP 23 | }; 24 | 25 | nvtxPayloadEnumRegister(nvtx3::domain::get(), &eAttr); 26 | } 27 | -------------------------------------------------------------------------------- /ext-tuner/example/test/cuda/tuner_loader.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_INT_TUNER_H_ 9 | #define NCCL_INT_TUNER_H_ 10 | 11 | #include "plugin/cuda/plugin.h" 12 | 13 | // Tuning plugin to override NCCL's default algorithm/protocol tuning. 14 | 15 | // Attempts to load NCCL tuner from environmental variable. 16 | // Returns ncclSuccess if the correct tuner symbol has been found and 17 | // successully loaded. Otherwise returns an error and also logs the error. 18 | ncclResult_t ncclLoadTunerPlugin(ncclTuner_t **tuner); 19 | 20 | // Cleans up NCCL tuner plugin. 21 | ncclResult_t ncclCloseTunerPlugin(ncclTuner_t **tuner); 22 | #endif 23 | -------------------------------------------------------------------------------- /pkg/srctxz/create_srctxz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | cd .. 11 | NCCLDIR=`basename $PWD` 12 | 13 | echo "Checking for unclean directory ..." 14 | git clean -x -i 15 | echo "Clean done." 16 | echo "Checking for uncommited files ..." 17 | if [ "`git status -s | wc -l`" != "0" ]; then 18 | git status -s 19 | echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)" 20 | read 21 | fi 22 | 23 | cd .. 24 | NCCL_MAJOR=${nccl:Major} 25 | NCCL_MINOR=${nccl:Minor} 26 | NCCL_PATCH=${nccl:Patch} 27 | NCCL_SUFFIX=${nccl:Suffix} 28 | NCCL_BUILD=${pkg:Revision} 29 | 30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" 31 | 32 | tar --exclude build \ 33 | --exclude ".git*" \ 34 | --exclude pkg/srctxz \ 35 | --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR 36 | -------------------------------------------------------------------------------- /ext-tuner/example/src/cuda/collectives/util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/cuda/nccl_params.h" 6 | 7 | int effective2Wire(int effectiveChunksize, int proto) { 8 | int wireChunksize = -1; 9 | if (proto == NCCL_PROTO_LL) 10 | wireChunksize = static_cast(effectiveChunksize * 2); 11 | else if (proto == NCCL_PROTO_LL128) 12 | wireChunksize = static_cast(effectiveChunksize * 16 / 15); 13 | else if (proto == NCCL_PROTO_SIMPLE) 14 | wireChunksize = effectiveChunksize; 15 | return wireChunksize; 16 | } 17 | int wire2Effective(int wireChunksize, int proto) { 18 | int effectiveChunksize = -1; 19 | if (proto == NCCL_PROTO_LL) 20 | effectiveChunksize = static_cast(wireChunksize / 2); 21 | else if (proto == NCCL_PROTO_LL128) 22 | effectiveChunksize = static_cast(wireChunksize * 15 / 16); 23 | else if (proto == NCCL_PROTO_SIMPLE) 24 | effectiveChunksize = wireChunksize; 25 | return effectiveChunksize; 26 | } 27 | -------------------------------------------------------------------------------- /src/include/p2p.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | 9 | #ifndef NCCL_P2P_H_ 10 | #define NCCL_P2P_H_ 11 | 12 | #define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR 13 | 14 | typedef struct { 15 | int data; // Currently only support an fd based descriptor 16 | } ncclCuDesc; 17 | 18 | typedef union { 19 | // Legacy CUDA IPC 20 | cudaIpcMemHandle_t devIpc; 21 | // cuMem API support 22 | ncclCuDesc cuDesc; 23 | } ncclIpcDesc; 24 | 25 | ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr); 26 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); 27 | ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /src/include/profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PROFILER_H_ 8 | #define NCCL_PROFILER_H_ 9 | 10 | #include "proxy.h" 11 | 12 | enum ncclProxyProfileState { 13 | ncclProxyProfileBegin = 0, 14 | 15 | ncclProxyProfileSendGPUWait = 1, 16 | ncclProxyProfileSendWait = 2, 17 | 18 | ncclProxyProfileRecvWait = 1, 19 | ncclProxyProfileRecvFlushWait = 2, 20 | ncclProxyProfileRecvGPUWait = 3, 21 | 22 | ncclProxyProfileEnd = 4, 23 | 24 | ncclProxyProfileSleep = 8, 25 | ncclProxyProfileWakeup = 9, 26 | 27 | ncclProxyProfileIdle = 16, 28 | ncclProxyProfileActive = 17, 29 | 30 | ncclProxyProfileAppend = 24, 31 | ncclProxyProfileAppendEnd = 25 32 | }; 33 | 34 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); 35 | void ncclProfilingDump(); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/include/param.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PARAM_H_ 8 | #define NCCL_PARAM_H_ 9 | 10 | #include 11 | 12 | const char* userHomeDir(); 13 | void setEnvFile(const char* fileName); 14 | void initEnv(); 15 | 16 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); 17 | 18 | #define NCCL_PARAM(name, env, deftVal) \ 19 | int64_t ncclParam##name() { \ 20 | constexpr int64_t uninitialized = INT64_MIN; \ 21 | static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ 22 | static int64_t cache = uninitialized; \ 23 | if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ 24 | ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ 25 | } \ 26 | return cache; \ 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_H_ 6 | #define NCCL_NET_H_ 7 | 8 | #include 9 | #include 10 | 11 | #include "err.h" 12 | 13 | #define NCCL_NET_HANDLE_MAXSIZE 128 14 | 15 | #define NCCL_PTR_HOST 0x1 16 | #define NCCL_PTR_CUDA 0x2 17 | #define NCCL_PTR_DMABUF 0x4 18 | 19 | // Maximum number of requests per comm object 20 | #define NCCL_NET_MAX_REQUESTS 8 21 | 22 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 23 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; 24 | 25 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 26 | 27 | #include "net_v6.h" 28 | #include "net_v5.h" 29 | #include "net_v4.h" 30 | #include "net_v3.h" 31 | #include "net_v2.h" 32 | 33 | #endif // end include guard 34 | -------------------------------------------------------------------------------- /src/include/ipcsocket.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See COPYRIGHT for license information 5 | */ 6 | 7 | #ifndef NCCL_IPCSOCKET_H 8 | #define NCCL_IPCSOCKET_H 9 | 10 | #include "nccl.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #define NCCL_IPC_SOCKNAME_LEN 64 24 | 25 | struct ncclIpcSocket { 26 | int fd; 27 | char socketName[NCCL_IPC_SOCKNAME_LEN]; 28 | volatile uint32_t* abortFlag; 29 | }; 30 | 31 | ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); 32 | ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); 33 | 34 | ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); 35 | ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); 36 | 37 | #endif /* NCCL_IPCSOCKET_H */ 38 | -------------------------------------------------------------------------------- /pkg/srctxz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/srctxz 11 | PKGDIR := $(BUILDDIR)/pkg/srctxz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_REVISION ?= 3 18 | PKG_ARCH := $(shell uname -m) 19 | 20 | prep: $(TXZTARGETS) 21 | 22 | build: prep 23 | $(MAKE) -C ../../src clean 24 | @printf "Building source tar.xz package\n" 25 | (cd $(BUILDDIR); bash srctxz/create_srctxz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 40 | $< > $@ 41 | -------------------------------------------------------------------------------- /src/collectives/all_gather.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | 10 | NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, 11 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); 12 | ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, 13 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { 14 | // Just pass the size of one message and not the total bytes sent/received. 15 | constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { 16 | {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"} 17 | }; 18 | size_t msgsize = sendcount * ncclTypeSize(datatype); 19 | NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize) 20 | 21 | struct ncclInfo info = { ncclFuncAllGather, "AllGather", 22 | sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ 23 | ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; 24 | return ncclEnqueueCheck(&info); 25 | } 26 | -------------------------------------------------------------------------------- /makefiles/formatting.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting 8 | # As this file defines a new target (format), it should be included at least after the definition of the 9 | # default target. 10 | 11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none 12 | ASTYLEDIR := $(BUILDDIR)/contrib 13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz 14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle 15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/ 16 | ASTYLEVER := 3.1 17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz" 18 | 19 | $(ASTYLEDIR) : 20 | @mkdir -p $(ASTYLEDIR) 21 | 22 | $(ASTYLETAR) : $(ASTYLEDIR) 23 | @wget -q -O $(ASTYLETAR) $(ASTYLEURL) 24 | 25 | $(ASTYLEBLD) : $(ASTYLETAR) 26 | @cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR) 27 | 28 | $(ASTYLEBIN) : $(ASTYLEBLD) 29 | ${MAKE} -C $(ASTYLEBLD) 30 | 31 | .PHONY : format 32 | format : $(ASTYLEBIN) 33 | @$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT) 34 | -------------------------------------------------------------------------------- /src/include/enqueue.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ENQUEUE_H_ 8 | #define NCCL_ENQUEUE_H_ 9 | 10 | #include "comm.h" 11 | #include "group.h" 12 | #include "collectives.h" 13 | #include "utils.h" 14 | 15 | #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64) 16 | #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ 17 | 18 | ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); 19 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); 20 | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); 21 | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); 22 | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); 23 | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); 24 | ncclResult_t ncclLaunchFinish(struct ncclComm* comm); 25 | ncclResult_t chooseTransport(struct ncclComm* comm, int channelId, int peer, uint8_t isCopyEngineNotSmCopy, uint8_t p2pLevel, uint8_t* transportIndex); 26 | 27 | #endif // End include guard 28 | -------------------------------------------------------------------------------- /pkg/debian/control.in: -------------------------------------------------------------------------------- 1 | Source: nccl 2 | Section: libs 3 | Maintainer: cudatools 4 | Priority: optional 5 | Build-depends: debhelper(>=9) 6 | Standards-Version: 3.9.5 7 | 8 | Package: libnccl${nccl:Major} 9 | Section: libs 10 | Architecture: ${pkg:Arch} 11 | Depends: ${misc:Depends}, ${shlibs:Depends} 12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | Package: libnccl-dev 21 | Section: libdevel 22 | Architecture: ${pkg:Arch} 23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version}) 24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files 25 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 26 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 27 | broadcast, and reduce-scatter. 28 | It has been optimized to achieve high bandwidth on any platform using PCIe, 29 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 30 | sockets. 31 | -------------------------------------------------------------------------------- /src/include/align.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ALIGN_H_ 8 | #define NCCL_ALIGN_H_ 9 | 10 | #define DIVUP(x, y) \ 11 | (((x)+(y)-1)/(y)) 12 | 13 | #define ROUNDUP(x, y) \ 14 | (DIVUP((x), (y))*(y)) 15 | 16 | #define ALIGN_POWER(x, y) \ 17 | ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) 18 | 19 | #define ALIGN_SIZE(size, align) \ 20 | size = ((size + (align) - 1) / (align)) * (align); 21 | 22 | #if !__CUDA_ARCH__ 23 | #ifndef __host__ 24 | #define __host__ 25 | #endif 26 | #ifndef __device__ 27 | #define __device__ 28 | #endif 29 | #endif 30 | 31 | template 32 | __host__ __device__ constexpr Z divUp(X x, Y y) { 33 | return (x+y-1)/y; 34 | } 35 | 36 | template 37 | __host__ __device__ constexpr Z roundUp(X x, Y y) { 38 | return (x+y-1) - (x+y-1)%y; 39 | } 40 | 41 | // assumes second argument is a power of 2 42 | template 43 | __host__ __device__ constexpr Z alignUp(X x, int a) { 44 | return (x+a-1) & Z(-a); 45 | } 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /pkg/txz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/txz 11 | PKGDIR := $(BUILDDIR)/pkg/txz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_ARCH := $(shell uname -m) 18 | 19 | prep: $(TXZTARGETS) 20 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 21 | 22 | build: prep 23 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 24 | @printf "Building tar.xz package\n" 25 | (cd $(BUILDDIR); bash txz/create_txz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../nccl*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 40 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 41 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 42 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 43 | $< > $@ 44 | -------------------------------------------------------------------------------- /src/collectives/device/gen_rules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | dir=$1 9 | 10 | datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64" 11 | if [ "$CUDA_MAJOR" -ge 11 ] 12 | then 13 | datatypes+=" bf16" 14 | fi 15 | 16 | targets="GENOBJS := \\\\\n" 17 | 18 | for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do 19 | opn=0 20 | for op in sum prod min max premulsum sumpostdiv; do 21 | dtn=0 22 | # Order must match that of the ncclDataType_t enum 23 | for dt in ${datatypes}; do 24 | # Generate a unique filename for each compilation unit, 25 | # otherwise the __nv_module_id may conflict at link time 26 | echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu" 27 | echo " @printf \"Copying %-35s > %s\\\\n\" \$< \$@" 28 | echo " cp \$< \$@" 29 | echo "" 30 | # Compile the file 31 | echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep" 32 | 33 | echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" 34 | echo " mkdir -p ${dir}" 35 | echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@" 36 | echo "" 37 | targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" 38 | dtn=$(($dtn + 1)) 39 | done 40 | opn=$(($opn + 1)) 41 | done 42 | done 43 | echo -e "$targets" 44 | -------------------------------------------------------------------------------- /src/collectives/all_reduce.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "nccl.h" 9 | 10 | NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, 11 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); 12 | ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, 13 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { 14 | struct NvtxParamsAllReduce { 15 | size_t bytes; 16 | ncclRedOp_t op; 17 | }; 18 | // Just pass the size of one message and not the total bytes sent/received. 19 | static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = { 20 | {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, 21 | {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, 22 | offsetof(NvtxParamsAllReduce, op)} 23 | }; 24 | NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op}; 25 | NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload) 26 | 27 | struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", 28 | sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ 29 | ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; 30 | return ncclEnqueueCheck(&info); 31 | } 32 | -------------------------------------------------------------------------------- /src/enhcompat.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ 8 | 9 | enum cudaError_t { cudaErrorStubLibrary = 34 }; 10 | 11 | extern "C" { 12 | 13 | cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); 14 | cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } 15 | 16 | cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); 17 | cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } 18 | 19 | cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); 20 | cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } 21 | 22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); 23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } 24 | 25 | cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); 26 | cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/collectives/reduce_scatter.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | #include "nccl.h" 10 | 11 | NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, 12 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); 13 | ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, 14 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { 15 | struct NvtxParamsReduceScatter { 16 | size_t bytes; 17 | ncclRedOp_t op; 18 | }; 19 | constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = { 20 | {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, 21 | {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, 22 | offsetof(NvtxParamsReduceScatter, op)} 23 | }; 24 | NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op}; 25 | NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload) 26 | 27 | struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", 28 | sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ 29 | REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; 30 | return ncclEnqueueCheck(&info); 31 | } 32 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /* This header defines types which are used by the internal implementation 10 | * of NVTX and callback subscribers. API clients do not use these types, 11 | * so they are defined here instead of in nvToolsExt.h to clarify they are 12 | * not part of the NVTX client API. */ 13 | 14 | #ifndef NVTXEXTTYPES_H 15 | #define NVTXEXTTYPES_H 16 | 17 | #ifndef NVTX_EXT_TYPES_GUARD 18 | #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h. 19 | #endif 20 | 21 | typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId); 22 | 23 | typedef struct nvtxExtModuleSegment_t 24 | { 25 | size_t segmentId; 26 | size_t slotCount; 27 | intptr_t* functionSlots; 28 | } nvtxExtModuleSegment_t; 29 | 30 | typedef struct nvtxExtModuleInfo_t 31 | { 32 | uint16_t nvtxVer; 33 | uint16_t structSize; 34 | uint16_t moduleId; 35 | uint16_t compatId; 36 | size_t segmentsCount; 37 | nvtxExtModuleSegment_t* segments; 38 | NvtxExtGetExportFunction_t getExportFunction; 39 | const void* extInfo; 40 | } nvtxExtModuleInfo_t; 41 | 42 | typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo); 43 | 44 | #endif /* NVTXEXTTYPES_H */ -------------------------------------------------------------------------------- /src/collectives/reduce.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | #include "nccl.h" 10 | 11 | NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, 12 | ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); 13 | ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, 14 | ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { 15 | struct NvtxParamsReduce { 16 | size_t bytes; 17 | int root; 18 | ncclRedOp_t op; 19 | }; 20 | constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = { 21 | {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, 22 | {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)}, 23 | {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, 24 | offsetof(NvtxParamsReduce, op)} 25 | }; 26 | NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op}; 27 | NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload) 28 | 29 | struct ncclInfo info = { ncclFuncReduce, "Reduce", 30 | sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ 31 | REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; 32 | return ncclEnqueueCheck(&info); 33 | } 34 | -------------------------------------------------------------------------------- /ext-tuner/example/README.md: -------------------------------------------------------------------------------- 1 | # Contribution 2 | 3 | ## docker 4 | ``` 5 | docker pull nvcr.io/nvidia/pytorch:23.08-py3 6 | ``` 7 | 8 | ## Code Style 9 | ```sh 10 | # download clang-format 10.0.0 11 | wget https://gh-proxy.com/https://github.com/llvm/llvm-project/releases/download/llvmorg-13.0.0/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz 12 | # uncompress 13 | tar -xf clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz 14 | clangtidy=./clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04/bin/clang-tidy 15 | clangformat=./clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04/bin/clang-format 16 | 17 | # use bear to generate compile_commands.json 18 | sudo apt install bear 19 | bear make 20 | find . -name '*.cc' -or -name '*.h' -or -name '*.cu' | xargs $clangtidy -p . -fix-errors 21 | find . -name '*.cc' -or -name '*.h' -or -name '*.cu' | xargs $clangtidy -p . -checks=cppcoreguidelines-init-variables > tidy_result.txt 2>&1 22 | 23 | pip3 install cpplint 24 | 25 | # check the format 26 | find . -name '*.cc' -or -name '*.h' -or -name '*.cu' | xargs cpplint > lint_result.txt 2>&1 27 | find . -name '*.cc' -or -name '*.h' -or -name '*.cu' | xargs $clangformat -i 28 | ``` 29 | 30 | ## performance debug 31 | ```sh 32 | apt update 33 | apt install linux-tools-`uname -r | cut -d- -f1-2`-`uname -r | cut -d- -f3` -y 34 | apt install linux-tools-common 35 | apt install linux-tools-generic -y 36 | make perf 37 | perf report -g -i perf.data 38 | ``` 39 | * flamegraph 40 | ``` 41 | git clone https://github.com/brendangregg/FlameGraph.git 42 | perf script -i perf.data | FlameGraph/stackcollapse-perf.pl | FlameGraph/flamegraph.pl > out.svg 43 | ``` 44 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/jobs/bruteforce_job.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/include/datatype.h" 3 | #include "src/include/internal/logging.h" 4 | #include "src/include/jobs/job.h" 5 | 6 | struct BFJob : public Job { 7 | static bool isDecentralized() { return false; } 8 | BFJob(Tuner *tuner, const GIDTYPE &groupID, const Workload &workload, 9 | const Candidate &startCandidate, 10 | std::vector &&validCandidates, std::vector configRanges, const int32_t warmupSteps, const int32_t nativeSteps, 11 | const int32_t pretrainSteps, const int32_t trainSteps, 12 | const int32_t roundMaxSteps, const int32_t optimumExpireSteps, 13 | const int32_t expireSteps) : Job(tuner, groupID, workload, startCandidate, std::move(validCandidates), configRanges, 14 | warmupSteps, nativeSteps, pretrainSteps+trainSteps, 0, roundMaxSteps, 15 | optimumExpireSteps, expireSteps) { 16 | if (this->roundMaxSteps > this->pretrainSteps) { 17 | WARN(Logger::LogSubSys::OPTIMIZER) << "set pretrainSteps at least: " << this->roundMaxSteps; 18 | } 19 | this->accessByRand = Environment::get()->find("TUNER_BRUTEFOROCE_RAND", 1) > 0; 20 | } 21 | std::string debugStr() const override; 22 | 23 | JobType getType() const override { 24 | return JobType::BFJob; 25 | } 26 | 27 | protected: 28 | void addTrain(Result *jobResult) override {}; 29 | }; 30 | 31 | std::string BFJob::debugStr() const { 32 | std::stringstream ss; // NOLINT 33 | ss << Job::debugStr(); 34 | ss << " + BFJob["; 35 | ss << "]"; 36 | return ss.str(); 37 | } 38 | -------------------------------------------------------------------------------- /ext-tuner/example/plugin/cuda/wrapper.cc: -------------------------------------------------------------------------------- 1 | #include "src/cuda/collectives/nccl_candidate.h" 2 | 3 | #define WORKLOAD_SIZE 3 4 | #define CANDIDATE_SIZE 10 5 | 6 | // Wrapper function to match Python ctypes expectations 7 | extern "C" void ncclGetValidCandidatesWrapper( 8 | const GroupInfo* groupInfo, 9 | const std::pair* pairs, size_t pairs_size, 10 | const uint64_t* workloadElemPtr, bool scale2, int32_t** candidateElemPtr, size_t* candidateElemCount) { 11 | Info myInfo; 12 | 13 | std::map tunerEnvs; 14 | for (size_t i = 0; i < pairs_size; ++i) { 15 | tunerEnvs[pairs[i].first] = pairs[i].second; 16 | } 17 | std::unordered_map allGroupInfos; 18 | allGroupInfos[groupInfo->groupID] = GroupInfo( 19 | groupInfo->groupID, 20 | groupInfo->root, 21 | groupInfo->rank, 22 | groupInfo->nrank, 23 | groupInfo->nnode, 24 | tunerEnvs); 25 | 26 | Workload workload(workloadElemPtr, workloadElemPtr+WORKLOAD_SIZE); 27 | std::vector candidates; 28 | std::vector configRanges; 29 | 30 | ncclGetValidCandidates(myInfo, allGroupInfos, workload, scale2, &candidates, &configRanges); 31 | 32 | *candidateElemCount = candidates.size() * CANDIDATE_SIZE; 33 | *candidateElemPtr = new int32_t[*candidateElemCount]; 34 | 35 | size_t index = 0; 36 | for (const auto& candidate : candidates) { 37 | std::copy(candidate.begin(), candidate.end(), *candidateElemPtr + index); 38 | index += CANDIDATE_SIZE; 39 | } 40 | } 41 | 42 | // Function to free allocated candidate memory 43 | extern "C" void freeCandidates(int32_t* candidateElemPtr) { 44 | delete[] candidateElemPtr; 45 | } 46 | -------------------------------------------------------------------------------- /src/include/debug.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_DEBUG_H_ 8 | #define NCCL_DEBUG_H_ 9 | 10 | #include "nccl_net.h" 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | // Conform to pthread and NVTX standard 20 | #define NCCL_THREAD_NAMELEN 16 21 | 22 | extern int ncclDebugLevel; 23 | extern uint64_t ncclDebugMask; 24 | extern pthread_mutex_t ncclDebugLock; 25 | extern FILE *ncclDebugFile; 26 | extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); 27 | 28 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); 29 | 30 | // Let code temporarily downgrade WARN into INFO 31 | extern thread_local int ncclDebugNoWarn; 32 | extern char ncclLastError[]; 33 | 34 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 35 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) 36 | #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) 37 | 38 | #ifdef ENABLE_TRACE 39 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) 40 | extern std::chrono::steady_clock::time_point ncclEpoch; 41 | #else 42 | #define TRACE(...) 43 | #endif 44 | 45 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...); 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /src/include/cpuset.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CPUSET_H_ 8 | #define NCCL_CPUSET_H_ 9 | 10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t 11 | 12 | static int hexToInt(char c) { 13 | int v = c - '0'; 14 | if (v < 0) return -1; 15 | if (v > 9) v = 10 + c - 'a'; 16 | if ((v < 0) || (v > 15)) return -1; 17 | return v; 18 | } 19 | 20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) 21 | 22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { 23 | uint32_t cpumasks[CPU_SET_N_U32]; 24 | int m = CPU_SET_N_U32-1; 25 | cpumasks[m] = 0; 26 | for (int o=0; o=0; o--) { 49 | if (c == 0 && m8[o] == 0) continue; 50 | sprintf(str+c, "%02x", m8[o]); 51 | c+=2; 52 | if (o && o%4 == 0) { 53 | sprintf(str+c, ","); 54 | c++; 55 | } 56 | } 57 | str[c] = '\0'; 58 | return ncclSuccess; 59 | } 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /src/include/bootstrap.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_BOOTSTRAP_H_ 8 | #define NCCL_BOOTSTRAP_H_ 9 | 10 | #include "nccl.h" 11 | #include "comm.h" 12 | 13 | struct ncclBootstrapHandle { 14 | uint64_t magic; 15 | union ncclSocketAddress addr; 16 | }; 17 | static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID"); 18 | 19 | ncclResult_t bootstrapNetInit(); 20 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv); 21 | ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle); 22 | ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm); 23 | ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks); 24 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); 25 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); 26 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); 27 | ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag); 28 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); 29 | ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size); 30 | ncclResult_t bootstrapClose(void* commState); 31 | ncclResult_t bootstrapAbort(void* commState); 32 | #endif 33 | -------------------------------------------------------------------------------- /src/collectives/broadcast.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | 10 | NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, 11 | ncclComm_t comm, cudaStream_t stream); 12 | ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, 13 | ncclComm_t comm, cudaStream_t stream) { 14 | struct NvtxParamsBroadcast { 15 | size_t bytes; 16 | int root; 17 | }; 18 | constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = { 19 | {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, 20 | {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)} 21 | }; 22 | NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root}; 23 | NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload) 24 | 25 | struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", 26 | sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ 27 | BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; 28 | return ncclEnqueueCheck(&info); 29 | } 30 | /* Deprecated original "in place" function, similar to MPI */ 31 | NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, 32 | ncclComm_t comm, cudaStream_t stream); 33 | ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, 34 | ncclComm_t comm, cudaStream_t stream) { 35 | return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); 36 | } 37 | 38 | -------------------------------------------------------------------------------- /pkg/debian/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | DEBPREPDIR := $(BUILDDIR)/debian 11 | PKGDIR := $(BUILDDIR)/pkg/deb/ 12 | 13 | DEBGEN_IN := $(wildcard *.in) 14 | DEBGEN := $(DEBGEN_IN:.in=) 15 | DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN) 16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) 20 | PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) 21 | 22 | prep : $(DEBTARGETS) 23 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 24 | 25 | build : prep 26 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 27 | @printf "Building Debian package\n" 28 | (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b) 29 | mkdir -p $(PKGDIR) 30 | mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ 31 | 32 | clean: 33 | rm -Rf $(DEBPREPDIR) $(PKGDIR) 34 | 35 | $(DEBPREPDIR)/% : %.in 36 | @printf "Generating %-35s > %s\n" $< $@ 37 | mkdir -p $(DEBPREPDIR) 38 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 39 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 40 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 41 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 42 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 43 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 44 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 45 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 46 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 47 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 48 | $< > $@ 49 | 50 | $(DEBPREPDIR)/% : % 51 | @printf "Grabbing %-35s > %s\n" $< $@ 52 | mkdir -p $(DEBPREPDIR) 53 | cp -f $< $@ 54 | -------------------------------------------------------------------------------- /src/include/timer.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TIMER_H_ 8 | #define NCCL_TIMER_H_ 9 | #if ENABLE_TIMER 10 | #include 11 | #include 12 | #include 13 | static double freq = -1; 14 | static void calibrate() { 15 | struct timeval tv; 16 | gettimeofday(&tv, NULL); 17 | uint64_t timeCycles = __rdtsc(); 18 | double time = - tv.tv_sec*1E6 - tv.tv_usec; 19 | uint64_t total = 0ULL; 20 | for (int i=0; i<10000; i++) total += __rdtsc(); 21 | gettimeofday(&tv, NULL); 22 | timeCycles = __rdtsc() - timeCycles; 23 | time += tv.tv_sec*1E6 + tv.tv_usec; 24 | freq = timeCycles/time; 25 | } 26 | static inline double gettime() { 27 | if (freq == -1) calibrate(); 28 | return __rdtsc()/freq; 29 | } 30 | static uint64_t counts[8]; 31 | static double times[8]; 32 | static double startTimes[8]; 33 | #define TIME_START(index) do { \ 34 | counts[index]++; \ 35 | startTimes[index] = gettime(); \ 36 | } while (0); 37 | 38 | #define TIME_STOP(index) do { \ 39 | times[index] += gettime() - startTimes[index]; \ 40 | } while (0); 41 | 42 | #define TIME_CANCEL(index) do { \ 43 | counts[index]--; \ 44 | } while (0); 45 | 46 | #define TIME_PRINT(name) do { \ 47 | printf("%s stats", name); \ 48 | for (int i=0; i<8; i++) { \ 49 | if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ 50 | counts[i] = 0; \ 51 | } \ 52 | printf("\n"); \ 53 | } while (0); 54 | #else 55 | #define TIME_START(index) while(0); 56 | #define TIME_STOP(index) while(0); 57 | #define TIME_CANCEL(index) while(0); 58 | #define TIME_PRINT(name) 59 | #endif 60 | #endif 61 | -------------------------------------------------------------------------------- /src/include/core.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CORE_H_ 8 | #define NCCL_CORE_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include // For std::min/std::max 15 | #include "nccl.h" 16 | 17 | #ifdef PROFAPI 18 | #define NCCL_API(ret, func, args...) \ 19 | __attribute__ ((visibility("default"))) \ 20 | __attribute__ ((alias(#func))) \ 21 | ret p##func (args); \ 22 | extern "C" \ 23 | __attribute__ ((visibility("default"))) \ 24 | __attribute__ ((weak)) \ 25 | ret func(args) 26 | #else 27 | #define NCCL_API(ret, func, args...) \ 28 | extern "C" \ 29 | __attribute__ ((visibility("default"))) \ 30 | ret func(args) 31 | #endif // end PROFAPI 32 | 33 | static __inline__ int ncclTypeSize(ncclDataType_t type) { 34 | switch (type) { 35 | case ncclInt8: 36 | case ncclUint8: 37 | return 1; 38 | case ncclFloat16: 39 | #if defined(__CUDA_BF16_TYPES_EXIST__) 40 | case ncclBfloat16: 41 | #endif 42 | return 2; 43 | case ncclInt32: 44 | case ncclUint32: 45 | case ncclFloat32: 46 | return 4; 47 | case ncclInt64: 48 | case ncclUint64: 49 | case ncclFloat64: 50 | return 8; 51 | default: 52 | return -1; 53 | } 54 | } 55 | 56 | #include "debug.h" 57 | #include "checks.h" 58 | #include "cudawrap.h" 59 | #include "alloc.h" 60 | #include "utils.h" 61 | #include "param.h" 62 | #include "nvtx.h" 63 | 64 | #endif // end include guard 65 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/jobs/native_job.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/include/datatype.h" 3 | #include "src/include/internal/logging.h" 4 | #include "src/include/jobs/job.h" 5 | 6 | #define DEBUG_DIST 0 7 | struct NativeJob : public Job { 8 | NativeJob(Tuner *tuner, const GIDTYPE &groupID, const Workload &workload, 9 | const Candidate &startCandidate, 10 | std::vector &&validCandidates, std::vector configRanges, 11 | const int32_t warmupSteps, const int32_t nativeSteps, const int32_t pretrainSteps, 12 | const int32_t trainSteps, const int32_t roundMaxSteps, 13 | const int32_t optimumExpireSteps, const int32_t expireSteps) 14 | : Job(tuner, groupID, workload, startCandidate, 15 | std::move(validCandidates), configRanges, 0, 0, 0, 16 | 0, roundMaxSteps, optimumExpireSteps, expireSteps) { 17 | } 18 | JobType getType() const override { 19 | return JobType::NativeJob; 20 | } 21 | static bool isDecentralized() { return DEBUG_DIST == 0; } 22 | void createResult(const SimpleQueryStatus &status, Result *result) override; 23 | 24 | protected: 25 | void addTrain(Result *jobResult) override {}; 26 | }; 27 | 28 | void NativeJob::createResult(const SimpleQueryStatus &status, Result *result) { 29 | if (this->isDone) 30 | return; 31 | this->isDone = true; 32 | result->workload = this->workload; 33 | result->roundCandidates.version = this->nextVersion++; 34 | result->roundCandidates.roundExpire = ExpireCandidate::FOREVER; 35 | auto nativeEC = 36 | ExpireCandidate(ExpireCandidate::FOREVER, this->startCandidate); 37 | this->optimum = nativeEC.candidate; 38 | result->roundCandidates.expireCandidates.push_back(std::move(nativeEC)); 39 | this->currTotalExpire = ExpireCandidate::FOREVER; 40 | INFO(Logger::LogSubSys::OPTIMIZER) << " add permanent:" << toDebugStr(this->workload) 41 | << " candidate:" << toDebugStr(this->startCandidate); 42 | } 43 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 13 | Laboratory, the U.S. Department of Energy, nor the names of their 14 | contributors may be used to endorse or promote products derived 15 | from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | The U.S. Department of Energy funded the development of this software 30 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 31 | 32 | 33 | This code also includes files from the NVIDIA Tools Extension SDK project. 34 | 35 | See: 36 | 37 | https://github.com/NVIDIA/NVTX 38 | 39 | for more information and license details. 40 | -------------------------------------------------------------------------------- /src/graph/rings.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "core.h" 8 | 9 | #define MAXWIDTH 20 10 | #define PREFIXLEN 15 11 | #define STRLENGTH (PREFIXLEN+5*MAXWIDTH) 12 | void dumpLine(int* values, int nranks, const char* prefix) { 13 | int prefixlen = strlen(prefix); 14 | char line[STRLENGTH+1]; 15 | line[STRLENGTH] = '\0'; 16 | memset(line, ' ', STRLENGTH); 17 | strncpy(line, prefix, PREFIXLEN); 18 | for (int i=0; i 3 | #include 4 | #include 5 | #include 6 | 7 | template class ThreadsafeQueue { 8 | public: 9 | ThreadsafeQueue() = default; 10 | 11 | void push(T &&value) { 12 | // std::lock_guard lock(mutex_); 13 | pthread_mutex_lock(&this->mutex); 14 | queue_.push(std::move(value)); 15 | // condition_.notify_one(); 16 | pthread_cond_signal(&this->cond); 17 | pthread_mutex_unlock(&this->mutex); 18 | } 19 | 20 | void waitPop(T *value) { 21 | // std::unique_lock lock(mutex_); 22 | pthread_mutex_lock(&this->mutex); 23 | // condition_.wait(lock, [this] { return !queue_.empty(); }); 24 | while (queue_.empty()) { 25 | pthread_cond_wait(&this->cond, &this->mutex); 26 | } 27 | *value = std::move(queue_.front()); 28 | queue_.pop(); 29 | pthread_mutex_unlock(&this->mutex); 30 | } 31 | 32 | void merge(std::vector *v) { 33 | for (auto &item : *v) { 34 | // std::lock_guard lock(mutex_); 35 | pthread_mutex_lock(&this->mutex); 36 | queue_.push(std::move(item)); 37 | // condition_.notify_one(); 38 | pthread_cond_signal(&this->cond); 39 | pthread_mutex_unlock(&this->mutex); 40 | } 41 | } 42 | bool empty() { 43 | // std::lock_guard lock(mutex_); 44 | pthread_mutex_lock(&this->mutex); 45 | bool flag = queue_.empty(); // NOLINT 46 | pthread_mutex_unlock(&this->mutex); 47 | return flag; 48 | } 49 | 50 | void wait(const bool &flag = false) { 51 | pthread_mutex_lock(&this->mutex); 52 | while (!flag && queue_.empty()) { 53 | pthread_cond_wait(&this->cond, &this->mutex); 54 | } 55 | pthread_mutex_unlock(&this->mutex); 56 | } 57 | 58 | private: 59 | std::queue queue_; 60 | pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 61 | pthread_cond_t cond = PTHREAD_COND_INITIALIZER; 62 | // mutable std::mutex mutex_; 63 | // std::condition_variable condition_; 64 | }; 65 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/internal/env.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class Environment { 10 | public: 11 | static Environment *get() { 12 | static std::shared_ptr instance(new Environment()); 13 | return instance.get(); 14 | } 15 | 16 | template T find(const char *key, const T defaultValue) { 17 | const char *value = this->find(key); 18 | if (value == nullptr) { 19 | return defaultValue; 20 | } else { 21 | return T(value); 22 | } 23 | } 24 | 25 | int32_t find(const char *key, const int32_t defaultValue) { 26 | const char *value = this->find(key); 27 | if (value == nullptr) { 28 | return defaultValue; 29 | } else { 30 | return std::stoi(value); 31 | } 32 | } 33 | 34 | const char *find(const char *key) { 35 | if (kvs.count(std::string(key)) > 0) { 36 | return kvs[std::string(key)].c_str(); 37 | } else { 38 | const char *value = getenv(key); 39 | if (value == nullptr || strlen(value) == 0) 40 | return nullptr; 41 | return value; 42 | } 43 | } 44 | 45 | void set(const std::string &key, const std::string &value) { 46 | kvs[key] = value; 47 | } 48 | 49 | private: 50 | std::unordered_map kvs; 51 | }; 52 | 53 | #if 0 54 | int32_t main() { 55 | // Test the Environment class 56 | Environment& env = Environment::get(); 57 | 58 | // Test find() with string value 59 | std::string strValue = env.find("STRING_KEY", "default_string"); 60 | std::cout << "String value: " << strValue << std::endl; 61 | 62 | // Test find() with int32_t value 63 | int32_t intValue = env.find("INT_KEY", 123); 64 | std::cout << "Int value: " << intValue << std::endl; 65 | 66 | // Test set() and find() with newly set value 67 | env.set("NEW_KEY", "new_value"); 68 | std::string newValue = env.find("NEW_KEY", "default_new_value"); 69 | std::cout << "New value: " << newValue << std::endl; 70 | 71 | return 0; 72 | } 73 | #endif 74 | -------------------------------------------------------------------------------- /src/include/npkit/npkit.h: -------------------------------------------------------------------------------- 1 | #ifndef NPKIT_H_ 2 | #define NPKIT_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "npkit/npkit_event.h" 10 | #include "npkit/npkit_struct.h" 11 | 12 | class NpKit { 13 | public: 14 | static const uint64_t kNumGpuEventBuffers = 64; // [32, 64) for TreeSlipt 15 | 16 | static const uint64_t kNumCpuEventBuffers = 64; // [32, 64) for TreeSlipt 17 | 18 | static ncclResult_t Init(int rank); 19 | 20 | static ncclResult_t Dump(const std::string& dump_dir); 21 | 22 | static ncclResult_t Shutdown(); 23 | 24 | static NpKitEventCollectContext* GetGpuEventCollectContexts(); 25 | 26 | static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, 27 | NpKitEventCollectContext* ctx) { 28 | uint64_t event_buffer_head = ctx->event_buffer_head; 29 | if (event_buffer_head < kMaxNumGpuEventsPerBuffer) { 30 | NpKitEvent& event = ctx->event_buffer[event_buffer_head]; 31 | event.fields.type = type; 32 | event.fields.size = size; 33 | event.fields.rsvd = rsvd; 34 | event.fields.timestamp = timestamp; 35 | ctx->event_buffer_head++; 36 | } 37 | } 38 | 39 | static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id); 40 | 41 | static uint64_t* GetCpuTimestamp(); 42 | 43 | private: 44 | static void CpuTimestampUpdateThread(); 45 | 46 | // max: 2M * 32 * 16B = 1GB per GPU 47 | static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 21; // 2M 48 | 49 | // max: 4M * 32 * 16B = 2GB per CPU 50 | static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 22; // 2M * 2 (send/recv) * (32/32) = 4M 51 | 52 | static NpKitEvent** gpu_event_buffers_; 53 | static NpKitEvent** cpu_event_buffers_; 54 | 55 | static NpKitEventCollectContext* gpu_collect_contexts_; 56 | static NpKitEventCollectContext* cpu_collect_contexts_; 57 | static uint64_t* cpu_timestamp_; 58 | 59 | static uint64_t rank_; 60 | 61 | static std::thread* cpu_timestamp_update_thread_; 62 | static volatile bool cpu_timestamp_update_thread_should_stop_; 63 | }; 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /src/collectives/sendrecv.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "enqueue.h" 8 | #include "collectives.h" 9 | #include "argcheck.h" // Need some checks here since we access comm 10 | 11 | struct NvtxParamsSendRecv { 12 | size_t bytes; 13 | int peer; 14 | }; 15 | constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = { 16 | {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, 17 | {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)} 18 | }; 19 | 20 | NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, 21 | ncclComm_t comm, cudaStream_t stream); 22 | ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, 23 | ncclComm_t comm, cudaStream_t stream) { 24 | NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; 25 | NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload) 26 | 27 | struct ncclInfo info = { ncclFuncSend, "Send", 28 | NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 29 | 1, 1 }; 30 | ncclResult_t ret; 31 | NCCLCHECK(ncclGroupStart()); 32 | ret = ncclEnqueueCheck(&info); 33 | NCCLCHECK(ncclGroupEnd()); 34 | return ret; 35 | } 36 | 37 | NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, 38 | ncclComm_t comm, cudaStream_t stream); 39 | ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, 40 | ncclComm_t comm, cudaStream_t stream) { 41 | NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; 42 | NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) 43 | 44 | struct ncclInfo info = { ncclFuncRecv, "Recv", 45 | NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 46 | 1, 1 }; 47 | ncclResult_t ret; 48 | NCCLCHECK(ncclGroupStart()); 49 | ret = ncclEnqueueCheck(&info); 50 | NCCLCHECK(ncclGroupEnd()); 51 | return ret; 52 | } 53 | -------------------------------------------------------------------------------- /pkg/redhat/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | RPMPREPDIR := $(BUILDDIR)/redhat 11 | PKGDIR := $(BUILDDIR)/pkg/rpm/ 12 | 13 | RPMGEN_IN := $(wildcard *.in) 14 | RPMGEN := $(RPMGEN_IN:.in=) 15 | RPMFILES := $(RPMGEN) 16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | ARCH := $(shell uname -m) 20 | PKG_ARCH ?= $(shell uname -m) 21 | PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) 22 | ifeq ($(PKG_MULTIARCH),) 23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it 24 | PKG_MULTIARCH := $(ARCH)-linux-gnu 25 | endif 26 | 27 | prep : $(RPMTARGETS) 28 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 29 | 30 | build : prep 31 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 32 | $(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR) 33 | @printf "Building Redhat package\n" 34 | mkdir -p $(PKGDIR) 35 | rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \ 36 | --define "_rpmdir $(PKGDIR)" \ 37 | --define "_builddir $(PKGDIR)/build/" \ 38 | --define "_buildrootdir $(PKGDIR)/buildroot/" \ 39 | -bb $(BUILDDIR)/redhat/nccl.spec 40 | 41 | clean: 42 | rm -Rf $(RPMPREPDIR) $(PKGDIR) 43 | 44 | $(RPMPREPDIR)/% : %.in 45 | @printf "Generating %-35s > %s\n" $< $@ 46 | mkdir -p $(RPMPREPDIR) 47 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 48 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 49 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 50 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 51 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 52 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 53 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 54 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 55 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 56 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 57 | $< > $@ 58 | 59 | $(RPMPREPDIR)/% : % 60 | @printf "Grabbing %-35s > %s\n" $< $@ 61 | mkdir -p $(RPMPREPDIR) 62 | cp -f $< $@ 63 | -------------------------------------------------------------------------------- /ext-tuner/example/.clang-tidy: -------------------------------------------------------------------------------- 1 | # refer to https://clang.llvm.org/extra/clang-tidy/checks/list.html 2 | Checks: -*, clang-analyzer-core.*, clang-analyzer-cplusplus.*, clang-analyzer-deadcode.*, clang-analyzer-nullability.*, clang-analyzer-security.*, clang-analyzer-unix.*, clang-analyzer-valist.*, cppcoreguidelines-macro-usage, cppcoreguidelines-interfaces-global-init, cppcoreguidelines-narrowing-conversions, cppcoreguidelines-no-malloc, cppcoreguidelines-prefer-member-initializer, cppcoreguidelines-special-member-functions, cppcoreguidelines-slicing, google-build-explicit-make-pair, google-default-arguments, google-explicit-constructor, modernize-avoid-bind, modernize-loop-convert, modernize-macro-to-enum, modernize-make-shared, modernize-make-unique, modernize-pass-by-value, modernize-redundant-void-arg, modernize-return-braced-init-list, modernize-use-auto, modernize-use-bool-literals, modernize-use-emplace, modernize-use-equals-default, modernize-use-equals-delete, modernize-use-nullptr, modernize-use-override, modernize-use-using, performance-faster-string-find, performance-for-range-copy, performance-implicit-conversion-in-loop, performance-inefficient-algorithm, performance-inefficient-vector-operation, performance-move-const-arg, performance-move-constructor-init, performance-no-automatic-move, performance-trivially-destructible, performance-type-promotion-in-math-fn, performance-unnecessary-copy-initialization, performance-unnecessary-value-param 3 | 4 | WarningsAsErrors: clang-analyzer-*, -clang-analyzer-security.insecureAPI.rand, cppcoreguidelines-interfaces-global-init, cppcoreguidelines-no-malloc, cppcoreguidelines-slicing, google-*, modernize-use-emplace, modernize-use-equals-default, modernize-use-equals-delete, performance-implicit-conversion-in-loop, performance-inefficient-algorithm, performance-move-constructor-init, performance-no-automatic-move, performance-trivially-destructible, performance-type-promotion-in-math-fn, performance-unnecessary-copy-initialization 5 | 6 | CheckOptions: 7 | - key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor 8 | value: True 9 | - key: cppcoreguidelines-special-member-functions.AllowMissingMoveFunctionsWhenCopyIsDeleted 10 | value: True 11 | - key: performance-move-const-arg.CheckTriviallyCopyableMove 12 | value: False 13 | -------------------------------------------------------------------------------- /src/collectives/device/onerank_reduce.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "common_kernel.h" 10 | #include "common.h" 11 | 12 | namespace { 13 | template 14 | __device__ __forceinline__ void oneRankReduce() { 15 | ncclWork *w = &ncclShmem.work; 16 | int tid = threadIdx.x; 17 | int tn = blockDim.x; 18 | #pragma unroll 1 19 | for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) { 20 | ncclWorkElem *we = &w->elems[e]; 21 | intptr_t eltN = we->count; 22 | int bid = we->bid; 23 | int bn = we->nChannels; 24 | T const *src = (T const*)we->sendbuff; 25 | T *dst = (T*)we->recvbuff; 26 | 27 | // each block/channel gets a roughly equal segment of 16 byte packs 28 | constexpr int EltPerPack = 16/sizeof(T); 29 | intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack; 30 | intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn); 31 | intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn); 32 | i0 *= EltPerPack; 33 | i0 = i0 < eltN ? i0 : eltN; 34 | i1 *= EltPerPack; 35 | i1 = i1 < eltN ? i1 : eltN; 36 | src += i0; 37 | dst += i0; 38 | void *vsrc = (void*)src; 39 | void *vdst = (void*)dst; 40 | reduceCopy 41 | (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0); 42 | } 43 | } 44 | } 45 | 46 | #define INSTANTIATE(devredop, type) \ 47 | __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \ 48 | oneRankReduce>(); \ 49 | } 50 | 51 | INSTANTIATE(PreMulSum, int8_t) 52 | INSTANTIATE(PreMulSum, uint8_t) 53 | INSTANTIATE(PreMulSum, int32_t) 54 | INSTANTIATE(PreMulSum, uint32_t) 55 | INSTANTIATE(PreMulSum, int64_t) 56 | INSTANTIATE(PreMulSum, uint64_t) 57 | INSTANTIATE(PreMulSum, half) 58 | #if defined(__CUDA_BF16_TYPES_EXIST__) 59 | INSTANTIATE(PreMulSum, __nv_bfloat16) 60 | #endif 61 | INSTANTIATE(PreMulSum, float) 62 | INSTANTIATE(PreMulSum, double) 63 | -------------------------------------------------------------------------------- /src/include/channel.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CHANNEL_H_ 8 | #define NCCL_CHANNEL_H_ 9 | #include "comm.h" 10 | 11 | ncclResult_t initChannel(struct ncclComm* comm, int channelid); 12 | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); 13 | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); 14 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); 15 | static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) { 16 | int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; 17 | int peerNode = comm->rankToNode[peer]; 18 | int peerIndex = comm->rankToLocalRank[peer]; 19 | int nsteps = comm->maxLocalRanks; 20 | int rankIndex = comm->rankToLocalRank[comm->rank]; 21 | int step, delta; 22 | if (coll == ncclFuncSend) { 23 | step = (nsteps + peerIndex - rankIndex)%nsteps; 24 | delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes; 25 | } else if (coll == ncclFuncRecv) { 26 | step = (nsteps + rankIndex - peerIndex)%nsteps; 27 | delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes; 28 | } else { 29 | return ncclInternalError; 30 | } 31 | *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step; 32 | TRACE(NCCL_COLL, "native=1, peer=%d %s -> channelBase=%d", peer, (coll == ncclFuncSend ? "send" : "recv"), *channelBase); 33 | return ncclSuccess; 34 | } 35 | 36 | static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) { 37 | //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels; 38 | *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels; 39 | return ncclSuccess; 40 | } 41 | 42 | static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) { 43 | int base; 44 | NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base)); 45 | NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId)); 46 | return ncclSuccess; 47 | } 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V3_H_ 6 | #define NCCL_NET_V3_H_ 7 | 8 | #define NCCL_NET_MAX_REQUESTS_V3 16 9 | 10 | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t; 11 | typedef struct { 12 | // Name of the network (mainly for logs) 13 | const char* name; 14 | // Initialize the network. 15 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 16 | // Return the number of adapters. 17 | ncclResult_t (*devices)(int* ndev); 18 | // Get various device properties. 19 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); 20 | // Create a receiving object and provide a handle to connect to it. The 21 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 22 | // between ranks to create a connection. 23 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 24 | // Connect to a handle and return a sending comm object for that peer. 25 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 26 | // Finalize connection establishment after remote peer has called connectHandle 27 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 28 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 29 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 30 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 31 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 32 | // Asynchronous send to a peer. 33 | // May return request == NULL if the call cannot be performed (or would block) 34 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 35 | // Asynchronous recv from a peer. 36 | // May return request == NULL if the call cannot be performed (or would block) 37 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 38 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 39 | // visible to the GPU 40 | ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); 41 | // Test whether a request is complete. If size is not NULL, it returns the 42 | // number of bytes sent/received. 43 | ncclResult_t (*test)(void* request, int* done, int* size); 44 | // Close and free send/recv comm objects 45 | ncclResult_t (*closeSend)(void* sendComm); 46 | ncclResult_t (*closeRecv)(void* recvComm); 47 | ncclResult_t (*closeListen)(void* listenComm); 48 | } ncclNet_v3_t; 49 | 50 | #endif // end include guard 51 | -------------------------------------------------------------------------------- /ext-tuner/example/src/cuda/nccl_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* 4 | #define NCCL_ALGO_UNDEF -1 5 | #define NCCL_ALGO_TREE 0 6 | #define NCCL_ALGO_RING 1 7 | #define NCCL_ALGO_COLLNET_DIRECT 2 8 | #define NCCL_ALGO_COLLNET_CHAIN 3 9 | #define NCCL_ALGO_NVLS 4 10 | #define NCCL_ALGO_NVLS_TREE 5 11 | 12 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 13 | #define NCCL_PROTO_UNDEF -1 14 | #define NCCL_PROTO_LL 0 15 | #define NCCL_PROTO_LL128 1 16 | #define NCCL_PROTO_SIMPLE 2 17 | 18 | #define NCCL_SM_COPY 0 19 | #define NCCL_COPY_ENGINE 1 20 | 21 | #define NCCL_STEPS 8 22 | #define WARP_SIZE 32 23 | #define SIZEELEM (sizeof(int8_t)) 24 | // LL 25 | #define EltPerLine (sizeof(uint64_t)/SIZEELEM) 26 | // LL128 27 | #define NCCL_LL128_LINESIZE 128 28 | #define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t)) 29 | #define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS - 1) 30 | #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 31 | #define WireWordPerSlice WARP_SIZE *NCCL_LL128_SHMEM_ELEMS_PER_THREAD 32 | 33 | #define DataEltPerSlice \ 34 | ((WireWordPerSlice-WireWordPerSlice/NCCL_LL128_LINEELEMS)*(sizeof(uint64_t)/SIZEELEM)) 35 | 36 | #define NCCL_MAX_NTHREADS 640 37 | 38 | using ncclFunc_t = enum { 39 | ncclFuncBroadcast, 40 | ncclFuncReduce, 41 | ncclFuncAllGather, 42 | ncclFuncReduceScatter, 43 | ncclFuncAllReduce, 44 | ncclFuncSendRecv, 45 | ncclFuncSend, 46 | ncclFuncRecv, 47 | ncclNumFuncs, 48 | ncclFuncAll2All, 49 | ncclFuncAll2Allv 50 | }; 51 | 52 | using LinkType = enum { 53 | // Local (myself) 54 | PATH_LOC = 0, 55 | // Connection traversing NVLink 56 | PATH_NVL = 1, 57 | // Connection through NVLink using an intermediate GPU 58 | PATH_NVB = 2, 59 | // Connection traversing at most a single PCIe bridge 60 | PATH_PIX = 3, 61 | // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) 62 | PATH_PXB = 4, 63 | // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. 64 | PATH_PXN = 5, 65 | // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 66 | PATH_PHB = 6, 67 | // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 68 | PATH_SYS = 7, 69 | // Connection through the network 70 | PATH_NET = 8 71 | }; 72 | 73 | using Transports = enum { 74 | TRANSPORT_P2P = 0, 75 | TRANSPORT_P2P_CE = 1, 76 | TRANSPORT_SHM = 2, 77 | TRANSPORT_NET = 3, 78 | TRANSPORT_COLLNET = 4 79 | }; 80 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_IMPL_GUARD 10 | #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifndef NVTX_EXT_IMPL_H 14 | #define NVTX_EXT_IMPL_H 15 | /* ---- Include required platform headers ---- */ 16 | 17 | #if defined(_WIN32) 18 | 19 | #include 20 | 21 | #else 22 | #include 23 | 24 | #if defined(__ANDROID__) 25 | #include 26 | #endif 27 | 28 | #if defined(__linux__) || defined(__CYGWIN__) 29 | #include 30 | #endif 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | 47 | #endif 48 | 49 | /* ---- Define macros used in this file ---- */ 50 | 51 | #ifdef NVTX_DEBUG_PRINT 52 | #ifdef __ANDROID__ 53 | #include 54 | #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); 55 | #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); 56 | #else 57 | #include 58 | #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) 59 | #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) 60 | #endif 61 | #else /* !defined(NVTX_DEBUG_PRINT) */ 62 | #define NVTX_ERR(...) 63 | #define NVTX_INFO(...) 64 | #endif 65 | 66 | #ifdef __cplusplus 67 | extern "C" { 68 | #endif /* __cplusplus */ 69 | 70 | // #ifdef __GNUC__ 71 | // #pragma GCC visibility push(hidden) 72 | // #endif 73 | 74 | #define NVTX_EXTENSION_FRESH 0 75 | #define NVTX_EXTENSION_DISABLED 1 76 | #define NVTX_EXTENSION_STARTING 2 77 | #define NVTX_EXTENSION_LOADED 3 78 | 79 | NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0; 80 | 81 | #define NVTX_EXT_INIT_GUARD 82 | #include "nvtxExtInit.h" 83 | #undef NVTX_EXT_INIT_GUARD 84 | 85 | // #ifdef __GNUC__ 86 | // #pragma GCC visibility pop 87 | // #endif 88 | 89 | #ifdef __cplusplus 90 | } /* extern "C" */ 91 | #endif /* __cplusplus */ 92 | 93 | #endif /* NVTX_EXT_IMPL_H */ -------------------------------------------------------------------------------- /src/misc/param.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "param.h" 8 | #include "debug.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const char* userHomeDir() { 21 | struct passwd *pwUser = getpwuid(getuid()); 22 | return pwUser == NULL ? NULL : pwUser->pw_dir; 23 | } 24 | 25 | void setEnvFile(const char* fileName) { 26 | FILE * file = fopen(fileName, "r"); 27 | if (file == NULL) return; 28 | 29 | char *line = NULL; 30 | char envVar[1024]; 31 | char envValue[1024]; 32 | size_t n = 0; 33 | ssize_t read; 34 | while ((read = getline(&line, &n, file)) != -1) { 35 | if (line[read-1] == '\n') line[read-1] = '\0'; 36 | int s=0; // Env Var Size 37 | while (line[s] != '\0' && line[s] != '=') s++; 38 | if (line[s] == '\0') continue; 39 | strncpy(envVar, line, std::min(1023,s)); 40 | envVar[s] = '\0'; 41 | s++; 42 | strncpy(envValue, line+s, 1023); 43 | envValue[1023]='\0'; 44 | setenv(envVar, envValue, 0); 45 | //printf("%s : %s->%s\n", fileName, envVar, envValue); 46 | } 47 | if (line) free(line); 48 | fclose(file); 49 | } 50 | 51 | void initEnv() { 52 | char confFilePath[1024]; 53 | const char * userDir = userHomeDir(); 54 | if (userDir) { 55 | sprintf(confFilePath, "%s/.nccl.conf", userDir); 56 | setEnvFile(confFilePath); 57 | } 58 | sprintf(confFilePath, "/etc/nccl.conf"); 59 | setEnvFile(confFilePath); 60 | } 61 | 62 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { 63 | static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 64 | pthread_mutex_lock(&mutex); 65 | if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { 66 | char* str = getenv(env); 67 | int64_t value = deftVal; 68 | if (str && strlen(str) > 0) { 69 | errno = 0; 70 | value = strtoll(str, nullptr, 0); 71 | if (errno) { 72 | value = deftVal; 73 | INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); 74 | } else { 75 | INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value); 76 | } 77 | } 78 | __atomic_store_n(cache, value, __ATOMIC_RELAXED); 79 | } 80 | pthread_mutex_unlock(&mutex); 81 | } 82 | -------------------------------------------------------------------------------- /src/include/ibvsymbols.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_IBV_SYMBOLS_H_ 2 | #define NCCL_IBV_SYMBOLS_H_ 3 | 4 | #ifdef NCCL_BUILD_RDMA_CORE 5 | #include 6 | #else 7 | #include "ibvcore.h" 8 | #endif 9 | 10 | #include "nccl.h" 11 | 12 | /* IB Verbs Function Pointers*/ 13 | struct ncclIbvSymbols { 14 | int (*ibv_internal_fork_init)(void); 15 | struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); 16 | void (*ibv_internal_free_device_list)(struct ibv_device **list); 17 | const char * (*ibv_internal_get_device_name)(struct ibv_device *device); 18 | struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); 19 | int (*ibv_internal_close_device)(struct ibv_context *context); 20 | int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); 21 | void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); 22 | int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); 23 | int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); 24 | int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); 25 | int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); 26 | struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); 27 | int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); 28 | struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); 29 | struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access); 30 | /* DMA-BUF support */ 31 | struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); 32 | int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); 33 | struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); 34 | int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); 35 | struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); 36 | int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); 37 | int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); 38 | const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); 39 | }; 40 | 41 | /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ 42 | ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols); 43 | 44 | #endif // NCCL_IBV_SYMBOLS_H_ 45 | -------------------------------------------------------------------------------- /src/collectives/device/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../../makefiles/common.mk 8 | include ../../../makefiles/version.mk 9 | 10 | BUILDDIR ?= $(abspath ../../../build) 11 | OBJDIR := $(BUILDDIR)/obj/collectives/device 12 | 13 | LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu 14 | 15 | LIBSRCFILES += functions.cu 16 | 17 | DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES)) 18 | DEPENDFILES:= $(DEPFILES:%.d=%.dep) 19 | STATICLIB := $(OBJDIR)/colldevice.a 20 | DEVOBJ := $(OBJDIR)/devlink.o 21 | RULESFILE := $(OBJDIR)/Makefile.rules 22 | 23 | NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" 24 | 25 | 26 | all: $(STATICLIB) 27 | 28 | # Dummy rule so that the extra dependency (%.dep) files are preserved by make 29 | all_deps: $(DEPENDFILES) 30 | 31 | # Auto-generating the rules per op/reduction/datatype/algorithm 32 | $(RULESFILE) : gen_rules.sh 33 | @printf "Generating %-35s > %s\n" rules $@ 34 | @mkdir -p $(OBJDIR) 35 | @CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@ 36 | 37 | -include $(RULESFILE) 38 | 39 | LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o 40 | 41 | -include $(DEPFILES) 42 | 43 | $(STATICLIB): $(LIBOBJ) $(DEVOBJ) 44 | @printf "Archiving %-35s > %s\n" objects $@ 45 | ar cr $@ $^ 46 | 47 | # We do not want make to build *.d when running make clean. 48 | # So we only provide targets for .dep which will produce .dep and .d, 49 | # with only .d being included, and .dep keeping track of what needs to 50 | # be regenerated. 51 | $(OBJDIR)/%.dep : %.cu 52 | @mkdir -p $(OBJDIR) 53 | @$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp 54 | @sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@ 55 | @sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \ 56 | sed -e 's/^ *//' -e 's/$$/:/' >> $@ 57 | @rm -f $@.tmp 58 | @cp $@ $(@:.dep=.d) 59 | 60 | # Compiled kernels and collectives with relocatable device code ... 61 | $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep 62 | @printf "Compiling %-35s > %s\n" $< $@ 63 | mkdir -p `dirname $@` 64 | $(NVCC) $(NVCUFLAGS) -dc $< -o $@ 65 | 66 | $(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep 67 | @printf "Compiling %-35s > %s\n" $< $@ 68 | mkdir -p `dirname $@` 69 | $(NVCC) $(NVCUFLAGS) -dc $< -o $@ 70 | 71 | # ... and create the device-side linked object with all those. 72 | $(DEVOBJ) : $(LIBOBJ) 73 | $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ 74 | 75 | clean: 76 | rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB) 77 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V2_H_ 6 | #define NCCL_NET_V2_H_ 7 | 8 | typedef struct { 9 | // Name of the network (mainly for logs) 10 | const char* name; 11 | // Initialize the network. 12 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 13 | // Return the number of adapters. 14 | ncclResult_t (*devices)(int* ndev); 15 | // Return the device path in /sys. NCCL will call free on this path. 16 | ncclResult_t (*pciPath)(int dev, char** path); 17 | // Return whether this device supports host pointers and/or CUDA pointers 18 | // as data from the current GPU. Supported types should be composed with 19 | // NCCL_PTR_HOST and NCCL_PTR_CUDA. 20 | ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); 21 | // Create a receiving object and provide a handle to connect to it. The 22 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 23 | // between ranks to create a connection. 24 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 25 | // Connect to a handle and return a sending comm object for that peer. 26 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 27 | // Finalize connection establishment after remote peer has called connectHandle 28 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 29 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 30 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 31 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 32 | // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 33 | // May return request == NULL if the call cannot be performed (or would block) 34 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 35 | // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 36 | // May return request == NULL if the call cannot be performed (or would block) 37 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 38 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 39 | // visible to the GPU 40 | ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); 41 | // Test whether a request is complete. If size is not NULL, it returns the 42 | // number of bytes sent/received. 43 | ncclResult_t (*test)(void* request, int* done, int* size); 44 | // Close and free send/recv comm objects 45 | ncclResult_t (*closeSend)(void* sendComm); 46 | ncclResult_t (*closeRecv)(void* recvComm); 47 | ncclResult_t (*closeListen)(void* listenComm); 48 | } ncclNet_v2_t; 49 | 50 | #endif // end include guard 51 | -------------------------------------------------------------------------------- /pkg/redhat/nccl.spec.in: -------------------------------------------------------------------------------- 1 | Name: libnccl 2 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} 3 | Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} 4 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 5 | 6 | Group: Development/Libraries 7 | License: BSD 8 | URL: http://developer.nvidia.com/nccl 9 | Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz 10 | Requires(pre,preun): /sbin/ldconfig 11 | 12 | %description 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | %package devel 21 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 22 | Group: Development/Libraries 23 | %description devel 24 | NCCL development files 25 | 26 | %package static 27 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 28 | Group: Development/Libraries 29 | %description static 30 | NCCL static library 31 | 32 | %define debug_package %{nil} 33 | 34 | %prep 35 | %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q 36 | 37 | %build 38 | 39 | %install 40 | rm -rf $RPM_BUILD_ROOT 41 | install -m 755 -d $RPM_BUILD_ROOT 42 | install -m 755 -d $RPM_BUILD_ROOT/%{_libdir} 43 | install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir} 44 | ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major} 45 | 46 | # devel 47 | install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} 48 | install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} 49 | install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir} 50 | ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so 51 | 52 | # static 53 | install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir} 54 | 55 | %post -p /sbin/ldconfig 56 | %postun -p /sbin/ldconfig 57 | 58 | %post devel -p /sbin/ldconfig 59 | %postun devel -p /sbin/ldconfig 60 | 61 | %clean 62 | rm -rf $RPM_BUILD_ROOT 63 | 64 | %files devel 65 | %doc LICENSE.txt 66 | %defattr(-,root,root,-) 67 | %{_includedir}/nccl.h 68 | %{_includedir}/nccl_net.h 69 | %{_libdir}/libnccl.so 70 | 71 | %files static 72 | %doc LICENSE.txt 73 | %defattr(-,root,root,-) 74 | %{_libdir}/libnccl_static.a 75 | 76 | %files 77 | %doc LICENSE.txt 78 | %defattr(-,root,root,-) 79 | %{_libdir}/libnccl.so.${nccl:Major} 80 | %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} 81 | 82 | %changelog 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Prepare the docker 2 | ``` 3 | docker pull nvcr.io/nvidia/pytorch:23.08-py3 4 | ``` 5 | 6 | ## Download source code 7 | ``` 8 | git clone --recursive https://github.com/gbxu/autoccl.git 9 | ``` 10 | 11 | ## To build the library : 12 | 13 | ```shell 14 | cd autoccl 15 | make -j src.build 16 | ``` 17 | 18 | If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with : 19 | 20 | ```shell 21 | make src.build CUDA_HOME= 22 | ``` 23 | 24 | AutoCCL will be compiled and installed in `build/` unless `BUILDDIR` is set. 25 | 26 | By default, AutoCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform : 27 | ```shell 28 | $ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70" 29 | ``` 30 | 31 | ## Build AutoCCL Tuner 32 | ```shell 33 | $ cd ext-tuner/example && make clean && make 34 | ``` 35 | 36 | ## Use AutoCCL 37 | We assume that in a distributed scenario, each CPU process is responsible for managing a GPU. 38 | 39 | * Preload runtime and tuner to bypass Tccl on the system 40 | ```sh 41 | # Setting environment variables on each process 42 | export LD_PRELOAD=path/to/autoccl/build/lib/libnccl.so 43 | export LD_LIBRARY_PATH=path/to/autoccl/build/lib:path/to/autoccl/ext-tuner/example/build/:$LD_LIBRARY_PATH 44 | export NCCL_TUNER_PLUGIN=path/to/autoccl/ext-tuner/example/build/libnccl-plugin.so 45 | ``` 46 | 47 | * Specify the ip and port of the monitoring process 48 | ```sh 49 | # Setting environment variables on each process 50 | export TUNER_COORDINATOR="coordinator_node_ip:port" 51 | export TUNER_WORLDSIZE="YOUR_COMM_GROUP_SIZE" 52 | ``` 53 | 54 | * Specify a process on the coordinator node to create an additional thread to act as a coordinator responsible for listening to the coordinator_node_ip:port. 55 | ```sh 56 | # Setting environment variables only on a certain process 57 | export TUNER_ROLE="COORDINATOR" 58 | ``` 59 | 60 | ## Example 61 | see `autoccl/ext-tuner/example/example/cuda/pytorch/run.sh` 62 | 63 | 64 | ## Citation 65 | If you use autoccl in a scientific publication, we encourage you to add the following reference to the related papers: 66 | ``` 67 | @inproceedings {xu2025autoccl, 68 | author = {Guanbin Xu and Zhihao Le and Yinhe Chen and Zhiqi Lin and Zewen Jin and Youshan Miao and Cheng Li}, 69 | title = {{AutoCCL}: Automated Collective Communication Tuning for Accelerating Distributed and Parallel {DNN} Training}, 70 | booktitle = {22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)}, 71 | year = {2025}, 72 | isbn = {978-1-939133-46-5}, 73 | address = {Philadelphia, PA}, 74 | pages = {667--683}, 75 | url = {https://www.usenix.org/conference/nsdi25/presentation/xu-guanbin}, 76 | publisher = {USENIX Association}, 77 | month = apr 78 | } 79 | ``` 80 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v5.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V5_H_ 6 | #define NCCL_NET_V5_H_ 7 | 8 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; 9 | typedef struct { 10 | // Name of the network (mainly for logs) 11 | const char* name; 12 | // Initialize the network. 13 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 14 | // Return the number of adapters. 15 | ncclResult_t (*devices)(int* ndev); 16 | // Get various device properties. 17 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); 18 | // Create a receiving object and provide a handle to connect to it. The 19 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 20 | // between ranks to create a connection. 21 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 22 | // Connect to a handle and return a sending comm object for that peer. 23 | // This call must not block for the connection to be established, and instead 24 | // should return successfully with sendComm == NULL with the expectation that 25 | // it will be called again until sendComm != NULL. 26 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 27 | // Finalize connection establishment after remote peer has called connect. 28 | // This call must not block for the connection to be established, and instead 29 | // should return successfully with recvComm == NULL with the expectation that 30 | // it will be called again until recvComm != NULL. 31 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 32 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 33 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 34 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 35 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 36 | // Asynchronous send to a peer. 37 | // May return request == NULL if the call cannot be performed (or would block) 38 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 39 | // Asynchronous recv from a peer. 40 | // May return request == NULL if the call cannot be performed (or would block) 41 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 42 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 43 | // visible to the GPU 44 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 45 | // Test whether a request is complete. If size is not NULL, it returns the 46 | // number of bytes sent/received. 47 | ncclResult_t (*test)(void* request, int* done, int* sizes); 48 | // Close and free send/recv comm objects 49 | ncclResult_t (*closeSend)(void* sendComm); 50 | ncclResult_t (*closeRecv)(void* recvComm); 51 | ncclResult_t (*closeListen)(void* listenComm); 52 | } ncclNet_v5_t; 53 | 54 | #endif // end include guard 55 | -------------------------------------------------------------------------------- /ext-tuner/example/plugin/cuda/wrapper.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import sys 3 | import os 4 | 5 | class Pair(ctypes.Structure): 6 | _fields_ = [("key", ctypes.c_char_p), 7 | ("value", ctypes.c_int)] 8 | 9 | WORKLOAD_SIZE = 3 10 | CANDIDATE_SIZE = 10 11 | 12 | class GroupInfo(ctypes.Structure): 13 | _fields_ = [("groupID", ctypes.c_int64), 14 | ("root", ctypes.c_int32), 15 | ("rank", ctypes.c_int32), 16 | ("nrank", ctypes.c_int32), 17 | ("nnode", ctypes.c_int32)] 18 | 19 | class NCCLCandidateWrapper: 20 | def __init__(self, lib_path=None): 21 | try: 22 | # get current file directory 23 | current_dir = os.path.dirname(os.path.abspath(__file__)) 24 | # get dynamic library path 25 | lib_path = os.path.join(current_dir, "../../build/libwrapper.so") 26 | self.lib = ctypes.CDLL(lib_path) 27 | print("Library loaded successfully") 28 | except OSError as e: 29 | print(f"Failed to load library: {e}") 30 | sys.exit(1) 31 | 32 | self.lib.ncclGetValidCandidatesWrapper.argtypes = [ 33 | ctypes.POINTER(GroupInfo), # pointer 34 | ctypes.POINTER(Pair), # pointer 35 | ctypes.c_size_t, 36 | ctypes.POINTER(ctypes.c_uint64), # pointer 37 | ctypes.c_bool, 38 | ctypes.POINTER(ctypes.POINTER(ctypes.c_int32)), # pointer of pointer 39 | ctypes.POINTER(ctypes.c_size_t) # pointer 40 | ] 41 | self.lib.ncclGetValidCandidatesWrapper.restype = None 42 | 43 | self.lib.freeCandidates.argtypes = [ctypes.POINTER(ctypes.c_int32)] 44 | self.lib.freeCandidates.restype = None 45 | 46 | def nccl_get_valid_candidates(self, nrank, nnode, coll, size, tunerEnvs, scale2): 47 | pairs = [(key.encode('utf-8'), value) for key, value in tunerEnvs.items()] 48 | pairCount = len(pairs) 49 | pairPtr = (Pair * pairCount)(*pairs) 50 | 51 | group_info = GroupInfo(groupID=0, root=0, rank=0, nrank=nrank, nnode=nnode) 52 | 53 | workloadElemPtr = (ctypes.c_uint64 * WORKLOAD_SIZE)(*[0, coll, size]) 54 | 55 | candidateElemPtr = ctypes.POINTER(ctypes.c_int32)() # allocate the pointer 56 | candidateElemCount = ctypes.c_size_t() 57 | 58 | self.lib.ncclGetValidCandidatesWrapper( 59 | ctypes.byref(group_info), # get the pointer 60 | pairPtr, 61 | pairCount, 62 | workloadElemPtr, 63 | scale2, 64 | ctypes.byref(candidateElemPtr), # pointer of pointer 65 | ctypes.byref(candidateElemCount) # get the pointer 66 | ) 67 | 68 | candidates = [] 69 | for i in range(int(candidateElemCount.value/CANDIDATE_SIZE)): 70 | candidate = [] 71 | for j in range(CANDIDATE_SIZE): 72 | candidate.append(candidateElemPtr[CANDIDATE_SIZE*i+j]) 73 | candidates.append(candidate) 74 | 75 | self.lib.freeCandidates(candidateElemPtr) 76 | 77 | return candidates 78 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v4.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V4_H_ 6 | #define NCCL_NET_V4_H_ 7 | 8 | #define NCCL_NET_HANDLE_MAXSIZE_V4 64 9 | 10 | typedef struct { 11 | char* name; // Used mostly for logging. 12 | char* pciPath; // Path to the PCI device in /sys. 13 | uint64_t guid; // Unique identifier for the NIC chip. Important for 14 | // cards with multiple PCI functions (Physical or virtual). 15 | int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA 16 | int speed; // Port speed in Mbps. 17 | int port; // Port number. 18 | int maxComms; // Maximum number of comms we can create 19 | } ncclNetProperties_v4_t; 20 | 21 | // v4 struct for backwards compatibility 22 | typedef struct { 23 | // Name of the network (mainly for logs) 24 | const char* name; 25 | // Initialize the network. 26 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 27 | // Return the number of adapters. 28 | ncclResult_t (*devices)(int* ndev); 29 | // Get various device properties. 30 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); 31 | // Create a receiving object and provide a handle to connect to it. The 32 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 33 | // between ranks to create a connection. 34 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 35 | // Connect to a handle and return a sending comm object for that peer. 36 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 37 | // Finalize connection establishment after remote peer has called connectHandle 38 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 39 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 40 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 41 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 42 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 43 | // Asynchronous send to a peer. 44 | // May return request == NULL if the call cannot be performed (or would block) 45 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 46 | // Asynchronous recv from a peer. 47 | // May return request == NULL if the call cannot be performed (or would block) 48 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 49 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 50 | // visible to the GPU 51 | ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); 52 | // Test whether a request is complete. If size is not NULL, it returns the 53 | // number of bytes sent/received. 54 | ncclResult_t (*test)(void* request, int* done, int* size); 55 | // Close and free send/recv comm objects 56 | ncclResult_t (*closeSend)(void* sendComm); 57 | ncclResult_t (*closeRecv)(void* recvComm); 58 | ncclResult_t (*closeListen)(void* listenComm); 59 | } ncclNet_v4_t; 60 | 61 | #endif // end include guard 62 | -------------------------------------------------------------------------------- /src/misc/argcheck.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "argcheck.h" 8 | #include "comm.h" 9 | 10 | static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { 11 | cudaPointerAttributes attr; 12 | cudaError_t err = cudaPointerGetAttributes(&attr, pointer); 13 | if (err != cudaSuccess || attr.devicePointer == NULL) { 14 | WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer); 15 | return ncclInvalidArgument; 16 | } 17 | #if CUDART_VERSION >= 10000 18 | if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { 19 | #else 20 | if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { 21 | #endif 22 | WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); 23 | return ncclInvalidArgument; 24 | } 25 | return ncclSuccess; 26 | } 27 | 28 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { 29 | if (ptr == NULL) { 30 | WARN("%s : %s argument is NULL", opname, ptrname); 31 | return ncclInvalidArgument; 32 | } 33 | return ncclSuccess; 34 | } 35 | 36 | ncclResult_t ArgsCheck(struct ncclInfo* info) { 37 | // First, the easy ones 38 | if (info->root < 0 || info->root >= info->comm->nRanks) { 39 | WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); 40 | return ncclInvalidArgument; 41 | } 42 | if (info->datatype < 0 || info->datatype >= ncclNumTypes) { 43 | WARN("%s : invalid type %d", info->opName, info->datatype); 44 | return ncclInvalidArgument; 45 | } 46 | // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars. 47 | NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks)); 48 | 49 | if (info->op < 0 || ncclMaxRedOp < info->op) { 50 | WARN("%s : invalid reduction operation %d", info->opName, info->op); 51 | return ncclInvalidArgument; 52 | } 53 | int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps); 54 | if (ncclNumOps <= info->op && 55 | (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) { 56 | WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op); 57 | return ncclInvalidArgument; 58 | } 59 | 60 | if (info->comm->checkPointers) { 61 | if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) { 62 | if (info->count >0) 63 | NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName)); 64 | } else { 65 | // Check CUDA device pointers 66 | if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) { 67 | NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); 68 | } 69 | if (info->coll != ncclFuncReduce || info->comm->rank == info->root) { 70 | NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); 71 | } 72 | } 73 | } 74 | return ncclSuccess; 75 | } 76 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/internal/database.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "src/include/datatype.h" 8 | #include "src/include/internal/logging.h" 9 | #include "src/include/timer.h" 10 | 11 | class Database { 12 | public: 13 | // TODO: reload from previous database 14 | RecordValue fetct(const RecordKey &key) { 15 | std::lock_guard lock(this->mutex); 16 | // pthread_mutex_lock(&this->mutex); 17 | if (table.count(key) == 0) { 18 | // pthread_mutex_unlock(&this->mutex); 19 | return {-1, 0}; 20 | } else { 21 | auto item = table[key]; 22 | // pthread_mutex_unlock(&this->mutex); 23 | return item; 24 | } 25 | } 26 | 27 | void add(const Record &record) { 28 | std::lock_guard lock(this->mutex); 29 | // pthread_mutex_lock(&this->mutex); 30 | if (table.count(record.key) == 0) { 31 | table[record.key] = record.value; 32 | } else { 33 | table[record.key] = 34 | RecordValue((table[record.key].duration * table[record.key].repeat + 35 | record.value.duration * record.value.repeat) / 36 | (table[record.key].repeat + record.value.repeat), 37 | (table[record.key].repeat + record.value.repeat)); 38 | } 39 | log.push_back(record); 40 | // pthread_mutex_unlock(&this->mutex); 41 | } 42 | std::string dataDebugStr(); 43 | std::string logDebugStr(); 44 | 45 | private: 46 | std::map table; 47 | std::vector log; 48 | std::mutex mutex; 49 | // pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 50 | }; 51 | 52 | std::string Database::dataDebugStr() { 53 | std::lock_guard lock(this->mutex); 54 | // pthread_mutex_lock(&this->mutex); 55 | std::stringstream ss; // NOLINT 56 | // the sorted records 57 | std::vector> sortedRecords; 58 | 59 | // group records by map 60 | std::map>> groupedRecords; 61 | for (const auto& entry : this->table) { 62 | groupedRecords[entry.first.workload].emplace_back(entry); 63 | } 64 | // sort each group by duration value 65 | for (auto& group : groupedRecords) { 66 | auto& records = group.second; 67 | std::sort(records.begin(), records.end(), [](const auto& a, const auto& b) { 68 | return a.second.duration < b.second.duration; 69 | }); 70 | // update sortedRecords 71 | sortedRecords.insert(sortedRecords.end(), records.begin(), records.end()); 72 | } 73 | 74 | for (const auto &pair : sortedRecords) { 75 | ss << pair.first.debugStr(); 76 | ss << pair.second.debugStr() << "\n"; 77 | } 78 | // pthread_mutex_unlock(&this->mutex); 79 | return ss.str(); 80 | } 81 | 82 | std::string Database::logDebugStr() { 83 | std::lock_guard lock(this->mutex); 84 | // pthread_mutex_lock(&this->mutex); 85 | std::stringstream ss; // NOLINT 86 | for (const auto &record : log) { 87 | ss << record.debugStr() << "\n"; 88 | } 89 | // pthread_mutex_unlock(&this->mutex); 90 | return ss.str(); 91 | 92 | // TODO: chrome timeline 93 | } 94 | -------------------------------------------------------------------------------- /src/misc/tuner.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "debug.h" 13 | #include "nccl_tuner.h" 14 | 15 | pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; 16 | static int tunerPluginRefCount = -1; 17 | static void* tunerPluginLib = nullptr; 18 | ncclTuner_t* tunerSymbol = nullptr; 19 | 20 | __attribute__((visibility("default"))) ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) { 21 | // Initialize to nullptr by default if plugin tuner cannot be loaded. 22 | *tuner = nullptr; 23 | if (tunerPluginRefCount == -2) return ncclSuccess; 24 | 25 | pthread_mutex_lock(&tunerPluginLock); 26 | if (tunerPluginRefCount == -1) { 27 | tunerPluginRefCount = -2; // Default: no plugin, don't try again later 28 | 29 | const char* name = getenv("NCCL_TUNER_PLUGIN"); 30 | if (name) { 31 | INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name); 32 | tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL); 33 | } 34 | if (tunerPluginLib == nullptr) { 35 | // dlopen does not guarantee to set errno, but dlerror only gives us a 36 | // string, so checking errno doesn't hurt to try to provide a better 37 | // error message 38 | if (errno == ENOENT) { 39 | INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name); 40 | } else { 41 | INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror()); 42 | } 43 | } else { 44 | tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL); 45 | if (tunerSymbol == nullptr) { 46 | INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name); 47 | dlclose(tunerPluginLib); 48 | tunerPluginLib = nullptr; 49 | } else { 50 | INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name); 51 | tunerPluginRefCount = 0; 52 | } 53 | } 54 | } 55 | 56 | if (tunerPluginRefCount >= 0) { 57 | *tuner = tunerSymbol; 58 | INFO(NCCL_INIT, "Using tuner plugin: '%s'", tunerSymbol->name); 59 | tunerPluginRefCount++; 60 | } 61 | pthread_mutex_unlock(&tunerPluginLock); 62 | return ncclSuccess; 63 | } 64 | 65 | __attribute__((visibility("default"))) ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner) { 66 | if (*tuner == nullptr) return ncclSuccess; 67 | pthread_mutex_lock(&tunerPluginLock); 68 | if (--tunerPluginRefCount == 0) { 69 | if (tunerPluginLib == nullptr) { 70 | WARN("Tuner plugin refcount is 0, yet tunerPluginLib ptr is NULL\n"); 71 | } else { 72 | INFO(NCCL_TUNING, "Closing tuner: '%s'", tunerSymbol->name); 73 | dlclose(tunerPluginLib); 74 | } 75 | tunerPluginLib = nullptr; 76 | tunerSymbol = nullptr; 77 | *tuner = nullptr; 78 | tunerPluginRefCount = -1; 79 | } 80 | pthread_mutex_unlock(&tunerPluginLock); 81 | return ncclSuccess; 82 | } -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_CUDART 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif /* __cplusplus */ 16 | 17 | typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name); 18 | typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name); 19 | typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name); 20 | typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name); 21 | typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name); 22 | typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name); 23 | 24 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name) 25 | { 26 | #ifndef NVTX_DISABLE 27 | nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr; 28 | if(local!=0) 29 | (*local)(device, name); 30 | #endif /*NVTX_DISABLE*/ 31 | } 32 | 33 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name) 34 | { 35 | #ifndef NVTX_DISABLE 36 | nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr; 37 | if(local!=0) 38 | (*local)(device, name); 39 | #endif /*NVTX_DISABLE*/ 40 | } 41 | 42 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name) 43 | { 44 | #ifndef NVTX_DISABLE 45 | nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr; 46 | if(local!=0) 47 | (*local)(stream, name); 48 | #endif /*NVTX_DISABLE*/ 49 | } 50 | 51 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name) 52 | { 53 | #ifndef NVTX_DISABLE 54 | nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr; 55 | if(local!=0) 56 | (*local)(stream, name); 57 | #endif /*NVTX_DISABLE*/ 58 | } 59 | 60 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name) 61 | { 62 | #ifndef NVTX_DISABLE 63 | nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr; 64 | if(local!=0) 65 | (*local)(event, name); 66 | #endif /*NVTX_DISABLE*/ 67 | } 68 | 69 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name) 70 | { 71 | #ifndef NVTX_DISABLE 72 | nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr; 73 | if(local!=0) 74 | (*local)(event, name); 75 | #endif /*NVTX_DISABLE*/ 76 | } 77 | 78 | #ifdef __cplusplus 79 | } /* extern "C" */ 80 | #endif /* __cplusplus */ 81 | 82 | -------------------------------------------------------------------------------- /src/include/coll_net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COLL_NET_H_ 8 | #define COLL_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | 13 | typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 14 | 15 | // Translation to external API 16 | static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } 17 | static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } 18 | static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } 19 | static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } 20 | static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } 21 | static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } 22 | static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } 23 | /* DMA-BUF support */ 24 | static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } 25 | static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; } 26 | static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { 27 | NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } 28 | static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } 29 | static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } 30 | static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } 31 | static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } 32 | 33 | static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/include/nvtx.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_NVTX_H_ 8 | #define NCCL_NVTX_H_ 9 | 10 | #include "nvtx3/nvtx3.hpp" 11 | 12 | #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14) 13 | #define NVTX3_CONSTEXPR_IF_CPP14 constexpr 14 | #else 15 | #define NVTX3_CONSTEXPR_IF_CPP14 16 | #endif 17 | 18 | // Define all NCCL-provided static schema IDs here (avoid duplicates). 19 | #define NVTX_SID_CommInitRank 0 20 | #define NVTX_SID_CommInitAll 1 21 | #define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank 22 | #define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank 23 | #define NVTX_SID_AllGather 4 24 | #define NVTX_SID_AllReduce 5 25 | #define NVTX_SID_Broadcast 6 26 | #define NVTX_SID_ReduceScatter 7 27 | #define NVTX_SID_Reduce 8 28 | #define NVTX_SID_Send 9 29 | #define NVTX_SID_Recv 10 30 | 31 | // Define static schema ID for the reduction operation. 32 | #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START 33 | 34 | extern const nvtxDomainHandle_t ncclNvtxDomainHandle; 35 | 36 | struct nccl_domain{static constexpr char const* name{"NCCL"};}; 37 | 38 | class payload_schema { 39 | public: 40 | explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept 41 | { 42 | schema_attr.name = schemaName; 43 | schema_attr.entries = entries; 44 | schema_attr.numEntries = numEntries; 45 | schema_attr.schemaId = schemaId; 46 | nvtxPayloadSchemaRegister(nvtx3::domain::get(), &schema_attr); 47 | } 48 | 49 | payload_schema() = delete; 50 | ~payload_schema() = default; 51 | payload_schema(payload_schema const&) = default; 52 | payload_schema& operator=(payload_schema const&) = default; 53 | payload_schema(payload_schema&&) = default; 54 | payload_schema& operator=(payload_schema&&) = default; 55 | 56 | private: 57 | nvtxPayloadSchemaAttr_t schema_attr{ 58 | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | 59 | NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | 60 | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | 61 | NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | 62 | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, 63 | nullptr, 64 | NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, 65 | NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 66 | nullptr, 0, 0, 0}; 67 | }; 68 | 69 | // Create NVTX push/pop range with parameters 70 | // @param name of the operation (see `NVTX_SID_*`) 71 | // @param N schema name 72 | // @param S schema (entries) 73 | // @param P payload (struct) 74 | #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ 75 | static const payload_schema schema{S, std::extent::value, \ 76 | NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ 77 | static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ 78 | nvtxPayloadData_t nvtx3_bpl__[] = { \ 79 | {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ 80 | ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ 81 | ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; 82 | 83 | extern void initNvtxRegisteredEnums(); 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /ext-tuner/example/utils/get_candidates.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import argparse 5 | 6 | current_dir = os.path.dirname(os.path.abspath(__file__)) 7 | wrapper_dir = os.path.join(current_dir, "../plugin/cuda/") 8 | sys.path.append(wrapper_dir) 9 | from wrapper import NCCLCandidateWrapper 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--nrank", type=int, default=8) 14 | parser.add_argument("--nnode", type=int, default=1) 15 | parser.add_argument("--coll", type=str, default="AllReduce") 16 | parser.add_argument("--size", type=int, default=1024, help="nbyte per rank") 17 | parser.add_argument("--expire", type=int, default=10, help="expire per candidate, -1 is forever") 18 | parser.add_argument("--scale2", action='store_true', default=False, help="") 19 | args = parser.parse_args() 20 | coll_name = f"nnode{args.nnode}_nrank{args.nrank}_coll{args.coll}_size{args.size}" 21 | coll_to_key = { 22 | "Broadcast" : 0, 23 | "Reduce" : 1, 24 | "AllGather" : 2, 25 | "ReduceScatter" : 3, 26 | "AllReduce" : 4, 27 | "SendRecv" : 5, 28 | "All2All" : 9 29 | } 30 | 31 | wrapper = NCCLCandidateWrapper() 32 | # p2plevel=7 by default in NCCL 33 | # gdrlevel=4 by default in NCCL 34 | map_dict = { 35 | "tuner_extraP2PCE": 0, 36 | "tuner_extraSHM": 1, 37 | "tuner_p2pLevel": -1, 38 | # set tuner_p2pChunkSize=512*1024 for nvlink-1-node 39 | "tuner_p2pChunkSize": 128*1024, 40 | "tuner_p2pnChannelsPerPeer": 128, 41 | "tuner_p2pnChannels": 128, 42 | "tuner_nChannels": 128, 43 | "tuner_treeupdown_allreduce_simple": 0 44 | } 45 | candidates = wrapper.nccl_get_valid_candidates( 46 | nrank=args.nrank, 47 | nnode=args.nnode, 48 | coll=coll_to_key[args.coll], 49 | size=args.size, 50 | tunerEnvs=map_dict, 51 | scale2=args.scale2) 52 | 53 | count_after_filter = 0 54 | file = open(f"{coll_name}.txt", 'w') 55 | print(f"writting {coll_name}.txt") 56 | last_output = None 57 | for candidate in candidates: 58 | # you can filter the candidate here 59 | algo, proto, copytype, p2plevel, nc, nt, chunksize, _, _, _ = candidate 60 | '''example 61 | if algo != 1: 62 | continue 63 | if proto != 2: 64 | continue 65 | if nc & (nc - 1) != 0: 66 | continue 67 | if nt & (nt - 1) != 0: 68 | continue 69 | if chunksize & (chunksize - 1) != 0: 70 | continue 71 | ''' 72 | count_after_filter += 1 73 | specific_candidate = f"{args.nrank}; -1 {coll_to_key[args.coll]} {args.size};{' '.join(map(str, candidate))}" 74 | output = f"{specific_candidate};{args.expire}" 75 | last_output = f"{specific_candidate};-1" 76 | file.write(output + '\n') 77 | file.write(last_output + '\n') 78 | file.close() 79 | 80 | print(f"The communication: nrank={args.nrank}, nnode={args.nnode}, coll={args.coll}, sizePerRank={args.size},\n" 81 | f"tunerEnvs={map_dict},\n" 82 | f"scale2={args.scale2}, has {len(candidates)} candidates, after filter: {count_after_filter} candidates.") 83 | 84 | print(f"set environment variable TUNER_PROFILE_MORE={count_after_filter*args.expire},\n" 85 | f"then call {count_after_filter*args.expire} times {coll_name},\n" 86 | f"add more repeats will be safer.") 87 | -------------------------------------------------------------------------------- /ext-tuner/example/src/include/optimizers/optimizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "src/include/jobs/job.h" 11 | #include "src/include/tuner.h" 12 | #include "src/include/internal/env.h" 13 | #include "src/misc/math.h" 14 | 15 | class Tuner; 16 | 17 | class Optimizer { 18 | public: 19 | explicit Optimizer(const CandidateFunc &getValidCandidatesFunc) 20 | : getValidCandidatesFunc(getValidCandidatesFunc) { 21 | initEnv(); 22 | } 23 | void initialize(Tuner *tuner) { this->tuner = tuner; } 24 | virtual void 25 | addJob(const Info &myInfo, 26 | const std::unordered_map &allGroupInfos, 27 | const GIDTYPE &groupID, const Workload &workload, 28 | Candidate &startCandidate) = 0; 29 | // TODO: multithread, workload-specific 30 | virtual void run(const std::map &status, 31 | const std::vector &records, 32 | ResultPack *resultPack) = 0; 33 | virtual void pick(const std::map &status, 34 | DecisionPack *decisionPack) = 0; 35 | 36 | std::string debugStr() const { 37 | std::stringstream ss; // NOLINT 38 | ss << "mode=" << Environment::get()->find("TUNER_MODE", "-1") << "\n"; 39 | ss << "warmupSteps=" << warmupSteps << "\n"; 40 | ss << "pretrainSteps=" << pretrainSteps << "\n"; 41 | ss << "trainSteps=" << trainSteps << "\n"; 42 | ss << "roundMaxSteps=" << roundMaxSteps << "\n"; 43 | ss << "optimumExpireSteps=" << optimumExpireSteps << "\n"; 44 | ss << "expireSteps=" << expireSteps << "\n"; 45 | std::lock_guard lock(this->jobsMutex); 46 | for (auto &pair : this->jobs) { 47 | const auto &workload = pair.first; 48 | const auto &job = pair.second; 49 | ss << job->debugStr() << "\n";; 50 | } 51 | return ss.str(); 52 | } 53 | 54 | protected: 55 | mutable std::mutex jobsMutex; 56 | // pthread_mutex_t jobsMutex = PTHREAD_MUTEX_INITIALIZER; 57 | std::map> jobs; 58 | Tuner *tuner; 59 | CandidateFunc getValidCandidatesFunc; 60 | int32_t warmupSteps; 61 | int32_t pretrainSteps; 62 | int32_t trainSteps; 63 | int32_t roundMaxSteps; 64 | int32_t optimumExpireSteps; 65 | int32_t expireSteps; 66 | 67 | private: 68 | void initEnv(); 69 | }; 70 | 71 | void Optimizer::initEnv() { 72 | warmupSteps = Environment::get()->find("TUNER_WARMUP_STEPS", 5); 73 | CHECK(warmupSteps >= 1); 74 | pretrainSteps = Environment::get()->find("TUNER_PRETRAIN_STEPS", 20); 75 | CHECK(pretrainSteps >= 1); 76 | trainSteps = Environment::get()->find("TUNER_TRAIN_STEPS", 50); 77 | CHECK(trainSteps >= 1); 78 | roundMaxSteps = Environment::get()->find("TUNER_ROUND_MAX_STEPS", 10); 79 | CHECK(roundMaxSteps >= 1); 80 | optimumExpireSteps = Environment::get()->find("TUNER_OPTIMUM_EXPIRE", 10); 81 | expireSteps = Environment::get()->find("TUNER_PROFILE_REPEAT", 1); 82 | CHECK(pretrainSteps + trainSteps >= expireSteps); 83 | CHECK(expireSteps > 0); 84 | INFO(Logger::LogSubSys::OPTIMIZER) << "set warmupSteps=" << warmupSteps 85 | << " pretrainSteps=" << pretrainSteps << " trainSteps=" << trainSteps 86 | << " roundMaxSteps=" << roundMaxSteps 87 | << " optimumExpireSteps=" << optimumExpireSteps 88 | << " expireSteps=" << expireSteps; 89 | } 90 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD 10 | #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #define NVTX_EXT_IMPL_GUARD 14 | #include "nvtxExtImpl.h" 15 | #undef NVTX_EXT_IMPL_GUARD 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif /* __cplusplus */ 20 | 21 | #define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \ 22 | NAME##_v##VERSION##_mem##COMPATID 23 | #define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \ 24 | NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) 25 | #define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \ 26 | NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD) 27 | 28 | /* 29 | * Function slots for the binary payload extension. First entry is the module 30 | * state, initialized to `0` (`NVTX_EXTENSION_FRESH`). 31 | */ 32 | NVTX_LINKONCE_DEFINE_GLOBAL intptr_t 33 | NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1] 34 | = {0}; 35 | 36 | NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)() 37 | { 38 | intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1; 39 | nvtxExtModuleSegment_t segment = { 40 | 0, // unused (only one segment) 41 | NVTX3EXT_CBID_PAYLOAD_FN_NUM, 42 | fnSlots 43 | }; 44 | 45 | nvtxExtModuleInfo_t module = { 46 | NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), 47 | NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD, 48 | 1, &segment, // number of segments, segments 49 | NULL, // no export function needed 50 | // bake type sizes and alignment information into program binary 51 | &nvtxExtPayloadTypeInfo 52 | }; 53 | 54 | NVTX_INFO( "%s\n", __FUNCTION__ ); 55 | 56 | NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, 57 | NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)); 58 | } 59 | 60 | #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ 61 | typedef ret_val ( * fn_name##_impl_fntype )signature; \ 62 | NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \ 63 | intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ 64 | if (slot != NVTX_EXTENSION_DISABLED) { \ 65 | if (slot) { \ 66 | return (*(fn_name##_impl_fntype)slot) arg_names; \ 67 | } else { \ 68 | NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \ 69 | slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ 70 | if (slot != NVTX_EXTENSION_DISABLED && slot) { \ 71 | return (*(fn_name##_impl_fntype)slot) arg_names; \ 72 | } \ 73 | } \ 74 | } \ 75 | return ((ret_val)(intptr_t)-1); \ 76 | } 77 | 78 | NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr)) 79 | 80 | NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr)) 81 | 82 | #undef NVTX_EXT_FN_IMPL 83 | 84 | #ifdef __cplusplus 85 | } /* extern "C" */ 86 | #endif /* __cplusplus */ -------------------------------------------------------------------------------- /ext-tuner/example/plugin/cuda/plugin.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_TUNER_H_ 9 | #define NCCL_TUNER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include "nccl.h" 15 | #include "src/cuda/nccl_params.h" 16 | 17 | // API to be implemented by external tuner 18 | struct ncclTuner_v1_t { 19 | // Name of the tuner 20 | const char *name; 21 | 22 | // Initializes tuner states. 23 | // nRanks: number of ranks in current communicator. Each communicator 24 | // initialize its own tuner. nNodes: number of nodes in current 25 | // communicator. logFunction: a logFunction can be useful to integrate 26 | // logging together with NCCL core. 27 | ncclResult_t (*init)(uint64_t commHash, size_t nRanks, size_t nNodes, 28 | size_t rank, size_t node, size_t device, std::map tunerEnvs, 29 | void *handler); 30 | 31 | // Gets info (algo, protocol, number of ctas and threads) for a given 32 | // collective. Inputs: 33 | // - collType: collective type , e.g., allreduce, allgather… 34 | // - nBytes: collective size in bytes 35 | // - collNetSupport: whether collnet supports this type 36 | // - nvlsSupport: whether nvlink sharp supports this time 37 | // - numPipeOps: number of operations in the group 38 | // 39 | // Outputs: 40 | // - algorithm: selected algorithm to be used for the given collective 41 | // - protocol: selected protocol to be used for the given collective 42 | // - nChannels: number of channels (hence SMs) to be used. 43 | // 44 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 45 | // default tuning for the given collective. 46 | // Also, the plugin is allowed to not set any output, or set only the 47 | // algorithm and protocol, but not only the algorithm or only the protocol. 48 | // Unset fields will be set automatically by NCCL. 49 | ncclResult_t (*getCandidate)(uint64_t commHash, ncclFunc_t collType, 50 | size_t nBytes, int *algorithm, int *protocol, 51 | int *isCopyEngineNotSmCopy, int *p2pLevel, int *nChannels, 52 | int *nThreads, int *chunkSize, int *iteration, 53 | int *lastIterEffectiveChunksize, int *native); 54 | 55 | // Terminates the plugin and cleans up any resources that the plugin 56 | // allocated. 57 | ncclResult_t (*destroy)(uint64_t commHash); 58 | // Profiles the communication 59 | ncclResult_t (*startProfiling)(uint64_t commHash, cudaStream_t stream, 60 | ncclFunc_t collType, size_t nBytes, 61 | int algorithm, int protocol, 62 | int isCopyEngineNotSmCopy, int p2pLevel, int nChannels, 63 | int nThreads, int chunkSize, int iteration, 64 | int lastIterEffectiveChunksize, int native); 65 | ncclResult_t (*stopProfiling)(uint64_t commHash); 66 | ncclResult_t (*isNewWorkload)(uint64_t commHash, ncclFunc_t collType, size_t nBytes, bool* flag); 67 | // Workload workload; 68 | // Candidate candidate; 69 | }; 70 | 71 | using ncclTuner_t = ncclTuner_v1_t; 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_SYNC 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif /* __cplusplus */ 17 | 18 | typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); 19 | typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle); 20 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle); 21 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle); 22 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle); 23 | typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle); 24 | 25 | NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs) 26 | { 27 | #ifndef NVTX_DISABLE 28 | nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr; 29 | if(local!=0) 30 | return (*local)(domain, attribs); 31 | else 32 | #endif /*NVTX_DISABLE*/ 33 | return (nvtxSyncUser_t)0; 34 | } 35 | 36 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle) 37 | { 38 | #ifndef NVTX_DISABLE 39 | nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr; 40 | if(local!=0) 41 | (*local)(handle); 42 | #endif /*NVTX_DISABLE*/ 43 | } 44 | 45 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle) 46 | { 47 | #ifndef NVTX_DISABLE 48 | nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr; 49 | if(local!=0) 50 | (*local)(handle); 51 | #endif /*NVTX_DISABLE*/ 52 | } 53 | 54 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle) 55 | { 56 | #ifndef NVTX_DISABLE 57 | nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr; 58 | if(local!=0) 59 | (*local)(handle); 60 | #endif /*NVTX_DISABLE*/ 61 | } 62 | 63 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle) 64 | { 65 | #ifndef NVTX_DISABLE 66 | nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr; 67 | if(local!=0) 68 | (*local)(handle); 69 | #endif /*NVTX_DISABLE*/ 70 | } 71 | 72 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle) 73 | { 74 | #ifndef NVTX_DISABLE 75 | nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr; 76 | if(local!=0) 77 | (*local)(handle); 78 | #endif /*NVTX_DISABLE*/ 79 | } 80 | 81 | #ifdef __cplusplus 82 | } /* extern "C" */ 83 | #endif /* __cplusplus */ 84 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v6.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V6_H_ 6 | #define NCCL_NET_V6_H_ 7 | 8 | typedef struct { 9 | char* name; // Used mostly for logging. 10 | char* pciPath; // Path to the PCI device in /sys. 11 | uint64_t guid; // Unique identifier for the NIC chip. Important for 12 | // cards with multiple PCI functions (Physical or virtual). 13 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 14 | int speed; // Port speed in Mbps. 15 | int port; // Port number. 16 | float latency; // Network latency 17 | int maxComms; // Maximum number of comms we can create 18 | int maxRecvs; // Maximum number of grouped receives. 19 | }ncclNetProperties_v6_t; 20 | 21 | typedef ncclNetProperties_v6_t ncclNetProperties_t; 22 | 23 | typedef struct { 24 | // Name of the network (mainly for logs) 25 | const char* name; 26 | // Initialize the network. 27 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 28 | // Return the number of adapters. 29 | ncclResult_t (*devices)(int* ndev); 30 | // Get various device properties. 31 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); 32 | // Create a receiving object and provide a handle to connect to it. The 33 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 34 | // between ranks to create a connection. 35 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 36 | // Connect to a handle and return a sending comm object for that peer. 37 | // This call must not block for the connection to be established, and instead 38 | // should return successfully with sendComm == NULL with the expectation that 39 | // it will be called again until sendComm != NULL. 40 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 41 | // Finalize connection establishment after remote peer has called connect. 42 | // This call must not block for the connection to be established, and instead 43 | // should return successfully with recvComm == NULL with the expectation that 44 | // it will be called again until recvComm != NULL. 45 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 46 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 47 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 48 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 49 | /* DMA-BUF support */ 50 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 51 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 52 | // Asynchronous send to a peer. 53 | // May return request == NULL if the call cannot be performed (or would block) 54 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 55 | // Asynchronous recv from a peer. 56 | // May return request == NULL if the call cannot be performed (or would block) 57 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 58 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 59 | // visible to the GPU 60 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 61 | // Test whether a request is complete. If size is not NULL, it returns the 62 | // number of bytes sent/received. 63 | ncclResult_t (*test)(void* request, int* done, int* sizes); 64 | // Close and free send/recv comm objects 65 | ncclResult_t (*closeSend)(void* sendComm); 66 | ncclResult_t (*closeRecv)(void* recvComm); 67 | ncclResult_t (*closeListen)(void* listenComm); 68 | } ncclNet_v6_t; 69 | 70 | #endif // end include guard 71 | -------------------------------------------------------------------------------- /src/include/nccl_tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_TUNER_H_ 9 | #define NCCL_TUNER_H_ 10 | 11 | #include 12 | #include 13 | #include "nccl.h" 14 | #include "devcomm.h" 15 | 16 | // API to be implemented by external tuner 17 | typedef struct { 18 | // Name of the tuner 19 | const char* name; 20 | 21 | // Initializes tuner states. 22 | // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. 23 | // nNodes: number of nodes in current communicator. 24 | // logFunction: a logFunction can be useful to integrate logging together with NCCL core. 25 | ncclResult_t (*init)(uint64_t commHash, size_t nRanks, size_t nNodes, size_t rank, size_t node, size_t device, std::map tunerEnvs, void* handler); 26 | 27 | // Gets info (algo, protocol, number of ctas and threads) for a given collective. 28 | // Inputs: 29 | // - collType: collective type , e.g., allreduce, allgather… 30 | // - nBytes: collective size in bytes 31 | // - collNetTypeSupport: whether collnet supports this type 32 | // - nvlsTypeSupport: whether nvlink sharp supports this time 33 | // - numPipeOps: number of operations in the group 34 | // 35 | // Outputs: 36 | // - algorithm: selected algorithm to be used for the given collective 37 | // - protocol: selected protocol to be used for the given collective 38 | // - nChannels: number of channels (hence SMs) to be used. 39 | // 40 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 41 | // default tuning for the given collective. 42 | // Also, the plugin is allowed to not set any output, or set only the 43 | // algorithm and protocol, but not only the algorithm or only the protocol. 44 | // Unset fields will be set automatically by NCCL. 45 | ncclResult_t (*getCandidate)(uint64_t commHash, ncclFunc_t collType, size_t nBytes, 46 | int *algorithm, int *protocol, int* isCopyEngineNotSmCopy, int* p2pLevel, 47 | int* nChannels, int* nThreads, 48 | int* chunkSize, int* iteration, int* lastIterEffectiveChunksize, int* native); 49 | 50 | // Terminates the plugin and cleans up any resources that the plugin allocated. 51 | ncclResult_t (*destroy)(uint64_t commHash); 52 | // Profiles the communication 53 | ncclResult_t (*startProfiling)(uint64_t commHash, cudaStream_t stream, ncclFunc_t collType, size_t nBytes, 54 | int algorithm, int protocol, int isCopyEngineNotSmCopy, int p2pLevel, 55 | int nChannels, int nThreads, 56 | int chunkSize, int iteration, int lastIterEffectiveChunksize, int native); 57 | ncclResult_t (*stopProfiling)(uint64_t commHash); 58 | ncclResult_t (*isNewWorkload)(uint64_t commHash, ncclFunc_t collType, size_t nBytes, bool* flag); 59 | } ncclTuner_v1_t; 60 | 61 | typedef ncclTuner_v1_t ncclTuner_t; 62 | 63 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1" 64 | 65 | #define TUNER_P2PCHUNKSIZE 128 * 1024 66 | #define SIMPLE_P2PCHUNKSIZE_UPPER_BOUND 128*1024 67 | #define LL128_P2PCHUNKSIZE_UPPER_BOUND 128*1024 68 | #define LL_P2PCHUNKSIZE_UPPER_BOUND 80*1024 69 | static_assert(SIMPLE_P2PCHUNKSIZE_UPPER_BOUND <= TUNER_P2PCHUNKSIZE); 70 | static_assert(LL128_P2PCHUNKSIZE_UPPER_BOUND <= TUNER_P2PCHUNKSIZE); 71 | static_assert(LL_P2PCHUNKSIZE_UPPER_BOUND <= TUNER_P2PCHUNKSIZE); 72 | ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv); 73 | #endif 74 | -------------------------------------------------------------------------------- /ext-tuner/example/src/cuda/nccl_socket.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SOCKET_H_ 8 | #define NCCL_SOCKET_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "nccl.h" 17 | 18 | #define MAX_IFS 16 19 | #define MAX_IF_NAME_SIZE 16 20 | #define SLEEP_INT 1000 // connection retry sleep interval in usec 21 | #define RETRY_REFUSED_TIMES \ 22 | 2e4 // connection refused retry times before reporting a timeout (20 sec) 23 | #define RETRY_TIMEDOUT_TIMES \ 24 | 3 // connection timed out retry times (each one can take 20s) 25 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST + NI_MAXSERV) 26 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL 27 | 28 | /* Common socket address storage structure for IPv4/IPv6 */ 29 | union ncclSocketAddress { 30 | struct sockaddr sa; 31 | struct sockaddr_in sin; 32 | struct sockaddr_in6 sin6; 33 | }; 34 | 35 | enum ncclSocketState { 36 | ncclSocketStateNone = 0, 37 | ncclSocketStateInitialized = 1, 38 | ncclSocketStateAccepting = 2, 39 | ncclSocketStateAccepted = 3, 40 | ncclSocketStateConnecting = 4, 41 | ncclSocketStateConnectPolling = 5, 42 | ncclSocketStateConnected = 6, 43 | ncclSocketStateReady = 7, 44 | ncclSocketStateClosed = 8, 45 | ncclSocketStateError = 9, 46 | ncclSocketStateNum = 10 47 | }; 48 | 49 | enum ncclSocketType { 50 | ncclSocketTypeUnknown = 0, 51 | ncclSocketTypeBootstrap = 1, 52 | ncclSocketTypeProxy = 2, 53 | ncclSocketTypeNetSocket = 3, 54 | ncclSocketTypeNetIb = 4 55 | }; 56 | 57 | struct ncclSocket { 58 | int fd; 59 | int acceptFd; 60 | int timedOutRetries; 61 | int refusedRetries; 62 | union ncclSocketAddress addr; 63 | volatile uint32_t *abortFlag; 64 | int asyncFlag; 65 | enum ncclSocketState state; 66 | int salen; 67 | uint64_t magic; 68 | enum ncclSocketType type; 69 | }; 70 | 71 | // Initialize a socket 72 | ncclResult_t ncclSocketInit(struct ncclSocket *sock, 73 | union ncclSocketAddress *addr = NULL, 74 | uint64_t magic = NCCL_SOCKET_MAGIC, 75 | enum ncclSocketType type = ncclSocketTypeUnknown, 76 | volatile uint32_t *abortFlag = NULL, 77 | int asyncFlag = 0); 78 | // Connect to sock->addr. sock->fd is set after a successful call. 79 | ncclResult_t ncclSocketConnect(struct ncclSocket *sock); 80 | // Accept an incoming connection from listenSock->fd and keep the file 81 | // descriptor in sock->fd, with the remote side IP/port in sock->addr. 82 | ncclResult_t ncclSocketAccept(struct ncclSocket *sock, 83 | struct ncclSocket *ulistenSock); 84 | 85 | #define NCCL_SOCKET_SEND 0 86 | #define NCCL_SOCKET_RECV 1 87 | 88 | ncclResult_t ncclSocketSend(struct ncclSocket *sock, void *ptr, int size); 89 | ncclResult_t ncclSocketRecv(struct ncclSocket *sock, void *ptr, int size); 90 | ncclResult_t ncclSocketTryRecv(struct ncclSocket *sock, void *ptr, int size, 91 | int *closed, bool blocking); 92 | ncclResult_t ncclSocketClose(struct ncclSocket *sock); 93 | 94 | struct unexConn { 95 | int peer; 96 | int tag; 97 | struct ncclSocket sock; 98 | struct unexConn *next; 99 | }; 100 | 101 | struct bootstrapState { 102 | struct ncclSocket listenSock; 103 | struct ncclSocket ringRecvSocket; 104 | struct ncclSocket ringSendSocket; 105 | union ncclSocketAddress *peerCommAddresses; 106 | union ncclSocketAddress *peerProxyAddresses; 107 | struct unexConn *unexpectedConnections; 108 | int cudaDev; 109 | int rank; 110 | int nranks; 111 | uint64_t magic; 112 | volatile uint32_t *abortFlag; 113 | }; 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /ext-tuner/example/example/cuda/pytorch/demo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | import argparse 4 | import os 5 | import datetime 6 | import sys 7 | import numpy as np 8 | 9 | def profiling(args, nBytes, repeat): 10 | data_type = torch.float32 11 | data_type_size = torch.finfo(data_type).bits // 8 12 | 13 | if repeat == 0 or nBytes//data_type_size == 0: 14 | return 15 | comm_begin_event = torch.cuda.Event(enable_timing=True) 16 | comm_end_event = torch.cuda.Event(enable_timing=True) 17 | 18 | tensor = torch.randn([1, nBytes//data_type_size], dtype=data_type).to(args.local_rank) 19 | comm_begin_event.record() 20 | for i in range(repeat): 21 | torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) 22 | comm_end_event.record() 23 | comm_end_event.synchronize() # blocking CPU thread until the event completes. 24 | comm_gpu_time = comm_begin_event.elapsed_time(comm_end_event)/repeat 25 | GB_s = nBytes*1.0/1024/1024/1024/(comm_gpu_time/1000) 26 | if args.rank == 0: 27 | print(f"avged by repeat: {repeat:>5}, bytes={nBytes:>10}, nrank={args.world_size:>4}: {GB_s:>5.2f} GB/s") 28 | 29 | comm_begin_event.record() 30 | for i in range(100): 31 | torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) 32 | comm_end_event.record() 33 | comm_end_event.synchronize() # blocking CPU thread until the event completes. 34 | comm_gpu_time = comm_begin_event.elapsed_time(comm_end_event)/100 35 | GB_s = nBytes*1.0/1024/1024/1024/(comm_gpu_time/1000) 36 | if args.rank == 0: 37 | print(f"after pretuning, avged by repeat 100, bytes={nBytes:>10}, nrank={args.world_size:>4}: {GB_s:>5.2f} GB/s") 38 | 39 | torch.cuda.synchronize(args.local_rank) 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("--rank", type=int, default=0, help="global rank") 44 | parser.add_argument("--local_rank", type=int, default=0, help="local rank") 45 | parser.add_argument("--world_size", type=int, default=1, help="world size") 46 | parser.add_argument("--master_ip", type=str, default="localhost", help="master ip") 47 | parser.add_argument("--master_port", type=str, default="6000", help="master port") 48 | 49 | args = parser.parse_args() 50 | if os.getenv('OMPI_COMM_WORLD_SIZE') is not None: # for mpirun 51 | args.rank = int(os.environ.get('OMPI_COMM_WORLD_RANK', args.rank)) 52 | args.local_rank = int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK', args.local_rank)) 53 | args.world_size = int(os.environ.get('OMPI_COMM_WORLD_SIZE', args.world_size)) 54 | os.environ['MASTER_ADDR'] = str(args.master_ip) 55 | os.environ['MASTER_PORT'] = str(args.master_port) 56 | elif os.getenv('WORLD_SIZE') is not None: # for torchrun 57 | args.rank = int(os.environ.get('RANK', args.rank)) 58 | args.local_rank = int(os.environ.get('LOCAL_RANK', args.local_rank)) 59 | args.world_size = int(os.environ.get('WORLD_SIZE', args.world_size)) 60 | args.master_ip = str(os.environ.get('MASTER_ADDR', args.master_ip)) 61 | args.master_port = str(os.environ.get('MASTER_PORT', args.master_port)) 62 | 63 | torch.cuda.set_device(args.local_rank) 64 | 65 | tuning_steps = 5 # for warmup 66 | tuning_steps += 5 # for native 67 | tuning_steps += int(os.environ.get('TUNER_PRETRAIN_STEPS')) 68 | tuning_steps += int(os.environ.get('TUNER_TRAIN_STEPS')) 69 | # to avoid the remaing round 70 | tuning_steps += 5 # in case 71 | 72 | if args.rank == 0: 73 | os.environ['TUNER_ROLE'] = "COORDINATOR" 74 | 75 | if args.world_size > 1: 76 | init_method = 'tcp://' 77 | init_method += args.master_ip + ':' + args.master_port 78 | torch.distributed.init_process_group(backend='nccl', 79 | rank=args.rank, 80 | world_size=args.world_size, 81 | init_method=init_method, 82 | timeout=datetime.timedelta(seconds=14400000)) 83 | 84 | for nBytes in [1024*(2**i) for i in range(19)]: 85 | profiling(args, nBytes, tuning_steps) 86 | -------------------------------------------------------------------------------- /src/collectives/device/broadcast.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | 11 | namespace { 12 | template 13 | __device__ __forceinline__ void runRing(ncclWorkElem *args) { 14 | const int tid = threadIdx.x; 15 | const int nthreads = args->nWarps*WARP_SIZE; 16 | const int bid = args->bid; 17 | const int nChannels = args->nChannels; 18 | ncclRing *ring = &ncclShmem.channel.ring; 19 | ssize_t chunkSize; 20 | if (args->native) { 21 | chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1)); 22 | } else { 23 | chunkSize = args->effectiveChunkSize; // effective chunksize 24 | } 25 | ssize_t minChunkSize; 26 | if (Proto::Id == NCCL_PROTO_LL) 27 | minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T)); 28 | if (Proto::Id == NCCL_PROTO_LL128) { 29 | // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. 30 | minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T)); // minChunkSizeLL128 31 | } 32 | 33 | const ssize_t loopSize = nChannels*chunkSize; 34 | const ssize_t size = args->count; 35 | const int rank = ring->userRanks[0]; 36 | const int nextRank = ring->userRanks[1]; 37 | const int root = args->root; 38 | 39 | T *inputBuf = (T*)args->sendbuff; 40 | T *outputBuf = (T*)args->recvbuff; 41 | Primitives, 0, Proto, 0> 42 | prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, 0, 0, args->transportIndex); 43 | 44 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 45 | ssize_t realChunkSize; 46 | if (Proto::Id == NCCL_PROTO_SIMPLE) { 47 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); 48 | realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); 49 | } 50 | else if (Proto::Id == NCCL_PROTO_LL) { 51 | if (args->native) { 52 | realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; 53 | } else { 54 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSize)*minChunkSize); 55 | } 56 | } 57 | else if (Proto::Id == NCCL_PROTO_LL128) { 58 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSize)*minChunkSize); 59 | } 60 | realChunkSize = int(realChunkSize); 61 | 62 | ssize_t offset = gridOffset + int(bid*realChunkSize); 63 | int nelem = min(realChunkSize, size-offset); 64 | 65 | if (rank == root) { 66 | if (inputBuf == outputBuf) { 67 | prims.send(offset, nelem); 68 | } else { 69 | prims.copySend(offset, offset, nelem); 70 | } 71 | } else if (nextRank == root) { 72 | prims.recv(offset, nelem); 73 | } else { 74 | prims.recvCopySend(offset, nelem); 75 | } 76 | } 77 | } 78 | } 79 | 80 | template 81 | struct RunWorkElement { 82 | __device__ __forceinline__ void run(ncclWorkElem *args) { 83 | using Proto = ProtoSimple; 84 | runRing(args); 85 | } 86 | }; 87 | 88 | template 89 | struct RunWorkElement { 90 | __device__ __forceinline__ void run(ncclWorkElem *args) { 91 | runRing(args); 92 | } 93 | }; 94 | 95 | template 96 | struct RunWorkElement { 97 | __device__ __forceinline__ void run(ncclWorkElem *args) { 98 | runRing(args); 99 | } 100 | }; 101 | -------------------------------------------------------------------------------- /src/graph/trees.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "nccl.h" 8 | 9 | #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank) 10 | 11 | /* Btree which alternates leaves and nodes. 12 | * Assumes root is 0, which conveniently builds a tree on powers of two, 13 | * (because we have pow2-1 ranks) which lets us manipulate bits. 14 | * Find first non-zero bit, then : 15 | * Find the parent : 16 | * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below) 17 | * xx11[0] -> xx10[0] (3,7,11 below) 18 | * Find the children : 19 | * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13) 20 | * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13) 21 | * 22 | * Illustration : 23 | * 0---------------8 24 | * ______/ \______ 25 | * 4 12 26 | * / \ / \ 27 | * 2 6 10 \ 28 | * / \ / \ / \ \ 29 | * 1 3 5 7 9 11 13 30 | */ 31 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) { 32 | int up, down0, down1; 33 | int bit; 34 | for (bit=1; bit 0 so it has to be our child 1, not 0. 42 | *d1 = nranks > 1 ? bit >> 1 : -1; 43 | return ncclSuccess; 44 | } 45 | 46 | up = (rank ^ bit) | (bit << 1); 47 | // if smaller than the parent, we are his first child, otherwise we're his second 48 | if (up >= nranks) up = (rank ^ bit); 49 | *parentChildType = (rank < up) ? 0 : 1; 50 | *u = up; 51 | 52 | int lowbit = bit >> 1; 53 | // down0 is always within bounds 54 | down0 = lowbit == 0 ? -1 : rank-lowbit; 55 | 56 | down1 = lowbit == 0 ? -1 : rank+lowbit; 57 | // Make sure down1 is within bounds 58 | while (down1 >= nranks) { 59 | down1 = lowbit == 0 ? -1 : rank+lowbit; 60 | lowbit >>= 1; 61 | } 62 | *d0 = down0; *d1 = down1; 63 | 64 | return ncclSuccess; 65 | } 66 | 67 | /* Build a double binary tree. Take the previous tree for the first tree. 68 | * For the second tree, we use a mirror tree (if nranks is even) 69 | * 70 | * 0---------------8 3----------------11 71 | * ______/ \ / \______ 72 | * 4 \ / 7 73 | * / \ \ / / \ 74 | * 2 6 10 1 5 9 75 | * / \ / \ / \ / \ / \ / \ 76 | * 1 3 5 7 9 11 0 2 4 6 8 10 77 | * 78 | * or shift it by one rank (if nranks is odd). 79 | * 80 | * 0---------------8 1---------------9 81 | * ______/ \______ ______/ \______ 82 | * 4 12 5 0 83 | * / \ / / \ / 84 | * 2 6 10 3 7 11 85 | * / \ / \ / \ / \ / \ / \ 86 | * 1 3 5 7 9 11 2 4 6 8 10 12 87 | */ 88 | ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) { 89 | // First tree ... use a btree 90 | ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0); 91 | // Second tree ... mirror or shift 92 | if (nranks % 2 == 1) { 93 | // shift 94 | int shiftrank = (rank-1+nranks) % nranks; 95 | int u, d0, d1; 96 | ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1); 97 | *s1 = u == -1 ? -1 : (u+1) % nranks; 98 | *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks; 99 | *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks; 100 | } else { 101 | // mirror 102 | int u, d0, d1; 103 | ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1); 104 | *s1 = u == -1 ? -1 : nranks-1-u; 105 | *d1_0 = d0 == -1 ? -1 : nranks-1-d0; 106 | *d1_1 = d1 == -1 ? -1 : nranks-1-d1; 107 | } 108 | return ncclSuccess; 109 | } 110 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_CUDA 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif /* __cplusplus */ 17 | 18 | typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name); 19 | typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name); 20 | typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name); 21 | typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name); 22 | typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name); 23 | typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name); 24 | typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name); 25 | typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name); 26 | 27 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name) 28 | { 29 | #ifndef NVTX_DISABLE 30 | nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr; 31 | if(local!=0) 32 | (*local)(device, name); 33 | #endif /*NVTX_DISABLE*/ 34 | } 35 | 36 | NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name) 37 | { 38 | #ifndef NVTX_DISABLE 39 | nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr; 40 | if(local!=0) 41 | (*local)(device, name); 42 | #endif /*NVTX_DISABLE*/ 43 | } 44 | 45 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name) 46 | { 47 | #ifndef NVTX_DISABLE 48 | nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr; 49 | if(local!=0) 50 | (*local)(context, name); 51 | #endif /*NVTX_DISABLE*/ 52 | } 53 | 54 | NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name) 55 | { 56 | #ifndef NVTX_DISABLE 57 | nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr; 58 | if(local!=0) 59 | (*local)(context, name); 60 | #endif /*NVTX_DISABLE*/ 61 | } 62 | 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name) 64 | { 65 | #ifndef NVTX_DISABLE 66 | nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr; 67 | if(local!=0) 68 | (*local)(stream, name); 69 | #endif /*NVTX_DISABLE*/ 70 | } 71 | 72 | NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name) 73 | { 74 | #ifndef NVTX_DISABLE 75 | nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr; 76 | if(local!=0) 77 | (*local)(stream, name); 78 | #endif /*NVTX_DISABLE*/ 79 | } 80 | 81 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name) 82 | { 83 | #ifndef NVTX_DISABLE 84 | nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr; 85 | if(local!=0) 86 | (*local)(event, name); 87 | #endif /*NVTX_DISABLE*/ 88 | } 89 | 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name) 91 | { 92 | #ifndef NVTX_DISABLE 93 | nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr; 94 | if(local!=0) 95 | (*local)(event, name); 96 | #endif /*NVTX_DISABLE*/ 97 | } 98 | 99 | #ifdef __cplusplus 100 | } /* extern "C" */ 101 | #endif /* __cplusplus */ 102 | 103 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtCudaRt.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #include "nvToolsExt.h" 10 | 11 | #include "cuda.h" 12 | #include "driver_types.h" 13 | 14 | #ifndef NVTOOLSEXT_CUDART_V3 15 | #define NVTOOLSEXT_CUDART_V3 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif /* __cplusplus */ 20 | 21 | /* ========================================================================= */ 22 | /** \name Functions for CUDA Resource Naming 23 | */ 24 | /** \addtogroup RESOURCE_NAMING 25 | * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming 26 | * 27 | * This section covers the API functions that allow to annotate CUDA resources 28 | * with user-provided names. 29 | * 30 | * @{ 31 | */ 32 | 33 | /* ------------------------------------------------------------------------- */ 34 | /* \cond SHOW_HIDDEN 35 | * \brief Used to build a non-colliding value for resource types separated class 36 | * \version \NVTX_VERSION_2 37 | */ 38 | #define NVTX_RESOURCE_CLASS_CUDART 5 39 | /** \endcond */ 40 | 41 | /* ------------------------------------------------------------------------- */ 42 | /** \brief Resource types for CUDART 43 | */ 44 | typedef enum nvtxResourceCUDARTType_t 45 | { 46 | NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */ 47 | NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */ 48 | NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */ 49 | } nvtxResourceCUDARTType_t; 50 | 51 | 52 | /* ------------------------------------------------------------------------- */ 53 | /** \brief Annotates a CUDA device. 54 | * 55 | * Allows the user to associate a CUDA device with a user-provided name. 56 | * 57 | * \param device - The id of the CUDA device to name. 58 | * \param name - The name of the CUDA device. 59 | * 60 | * \version \NVTX_VERSION_1 61 | * @{ */ 62 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name); 63 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name); 64 | /** @} */ 65 | 66 | /* ------------------------------------------------------------------------- */ 67 | /** \brief Annotates a CUDA stream. 68 | * 69 | * Allows the user to associate a CUDA stream with a user-provided name. 70 | * 71 | * \param stream - The handle of the CUDA stream to name. 72 | * \param name - The name of the CUDA stream. 73 | * 74 | * \version \NVTX_VERSION_1 75 | * @{ */ 76 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name); 77 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name); 78 | /** @} */ 79 | 80 | /* ------------------------------------------------------------------------- */ 81 | /** \brief Annotates a CUDA event. 82 | * 83 | * Allows the user to associate a CUDA event with a user-provided name. 84 | * 85 | * \param event - The handle of the CUDA event to name. 86 | * \param name - The name of the CUDA event. 87 | * 88 | * \version \NVTX_VERSION_1 89 | * @{ */ 90 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name); 91 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name); 92 | /** @} */ 93 | 94 | /** @} */ /* END RESOURCE_NAMING */ 95 | 96 | /* ========================================================================= */ 97 | #ifdef UNICODE 98 | #define nvtxNameCudaDevice nvtxNameCudaDeviceW 99 | #define nvtxNameCudaStream nvtxNameCudaStreamW 100 | #define nvtxNameCudaEvent nvtxNameCudaEventW 101 | #else 102 | #define nvtxNameCudaDevice nvtxNameCudaDeviceA 103 | #define nvtxNameCudaStream nvtxNameCudaStreamA 104 | #define nvtxNameCudaEvent nvtxNameCudaEventA 105 | #endif 106 | 107 | #ifdef __cplusplus 108 | } 109 | #endif /* __cplusplus */ 110 | 111 | #ifndef NVTX_NO_IMPL 112 | #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */ 113 | #include "nvtxDetail/nvtxImplCudaRt_v3.h" 114 | #undef NVTX_IMPL_GUARD_CUDART 115 | #endif /*NVTX_NO_IMPL*/ 116 | 117 | #endif /* NVTOOLSEXT_CUDART_V3 */ 118 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef __NVTX_LINKONCE_H__ 10 | #define __NVTX_LINKONCE_H__ 11 | 12 | /* This header defines macros to permit making definitions of global variables 13 | * and functions in C/C++ header files which may be included multiple times in 14 | * a translation unit or linkage unit. It allows authoring header-only libraries 15 | * which can be used by multiple other header-only libraries (either as the same 16 | * copy or multiple copies), and does not require any build changes, such as 17 | * adding another .c file, linking a static library, or deploying a dynamic 18 | * library. Globals defined with these macros have the property that they have 19 | * the same address, pointing to a single instance, for the entire linkage unit. 20 | * It is expected but not guaranteed that each linkage unit will have a separate 21 | * instance. 22 | * 23 | * In some situations it is desirable to declare a variable without initializing 24 | * it, refer to it in code or other variables' initializers, and then initialize 25 | * it later. Similarly, functions can be prototyped, have their address taken, 26 | * and then have their body defined later. In such cases, use the FWDDECL macros 27 | * when forward-declaring LINKONCE global variables without initializers and 28 | * function prototypes, and then use the DEFINE macros when later defining them. 29 | * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro, 30 | * following this pattern makes code maximally portable. 31 | */ 32 | 33 | #if defined(__MINGW32__) /* MinGW */ 34 | #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) 35 | #if defined(__cplusplus) 36 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 37 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK 38 | #else 39 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 40 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 41 | #endif 42 | #elif defined(_MSC_VER) /* MSVC */ 43 | #if defined(__cplusplus) 44 | #define NVTX_LINKONCE_DEFINE_GLOBAL extern "C" __declspec(selectany) 45 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline 46 | #else 47 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 48 | #define NVTX_LINKONCE_DEFINE_FUNCTION __inline 49 | #endif 50 | #elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */ 51 | #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) 52 | #if defined(__cplusplus) 53 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 54 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK 55 | #else 56 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 57 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 58 | #endif 59 | #elif defined(__CYGWIN__) /* Assume GCC or compatible */ 60 | #define NVTX_LINKONCE_WEAK __attribute__((weak)) 61 | #if defined(__cplusplus) 62 | #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) 63 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline 64 | #else 65 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK 66 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK 67 | #endif 68 | #else /* All others: Assume GCC, clang, or compatible */ 69 | #define NVTX_LINKONCE_WEAK __attribute__((weak)) 70 | #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden"))) 71 | #if defined(__cplusplus) 72 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 73 | #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline 74 | #else 75 | #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 76 | #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK 77 | #endif 78 | #endif 79 | 80 | #define NVTX_LINKONCE_FWDDECL_GLOBAL NVTX_LINKONCE_DEFINE_GLOBAL extern 81 | #define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION 82 | 83 | #endif /* __NVTX_LINKONCE_H__ */ 84 | -------------------------------------------------------------------------------- /src/include/socket.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SOCKET_H_ 8 | #define NCCL_SOCKET_H_ 9 | 10 | #include "nccl.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #define MAX_IFS 16 19 | #define MAX_IF_NAME_SIZE 16 20 | #define SLEEP_INT 1000 // connection retry sleep interval in usec 21 | #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) 22 | #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) 23 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) 24 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL 25 | 26 | /* Common socket address storage structure for IPv4/IPv6 */ 27 | union ncclSocketAddress { 28 | struct sockaddr sa; 29 | struct sockaddr_in sin; 30 | struct sockaddr_in6 sin6; 31 | }; 32 | 33 | enum ncclSocketState { 34 | ncclSocketStateNone = 0, 35 | ncclSocketStateInitialized = 1, 36 | ncclSocketStateAccepting = 2, 37 | ncclSocketStateAccepted = 3, 38 | ncclSocketStateConnecting = 4, 39 | ncclSocketStateConnectPolling = 5, 40 | ncclSocketStateConnected = 6, 41 | ncclSocketStateReady = 7, 42 | ncclSocketStateClosed = 8, 43 | ncclSocketStateError = 9, 44 | ncclSocketStateNum = 10 45 | }; 46 | 47 | enum ncclSocketType { 48 | ncclSocketTypeUnknown = 0, 49 | ncclSocketTypeBootstrap = 1, 50 | ncclSocketTypeProxy = 2, 51 | ncclSocketTypeNetSocket = 3, 52 | ncclSocketTypeNetIb = 4 53 | }; 54 | 55 | struct ncclSocket { 56 | int fd; 57 | int acceptFd; 58 | int timedOutRetries; 59 | int refusedRetries; 60 | union ncclSocketAddress addr; 61 | volatile uint32_t* abortFlag; 62 | int asyncFlag; 63 | enum ncclSocketState state; 64 | int salen; 65 | uint64_t magic; 66 | enum ncclSocketType type; 67 | }; 68 | 69 | const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); 70 | ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); 71 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); 72 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); 73 | 74 | // Initialize a socket 75 | __attribute__((visibility("default"))) ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); 76 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call 77 | ncclResult_t ncclSocketListen(struct ncclSocket* sock); 78 | ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr); 79 | // Connect to sock->addr. sock->fd is set after a successful call. 80 | __attribute__((visibility("default"))) ncclResult_t ncclSocketConnect(struct ncclSocket* sock); 81 | // Return socket connection state. 82 | ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running); 83 | // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. 84 | __attribute__((visibility("default"))) ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock); 85 | ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd); 86 | ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock); 87 | 88 | #define NCCL_SOCKET_SEND 0 89 | #define NCCL_SOCKET_RECV 1 90 | 91 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); 92 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); 93 | __attribute__((visibility("default"))) ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); 94 | __attribute__((visibility("default"))) ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); 95 | __attribute__((visibility("default"))) ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); 96 | __attribute__((visibility("default"))) ncclResult_t ncclSocketClose(struct ncclSocket* sock); 97 | #endif 98 | -------------------------------------------------------------------------------- /makefiles/common.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | CUDA_HOME ?= /usr/local/cuda 8 | PREFIX ?= /usr/local 9 | VERBOSE ?= 0 10 | KEEP ?= 0 11 | DEBUG ?= 0 12 | TRACE ?= 0 13 | PROFAPI ?= 1 14 | NVTX ?= 1 15 | RDMA_CORE ?= 0 16 | TUNER_MAXCHANNELS ?= 128 17 | 18 | NVCC = $(CUDA_HOME)/bin/nvcc 19 | 20 | CUDA_LIB ?= $(CUDA_HOME)/lib64 21 | CUDA_INC ?= $(CUDA_HOME)/include 22 | CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) 23 | #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) 24 | CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) 25 | CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) 26 | #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}) 27 | 28 | # You should define NVCC_GENCODE in your environment to the minimal set 29 | # of archs to reduce compile time. 30 | CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \ 31 | -gencode=arch=compute_60,code=sm_60 \ 32 | -gencode=arch=compute_61,code=sm_61 33 | ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0) 34 | # SM35 is deprecated from CUDA12.0 onwards 35 | CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35 36 | endif 37 | CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 38 | CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 39 | CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90 40 | 41 | CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 42 | CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 43 | CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 44 | CUDA12_PTX = -gencode=arch=compute_90,code=compute_90 45 | 46 | 47 | ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) 48 | # Include Hopper support if we're using CUDA11.8 or above 49 | NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX) 50 | else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) 51 | NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX) 52 | # Include Volta support if we're using CUDA9 or above 53 | else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0) 54 | NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) 55 | else 56 | NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) 57 | endif 58 | $(info NVCC_GENCODE is ${NVCC_GENCODE}) 59 | 60 | CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ 61 | -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \ 62 | -I $(CUDA_INC) $(NPKIT_FLAGS) \ 63 | $(CXXFLAGS) 64 | # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) 65 | # 512 : 120, 640 : 96, 768 : 80, 1024 : 60 66 | # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. 67 | NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all --resource-usage $(NPKIT_FLAGS) 68 | # Use addprefix so that we can specify more than one path 69 | NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt 70 | 71 | ######### 72 | CXXFLAGS += -DTUNER_MAXCHANNELS=${TUNER_MAXCHANNELS} 73 | NVCUFLAGS += -DTUNER_MAXCHANNELS=${TUNER_MAXCHANNELS} 74 | ########## ########## 75 | 76 | ########## GCOV ########## 77 | GCOV ?= 0 # disable by default. 78 | GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1 79 | CXXFLAGS += ${GCOV_FLAGS} 80 | NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %} 81 | LDFLAGS += ${GCOV_FLAGS} 82 | NVLDFLAGS += ${GCOV_FLAGS:%=-Xcompiler %} 83 | # $(warning GCOV_FLAGS=${GCOV_FLAGS}) 84 | ########## GCOV ########## 85 | 86 | ifeq ($(DEBUG), 0) 87 | NVCUFLAGS += -O3 88 | CXXFLAGS += -O3 -g 89 | else 90 | NVCUFLAGS += -O0 -G -g 91 | CXXFLAGS += -O0 -g -ggdb3 92 | endif 93 | 94 | ifneq ($(VERBOSE), 0) 95 | NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter 96 | CXXFLAGS += -Wall -Wextra 97 | else 98 | .SILENT: 99 | endif 100 | 101 | ifneq ($(TRACE), 0) 102 | CXXFLAGS += -DENABLE_TRACE 103 | endif 104 | 105 | ifeq ($(NVTX), 0) 106 | CXXFLAGS += -DNVTX_DISABLE 107 | endif 108 | 109 | ifneq ($(KEEP), 0) 110 | NVCUFLAGS += -keep 111 | endif 112 | 113 | ifneq ($(PROFAPI), 0) 114 | CXXFLAGS += -DPROFAPI 115 | endif 116 | 117 | ifneq ($(RDMA_CORE), 0) 118 | CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 119 | endif 120 | -------------------------------------------------------------------------------- /ext-tuner/example/.clang-format: -------------------------------------------------------------------------------- 1 | # clang-format -style=llvm -dump-config > .clang-format 2 | --- 3 | Language: Cpp 4 | # BasedOnStyle: LLVM 5 | AccessModifierOffset: -2 6 | AlignAfterOpenBracket: Align 7 | AlignConsecutiveMacros: false 8 | AlignConsecutiveAssignments: false 9 | AlignConsecutiveDeclarations: false 10 | AlignEscapedNewlines: Right 11 | AlignOperands: true 12 | AlignTrailingComments: true 13 | AllowAllArgumentsOnNextLine: true 14 | AllowAllConstructorInitializersOnNextLine: true 15 | AllowAllParametersOfDeclarationOnNextLine: true 16 | AllowShortBlocksOnASingleLine: Never 17 | AllowShortCaseLabelsOnASingleLine: false 18 | AllowShortFunctionsOnASingleLine: All 19 | AllowShortLambdasOnASingleLine: All 20 | AllowShortIfStatementsOnASingleLine: Never 21 | AllowShortLoopsOnASingleLine: false 22 | AlwaysBreakAfterDefinitionReturnType: None 23 | AlwaysBreakAfterReturnType: None 24 | AlwaysBreakBeforeMultilineStrings: false 25 | AlwaysBreakTemplateDeclarations: MultiLine 26 | BinPackArguments: true 27 | BinPackParameters: true 28 | BraceWrapping: 29 | AfterCaseLabel: false 30 | AfterClass: false 31 | AfterControlStatement: false 32 | AfterEnum: false 33 | AfterFunction: false 34 | AfterNamespace: false 35 | AfterObjCDeclaration: false 36 | AfterStruct: false 37 | AfterUnion: false 38 | AfterExternBlock: false 39 | BeforeCatch: false 40 | BeforeElse: false 41 | IndentBraces: false 42 | SplitEmptyFunction: true 43 | SplitEmptyRecord: true 44 | SplitEmptyNamespace: true 45 | BreakBeforeBinaryOperators: None 46 | BreakBeforeBraces: Attach 47 | BreakBeforeInheritanceComma: false 48 | BreakInheritanceList: BeforeColon 49 | BreakBeforeTernaryOperators: true 50 | BreakConstructorInitializersBeforeComma: false 51 | BreakConstructorInitializers: BeforeColon 52 | BreakAfterJavaFieldAnnotations: false 53 | BreakStringLiterals: true 54 | ColumnLimit: 80 55 | CommentPragmas: '^ IWYU pragma:' 56 | CompactNamespaces: false 57 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 58 | ConstructorInitializerIndentWidth: 4 59 | ContinuationIndentWidth: 4 60 | Cpp11BracedListStyle: true 61 | DeriveLineEnding: true 62 | DerivePointerAlignment: false 63 | DisableFormat: false 64 | ExperimentalAutoDetectBinPacking: false 65 | FixNamespaceComments: true 66 | ForEachMacros: 67 | - foreach 68 | - Q_FOREACH 69 | - BOOST_FOREACH 70 | IncludeBlocks: Preserve 71 | IncludeCategories: 72 | - Regex: '^<.*\.h>' 73 | Priority: 0 74 | SortPriority: 0 75 | CaseSensitive: false 76 | - Regex: '^<.*' 77 | Priority: 1 78 | SortPriority: 0 79 | CaseSensitive: false 80 | - Regex: '.*' 81 | Priority: 2 82 | SortPriority: 0 83 | CaseSensitive: false 84 | IncludeIsMainRegex: '(Test)?$' 85 | IncludeIsMainSourceRegex: '' 86 | IndentAccessModifiers: true 87 | IndentCaseLabels: false 88 | IndentGotoLabels: true 89 | IndentPPDirectives: None 90 | IndentWidth: 2 91 | IndentWrappedFunctionNames: false 92 | JavaScriptQuotes: Leave 93 | JavaScriptWrapImports: true 94 | KeepEmptyLinesAtTheStartOfBlocks: true 95 | MacroBlockBegin: '' 96 | MacroBlockEnd: '' 97 | MaxEmptyLinesToKeep: 1 98 | NamespaceIndentation: None 99 | ObjCBinPackProtocolList: Auto 100 | ObjCBlockIndentWidth: 2 101 | ObjCSpaceAfterProperty: false 102 | ObjCSpaceBeforeProtocolList: true 103 | PenaltyBreakAssignment: 2 104 | PenaltyBreakBeforeFirstCallParameter: 19 105 | PenaltyBreakComment: 300 106 | PenaltyBreakFirstLessLess: 120 107 | PenaltyBreakString: 1000 108 | PenaltyBreakTemplateDeclaration: 10 109 | PenaltyExcessCharacter: 1000000 110 | PenaltyReturnTypeOnItsOwnLine: 60 111 | PointerAlignment: Right 112 | ReflowComments: true 113 | SortIncludes: true 114 | SortUsingDeclarations: true 115 | SpaceAfterCStyleCast: false 116 | SpaceAfterLogicalNot: false 117 | SpaceAfterTemplateKeyword: true 118 | SpaceBeforeAssignmentOperators: true 119 | SpaceBeforeCpp11BracedList: false 120 | SpaceBeforeCtorInitializerColon: true 121 | SpaceBeforeInheritanceColon: true 122 | SpaceBeforeParens: ControlStatements 123 | SpaceBeforeRangeBasedForLoopColon: true 124 | SpaceInEmptyBlock: false 125 | SpaceInEmptyParentheses: false 126 | SpacesBeforeTrailingComments: 2 127 | SpacesInAngles: false 128 | SpacesInConditionalStatement: false 129 | SpacesInContainerLiterals: true 130 | SpacesInCStyleCastParentheses: false 131 | SpacesInParentheses: false 132 | SpacesInSquareBrackets: false 133 | SpaceBeforeSquareBrackets: false 134 | Standard: Latest 135 | StatementMacros: 136 | - Q_UNUSED 137 | - QT_REQUIRE_VERSION 138 | TabWidth: 8 139 | UseCRLF: false 140 | UseTab: Never 141 | ... 142 | 143 | -------------------------------------------------------------------------------- /src/include/info.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INFO_H_ 8 | #define NCCL_INFO_H_ 9 | 10 | #include "nccl.h" 11 | #include "devcomm.h" 12 | #include "collectives.h" 13 | #include "core.h" 14 | #include "utils.h" 15 | #include "strongstream.h" 16 | 17 | typedef enum : uint8_t { 18 | ncclPatternRing, 19 | ncclPatternRingTwice, 20 | ncclPatternPipelineFrom, 21 | ncclPatternPipelineTo, 22 | ncclPatternTreeUp, 23 | ncclPatternTreeDown, 24 | ncclPatternTreeUpDown, 25 | ncclPatternCollnetChain, 26 | ncclPatternCollnetDirect, 27 | ncclPatternNvls, 28 | ncclPatternNvlsTree, 29 | ncclPatternSend, 30 | ncclPatternRecv 31 | } ncclPattern_t; 32 | 33 | // Used to pass NCCL call information between functions 34 | struct ncclInfo { 35 | ncclFunc_t coll; 36 | const char* opName; 37 | // NCCL Coll Args 38 | const void* sendbuff; 39 | void* recvbuff; 40 | size_t count; 41 | ncclDataType_t datatype; 42 | ncclRedOp_t op; 43 | int root; // peer for p2p operations 44 | ncclComm_t comm; 45 | cudaStream_t stream; 46 | // Algorithm details 47 | int chunkSteps; 48 | int sliceSteps; 49 | // Computed later 50 | ncclDevRedOpFull opFull; 51 | int algorithm; 52 | int protocol; 53 | int isCopyEngineNotSmCopy; 54 | int p2pLevel; 55 | ncclPattern_t pattern; 56 | int nChannels; 57 | int nThreads; 58 | size_t nBytes; 59 | int nstepsPerLoop; 60 | int nchunksPerLoop; 61 | int chunkSize; 62 | int channelId; 63 | }; 64 | 65 | inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) { 66 | info->nBytes = info->count * ncclTypeSize(info->datatype); 67 | if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) { 68 | info->count = info->nBytes; 69 | info->datatype = ncclInt8; 70 | } 71 | if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank 72 | return ncclSuccess; 73 | } 74 | 75 | struct ncclTaskColl { 76 | struct ncclTaskColl* next; 77 | ncclFunc_t func; 78 | void const* sendbuff; 79 | void* recvbuff; 80 | size_t count; 81 | int root; 82 | ncclDataType_t datatype; 83 | ncclDevRedOpFull op; 84 | int chunkSteps, sliceSteps; 85 | }; 86 | struct ncclTaskP2p { 87 | ncclTaskP2p *next; 88 | void *buff; 89 | size_t bytes; 90 | // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track 91 | // of where it left off. 92 | int chunk; 93 | int peer; 94 | }; 95 | 96 | struct ncclCudaStreamList { 97 | struct ncclCudaStreamList *next; 98 | cudaStream_t stream; 99 | }; 100 | struct ncclTasks { 101 | struct Peer { 102 | bool sendSeen, recvSeen; 103 | struct ncclIntruQueue sendQueue; 104 | struct ncclIntruQueue recvQueue; 105 | }; 106 | struct Backup { 107 | // backup for Tuner 108 | struct ncclIntruQueue collQueue; 109 | size_t collBytesTotal; 110 | struct Peer* peers/*[nRanks]*/; 111 | int nTasksColl, nTasksP2p; 112 | }; 113 | Backup backup; 114 | // for 1 plan: workload, candidate 115 | struct Workload { 116 | uint64_t commHash; 117 | ncclFunc_t collType; 118 | size_t nBytes; 119 | }; 120 | Workload workload; 121 | struct Candidate { 122 | int algorithm; 123 | int protocol; 124 | int isCopyEngineNotSmCopy; 125 | int p2pLevel; 126 | int nChannels; 127 | int nThreads; 128 | int wireChunksize; 129 | int iteration; 130 | int lastIterEffectiveChunksize; 131 | int native; 132 | bool initialized = false; 133 | int nThreadsTotal; 134 | }; 135 | Candidate candidate; // 0 for native; 1 for tuner 136 | 137 | struct ncclIntruQueue collQueue; 138 | size_t collBytesTotal; 139 | struct Peer* peers/*[nRanks]*/; 140 | int *p2pSendOrder, *p2pRecvOrder; 141 | int p2pOrderSteps; 142 | int nTasksColl, nTasksP2p; 143 | 144 | // The list of user streams aggregated over all tasks present. 145 | struct ncclCudaStreamList* streams; 146 | // The most recent user stream. Ignored if streams==nullptr 147 | cudaStream_t streamRecent; 148 | // The graph capturing all user streams or invalid if none. Thus we restrict the 149 | // user that all streams must be captured in the same graph or not captured 150 | // at all. Technically we could probably relax this, but that would mean 151 | // collecting a different `ncclTasks` per graph and one for non-graph. 152 | struct ncclCudaGraph capturingGraph; 153 | }; 154 | 155 | #endif 156 | -------------------------------------------------------------------------------- /src/collectives/device/reduce.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "devcomm.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | 11 | namespace { 12 | template 13 | __device__ __forceinline__ void runRing(ncclWorkElem *args) { 14 | const int tid = threadIdx.x; 15 | const int nthreads = args->nWarps*WARP_SIZE; 16 | const int bid = args->bid; 17 | const int nChannels = args->nChannels; 18 | ncclRing *ring = &ncclShmem.channel.ring; 19 | ssize_t chunkSize; 20 | if (args->native) { 21 | chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1)); 22 | } else { 23 | chunkSize = args->effectiveChunkSize; 24 | } 25 | ssize_t minChunkSize; 26 | if (Proto::Id == NCCL_PROTO_LL) 27 | minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T)); 28 | if (Proto::Id == NCCL_PROTO_LL128) { 29 | // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. 30 | minChunkSize = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); // minChunkSizeLL128 31 | } 32 | 33 | const int nranks = ncclShmem.comm.nRanks; 34 | const ssize_t loopSize = nChannels*chunkSize; 35 | const ssize_t size = args->count; 36 | const int rank = ncclShmem.comm.rank; 37 | const int prevRank = ring->userRanks[nranks-1]; 38 | const int root = args->root; 39 | 40 | Primitives, 0, Proto, 0> 41 | prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, 0, 0, args->transportIndex); 42 | 43 | auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int { 44 | int realChunkSize; 45 | if (Proto::Id == NCCL_PROTO_SIMPLE) { 46 | realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); 47 | realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); 48 | } else if (Proto::Id == NCCL_PROTO_LL) { 49 | if (args->native) { 50 | realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; 51 | } else { 52 | realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize); 53 | } 54 | } else if (Proto::Id == NCCL_PROTO_LL128) { 55 | realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize); 56 | } 57 | return realChunkSize; 58 | }; 59 | 60 | if (prevRank == root) { 61 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 62 | int realChunkSize; 63 | realChunkSize = calcChunkSize(gridOffset); 64 | ssize_t offset = gridOffset + bid*realChunkSize; 65 | int nelem = min(realChunkSize, size-offset); 66 | prims.send(offset, nelem); 67 | } 68 | } 69 | else if (rank == root) { 70 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 71 | int realChunkSize; 72 | realChunkSize = calcChunkSize(gridOffset); 73 | ssize_t offset = gridOffset + bid*realChunkSize; 74 | int nelem = min(realChunkSize, size-offset); 75 | prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); 76 | } 77 | } 78 | else { 79 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 80 | int realChunkSize; 81 | realChunkSize = calcChunkSize(gridOffset); 82 | ssize_t offset = gridOffset + bid*realChunkSize; 83 | int nelem = min(realChunkSize, size-offset); 84 | prims.recvReduceSend(offset, nelem); 85 | } 86 | } 87 | } 88 | } 89 | 90 | template 91 | struct RunWorkElement { 92 | __device__ __forceinline__ void run(ncclWorkElem *args) { 93 | using Proto = ProtoSimple; 94 | runRing(args); 95 | } 96 | }; 97 | 98 | template 99 | struct RunWorkElement { 100 | __device__ __forceinline__ void run(ncclWorkElem *args) { 101 | runRing(args); 102 | } 103 | }; 104 | 105 | template 106 | struct RunWorkElement { 107 | __device__ __forceinline__ void run(ncclWorkElem *args) { 108 | runRing(args); 109 | } 110 | }; 111 | --------------------------------------------------------------------------------