├── .ci ├── Jenkinsfile ├── README.md ├── build_cli.sh ├── build_nccl_rdma_sharp_plugins.sh ├── ci_functions.sh ├── config-header-check.yml ├── configure_sharp.sh ├── ibdev2netdev ├── nccl_tests ├── publish_artefacts.sh ├── pushd_functions.sh ├── run_nccl_tests.sh ├── settings.sh ├── sharp_coll_test_wrapper └── taskset ├── .clang-format ├── .github └── workflows │ └── nccl-sharp-plugin.yml ├── .gitignore ├── LICENSE ├── Makefile.am ├── README.md ├── autogen.sh ├── configure.ac ├── contrib └── buildrpm.sh ├── debian ├── changelog.in ├── compat ├── control.in ├── copyright ├── nccl-rdma-sharp-plugins.postinst.in ├── nccl-rdma-sharp-plugins.prem.in ├── rules.in └── source │ └── format ├── include ├── core.h ├── debug.h ├── ibvwrap.h ├── nccl.h ├── net.h ├── net_device.h ├── net_v10.h ├── net_v5.h ├── net_v6.h ├── net_v7.h ├── net_v8.h ├── net_v9.h ├── p2p_plugin.h ├── param.h ├── socket.h ├── timer.h ├── ucx_uct_lib.h ├── ucx_uct_ring.h └── utils.h ├── m4 ├── sharp.m4 └── ucx.m4 ├── nccl-rdma-sharp-plugins.pc.in ├── nccl-rdma-sharp-plugins.spec.in └── src ├── Makefile.am ├── ib_plugin.c ├── ibvwrap.c ├── p2p_plugin.c ├── param.c ├── sharp_plugin.c ├── socket.c ├── ucx_plugin.c ├── ucx_rma_plugin.c ├── ucx_uct_lib.c ├── ucx_uct_plugin.c ├── ucx_uct_rd_plugin.c └── utils.c /.ci/Jenkinsfile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env groovy 2 | 3 | // Verified with Jenkins v2.190.2 4 | 5 | // TODO: 6 | // 1. Calculate taskset/affinity for the scripts based on total number of jenkins executors 7 | // 2. NCCL/CUDA/SHARP dependencies should be parameterized 8 | // 3. HPC-X OS/MOFED support matrix should be covered (e.g. docker-based) 9 | // 4. Add signal handlers in the scripts (e.g. to correctly handle Jenkins abort by timeout situations) 10 | 11 | pipeline { 12 | agent {label "ml-test-node-gpu"} 13 | 14 | options { 15 | buildDiscarder(logRotator(numToKeepStr: '10')) 16 | timeout(time: 90, unit: 'MINUTES') 17 | disableConcurrentBuilds() 18 | } 19 | 20 | environment { 21 | NFS_WORKSPACE = "${NFS_WORKSPACE_ROOT}/ml-nccl-rdma-sharp-plugins-pr/${BUILD_NUMBER}" 22 | ARTEFACT_DIR = "${NFS_WORKSPACE}/artefacts" 23 | NCCL_RDMA_SHARP_PLUGINS_DIR = "${NFS_WORKSPACE}/nccl-rdma-sharp-plugins" 24 | NCCL_TESTS_DIR = "${NFS_WORKSPACE}/nccl-tests" 25 | } 26 | 27 | stages { 28 | stage('Preparations') { 29 | steps { 30 | echo 'Preparations...' 31 | sh 'mkdir -p ${ARTEFACT_DIR}' 32 | sh 'mkdir -p ${NFS_WORKSPACE}' 33 | } 34 | } 35 | stage('Build nccl-rdma-sharp-plugins') { 36 | steps { 37 | echo 'Building nccl-rdma-sharp-plugins...' 38 | sh """#!/bin/bash 39 | set -o pipefail 40 | ${WORKSPACE}/.ci/build_nccl_rdma_sharp_plugins.sh 2>&1 | tee ${ARTEFACT_DIR}/build_nccl_rdma_sharp_plugins.log 41 | """ 42 | } 43 | } 44 | stage('Configure SHARP: startup') { 45 | steps { 46 | echo 'Configure SHARP: startup...' 47 | sh """#!/bin/bash 48 | set -o pipefail 49 | ${WORKSPACE}/.ci/configure_sharp.sh 2>&1 | tee ${ARTEFACT_DIR}/configure_sharp_startup.log 50 | """ 51 | } 52 | } 53 | stage('Checkout NCCL tests') { 54 | steps { 55 | dir("${NCCL_TESTS_DIR}") { 56 | git branch: 'master', 57 | url: 'https://github.com/NVIDIA/nccl-tests.git' 58 | } 59 | } 60 | } 61 | stage('Test nccl-rdma-sharp-plugins') { 62 | steps { 63 | echo 'Testing nccl-rdma-sharp-plugins...' 64 | sh """#!/bin/bash 65 | set -o pipefail 66 | ${WORKSPACE}/.ci/run_nccl_tests.sh 2>&1 | tee ${ARTEFACT_DIR}/run_nccl_test.log 67 | """ 68 | } 69 | } 70 | stage('Configure SHARP: stop') { 71 | steps { 72 | echo 'Configure SHARP: stop...' 73 | sh """#!/bin/bash 74 | set -o pipefail 75 | ${WORKSPACE}/.ci/configure_sharp.sh stop 2>&1 | tee ${ARTEFACT_DIR}/configure_sharp_stop.log 76 | """ 77 | } 78 | } 79 | } 80 | // Not needed, as there are no external contributors 81 | // post { 82 | // always { 83 | // echo 'Post-actions...' 84 | // sh '${WORKSPACE}/.ci/publish_artefacts.sh' 85 | // } 86 | // } 87 | } 88 | -------------------------------------------------------------------------------- /.ci/README.md: -------------------------------------------------------------------------------- 1 | # nccl-rdma-sharp-plugins Continuous Integration (CI) 2 | ## Overview 3 | nccl-rdma-sharp-plugins CI is intended to make sanity checking for every code change. CI is started for each Pull Request (PR) and can be additionally triggered with **bot:mlx:test** (or **bot:mlx:retest**) keyword written in the PR comments. For users in the project WhiteList CI is started automatically, for others - project maintainers should approve CI start with '**ok to test**' keyword reply.
4 | CI status and artefacts (log files) are published within the PR comments. 5 | ## Description 6 | CI includes the following steps: 7 | * Build nccl-rdma-sharp-plugins 8 | * Test nccl-rdma-sharp-plugins with [NCCL tests](https://github.com/nvidia/nccl-tests). 9 | The tests are run with [NVIDIA's NCCL](https://github.com/NVIDIA/nccl) library built within CI from the internal repository. 10 | ### Test Environment 11 | CI is run in the Mellanox lab on a 2-node cluster with the following parameters: 12 | 13 | Hardware 14 | * IB: 1x ConnectX-6 HCA (connected to Mellanox Quantum™ HDR switch) 15 | * GPU: 1x Nvidia Tesla K40m 16 | 17 | Software 18 | * Ubuntu 18.04.4 19 | * Internal stable MLNX_OFED, HPC-X and SHARP versions -------------------------------------------------------------------------------- /.ci/build_cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eE 2 | . ./pushd_functions.sh 3 | . ./ci_functions.sh 4 | pushd /GIT 5 | case $1 in 6 | build) 7 | configure 8 | echo "Building NCCL sharp plugin" 9 | build 10 | ;; 11 | sharp) 12 | echo "Checking and configure sharp" 13 | sharp 14 | ;; 15 | test) 16 | echo "Running tests for NCCL sharp plugin" 17 | test 18 | ;; 19 | *) 20 | echo "Do nothing" 21 | ;; 22 | esac 23 | -------------------------------------------------------------------------------- /.ci/build_nccl_rdma_sharp_plugins.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -leE 2 | 3 | SCRIPT_DIR="$( 4 | cd "$(dirname "$0")" 5 | pwd -P 6 | )" 7 | cd "${SCRIPT_DIR}" 8 | # shellcheck source=settings.sh 9 | . "${SCRIPT_DIR}/settings.sh" 10 | 11 | cd "${WORKSPACE}" 12 | 13 | if ! "${WORKSPACE}/autogen.sh"; then 14 | echo "ERROR: ${WORKSPACE}/autogen.sh failed" 15 | echo "FAIL" 16 | exit 1 17 | fi 18 | 19 | if ! "${WORKSPACE}/configure" \ 20 | --prefix="${NCCL_RDMA_SHARP_PLUGINS_DIR}" \ 21 | --with-cuda="${CUDA_HOME}" \ 22 | --with-sharp="${HPCX_SHARP_DIR}"; then 23 | echo "ERROR: ${WORKSPACE}/configure failed" 24 | echo "FAIL" 25 | exit 1 26 | fi 27 | 28 | if ! make -j install; then 29 | echo "ERROR: 'make -j install' failed" 30 | echo "FAIL" 31 | exit 1 32 | fi 33 | 34 | if [ "$DEBUG" = "true" ]; then 35 | echo "INFO: ${NCCL_RDMA_SHARP_PLUGINS_DIR}:" 36 | # For debug purposes 37 | find "${NCCL_RDMA_SHARP_PLUGINS_DIR}" -type f 38 | fi 39 | 40 | echo "PASS" 41 | -------------------------------------------------------------------------------- /.ci/ci_functions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eE 2 | # Preparation a workplace & configs to CI 3 | function configure() { 4 | rm -rf "${NFS_WORKSPACE}-pr" || true 5 | rm -rf "${NFS_WORKSPACE}" || true 6 | rm -rf ./nccl-rdma-sharp-plugins/.ci/cfg/* || true 7 | cd "${NFS_WORKSPACE_ROOT}" || exit 1 8 | mkdir -p ./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf 9 | mkdir -p ./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf 10 | 11 | printf "%s\n%s\n" "${HOST1}" "${HOST2}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/hostfile 12 | printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharp_am.cfg 13 | printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharpd.cfg 14 | printf "%s\n" "${SHARP_AM_HOST}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharp_am_node.txt 15 | 16 | printf "%s\n%s\n" "${HOST1}" "${HOST2}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/hostfile 17 | printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharp_am.cfg 18 | printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharpd.cfg 19 | printf "%s\n" "${SHARP_AM_HOST}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharp_am_node.txt 20 | } 21 | 22 | # Building NCCL rdma sharp plugin 23 | function build() { 24 | echo "Running build_nccl_rdma_sharp_plugins.sh..." 25 | "${WORKSPACE}"/.ci/build_nccl_rdma_sharp_plugins.sh && echo "Build SUCCESFULL !!!" 26 | } 27 | 28 | # Checking and configuring Sharp 29 | function sharp() { 30 | echo "Running configure_sharp.sh..." 31 | "${WORKSPACE}"/.ci/configure_sharp.sh && echo "Step configure_sharp SUCCESFULL !!!" 32 | } 33 | 34 | # Running of tests 35 | function test() { 36 | git clone --depth=1 https://github.com/NVIDIA/nccl-tests.git "${NFS_WORKSPACE}"/nccl-tests 37 | echo "Running run_nccl_tests.sh..." 38 | "${WORKSPACE}"/.ci/run_nccl_tests.sh && echo "Tests SUCCESFULL !!!" 39 | } 40 | -------------------------------------------------------------------------------- /.ci/config-header-check.yml: -------------------------------------------------------------------------------- 1 | general: 2 | exclude: 3 | - "\\.git.*" 4 | - "\\.(yml|md|txt)" 5 | - "^\\.ci.*" 6 | - "\\.(m4|ac)" 7 | - "LICENSE" 8 | - "debian/copyright" 9 | - "debian/compat" 10 | - "debian/source/format" 11 | 12 | bsd: 13 | validate-spdx-license: true 14 | include: 15 | - ".*\\.(am|in|hpp|cpp|py|cc|h|c|sh)$" 16 | -------------------------------------------------------------------------------- /.ci/configure_sharp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | SCRIPT_DIR="$( 3 | cd "$(dirname "$0")" || exit 1 4 | pwd -P 5 | )" 6 | cd "${SCRIPT_DIR}" || exit 1 7 | # shellcheck source=settings.sh 8 | . "${SCRIPT_DIR}/settings.sh" 9 | 10 | if [ -z "${NCCL_RDMA_SHARP_PLUGINS_DIR}" ]; then 11 | echo "ERROR: NCCL_RDMA_SHARP_PLUGINS_DIR is not defined" 12 | echo "FAIL" 13 | exit 1 14 | fi 15 | 16 | export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${NCCL_RDMA_SHARP_PLUGINS_DIR}/lib:${LD_LIBRARY_PATH}" 17 | 18 | # 1 - run sanity tests, 0 - do not run 19 | VERIFY_SHARP_ENABLE=${VERIFY_SHARP_ENABLE:-1} 20 | 21 | if [ -z "${NCCL_DIR}" ]; then 22 | module load dev/nccl-nightly-stable 23 | else 24 | export LD_LIBRARY_PATH="${NCCL_DIR}/lib:${LD_LIBRARY_PATH}" 25 | fi 26 | 27 | # Available values: start|stop|restart 28 | SHARP_MANAGER_ACTION="${1:-restart}" 29 | echo "INFO: SHARP_MANAGER_ACTION = ${SHARP_MANAGER_ACTION}" 30 | 31 | echo "INFO: NFS_WORKSPACE = ${NFS_WORKSPACE}" 32 | 33 | if [ -z "${NFS_WORKSPACE}" ]; then 34 | echo "ERROR: NFS_WORKSPACE is not defined" 35 | echo "FAIL" 36 | exit 1 37 | fi 38 | 39 | if [ -z "${HPCX_SHARP_DIR}" ]; then 40 | echo "ERROR: HPCX_SHARP_DIR is not defined" 41 | echo "FAIL" 42 | exit 1 43 | fi 44 | 45 | HPCX_SHARP_DIR=/opt/mellanox/sharp 46 | CONFIGURE_SHARP_TMP_DIR="${NFS_WORKSPACE}/configure_sharp_$$" 47 | mkdir -p "${CONFIGURE_SHARP_TMP_DIR}" 48 | chmod o+w "${CONFIGURE_SHARP_TMP_DIR}" 49 | 50 | export SHARP_CONF="${CONFIGURE_SHARP_TMP_DIR}" 51 | export SHARP_INI_FILE="${SHARP_CONF}/sharp_manager.ini" 52 | 53 | cp -R "${CFG_DIR}/$HOSTNAME/sharp_conf/"* "${SHARP_CONF}" 54 | 55 | if [ -f "${SHARP_CONF}/sharp_am_node.txt" ]; then 56 | SHARP_AM_NODE=$(cat ${SHARP_CONF}/sharp_am_node.txt) 57 | echo "INFO: SHARP_AM_NODE = ${SHARP_AM_NODE}" 58 | else 59 | echo "ERROR: ${SHARP_CONF}/sharp_am_node.txt does not exist or not accessible" 60 | echo "FAIL" 61 | exit 1 62 | fi 63 | 64 | IB_DEV=$(/GIT/ibdev2netdev | awk '{ print $1 }'):1 65 | SM_GUID=$(sudo sminfo -C ${IB_DEV} -P1 | awk '{print $7}' | cut -d',' -f1) 66 | # SM/AM node 67 | # SM_HOSTNAME=`sudo ibnetdiscover -H -C mlx5_0 -P1 | grep ${SM_GUID} | awk -F'"' '{print $2 }' | awk '{print $1}'` 68 | HOSTS=$(cat $HOSTFILE | xargs | tr ' ' ',') 69 | 70 | echo "INFO: IB_DEV = ${IB_DEV}" 71 | echo "INFO: SM_GUID = ${SM_GUID}" 72 | # echo "INFO: SM_HOSTNAME = ${SM_HOSTNAME}" 73 | echo "INFO: HOSTS = ${HOSTS}" 74 | 75 | rm -f ${SHARP_INI_FILE} 76 | 77 | cat >${SHARP_INI_FILE} </dev/null" 113 | if [ $? -ne 0 ]; then 114 | echo "ERROR: wrong value of routing_engine parameter in ${OPENSM_CONFIG}" 115 | echo "Should be (example): routing_engine updn" 116 | echo "FAIL" 117 | exit 1 118 | fi 119 | 120 | ssh "${SHARP_AM_NODE}" "grep \"sharp_enabled.*2\" ${OPENSM_CONFIG} 2>/dev/null" 121 | if [ $? -ne 0 ]; then 122 | echo "ERROR: wrong value of sharp_enabled parameter in ${OPENSM_CONFIG}" 123 | echo "Should be (example): sharp_enabled 2" 124 | echo "FAIL" 125 | exit 1 126 | fi 127 | 128 | echo "INFO: check_opensm_conf on ${SHARP_AM_NODE}... DONE" 129 | } 130 | 131 | verify_sharp() { 132 | echo "INFO: verify_sharp..." 133 | 134 | cp ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/* ${CONFIGURE_SHARP_TMP_DIR} 135 | cd ${CONFIGURE_SHARP_TMP_DIR} 136 | make CUDA=1 CUDA_HOME=${CUDA_HOME} SHARP_HOME="${HPCX_SHARP_DIR}" 137 | if [ $? -ne 0 ]; then 138 | echo "ERROR: verify_sharp make failed" 139 | echo "FAIL" 140 | exit 1 141 | fi 142 | 143 | cp ${WORKSPACE}/.ci/sharp_coll_test_wrapper ./ 144 | ITERS=100 145 | SKIP=20 146 | NP=$(wc --lines "$HOSTFILE" | awk '{print $1}') 147 | 148 | # -mca coll_hcoll_enable 0 - disable HCOLL 149 | MPIRUN_COMMON_OPTIONS="\ 150 | -np $NP \ 151 | -H $HOSTS \ 152 | --map-by node \ 153 | -x LD_LIBRARY_PATH \ 154 | --allow-run-as-root \ 155 | -mca oob_tcp_if_exclude eth0 \ 156 | " 157 | 158 | # TODO change to SHARP_COLL_SAT_THRESHOLD=1 (32 - W/A for SHARP issue) 159 | MPIRUN_SHARP_OPTIONS="\ 160 | -x SHARP_COLL_LOG_LEVEL=3 \ 161 | -x ENABLE_SHARP_COLL=1 \ 162 | -x SHARP_COLL_SAT_THRESHOLD=32 \ 163 | -x SHARP_COLL_ENABLE_SAT=1 \ 164 | " 165 | 166 | echo "Environment for the reproducer:" 167 | echo "export PATH=$PATH" 168 | echo "export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" 169 | echo "export OPAL_PREFIX=${OPAL_PREFIX}" 170 | 171 | # Test 1 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README): 172 | # Run allreduce barrier perf test on 2 hosts using port mlx5_0 173 | echo "${GH_FOLD}# Test 1..." 174 | CMD="mpirun \ 175 | ${MPIRUN_COMMON_OPTIONS} \ 176 | ${MPIRUN_SHARP_OPTIONS} \ 177 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \ 178 | --iters $ITERS \ 179 | --skip $SKIP \ 180 | --mode perf \ 181 | --collectives allreduce,barrier" 182 | echo "INFO: Test 1 command line:" 183 | trim_multiple_spaces "$CMD" 184 | $CMD 185 | if [ $? -ne 0 ]; then 186 | echo "ERROR: verify_sharp Test 1 failed" 187 | echo "FAIL" 188 | exit 1 189 | fi 190 | echo "${GH_UNFOLD}" 191 | echo "Test 1... DONE" 192 | 193 | # Test 2 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README): 194 | # Run allreduce perf test on 2 hosts using port mlx5_0 with CUDA buffers 195 | echo "${GH_FOLD}# Test 2..." 196 | CMD="mpirun \ 197 | ${MPIRUN_COMMON_OPTIONS} \ 198 | ${MPIRUN_SHARP_OPTIONS} \ 199 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \ 200 | --iters $ITERS \ 201 | --skip $SKIP \ 202 | --mode perf \ 203 | --collectives allreduce \ 204 | -M cuda" 205 | echo "INFO: Test 2 command line:" 206 | trim_multiple_spaces "$CMD" 207 | $CMD 208 | if [ $? -ne 0 ]; then 209 | echo "ERROR: verify_sharp Test 2 failed" 210 | echo "FAIL" 211 | exit 1 212 | fi 213 | echo "${GH_UNFOLD}" 214 | echo "Test 2... DONE" 215 | 216 | # Test 3 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README): 217 | # Run allreduce perf test on 2 hosts using port mlx5_0 with Streaming aggregation from 4B to 512MB 218 | echo "${GH_FOLD}# Test 3..." 219 | CMD="mpirun \ 220 | ${MPIRUN_COMMON_OPTIONS} \ 221 | ${MPIRUN_SHARP_OPTIONS} \ 222 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \ 223 | --iters $ITERS \ 224 | --skip $SKIP \ 225 | --mode perf \ 226 | --collectives allreduce \ 227 | -s 4:536870912" 228 | echo "INFO: Test 3 command line:" 229 | trim_multiple_spaces "$CMD" 230 | $CMD 231 | if [ $? -ne 0 ]; then 232 | echo "ERROR: verify_sharp Test 3 failed" 233 | echo "FAIL" 234 | exit 1 235 | fi 236 | echo "${GH_UNFOLD}" 237 | echo "Test 3... DONE" 238 | 239 | # Test 4: 240 | # Run iallreduce perf test on 2 hosts using port mlx5_0 241 | echo "${GH_FOLD}# Test 4..." 242 | CMD="mpirun \ 243 | ${MPIRUN_COMMON_OPTIONS} \ 244 | ${MPIRUN_SHARP_OPTIONS} \ 245 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \ 246 | --iters $ITERS \ 247 | --skip $SKIP \ 248 | --mode perf \ 249 | --collectives iallreduce \ 250 | -N 128" 251 | echo "INFO: Test 4 command line:" 252 | trim_multiple_spaces "$CMD" 253 | $CMD 254 | if [ $? -ne 0 ]; then 255 | echo "ERROR: verify_sharp Test 4 failed" 256 | echo "FAIL" 257 | exit 1 258 | fi 259 | echo "${GH_UNFOLD}" 260 | echo "Test 4... DONE" 261 | 262 | # Test 5: 263 | # Run iallreduce perf test on 2 hosts using port mlx5_0 with CUDA buffers 264 | echo "${GH_FOLD}# Test 5..." 265 | CMD="mpirun \ 266 | ${MPIRUN_COMMON_OPTIONS} \ 267 | ${MPIRUN_SHARP_OPTIONS} \ 268 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \ 269 | --iters $ITERS \ 270 | --skip $SKIP \ 271 | --mode perf \ 272 | --collectives iallreduce \ 273 | -N 128 \ 274 | -M cuda" 275 | echo "INFO: Test 5 command line:" 276 | trim_multiple_spaces "$CMD" 277 | $CMD 278 | if [ $? -ne 0 ]; then 279 | echo "ERROR: verify_sharp Test 5 failed" 280 | echo "FAIL" 281 | exit 1 282 | fi 283 | echo "${GH_UNFOLD}" 284 | echo "Test 5... DONE" 285 | 286 | # Test 6: 287 | # Run iallreduce perf test on 2 hosts using port mlx5_0 with Streaming aggregation from 4B to 512MB 288 | echo "${GH_FOLD}# Test 6..." 289 | CMD="mpirun \ 290 | ${MPIRUN_COMMON_OPTIONS} \ 291 | ${MPIRUN_SHARP_OPTIONS} \ 292 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \ 293 | --iters $ITERS \ 294 | --skip $SKIP \ 295 | --mode perf \ 296 | --collectives iallreduce \ 297 | -N 128 \ 298 | -s 4:131072" 299 | echo "INFO: Test 6 command line:" 300 | trim_multiple_spaces "$CMD" 301 | $CMD 302 | if [ $? -ne 0 ]; then 303 | echo "ERROR: verify_sharp Test 6 failed" 304 | echo "FAIL" 305 | exit 1 306 | fi 307 | echo "${GH_UNFOLD}" 308 | echo "Test 6... DONE" 309 | 310 | # Test 7 (from the SHARP deployment guide): Without SAT 311 | echo "${GH_FOLD}# Test 7..." 312 | CMD="$OMPI_HOME/bin/mpirun \ 313 | ${MPIRUN_COMMON_OPTIONS} \ 314 | --bind-to core \ 315 | -mca btl_openib_warn_default_gid_prefix 0 \ 316 | -mca rmaps_dist_device ${IB_DEV} \ 317 | -mca rmaps_base_mapping_policy dist:span \ 318 | -x MXM_LOG_LEVEL=ERROR \ 319 | -x HCOLL_ML_DISABLE_REDUCE=1 \ 320 | -x LD_LIBRARY_PATH \ 321 | -x HCOLL_ENABLE_SHARP=2 \ 322 | -x SHARP_COLL_LOG_LEVEL=3 \ 323 | -x SHARP_COLL_GROUP_RESOURCE_POLICY=1 \ 324 | -x SHARP_COLL_MAX_PAYLOAD_SIZE=256 \ 325 | -x HCOLL_SHARP_UPROGRESS_NUM_POLLS=999 \ 326 | -x HCOLL_BCOL_P2P_ALLREDUCE_SHARP_MAX=4096 \ 327 | -x SHARP_COLL_PIPELINE_DEPTH=32 \ 328 | -x SHARP_COLL_JOB_QUOTA_OSTS=32 \ 329 | -x SHARP_COLL_JOB_QUOTA_MAX_GROUPS=4 \ 330 | -x SHARP_COLL_JOB_QUOTA_PAYLOAD_PER_OST=256 \ 331 | ${WORKSPACE}/.ci/taskset -c 1 \ 332 | numactl --membind=0 \ 333 | $HPCX_OSU_DIR/osu_allreduce \ 334 | -i 100 \ 335 | -x 100 \ 336 | -f \ 337 | -m 4096:4096" 338 | echo "INFO: Test 7 command line:" 339 | trim_multiple_spaces "$CMD" 340 | $CMD 341 | if [ $? -ne 0 ]; then 342 | echo "ERROR: Test 7 (without SAT) failed, check the log file" 343 | echo "FAIL" 344 | exit 1 345 | fi 346 | echo "${GH_UNFOLD}" 347 | echo "Test 7... DONE" 348 | 349 | # Test 8 (from the SHARP deployment guide): With SAT 350 | echo "${GH_FOLD}# Test 8..." 351 | CMD="$OMPI_HOME/bin/mpirun \ 352 | ${MPIRUN_COMMON_OPTIONS} \ 353 | -mca btl_openib_warn_default_gid_prefix 0 \ 354 | -mca rmaps_dist_device ${IB_DEV} \ 355 | -mca rmaps_base_mapping_policy dist:span \ 356 | -x MXM_ASYNC_INTERVAL=1800s \ 357 | -x MXM_LOG_LEVEL=ERROR \ 358 | -x HCOLL_ML_DISABLE_REDUCE=1 \ 359 | -x LD_LIBRARY_PATH \ 360 | -x HCOLL_ENABLE_SHARP=2 \ 361 | -x SHARP_COLL_LOG_LEVEL=3 \ 362 | -x SHARP_COLL_GROUP_RESOURCE_POLICY=1 \ 363 | -x SHARP_COLL_MAX_PAYLOAD_SIZE=256 \ 364 | -x HCOLL_SHARP_UPROGRESS_NUM_POLLS=999 \ 365 | -x HCOLL_BCOL_P2P_ALLREDUCE_SHARP_MAX=4096 \ 366 | -x SHARP_COLL_PIPELINE_DEPTH=32 \ 367 | -x SHARP_COLL_JOB_QUOTA_OSTS=32 \ 368 | -x SHARP_COLL_JOB_QUOTA_MAX_GROUPS=4 \ 369 | -x SHARP_COLL_JOB_QUOTA_PAYLOAD_PER_OST=256 \ 370 | -x SHARP_COLL_ENABLE_SAT=1 \ 371 | ${WORKSPACE}/.ci/taskset -c 1 \ 372 | numactl --membind=0 \ 373 | $HPCX_OSU_DIR/osu_allreduce \ 374 | -i 100 \ 375 | -x 100 \ 376 | -f \ 377 | -m 4096:4096" 378 | echo "INFO: Test 8 command line:" 379 | trim_multiple_spaces "$CMD" 380 | $CMD 381 | if [ $? -ne 0 ]; then 382 | echo "ERROR: Test 8 (with SAT) failed, check the log file" 383 | echo "FAIL" 384 | exit 1 385 | fi 386 | echo "${GH_UNFOLD}" 387 | echo "Test 8... DONE" 388 | 389 | echo "INFO: verify_sharp... DONE" 390 | } 391 | 392 | if [ "${SHARP_MANAGER_ACTION}" != "stop" ]; then 393 | check_opensm_status 394 | check_opensm_conf 395 | fi 396 | 397 | sudo PDSH_RCMD_TYPE=ssh SHARP_INI_FILE=${SHARP_INI_FILE} SHARP_CONF=${SHARP_CONF} ${HPCX_SHARP_DIR}/sbin/sharp_manager.sh "${SHARP_MANAGER_ACTION}" -l "$HOSTS" -s "${SHARP_AM_NODE}" 398 | if [ $? -ne 0 ]; then 399 | echo "ERROR: sharp_manager.sh failed, check the log file" 400 | echo "FAIL" 401 | exit 1 402 | fi 403 | 404 | if [ "${SHARP_MANAGER_ACTION}" != "stop" ] && [ "${VERIFY_SHARP_ENABLE}" -eq 1 ]; then 405 | verify_sharp 406 | fi 407 | 408 | sudo chmod -R 777 ${CONFIGURE_SHARP_TMP_DIR} 409 | rm -rf ${CONFIGURE_SHARP_TMP_DIR} 410 | 411 | echo "PASS" 412 | -------------------------------------------------------------------------------- /.ci/ibdev2netdev: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eE 2 | # ibdev2netdev doesn't work correctly inside a container. This wrapper is a workaround 3 | DEV_IB=$(ls -1 /dev/infiniband/umad*) 4 | N=${DEV_IB: -1} 5 | if [ -e /dev/infiniband/umad${N} ]; then 6 | printf "mlx5_${N} port 1 ====> ib0\n" 7 | fi 8 | -------------------------------------------------------------------------------- /.ci/nccl_tests: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eE 2 | # Wrapper to add correct parameters to the main app 3 | IB_DEV=$(ibdev2netdev | awk '{ print $1 }'):1 4 | ETH_DEV=$(ibdev2netdev | awk '{ print $5 }') 5 | 6 | export HCOLL_MAIN_IB=${IB_DEV} 7 | export NCCL_IB_HCA=${IB_DEV} 8 | export UCX_NET_DEVICES=${IB_DEV} 9 | export NCCL_SOCKET_IFNAME=${ETH_DEV} 10 | exec "${@}" 11 | -------------------------------------------------------------------------------- /.ci/publish_artefacts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -leE 2 | 3 | SCRIPT_DIR="$( 4 | cd "$(dirname "$0")" 5 | pwd -P 6 | )" 7 | cd "${SCRIPT_DIR}" 8 | # shellcheck source=settings.sh 9 | . "${SCRIPT_DIR}/settings.sh" 10 | 11 | echo 'Publish artefacts...' 12 | 13 | export UPSTREAM_JOB_NAME=${UPSTREAM_JOB_NAME:-${JOB_NAME}} 14 | export UPSTREAM_BUILD_NUMBER=${UPSTREAM_BUILD_NUMBER:-${BUILD_NUMBER}} 15 | export UPSTREAM_ghprbGhRepository=${UPSTREAM_ghprbGhRepository:-${ghprbGhRepository}} 16 | export UPSTREAM_ghprbPullId=${UPSTREAM_ghprbPullId:-${ghprbPullId}} 17 | 18 | ls -al "${ARTEFACT_DIR}" 19 | 20 | publish_artefacts_to_gist.py 21 | 22 | echo 'Publish artefacts... DONE' 23 | -------------------------------------------------------------------------------- /.ci/pushd_functions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eE 2 | pushd() { 3 | command pushd "$@" >/dev/null 4 | } 5 | 6 | popd() { 7 | command popd "$@" >/dev/null 8 | } 9 | -------------------------------------------------------------------------------- /.ci/run_nccl_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -leE 2 | SCRIPT_DIR="$( 3 | cd "$(dirname "$0")" 4 | pwd -P 5 | )" 6 | cd "${SCRIPT_DIR}" 7 | # shellcheck source=settings.sh 8 | . "${SCRIPT_DIR}"/settings.sh 9 | 10 | GLOBAL_TEST_STATUS=0 11 | 12 | if [ -z "${NCCL_DIR}" ]; then 13 | module load dev/nccl-nightly-stable 14 | else 15 | export LD_LIBRARY_PATH="${NCCL_DIR}/lib:${LD_LIBRARY_PATH}" 16 | fi 17 | 18 | if [ -z "${NCCL_RDMA_SHARP_PLUGINS_DIR}" ]; then 19 | echo "ERROR: NCCL_RDMA_SHARP_PLUGINS_DIR is not defined" 20 | echo "FAIL" 21 | exit 1 22 | fi 23 | 24 | if [ -z "${NCCL_TESTS_DIR}" ]; then 25 | echo "ERROR: NCCL_TESTS_DIR is not defined" 26 | echo "FAIL" 27 | exit 1 28 | fi 29 | 30 | NP=2 31 | IB_DEV=$(ibdev2netdev | awk '{ print $1 }'):1 32 | # UCX_MEMTYPE_CACHE=n - to avoid warnings "memtype_cache.c:83 UCX ERROR failed to insert region 0x1a1e890 [0x7f8d00000000..0x7f8d30000000]: Element already exists" 33 | MPIRUN_OPTIONS_COMMON="\ 34 | -x LD_LIBRARY_PATH \ 35 | -x NCCL_DEBUG=INFO \ 36 | -x NCCL_DEBUG_SUBSYS=INIT \ 37 | -x UCX_MEMTYPE_CACHE=n \ 38 | -x HCOLL_ENABLE_SHARP=0 \ 39 | -x HCOLL_ENABLE_MCAST_ALL=0 \ 40 | -mca pml ucx \ 41 | -mca coll_hcoll_enable 1 \ 42 | --map-by node \ 43 | --bind-to none \ 44 | --hostfile ${HOSTFILE} \ 45 | -np $NP \ 46 | --report-bindings \ 47 | --allow-run-as-root \ 48 | -mca oob_tcp_if_exclude eth0 \ 49 | " 50 | 51 | # Application options 52 | ITER=100 53 | WARMUP_ITER=100 54 | MSG_SIZE_MIN="8" 55 | MSG_SIZE_MAX="4M" 56 | NCCL_TEST_EXE=("all_reduce_perf" "all_gather_perf" "broadcast_perf" "reduce_perf" "reduce_scatter_perf" "alltoall_perf") 57 | NCCL_TEST_PARAMS=" -b ${MSG_SIZE_MIN} -e ${MSG_SIZE_MAX} -f 2 -g 1 -c 1 -z 1 -n $ITER -w $WARMUP_ITER -p 0 " 58 | ENABLE_SAT=${ENABLE_SAT:-1} 59 | echo "INFO: ENABLE_SAT = ${ENABLE_SAT}" 60 | 61 | echo_hash_line() { 62 | echo "###############################################################################" 63 | } 64 | 65 | echo "CUDA_HOME: ${CUDA_HOME}" 66 | echo "NCCL_DIR: ${NCCL_DIR}" 67 | echo "NCCL_RDMA_SHARP_PLUGINS_DIR: ${NCCL_RDMA_SHARP_PLUGINS_DIR}" 68 | echo "MPI_HOME: ${MPI_HOME}" 69 | 70 | # Build NCCL-TESTS 71 | cd "${NCCL_TESTS_DIR}" 72 | make -j clean 73 | 74 | make -j CUDA_HOME="${CUDA_HOME}" NCCL_HOME="${NCCL_DIR}" MPI=1 MPI_HOME="${MPI_HOME}" 75 | 76 | export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${NCCL_RDMA_SHARP_PLUGINS_DIR}/lib:${LD_LIBRARY_PATH}" 77 | 78 | trim_multiple_spaces() { 79 | echo "$1" | sed -s "s|\ \ *| |g" 80 | } 81 | 82 | # USAGE: all_reduce_perf 83 | # [-t,--nthreads ] 84 | # [-g,--ngpus ] 85 | # [-b,--minbytes ] 86 | # [-e,--maxbytes ] 87 | # [-i,--stepbytes ] 88 | # [-f,--stepfactor ] 89 | # [-n,--iters ] 90 | # [-m,--agg_iters ] 91 | # [-w,--warmup_iters ] 92 | # [-p,--parallel_init <0/1>] 93 | # [-c,--check <0/1>] 94 | # [-o,--op ] 95 | # [-d,--datatype ] 96 | # [-r,--root ] 97 | # [-z,--blocking <0/1>] 98 | # [-h,--help] 99 | 100 | ############################################################################### 101 | # Run NCCL-TESTS (MPI) 102 | ############################################################################### 103 | 104 | i=1 105 | 106 | for TEST_EXE in ${NCCL_TEST_EXE[@]}; do 107 | #=================== 108 | # NCCL_PLUGIN_P2P 109 | #=================== 110 | # Enable ucx_rma tests once this is resolved: https://redmine.mellanox.com/issues/3037941 111 | # for P2P_LAYER in ucx ucx_rma ib 112 | for P2P_LAYER in ib ucx ucx_uct ucx_uct_read; do 113 | MPIRUN_OPTIONS_PLUGIN_P2P_LAYER="-x NCCL_PLUGIN_P2P=${P2P_LAYER}" 114 | 115 | #=================== 116 | # NCCL_PROTO 117 | #=================== 118 | for NCCL_PROTO in Simple LL DEFAULT; do 119 | if [ "${NCCL_PROTO}" = "DEFAULT" ]; then 120 | MPIRUN_OPTIONS_NCCL_PROTO="" 121 | else 122 | MPIRUN_OPTIONS_NCCL_PROTO="-x NCCL_PROTO=${NCCL_PROTO}" 123 | fi 124 | 125 | #=================== 126 | # NCCL_ALGO 127 | #=================== 128 | for NCCL_ALGO in CollNet Tree Ring DEFAULT; do 129 | if [ "${NCCL_ALGO}" = "CollNet" ] && [ "${TEST_EXE}" != "all_reduce_perf" ]; then 130 | # test sharp plugin only with all_reduce_perf 131 | continue 132 | fi 133 | 134 | if [ "${NCCL_ALGO}" = "DEFAULT" ]; then 135 | MPIRUN_OPTIONS_NCCL_ALGO="" 136 | else 137 | MPIRUN_OPTIONS_NCCL_ALGO="-x NCCL_ALGO=${NCCL_ALGO}" 138 | fi 139 | 140 | if [ "${NCCL_ALGO}" = "CollNet" ]; then 141 | MPIRUN_OPTIONS_NCCL_ALGO="-x NCCL_COLLNET_ENABLE=1" 142 | fi 143 | 144 | #=================== 145 | # SHARP_ENABLE 146 | #=================== 147 | for SHARP_ENABLE in 0 1; do 148 | if { [ "${NCCL_ALGO}" = "Tree" ] || [ "${NCCL_ALGO}" = "Ring" ]; } && [ "$SHARP_ENABLE" = "1" ]; then 149 | # skip sharp enable 1 for tree and ring algorithms 150 | continue 151 | fi 152 | if [ "${SHARP_ENABLE}" = "0" ]; then 153 | MPIRUN_OPTIONS_SHARP="" 154 | else 155 | MPIRUN_OPTIONS_SHARP="\ 156 | -x SHARP_COLL_LOG_LEVEL=3 \ 157 | -x SHARP_COLL_ENABLE_SAT=${ENABLE_SAT} \ 158 | " 159 | fi 160 | 161 | #=================== 162 | # NCCL_NET_GDR_LEVEL 163 | #=================== 164 | # for NCCL_NET_GDR_LEVEL in 0 1 2 3 4 5 DEFAULT 165 | for NCCL_NET_GDR_LEVEL in DEFAULT; do 166 | if [ "${NCCL_NET_GDR_LEVEL}" = "DEFAULT" ]; then 167 | MPIRUN_OPTIONS_GDR_LEVEL="" 168 | else 169 | MPIRUN_OPTIONS_GDR_LEVEL="-x NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL}" 170 | fi 171 | 172 | #=================== 173 | # NCCL_NET_GDR_READ 174 | #=================== 175 | # for NCCL_NET_GDR_READ in 0 1 DEFAULT 176 | for NCCL_NET_GDR_READ in DEFAULT; do 177 | if [ "${NCCL_NET_GDR_READ}" = "DEFAULT" ]; then 178 | MPIRUN_OPTIONS_GDR_READ="" 179 | else 180 | MPIRUN_OPTIONS_GDR_READ="-x NCCL_NET_GDR_READ=${NCCL_NET_GDR_READ}" 181 | fi 182 | 183 | echo_hash_line 184 | echo "${GH_FOLD}{# Test $i...}" 185 | echo_hash_line 186 | 187 | echo "INFO: TEST = ${TEST_EXE}" 188 | echo "INFO: P2P_LAYER = ${P2P_LAYER}" 189 | echo "INFO: NCCL_PROTO = ${NCCL_PROTO}" 190 | echo "INFO: NCCL_ALGO = ${NCCL_ALGO}" 191 | echo "INFO: SHARP_ENABLE = ${SHARP_ENABLE}" 192 | echo "INFO: NCCL_NET_GDR_LEVEL = ${NCCL_NET_GDR_LEVEL}" 193 | echo "INFO: NCCL_NET_GDR_READ = ${NCCL_NET_GDR_READ}" 194 | 195 | CMD="mpirun \ 196 | ${MPIRUN_OPTIONS_COMMON} \ 197 | ${MPIRUN_OPTIONS_NCCL_PROTO} \ 198 | ${MPIRUN_OPTIONS_NCCL_ALGO} \ 199 | ${MPIRUN_OPTIONS_SHARP} \ 200 | ${MPIRUN_OPTIONS_GDR_LEVEL} \ 201 | ${MPIRUN_OPTIONS_GDR_READ} \ 202 | ${MPIRUN_OPTIONS_PLUGIN_P2P_LAYER} \ 203 | ${WORKSPACE}/.ci/nccl_tests ${NCCL_TESTS_DIR}/build/${TEST_EXE} ${NCCL_TEST_PARAMS}" 204 | echo "# Test $i reproducer:" 205 | echo "export PATH=${PATH}" 206 | echo "" 207 | echo "export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" 208 | echo "" 209 | echo "export OPAL_PREFIX=${OPAL_PREFIX}" 210 | echo "" 211 | trim_multiple_spaces "$CMD" 212 | if ! $CMD; then 213 | echo "${GH_UNFOLD}" 214 | echo "# Test $i... failed" 215 | GLOBAL_TEST_STATUS=1 216 | else 217 | echo "${GH_UNFOLD}" 218 | echo "# Test $i... passed" 219 | fi 220 | 221 | i=$((i + 1)) 222 | done 223 | done 224 | done 225 | done 226 | done 227 | done 228 | done 229 | 230 | ############################################################################### 231 | if [ ${GLOBAL_TEST_STATUS} -ne 0 ]; then 232 | echo "ERROR: some tests failed, check the log file" 233 | echo "FAIL" 234 | exit 1 235 | else 236 | echo "All tests PASSED" 237 | fi 238 | 239 | echo "PASS" 240 | -------------------------------------------------------------------------------- /.ci/settings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -leE 2 | # Formating for Github acctions fold/unfold 3 | GH=${GH:-0} 4 | if [ "${GH}" -eq 1 ]; then 5 | GH_FOLD="::group::" 6 | GH_UNFOLD="::endgroup::" 7 | fi 8 | # PLUGINS 9 | echo "INFO: DEBUG = $DEBUG" 10 | DEBUG=false 11 | if [ "$DEBUG" = "true" ]; then 12 | set -x 13 | fi 14 | 15 | # W/A for SHARP 16 | # CUDA 10.2 is the latest available version we would like to test, CUDA 10.1 is needed for SHARP 17 | # (due to HPC-X is buitl with CUDA 10.1). 18 | # CUDA 10.2 has priority in the env PATH/LD_LIBRARY_PATH. 19 | 20 | # TODO remove use HPC-X which is already inside the image 21 | 22 | #module load /hpc/local/etc/modulefiles/dev/cuda-latest 23 | HPCX_UBUNTU_INSTALL_DIR=${HPCX_UBUNTU_INSTALL_DIR:-/hpc/noarch/HPCX/unpacked/hpcx-v2.17-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64/} 24 | module load "${HPCX_UBUNTU_INSTALL_DIR}"/modulefiles/hpcx-ompi 25 | # . "${HPCX_UBUNTU_INSTALL_DIR}/hpcx-init.sh" 26 | # hpcx_load 27 | 28 | # It is needed to disable nccl_rdma_sharp_plugin libs from HPC-X 29 | LD_LIBRARY_PATH="${LD_LIBRARY_PATH//nccl_rdma_sharp_plugin/nccl_rdma_sharp_pluginX}" 30 | export LD_LIBRARY_PATH 31 | CUDA_HOME=/usr/local/cuda 32 | #export UCX_NET_DEVICES=$(ibdev2netdev | awk '{print $1}'):1 33 | export NCCL_RDMA_SHARP_PLUGINS_DIR="${NCCL_RDMA_SHARP_PLUGINS_DIR:-${WORKSPACE}/_install}" 34 | echo "INFO: NCCL_RDMA_SHARP_PLUGINS_DIR = ${NCCL_RDMA_SHARP_PLUGINS_DIR}" 35 | 36 | TOP_DIR="$(git rev-parse --show-toplevel)" 37 | echo "INFO: TOP_DIR = ${TOP_DIR}" 38 | 39 | echo "INFO: CUDA_VER = ${CUDA_VER}" 40 | echo "INFO: CUDA_HOME = ${CUDA_HOME}" 41 | echo "INFO: HPCX_SHARP_DIR = ${HPCX_SHARP_DIR}" 42 | echo "INFO: HPCX_DIR = ${HPCX_DIR}" 43 | echo "INFO: WORKSPACE = ${WORKSPACE}" 44 | 45 | HOSTNAME=$(hostname -s) 46 | echo "INFO: HOSTNAME = $HOSTNAME" 47 | 48 | WORKSPACE="${WORKSPACE:-${TOP_DIR}}" 49 | CFG_DIR="${WORKSPACE}/.ci/cfg" 50 | HOSTFILE=${CFG_DIR}/$HOSTNAME/hostfile 51 | 52 | if [ ! -f "${HOSTFILE}" ]; then 53 | echo "ERROR: ${HOSTFILE} doesn't exist or not accessible" 54 | echo "FAIL" 55 | exit 1 56 | fi 57 | 58 | if [ ! -d "${HPCX_DIR}" ]; then 59 | echo "ERROR: ${HPCX_DIR} does not exist or not accessible" 60 | echo "FAIL" 61 | exit 1 62 | fi 63 | -------------------------------------------------------------------------------- /.ci/sharp_coll_test_wrapper: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eE 2 | # Wrapper to add correct parameter to the main scripts without refactoring 3 | TEST_DEV=$(ibdev2netdev | awk '{ print $1 }' ):1 4 | echo ${UCX_NET_DEVICES} 5 | ./sharp_coll_test -d "${TEST_DEV}" "${@}" 6 | -------------------------------------------------------------------------------- /.ci/taskset: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eE 2 | # Wrapper to add correct parameters to the main app 3 | TEST_DEV=$(ibdev2netdev | awk '{ print $1 }' ):1 4 | export MXM_RDMA_PORTS="${TEST_DEV}" 5 | export HCOLL_MAIN_IB="${TEST_DEV}" 6 | /usr/bin/taskset "${@}" 7 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | IndentWidth: 2 2 | AlignEscapedNewlines: Indent 3 | AlignConsecutiveAssignments: true 4 | AlignConsecutiveDeclarations: false 5 | AlignConsecutiveStructMembers: true 6 | AlignConsecutiveMacros: true 7 | AlignDeclarationByPointer: true 8 | AlignAfterOpenBracket: true 9 | AlignOperands: true 10 | -------------------------------------------------------------------------------- /.github/workflows/nccl-sharp-plugin.yml: -------------------------------------------------------------------------------- 1 | name: NCCL Sharp plugin CI 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | mainhost: 6 | description: 'Choose one of hosts to run:' 7 | required: true 8 | type: choice 9 | default: 'host01' 10 | options: 11 | - host01 12 | - host02 13 | push: 14 | branches: ['*'] 15 | pull_request: 16 | branches: ['*'] 17 | jobs: 18 | deployment: 19 | runs-on: [self-hosted, linux, x64] 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Deployment infrastructure 23 | run: /start deploy 24 | build: 25 | needs: [deployment] 26 | runs-on: [self-hosted, linux, x64] 27 | steps: 28 | - name: Building NCCL RDMA sharp plugin 29 | run: /start build 30 | sharp_config: 31 | needs: [deployment, build] 32 | runs-on: [self-hosted, linux, x64] 33 | steps: 34 | - name: Configuring and checking Sharp 35 | run: /start sharp 36 | testing: 37 | needs: [sharp_config] 38 | runs-on: [self-hosted, linux, x64] 39 | steps: 40 | - name: Running tests 41 | run: /start test 42 | clean: 43 | if: ${{ always() }} 44 | needs: [testing] 45 | runs-on: [self-hosted, linux, x64] 46 | steps: 47 | - name: Cleaning 48 | run: /start clean 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | .project 3 | .cproject 4 | .settings 5 | test/test 6 | Makefile 7 | Makefile.in 8 | aclocal.m4 9 | compile 10 | config.guess 11 | config.h.in 12 | config.h.in~ 13 | config.sub 14 | m4/libtool.m4 15 | m4/ltoptions.m4 16 | m4/ltsugar.m4 17 | m4/ltversion.m4 18 | m4/lt~obsolete.m4 19 | config/aux 20 | configure 21 | install-sh 22 | ltmain.sh 23 | missing 24 | config.h 25 | config.log 26 | config.status 27 | libtool 28 | stamp-h1 29 | src/sharp/api/version.h 30 | autom4te.cache 31 | depcomp 32 | .libs 33 | *.la 34 | .deps 35 | .dirstamp 36 | *.lo 37 | *.o 38 | build-* 39 | sharp*tar.gz 40 | rpm-dist 41 | cov_build* 42 | debian/changelog 43 | debian/control 44 | debian/rules 45 | debian/sharp.postinst 46 | debian/nccl-rdma-sharp-plugins.postinst 47 | debian/nccl-rdma-sharp-plugins.prem 48 | sharp.spec 49 | sharp.pc 50 | doc/doxygen-doc 51 | doc/uml/uct.pdf 52 | test-driver 53 | install 54 | src/api/version.h 55 | tags 56 | valgrind*xml 57 | *.tap 58 | jenkins/* 59 | sharp-* 60 | config.cache 61 | nccl-rdma-sharp-plugins.pc 62 | nccl-rdma-sharp-plugins.spec 63 | 64 | # Prerequisites 65 | *.d 66 | 67 | # Compiled Object files 68 | *.slo 69 | *.lo 70 | *.o 71 | *.obj 72 | 73 | # Precompiled Headers 74 | *.gch 75 | *.pch 76 | 77 | # Compiled Dynamic libraries 78 | *.so 79 | *.dylib 80 | *.dll 81 | 82 | # Fortran module files 83 | *.mod 84 | *.smod 85 | 86 | # Compiled Static libraries 87 | *.lai 88 | *.la 89 | *.a 90 | *.lib 91 | 92 | # Executables 93 | *.exe 94 | *.out 95 | *.app 96 | 97 | .idea 98 | 99 | *.orig 100 | *.bak 101 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 3. Neither the name of the copyright holder nor the names of its 13 | contributors may be used to endorse or promote products derived from 14 | this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # See file LICENSE for terms. 7 | # 8 | 9 | SUBDIRS = src 10 | 11 | EXTRA_DIST = 12 | EXTRA_DIST += autogen.sh 13 | EXTRA_DIST += include 14 | EXTRA_DIST += debian 15 | EXTRA_DIST += nccl-rdma-sharp-plugins.spec 16 | 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nccl-rdma-sharp-plugins 2 | 3 | nccl-rdma-sharp plugin enables RDMA and Switch based collectives(SHARP) 4 | with [NVIDIA's NCCL](https://github.com/NVIDIA/nccl) library 5 | 6 | ## Overview 7 | 8 | ## Requirements 9 | 10 | * MOFED 11 | * CUDA 12 | * SHARP 13 | * NCCL 14 | * GPUDirectRDMA plugin 15 | 16 | ## Build Instructions 17 | 18 | ### build system requirements 19 | 20 | * CUDA 21 | * SHARP 22 | * MOFED 23 | 24 | Plugin uses GNU autotools for its build system. You can build it as follows: 25 | 26 | 27 | ``` 28 | $ ./autogen.sh 29 | $ ./configure 30 | $ make 31 | $ make install 32 | ``` 33 | 34 | The following flags enabled to build with custom dependencies 35 | 36 | 37 | ``` 38 | --with-verbs=PATH Path to non-standard libibverbs installation 39 | --with-sharp=PATH Path to non-standard SHARP installation 40 | --with-cuda=PATH Path to non-standard CUDA installation 41 | ``` 42 | 43 | 44 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 | # SPDX-License-Identifier: BSD-3-Clause 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions are met: 9 | # 10 | # 1. Redistributions of source code must retain the above copyright notice, this 11 | # list of conditions and the following disclaimer. 12 | # 13 | # 2. Redistributions in binary form must reproduce the above copyright notice, 14 | # this list of conditions and the following disclaimer in the documentation 15 | # and/or other materials provided with the distribution. 16 | # 17 | # 3. Neither the name of the copyright holder nor the names of its 18 | # contributors may be used to endorse or promote products derived from 19 | # this software without specific prior written permission. 20 | # 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | # 32 | 33 | rm -rf autom4te.cache 34 | autoreconf -ivf || exit 1 35 | rm -rf autom4te.cache 36 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # See file LICENSE for terms. 5 | # 6 | AC_PREREQ([2.63]) 7 | 8 | AC_COPYRIGHT([Copyright (c) 2019, NVIDIA CORPORATION & AFFILIATES. All rights reserved.]) 9 | 10 | define([nccl_rdma_sharp_plugins_ver_major], 2) 11 | define([nccl_rdma_sharp_plugins_ver_minor], 7) 12 | 13 | AC_INIT([nccl-rdma-sharp-plugins], [nccl_rdma_sharp_plugins_ver_major.nccl_rdma_sharp_plugins_ver_minor], [support@mellanox.com], [],[http://github.com/Mellanox/nccl-rdma-sharp-plugins]) 14 | 15 | AM_INIT_AUTOMAKE([1.10 foreign tar-ustar subdir-objects]) 16 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) 17 | AM_MAINTAINER_MODE 18 | AC_CONFIG_MACRO_DIR([m4]) 19 | 20 | AC_USE_SYSTEM_EXTENSIONS 21 | AC_GNU_SOURCE 22 | AC_CONFIG_HEADERS([config.h]) 23 | 24 | RPM_RELEASE=1 25 | MAJOR_VERSION=nccl_rdma_sharp_plugins_ver_major 26 | MINOR_VERSION=nccl_rdma_sharp_plugins_ver_minor 27 | VERSION=$MAJOR_VERSION.$MINOR_VERSION 28 | 29 | AC_SUBST(RPM_RELEASE) 30 | AC_SUBST(VERSION) 31 | AC_SUBST(MAJOR_VERSION) 32 | AC_SUBST(MINOR_VERSION) 33 | AC_SUBST([BUILD_DATE], [$(date +'%b/%d/%Y')]) 34 | AC_SUBST([BUILD_TIME], [$(date +'%H:%M:%S')]) 35 | 36 | # Checks for programs. 37 | AC_GNU_SOURCE 38 | AC_PROG_CC 39 | AC_PROG_CC_STDC 40 | AC_PROG_CXX 41 | AM_PROG_AS 42 | AC_PROG_SED 43 | AC_PROG_INSTALL 44 | AC_PROG_LIBTOOL 45 | AC_HEADER_STDC 46 | LT_LIB_M 47 | 48 | AC_ARG_ENABLE([debug],AS_HELP_STRING([--enable-debug], [Enable extra debugging code (default is NO).]), 49 | [], [enable_debug=no]) 50 | 51 | if test $enable_debug = yes; then 52 | AC_DEFINE([ENABLE_DEBUG], [1], [Enable debugging code]) 53 | CFLAGS="$CFLAGS -O0 -g3 -Wall -Werror" 54 | else 55 | CFLAGS="$CFLAGS -O3 -DNDEBUG -Wall -Werror" 56 | fi 57 | 58 | #check for cuda 59 | AC_ARG_WITH([cuda], 60 | [AC_HELP_STRING([--with-cuda=PATH], 61 | [Path to non-standard CUDA installation])], 62 | [AS_IF([test -d $withval/lib64], [cuda_libdir="lib64"], [cuda_libdir="lib"]) 63 | CFLAGS="-I$withval/include $CFLAGS" 64 | LDFLAGS="-L$withval/$cuda_libdir $LDFLAGS"], 65 | [CFLAGS="-I/usr/local/cuda/include $CFLAGS" 66 | LDFLAGS="-L/usr/local/cuda/$cuda_libdir $LDFLAGS"]) 67 | 68 | AC_CHECK_HEADER( [cuda_runtime.h], [], [AC_MSG_FAILURE([CUDA runtime header files not found])]) 69 | AC_CHECK_LIB([cudart], [cudaMalloc], [], [AC_MSG_FAILURE([CUDA runtime libs not found])]) 70 | 71 | #check for verbs 72 | AC_ARG_WITH([verbs], 73 | [AC_HELP_STRING([--with-verbs(=DIR)], 74 | [Build Infiniband support, adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])], 75 | [CFLAGS="-I$with_verbs/include $CFLAGS" 76 | LDFLAGS="-L$with_verbs/lib64 -L$with_verbs/lib -libverbs $LDFLAGS"], 77 | [CFLAGS="-I/usr/include $CFLAGS" 78 | LDFLAGS="-L/usr/lib64 -L/usr/lib -libverbs $LDFLAGS"]) 79 | 80 | AC_CHECK_HEADER( [infiniband/verbs.h], [],[AC_MSG_FAILURE([ibverbs header files not found])]) 81 | AC_CHECK_LIB([ibverbs], [ibv_get_device_list], [],[AC_MSG_FAILURE([libibverbs not found]);]) 82 | 83 | AC_CHECK_DECLS([IBV_ACCESS_RELAXED_ORDERING, IBV_QPF_GRH_REQUIRED, ibv_reg_dmabuf_mr, ibv_query_ece, ibv_set_ece], [], [], 84 | [[#include ]]) 85 | 86 | # check for ucx 87 | AM_CONDITIONAL([HAVE_UCX_PLUGIN], [false]) 88 | m4_include([m4/ucx.m4]) 89 | CHECK_UCX 90 | AC_MSG_RESULT([UCX support: $ucx_happy]) 91 | 92 | # check for sharp 93 | AM_CONDITIONAL([HAVE_SHARP_PLUGIN], [false]) 94 | m4_include([m4/sharp.m4]) 95 | CHECK_SHARP 96 | AC_MSG_RESULT([SHARP support: $sharp_happy]) 97 | 98 | #check for required headers 99 | AC_CHECK_HEADERS([limits.h stdlib.h string.h unistd.h], [], 100 | [AC_MSG_FAILURE([unable to find required headers])]) 101 | 102 | AC_CONFIG_FILES([Makefile 103 | src/Makefile 104 | nccl-rdma-sharp-plugins.spec 105 | nccl-rdma-sharp-plugins.pc 106 | debian/changelog 107 | debian/control 108 | debian/nccl-rdma-sharp-plugins.postinst 109 | debian/nccl-rdma-sharp-plugins.prem 110 | debian/rules 111 | ]) 112 | AC_OUTPUT 113 | 114 | echo "NCCL-RDMA-SHARP Plugin has been configured." 115 | -------------------------------------------------------------------------------- /contrib/buildrpm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eE 2 | # 3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 4 | # Copyright (c) 2001-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 | # SPDX-License-Identifier: BSD-3-Clause 6 | # See file LICENSE for terms. 7 | # 8 | 9 | PACKAGE=nccl-rdma-sharp-plugins 10 | WS=$PWD 11 | rpmspec=${PACKAGE}.spec 12 | rpmmacros="--define='_rpmdir ${WS}/rpm-dist' --define='_srcrpmdir ${WS}/rpm-dist' --define='_sourcedir ${WS}' --define='_specdir ${WS}' --define='_builddir ${WS}'" 13 | rpmopts="--nodeps --buildroot='${WS}/_rpm'" 14 | 15 | 16 | 17 | opt_tarball=0 18 | opt_srcrpm=0 19 | opt_binrpm=0 20 | 21 | while test "$1" != ""; do 22 | case $1 in 23 | --tarball|-t) opt_tarball=1 ;; 24 | --srcrpm|-s) opt_srcrpm=1 ;; 25 | --binrpm|-b) opt_binrpm=1 ;; 26 | *) 27 | cat < 35 | 36 | -- Mellanox Ltd. Wed, 11 Sep 2013 15:24:22 +0300 37 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 8 2 | -------------------------------------------------------------------------------- /debian/control.in: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # 2. Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # 3. Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | 32 | Source: @PACKAGE@ 33 | Section: libs 34 | Priority: extra 35 | Maintainer: support@mellanox.com 36 | Build-Depends: libibverbs-dev 37 | Standards-Version: @MAJOR_VERSION@.@MINOR_VERSION@ 38 | Homepage: http://www.mellanox.com 39 | 40 | Package: @PACKAGE@ 41 | Section: libs 42 | Depends: ${shlibs:Depends}, ${misc:Depends} 43 | Architecture: any 44 | Description: RDMA and SHARP plugin for NCCL 45 | Plugin enabled RDMA and switch collectives(SHARP) in NCCL 46 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ 2 | Upstream-Name: NCCL-RDMA-SHARP plugins 3 | Source: http://www.mellanox.com 4 | 5 | Files: * 6 | Copyright (c) 2015-2019, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 | License: BSD 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions 10 | are met: 11 | 12 | 1. Redistributions of source code must retain the above copyright 13 | notice, this list of conditions and the following disclaimer. 14 | 2. Redistributions in binary form must reproduce the above copyright 15 | notice, this list of conditions and the following disclaimer in the 16 | documentation and/or other materials provided with the distribution. 17 | 3. Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from 19 | this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 27 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /debian/nccl-rdma-sharp-plugins.postinst.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 | # SPDX-License-Identifier: BSD-3-Clause 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions are met: 9 | # 10 | # 1. Redistributions of source code must retain the above copyright notice, this 11 | # list of conditions and the following disclaimer. 12 | # 13 | # 2. Redistributions in binary form must reproduce the above copyright notice, 14 | # this list of conditions and the following disclaimer in the documentation 15 | # and/or other materials provided with the distribution. 16 | # 17 | # 3. Neither the name of the copyright holder nor the names of its 18 | # contributors may be used to endorse or promote products derived from 19 | # this software without specific prior written permission. 20 | # 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | # 32 | 33 | set -e 34 | if [ @prefix@ != /usr/lib/pkgconfig ];then 35 | install -m 755 @prefix@/lib/pkgconfig/nccl-rdma-sharp-plugins.pc /usr/lib/pkgconfig/nccl-rdma-sharp-plugins.pc 36 | fi 37 | 38 | -------------------------------------------------------------------------------- /debian/nccl-rdma-sharp-plugins.prem.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 | # SPDX-License-Identifier: BSD-3-Clause 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions are met: 9 | # 10 | # 1. Redistributions of source code must retain the above copyright notice, this 11 | # list of conditions and the following disclaimer. 12 | # 13 | # 2. Redistributions in binary form must reproduce the above copyright notice, 14 | # this list of conditions and the following disclaimer in the documentation 15 | # and/or other materials provided with the distribution. 16 | # 17 | # 3. Neither the name of the copyright holder nor the names of its 18 | # contributors may be used to endorse or promote products derived from 19 | # this software without specific prior written permission. 20 | # 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | # 32 | 33 | PCF=/usr/lib/pkgconfig/nccl-rdma-sharp-plugins.pc 34 | 35 | if [ -f $PCF ];then 36 | rm -f $PCF 37 | fi 38 | -------------------------------------------------------------------------------- /debian/rules.in: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # -*- makefile -*- 3 | # 4 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 5 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 | # SPDX-License-Identifier: BSD-3-Clause 7 | # 8 | # Redistribution and use in source and binary forms, with or without 9 | # modification, are permitted provided that the following conditions are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright notice, this 12 | # list of conditions and the following disclaimer. 13 | # 14 | # 2. Redistributions in binary form must reproduce the above copyright notice, 15 | # this list of conditions and the following disclaimer in the documentation 16 | # and/or other materials provided with the distribution. 17 | # 18 | # 3. Neither the name of the copyright holder nor the names of its 19 | # contributors may be used to endorse or promote products derived from 20 | # this software without specific prior written permission. 21 | # 22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 25 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 26 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 28 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | # 33 | 34 | # Sample debian/rules that uses debhelper. 35 | # This file was originally written by Joey Hess and Craig Small. 36 | # As a special exception, when this file is copied by dh-make into a 37 | # dh-make output file, you may use that output file without restriction. 38 | # This special exception was added by Craig Small in version 0.37 of dh-make. 39 | 40 | # Uncomment this to turn on verbose mode. 41 | #export DH_VERBOSE=1 42 | 43 | %: 44 | dh $@ 45 | 46 | override_dh_auto_configure: 47 | contrib/configure-release 48 | chmod +x debian/rules 49 | 50 | override_dh_shlibdeps: 51 | dh_shlibdeps --dpkg-shlibdeps-params=--ignore-missing-info 52 | 53 | override_dh_auto_clean: 54 | -------------------------------------------------------------------------------- /debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /include/core.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_CORE_H_ 10 | #define NCCL_CORE_H_ 11 | 12 | #include "nccl.h" 13 | #include "debug.h" 14 | 15 | #include 16 | #include 17 | 18 | #define MIN(a, b) ((a)<(b)?(a):(b)) 19 | #define MAX(a, b) ((a)>(b)?(a):(b)) 20 | 21 | #define DIVUP(x, y) \ 22 | (((x)+(y)-1)/(y)) 23 | #define ROUNDUP(x, y) \ 24 | (DIVUP((x), (y))*(y)) 25 | 26 | // Check CUDA calls 27 | #define CUDACHECK(cmd) do { \ 28 | cudaError_t err = cmd; \ 29 | if( err != cudaSuccess ) { \ 30 | WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ 31 | return ncclUnhandledCudaError; \ 32 | } \ 33 | } while(false) 34 | 35 | #define CUDACHECKGOTO(cmd, RES, label) do { \ 36 | cudaError_t err = cmd; \ 37 | if( err != cudaSuccess ) { \ 38 | WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ 39 | RES = ncclUnhandledCudaError; \ 40 | goto label; \ 41 | } \ 42 | } while(false) 43 | 44 | // Report failure but clear error and continue 45 | #define CUDACHECKIGNORE(cmd) do { \ 46 | cudaError_t err = cmd; \ 47 | if( err != cudaSuccess ) { \ 48 | INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \ 49 | (void) cudaGetLastError(); \ 50 | } \ 51 | } while(false) 52 | 53 | #include 54 | // Check system calls 55 | #define SYSCHECK(statement, name) do { \ 56 | int retval; \ 57 | SYSCHECKSYNC((statement), name, retval); \ 58 | if (retval == -1) { \ 59 | WARN("Call to " name " failed: %s", strerror(errno)); \ 60 | return ncclSystemError; \ 61 | } \ 62 | } while (false) 63 | 64 | #define SYSCHECKSYNC(statement, name, retval) do { \ 65 | retval = (statement); \ 66 | if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ 67 | INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ 68 | } else { \ 69 | break; \ 70 | } \ 71 | } while(true) 72 | 73 | #define SYSCHECKGOTO(statement, name, RES, label) do { \ 74 | int retval; \ 75 | SYSCHECKSYNC((statement), name, retval); \ 76 | if (retval == -1) { \ 77 | WARN("Call to " name " failed: %s", strerror(errno)); \ 78 | RES = ncclSystemError; \ 79 | goto label; \ 80 | } \ 81 | } while (0) 82 | 83 | // Pthread calls don't set errno and never return EINTR. 84 | #define PTHREADCHECK(statement, name) do { \ 85 | int retval = (statement); \ 86 | if (retval != 0) { \ 87 | WARN("Call to " name " failed: %s", strerror(retval)); \ 88 | return ncclSystemError; \ 89 | } \ 90 | } while (0) 91 | 92 | #define PTHREADCHECKGOTO(statement, name, RES, label) do { \ 93 | int retval = (statement); \ 94 | if (retval != 0) { \ 95 | WARN("Call to " name " failed: %s", strerror(retval)); \ 96 | RES = ncclSystemError; \ 97 | goto label; \ 98 | } \ 99 | } while (0) 100 | 101 | 102 | #define NEQCHECK(statement, value) do { \ 103 | if ((statement) != value) { \ 104 | /* Print the back trace*/ \ 105 | INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ 106 | return ncclSystemError; \ 107 | } \ 108 | } while (0) 109 | 110 | #define NEQCHECKGOTO(statement, value, RES, label) do { \ 111 | if ((statement) != value) { \ 112 | /* Print the back trace*/ \ 113 | RES = ncclSystemError; \ 114 | INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ 115 | goto label; \ 116 | } \ 117 | } while (0) 118 | 119 | #define EQCHECK(statement, value) do { \ 120 | if ((statement) == value) { \ 121 | /* Print the back trace*/ \ 122 | INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ 123 | return ncclSystemError; \ 124 | } \ 125 | } while (0) 126 | 127 | #define EQCHECKGOTO(statement, value, RES, label) do { \ 128 | if ((statement) == value) { \ 129 | /* Print the back trace*/ \ 130 | RES = ncclSystemError; \ 131 | INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ 132 | goto label; \ 133 | } \ 134 | } while (0) 135 | 136 | // Propagate errors up 137 | #define NCCLCHECK(call) do { \ 138 | ncclResult_t RES = call; \ 139 | if (RES != ncclSuccess && RES != ncclInProgress) { \ 140 | /* Print the back trace*/ \ 141 | return RES; \ 142 | } \ 143 | } while (0) 144 | 145 | #define NCCLCHECKGOTO(call, RES, label) do { \ 146 | RES = call; \ 147 | if (RES != ncclSuccess && RES != ncclInProgress) { \ 148 | /* Print the back trace*/ \ 149 | goto label; \ 150 | } \ 151 | } while (0) 152 | 153 | #define NCCLWAIT(call, cond, abortFlagPtr) do { \ 154 | volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ 155 | ncclResult_t RES = call; \ 156 | if (RES != ncclSuccess && RES != ncclInProgress) { \ 157 | return ncclInternalError; \ 158 | } \ 159 | if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ 160 | } while (!(cond)) 161 | 162 | #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \ 163 | volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ 164 | RES = call; \ 165 | if (RES != ncclSuccess && RES != ncclInProgress) { \ 166 | goto label; \ 167 | } \ 168 | if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ 169 | } while (!(cond)) 170 | 171 | #define NCCLCHECKTHREAD(a, args) do { \ 172 | if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \ 173 | INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ 174 | return args; \ 175 | } \ 176 | } while(0) 177 | 178 | #define CUDACHECKTHREAD(a) do { \ 179 | if ((a) != cudaSuccess) { \ 180 | INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ 181 | args->ret = ncclUnhandledCudaError; \ 182 | return args; \ 183 | } \ 184 | } while(0) 185 | 186 | #endif 187 | -------------------------------------------------------------------------------- /include/debug.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_DEBUG_H_ 10 | #define NCCL_DEBUG_H_ 11 | 12 | #include "core.h" 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include "net.h" 21 | 22 | // Conform to pthread and NVTX standard 23 | #define NCCL_THREAD_NAMELEN 16 24 | 25 | extern pthread_mutex_t ncclDebugLock; 26 | 27 | extern ncclDebugLogger_t pluginLogFunction; 28 | 29 | #define WARN(...) pluginLogFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 30 | #define INFO(FLAGS, ...) pluginLogFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) 31 | 32 | #ifdef ENABLE_TRACE 33 | #define TRACE(FLAGS, ...) pluginLogFunction(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) 34 | #else 35 | #define TRACE(...) 36 | #endif 37 | 38 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...); 39 | 40 | void ncclResetDebugInit(); 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /include/ibvwrap.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. 3 | * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. 4 | * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. 5 | * Copyright (c) 2005 PathScale, Inc. All rights reserved. 6 | * 7 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 8 | * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | * SPDX-License-Identifier: BSD-3-Clause 10 | * 11 | * See LICENSE.txt for license information 12 | ************************************************************************/ 13 | 14 | #ifndef NCCL_IBVWRAP_H_ 15 | #define NCCL_IBVWRAP_H_ 16 | #include "config.h" 17 | #include "core.h" 18 | #include "utils.h" 19 | #include 20 | #include 21 | #include 22 | 23 | #if !HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING 24 | # define IBV_ACCESS_RELAXED_ORDERING 0 25 | #endif 26 | #if !HAVE_DECL_IBV_QPF_GRH_REQUIRED 27 | # define IBV_QPF_GRH_REQUIRED 0 28 | #endif 29 | 30 | #if !HAVE_DECL_IBV_SET_ECE 31 | struct ibv_ece { 32 | /* 33 | * Unique identifier of the provider vendor on the network. 34 | * The providers will set IEEE OUI here to distinguish 35 | * itself in non-homogenius network. 36 | */ 37 | uint32_t vendor_id; 38 | /* 39 | * Provider specific attributes which are supported or 40 | * needed to be enabled by ECE users. 41 | */ 42 | uint32_t options; 43 | uint32_t comp_mask; 44 | }; 45 | #endif 46 | 47 | ncclResult_t wrap_ibv_fork_init(void); 48 | ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices); 49 | ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list); 50 | const char *wrap_ibv_get_device_name(struct ibv_device *device); 51 | ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device); 52 | ncclResult_t wrap_ibv_close_device(struct ibv_context *context); 53 | ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event); 54 | ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event); 55 | ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr); 56 | ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); 57 | ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); 58 | ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); 59 | ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context); 60 | ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); 61 | ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); 62 | struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); 63 | ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); 64 | /* DMA-BUF support */ 65 | ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); 66 | struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); 67 | ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); 68 | ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); 69 | ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); 70 | ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); 71 | ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq); 72 | static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) { 73 | int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/ 74 | if (done < 0) { 75 | WARN("Call to ibv_poll_cq() returned %d", done); 76 | return ncclSystemError; 77 | } 78 | *num_done = done; 79 | return ncclSuccess; 80 | } 81 | ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); 82 | ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); 83 | ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); 84 | ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); 85 | ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); 86 | ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr); 87 | ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); 88 | ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event); 89 | 90 | // converts a GID into a readable string. On success, returns a non-null pointer to gidStr. 91 | // NULL is returned if there was an error, with errno set to indicate the error. 92 | // errno = ENOSPC if the converted string would exceed strLen. 93 | static inline const char* ibvGetGidStr(union ibv_gid* gid, char* gidStr, size_t strLen) { 94 | // GID is a 16B handle, to convert it to a readable form, we use inet_ntop 95 | // sizeof(ibv_gid) == sizeof(struct in6_addr), so using AF_INET6 96 | NCCL_STATIC_ASSERT(sizeof(union ibv_gid) == sizeof(struct in6_addr), "the sizeof struct ibv_gid must be the size of struct in6_addr"); 97 | return inet_ntop(AF_INET6, gid->raw, gidStr, strLen); 98 | } 99 | 100 | #endif //End include guard 101 | -------------------------------------------------------------------------------- /include/net.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | */ 6 | 7 | #ifndef NCCL_NET_H_ 8 | #define NCCL_NET_H_ 9 | 10 | #include 11 | #include 12 | 13 | #define NCCL_NET_HANDLE_MAXSIZE 128 14 | //Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties 15 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) 16 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 17 | 18 | 19 | #define NCCL_PTR_HOST 0x1 20 | #define NCCL_PTR_CUDA 0x2 21 | #define NCCL_PTR_DMABUF 0x4 22 | 23 | // Maximum number of requests per comm object 24 | #define NCCL_NET_MAX_REQUESTS 8 25 | 26 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 27 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; 28 | 29 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 30 | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData); 31 | 32 | #include "net_v10.h" 33 | #include "net_v9.h" 34 | #include "net_v8.h" 35 | #include "net_v7.h" 36 | #include "net_v6.h" 37 | #include "net_v5.h" 38 | 39 | #endif // end include guard 40 | -------------------------------------------------------------------------------- /include/net_device.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NET_DEVICE_H_ 10 | #define NET_DEVICE_H_ 11 | 12 | #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 13 | #define NCCL_NET_MTU_SIZE 4096 14 | 15 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin 16 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. 17 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 18 | 19 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; 20 | 21 | typedef struct { 22 | ncclNetDeviceType netDeviceType; // Network offload type 23 | int netDeviceVersion; // Version number for network offload 24 | void* handle; 25 | size_t size; 26 | int needsProxyProgress; 27 | } ncclNetDeviceHandle_v7_t; 28 | 29 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; 30 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; 31 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; 32 | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /include/net_v10.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | */ 6 | #ifndef NCCL_NET_V10_H_ 7 | #define NCCL_NET_V10_H_ 8 | 9 | #include "net_device.h" 10 | 11 | #define NCCL_NET_MAX_DEVS_PER_NIC_V10 4 12 | #define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10 13 | typedef struct { 14 | int ndevs; 15 | int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10]; 16 | } ncclNetVDeviceProps_v10_t; 17 | typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; 18 | 19 | #define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 20 | typedef struct { 21 | // Plugin-specific TC value 22 | int trafficClass; 23 | } ncclNetCommConfig_v10_t; 24 | typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; 25 | 26 | typedef struct { 27 | char* name; // Used mostly for logging. 28 | char* pciPath; // Path to the PCI device in /sys. 29 | uint64_t guid; // Unique identifier for the NIC chip. Important for 30 | // cards with multiple PCI functions (Physical or virtual). 31 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 32 | int regIsGlobal; // regMr is not tied to a particular comm 33 | int forceFlush; // Force a flush on receives 34 | int speed; // Port speed in Mbps. 35 | int port; // Port number. 36 | float latency; // Network latency 37 | int maxComms; // Maximum number of comms we can create 38 | int maxRecvs; // Maximum number of grouped receives. 39 | ncclNetDeviceType netDeviceType; // Network offload type 40 | int netDeviceVersion; // Version number for network offload 41 | ncclNetVDeviceProps_v10_t vProps; 42 | size_t maxP2pBytes; // Max transfer size for point-to-point operations 43 | size_t maxCollBytes; // Max transfer size for collective operations 44 | } ncclNetProperties_v10_t; 45 | 46 | typedef ncclNetProperties_v10_t ncclNetProperties_t; 47 | 48 | typedef struct { 49 | // Name of the network (mainly for logs) 50 | const char* name; 51 | // Initialize the network. 52 | ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); 53 | // Return the number of adapters. 54 | ncclResult_t (*devices)(int* ndev); 55 | // Get various device properties. 56 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); 57 | // Create a receiving object and provide a handle to connect to it. The 58 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 59 | // between ranks to create a connection. 60 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 61 | // Connect to a handle and return a sending comm object for that peer. 62 | // This call must not block for the connection to be established, and instead 63 | // should return successfully with sendComm == NULL with the expectation that 64 | // it will be called again until sendComm != NULL. 65 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection 66 | ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm); 67 | // Finalize connection establishment after remote peer has called connect. 68 | // This call must not block for the connection to be established, and instead 69 | // should return successfully with recvComm == NULL with the expectation that 70 | // it will be called again until recvComm != NULL. 71 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection 72 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm); 73 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 74 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 75 | ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); 76 | /* DMA-BUF support */ 77 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 78 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 79 | // Asynchronous send to a peer. 80 | // May return request == NULL if the call cannot be performed (or would block) 81 | ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); 82 | // Asynchronous recv from a peer. 83 | // May return request == NULL if the call cannot be performed (or would block) 84 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); 85 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 86 | // visible to the GPU 87 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 88 | // Test whether a request is complete. If size is not NULL, it returns the 89 | // number of bytes sent/received. 90 | ncclResult_t (*test)(void* request, int* done, int* sizes); 91 | // Close and free send/recv comm objects 92 | ncclResult_t (*closeSend)(void* sendComm); 93 | ncclResult_t (*closeRecv)(void* recvComm); 94 | ncclResult_t (*closeListen)(void* listenComm); 95 | 96 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code 97 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); 98 | 99 | // Notify the plugin that a recv has completed by the device 100 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); 101 | 102 | // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller 103 | // what index this new vNIC exists at 104 | ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props); 105 | } ncclNet_v10_t; 106 | 107 | typedef struct { 108 | void* mhandle; 109 | void* address; 110 | size_t size; 111 | } ncclNetSGE_v10_t; 112 | 113 | typedef struct { 114 | // Name of the collective network (mainly for logs) 115 | const char* name; 116 | // Initialize the collective network. 117 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 118 | // Return the number of adapters capable of doing collective operations. 119 | // If ndev returns 0, all other functions might be set to NULL. 120 | ncclResult_t (*devices)(int* ndev); 121 | // Get various device properties. 122 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); 123 | // Create a receiving object and provide a handle to connect to it. The 124 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 125 | // between ranks to create connections. 126 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 127 | // Create a group for collective operations. handles have been created 128 | // using listen() above. rank indicates caller's rank in the collective network. 129 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); 130 | // Returns whether a reduction operation on a data type is supported. 131 | // 1 for supported, 0 otherwise. 132 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); 133 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 134 | ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); 135 | /* DMA-BUF support */ 136 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 137 | ncclResult_t (*deregMr)(void* collComm, void* mhandle); 138 | // Performs an asynchronous allreduce operation on the collective group. 139 | // May return request == NULL if the call cannot be performed (or would block). 140 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, 141 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); 142 | ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts, 143 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes, 144 | void* sendMhandle, void** request); 145 | ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData, 146 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes, 147 | ncclDataType_t dataType, ncclRedOp_t redOp, 148 | void* recvMhandle, void** request); 149 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 150 | // visible to the GPU 151 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); 152 | // Test whether a request is complete. If size is not NULL, it returns the 153 | // number of bytes sent/received. 154 | ncclResult_t (*test)(void* request, int* done, int* size); 155 | // Close and free collective comm objects 156 | ncclResult_t (*closeColl)(void* collComm); 157 | ncclResult_t (*closeListen)(void* listenComm); 158 | // Create a virtual NIC given the specified properties, which can be accessed at device index d 159 | ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props); 160 | } ncclCollNet_v10_t; 161 | 162 | typedef ncclCollNet_v10_t ncclCollNet_t; 163 | 164 | #endif // end include guard 165 | -------------------------------------------------------------------------------- /include/net_v5.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | */ 6 | 7 | #ifndef NCCL_NET_V5_H_ 8 | #define NCCL_NET_V5_H_ 9 | 10 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; 11 | typedef struct { 12 | // Name of the network (mainly for logs) 13 | const char* name; 14 | // Initialize the network. 15 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 16 | // Return the number of adapters. 17 | ncclResult_t (*devices)(int* ndev); 18 | // Get various device properties. 19 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); 20 | // Create a receiving object and provide a handle to connect to it. The 21 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 22 | // between ranks to create a connection. 23 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 24 | // Connect to a handle and return a sending comm object for that peer. 25 | // This call must not block for the connection to be established, and instead 26 | // should return successfully with sendComm == NULL with the expectation that 27 | // it will be called again until sendComm != NULL. 28 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 29 | // Finalize connection establishment after remote peer has called connect. 30 | // This call must not block for the connection to be established, and instead 31 | // should return successfully with recvComm == NULL with the expectation that 32 | // it will be called again until recvComm != NULL. 33 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 34 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 35 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 36 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 37 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 38 | // Asynchronous send to a peer. 39 | // May return request == NULL if the call cannot be performed (or would block) 40 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 41 | // Asynchronous recv from a peer. 42 | // May return request == NULL if the call cannot be performed (or would block) 43 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 44 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 45 | // visible to the GPU 46 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 47 | // Test whether a request is complete. If size is not NULL, it returns the 48 | // number of bytes sent/received. 49 | ncclResult_t (*test)(void* request, int* done, int* sizes); 50 | // Close and free send/recv comm objects 51 | ncclResult_t (*closeSend)(void* sendComm); 52 | ncclResult_t (*closeRecv)(void* recvComm); 53 | ncclResult_t (*closeListen)(void* listenComm); 54 | } ncclNet_v5_t; 55 | 56 | 57 | typedef struct { 58 | // Name of the collective network (mainly for logs) 59 | const char* name; 60 | // Initialize the collective network. 61 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 62 | // Return the number of adapters capable of doing collective operations. 63 | // If ndev returns 0, all other functions might be set to NULL. 64 | ncclResult_t (*devices)(int* ndev); 65 | // Get various device properties. 66 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); 67 | // Create a receiving object and provide a handle to connect to it. The 68 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 69 | // between ranks to create connections. 70 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 71 | // Create a group for collective operations. handles have been created 72 | // using listen() above. rank indicates caller's rank in the collective network. 73 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); 74 | // Returns whether a reduction operation on a data type is supported. 75 | // 1 for supported, 0 otherwise. 76 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); 77 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 78 | ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); 79 | ncclResult_t (*deregMr)(void* collComm, void* mhandle); 80 | // Performs an asynchronous allreduce operation on the collective group. 81 | // May return request == NULL if the call cannot be performed (or would block). 82 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, 83 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); 84 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 85 | // visible to the GPU 86 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); 87 | // Test whether a request is complete. If size is not NULL, it returns the 88 | // number of bytes sent/received. 89 | ncclResult_t (*test)(void* request, int* done, int* size); 90 | // Close and free collective comm objects 91 | ncclResult_t (*closeColl)(void* collComm); 92 | ncclResult_t (*closeListen)(void* listenComm); 93 | } ncclCollNet_v5_t; 94 | 95 | #endif 96 | -------------------------------------------------------------------------------- /include/net_v6.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | */ 6 | 7 | #ifndef NCCL_NET_V6_H_ 8 | #define NCCL_NET_V6_H_ 9 | 10 | typedef struct { 11 | char* name; // Used mostly for logging. 12 | char* pciPath; // Path to the PCI device in /sys. 13 | uint64_t guid; // Unique identifier for the NIC chip. Important for 14 | // cards with multiple PCI functions (Physical or virtual). 15 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 16 | int speed; // Port speed in Mbps. 17 | int port; // Port number. 18 | float latency; // Network latency 19 | int maxComms; // Maximum number of comms we can create 20 | int maxRecvs; // Maximum number of grouped receives. 21 | }ncclNetProperties_v6_t; 22 | 23 | typedef struct { 24 | // Name of the network (mainly for logs) 25 | const char* name; 26 | // Initialize the network. 27 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 28 | // Return the number of adapters. 29 | ncclResult_t (*devices)(int* ndev); 30 | // Get various device properties. 31 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); 32 | // Create a receiving object and provide a handle to connect to it. The 33 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 34 | // between ranks to create a connection. 35 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 36 | // Connect to a handle and return a sending comm object for that peer. 37 | // This call must not block for the connection to be established, and instead 38 | // should return successfully with sendComm == NULL with the expectation that 39 | // it will be called again until sendComm != NULL. 40 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 41 | // Finalize connection establishment after remote peer has called connect. 42 | // This call must not block for the connection to be established, and instead 43 | // should return successfully with recvComm == NULL with the expectation that 44 | // it will be called again until recvComm != NULL. 45 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 46 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 47 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 48 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 49 | /* DMA-BUF support */ 50 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 51 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 52 | // Asynchronous send to a peer. 53 | // May return request == NULL if the call cannot be performed (or would block) 54 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 55 | // Asynchronous recv from a peer. 56 | // May return request == NULL if the call cannot be performed (or would block) 57 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 58 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 59 | // visible to the GPU 60 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 61 | // Test whether a request is complete. If size is not NULL, it returns the 62 | // number of bytes sent/received. 63 | ncclResult_t (*test)(void* request, int* done, int* sizes); 64 | // Close and free send/recv comm objects 65 | ncclResult_t (*closeSend)(void* sendComm); 66 | ncclResult_t (*closeRecv)(void* recvComm); 67 | ncclResult_t (*closeListen)(void* listenComm); 68 | } ncclNet_v6_t; 69 | 70 | typedef struct { 71 | // Name of the collective network (mainly for logs) 72 | const char* name; 73 | // Initialize the collective network. 74 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 75 | // Return the number of adapters capable of doing collective operations. 76 | // If ndev returns 0, all other functions might be set to NULL. 77 | ncclResult_t (*devices)(int* ndev); 78 | // Get various device properties. 79 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); 80 | // Create a receiving object and provide a handle to connect to it. The 81 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 82 | // between ranks to create connections. 83 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 84 | // Create a group for collective operations. handles have been created 85 | // using listen() above. rank indicates caller's rank in the collective network. 86 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); 87 | // Returns whether a reduction operation on a data type is supported. 88 | // 1 for supported, 0 otherwise. 89 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); 90 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 91 | ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); 92 | /* DMA-BUF support */ 93 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 94 | ncclResult_t (*deregMr)(void* collComm, void* mhandle); 95 | // Performs an asynchronous allreduce operation on the collective group. 96 | // May return request == NULL if the call cannot be performed (or would block). 97 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, 98 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); 99 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 100 | // visible to the GPU 101 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); 102 | // Test whether a request is complete. If size is not NULL, it returns the 103 | // number of bytes sent/received. 104 | ncclResult_t (*test)(void* request, int* done, int* size); 105 | // Close and free collective comm objects 106 | ncclResult_t (*closeColl)(void* collComm); 107 | ncclResult_t (*closeListen)(void* listenComm); 108 | } ncclCollNet_v6_t; 109 | 110 | #endif // end include guard 111 | -------------------------------------------------------------------------------- /include/net_v7.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | */ 6 | 7 | #ifndef NCCL_NET_V7_H_ 8 | #define NCCL_NET_V7_H_ 9 | 10 | #include "net_device.h" 11 | 12 | typedef struct { 13 | char* name; // Used mostly for logging. 14 | char* pciPath; // Path to the PCI device in /sys. 15 | uint64_t guid; // Unique identifier for the NIC chip. Important for 16 | // cards with multiple PCI functions (Physical or virtual). 17 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 18 | int speed; // Port speed in Mbps. 19 | int port; // Port number. 20 | float latency; // Network latency 21 | int maxComms; // Maximum number of comms we can create 22 | int maxRecvs; // Maximum number of grouped receives. 23 | ncclNetDeviceType netDeviceType; // Network offload type 24 | int netDeviceVersion; // Version number for network offload 25 | } ncclNetProperties_v7_t; 26 | 27 | typedef struct { 28 | // Name of the network (mainly for logs) 29 | const char* name; 30 | // Initialize the network. 31 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 32 | // Return the number of adapters. 33 | ncclResult_t (*devices)(int* ndev); 34 | // Get various device properties. 35 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); 36 | // Create a receiving object and provide a handle to connect to it. The 37 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 38 | // between ranks to create a connection. 39 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 40 | // Connect to a handle and return a sending comm object for that peer. 41 | // This call must not block for the connection to be established, and instead 42 | // should return successfully with sendComm == NULL with the expectation that 43 | // it will be called again until sendComm != NULL. 44 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection 45 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); 46 | // Finalize connection establishment after remote peer has called connect. 47 | // This call must not block for the connection to be established, and instead 48 | // should return successfully with recvComm == NULL with the expectation that 49 | // it will be called again until recvComm != NULL. 50 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection 51 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); 52 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 53 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 54 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 55 | /* DMA-BUF support */ 56 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 57 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 58 | // Asynchronous send to a peer. 59 | // May return request == NULL if the call cannot be performed (or would block) 60 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 61 | // Asynchronous recv from a peer. 62 | // May return request == NULL if the call cannot be performed (or would block) 63 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 64 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 65 | // visible to the GPU 66 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 67 | // Test whether a request is complete. If size is not NULL, it returns the 68 | // number of bytes sent/received. 69 | ncclResult_t (*test)(void* request, int* done, int* sizes); 70 | // Close and free send/recv comm objects 71 | ncclResult_t (*closeSend)(void* sendComm); 72 | ncclResult_t (*closeRecv)(void* recvComm); 73 | ncclResult_t (*closeListen)(void* listenComm); 74 | 75 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code 76 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); 77 | 78 | // Notify the plugin that a recv has completed by the device 79 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); 80 | } ncclNet_v7_t; 81 | 82 | // v7 struct for backwards compatibility 83 | typedef struct { 84 | // Name of the collective network (mainly for logs) 85 | const char* name; 86 | // Initialize the collective network. 87 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 88 | // Return the number of adapters capable of doing collective operations. 89 | // If ndev returns 0, all other functions might be set to NULL. 90 | ncclResult_t (*devices)(int* ndev); 91 | // Get various device properties. 92 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); 93 | // Create a receiving object and provide a handle to connect to it. The 94 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 95 | // between ranks to create connections. 96 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 97 | // Create a group for collective operations. handles have been created 98 | // using listen() above. rank indicates caller's rank in the collective network. 99 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); 100 | // Returns whether a reduction operation on a data type is supported. 101 | // 1 for supported, 0 otherwise. 102 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); 103 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 104 | ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); 105 | /* DMA-BUF support */ 106 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 107 | ncclResult_t (*deregMr)(void* collComm, void* mhandle); 108 | // Performs an asynchronous allreduce operation on the collective group. 109 | // May return request == NULL if the call cannot be performed (or would block). 110 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, 111 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); 112 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 113 | // visible to the GPU 114 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); 115 | // Test whether a request is complete. If size is not NULL, it returns the 116 | // number of bytes sent/received. 117 | ncclResult_t (*test)(void* request, int* done, int* size); 118 | // Close and free collective comm objects 119 | ncclResult_t (*closeColl)(void* collComm); 120 | ncclResult_t (*closeListen)(void* listenComm); 121 | } ncclCollNet_v7_t; 122 | 123 | #endif // end include guard 124 | -------------------------------------------------------------------------------- /include/net_v8.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | */ 6 | 7 | #ifndef NCCL_NET_V8_H_ 8 | #define NCCL_NET_V8_H_ 9 | #include "net_device.h" 10 | 11 | typedef struct { 12 | char* name; // Used mostly for logging. 13 | char* pciPath; // Path to the PCI device in /sys. 14 | uint64_t guid; // Unique identifier for the NIC chip. Important for 15 | // cards with multiple PCI functions (Physical or virtual). 16 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 17 | int regIsGlobal; // regMr is not tied to a particular comm 18 | int speed; // Port speed in Mbps. 19 | int port; // Port number. 20 | float latency; // Network latency 21 | int maxComms; // Maximum number of comms we can create 22 | int maxRecvs; // Maximum number of grouped receives. 23 | ncclNetDeviceType netDeviceType; // Network offload type 24 | int netDeviceVersion; // Version number for network offload 25 | } ncclNetProperties_v8_t; 26 | 27 | typedef struct { 28 | // Name of the network (mainly for logs) 29 | const char* name; 30 | // Initialize the network. 31 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 32 | // Return the number of adapters. 33 | ncclResult_t (*devices)(int* ndev); 34 | // Get various device properties. 35 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); 36 | // Create a receiving object and provide a handle to connect to it. The 37 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 38 | // between ranks to create a connection. 39 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 40 | // Connect to a handle and return a sending comm object for that peer. 41 | // This call must not block for the connection to be established, and instead 42 | // should return successfully with sendComm == NULL with the expectation that 43 | // it will be called again until sendComm != NULL. 44 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection 45 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); 46 | // Finalize connection establishment after remote peer has called connect. 47 | // This call must not block for the connection to be established, and instead 48 | // should return successfully with recvComm == NULL with the expectation that 49 | // it will be called again until recvComm != NULL. 50 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection 51 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); 52 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 53 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 54 | ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); 55 | /* DMA-BUF support */ 56 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 57 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 58 | // Asynchronous send to a peer. 59 | // May return request == NULL if the call cannot be performed (or would block) 60 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 61 | // Asynchronous recv from a peer. 62 | // May return request == NULL if the call cannot be performed (or would block) 63 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 64 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 65 | // visible to the GPU 66 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 67 | // Test whether a request is complete. If size is not NULL, it returns the 68 | // number of bytes sent/received. 69 | ncclResult_t (*test)(void* request, int* done, int* sizes); 70 | // Close and free send/recv comm objects 71 | ncclResult_t (*closeSend)(void* sendComm); 72 | ncclResult_t (*closeRecv)(void* recvComm); 73 | ncclResult_t (*closeListen)(void* listenComm); 74 | 75 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code 76 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); 77 | 78 | // Notify the plugin that a recv has completed by the device 79 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); 80 | } ncclNet_v8_t; 81 | 82 | 83 | typedef struct { 84 | void* mhandle; 85 | void* address; 86 | uint32_t size; 87 | } ncclNetSGE_v8_t; 88 | 89 | typedef struct { 90 | // Name of the collective network (mainly for logs) 91 | const char* name; 92 | // Initialize the collective network. 93 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 94 | // Return the number of adapters capable of doing collective operations. 95 | // If ndev returns 0, all other functions might be set to NULL. 96 | ncclResult_t (*devices)(int* ndev); 97 | // Get various device properties. 98 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); 99 | // Create a receiving object and provide a handle to connect to it. The 100 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 101 | // between ranks to create connections. 102 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 103 | // Create a group for collective operations. handles have been created 104 | // using listen() above. rank indicates caller's rank in the collective network. 105 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); 106 | // Returns whether a reduction operation on a data type is supported. 107 | // 1 for supported, 0 otherwise. 108 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); 109 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 110 | ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); 111 | /* DMA-BUF support */ 112 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 113 | ncclResult_t (*deregMr)(void* collComm, void* mhandle); 114 | // Performs an asynchronous allreduce operation on the collective group. 115 | // May return request == NULL if the call cannot be performed (or would block). 116 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, 117 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); 118 | ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts, 119 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes, 120 | void* sendMhandle, void** request); 121 | ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData, 122 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes, 123 | ncclDataType_t dataType, ncclRedOp_t redOp, 124 | void* recvMhandle, void** request); 125 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 126 | // visible to the GPU 127 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); 128 | // Test whether a request is complete. If size is not NULL, it returns the 129 | // number of bytes sent/received. 130 | ncclResult_t (*test)(void* request, int* done, int* size); 131 | // Close and free collective comm objects 132 | ncclResult_t (*closeColl)(void* collComm); 133 | ncclResult_t (*closeListen)(void* listenComm); 134 | } ncclCollNet_v8_t; 135 | 136 | 137 | #endif // end include guard 138 | -------------------------------------------------------------------------------- /include/net_v9.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | */ 6 | 7 | #ifndef NCCL_NET_V9_H_ 8 | #define NCCL_NET_V9_H_ 9 | #include "net_device.h" 10 | 11 | // Max number of ncclNet objects which can live in the same process 12 | #define NCCL_NET_MAX_PLUGINS 3 13 | 14 | #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 15 | 16 | typedef struct { 17 | int ndevs; 18 | int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; 19 | } ncclNetVDeviceProps_v9_t; 20 | 21 | typedef struct { 22 | char* name; // Used mostly for logging. 23 | char* pciPath; // Path to the PCI device in /sys. 24 | uint64_t guid; // Unique identifier for the NIC chip. Important for 25 | // cards with multiple PCI functions (Physical or virtual). 26 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 27 | int regIsGlobal; // regMr is not tied to a particular comm 28 | int forceFlush; // Force a flush on receives 29 | int speed; // Port speed in Mbps. 30 | int port; // Port number. 31 | float latency; // Network latency 32 | int maxComms; // Maximum number of comms we can create 33 | int maxRecvs; // Maximum number of grouped receives. 34 | ncclNetDeviceType netDeviceType; // Network offload type 35 | int netDeviceVersion; // Version number for network offload 36 | ncclNetVDeviceProps_v9_t vProps; 37 | size_t maxP2pBytes; // Max transfer size for point-to-point operations 38 | size_t maxCollBytes; // Max transfer size for collective operations 39 | } ncclNetProperties_v9_t; 40 | 41 | typedef struct { 42 | // Name of the network (mainly for logs) 43 | const char* name; 44 | // Initialize the network. 45 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 46 | // Return the number of adapters. 47 | ncclResult_t (*devices)(int* ndev); 48 | // Get various device properties. 49 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); 50 | // Create a receiving object and provide a handle to connect to it. The 51 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 52 | // between ranks to create a connection. 53 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 54 | // Connect to a handle and return a sending comm object for that peer. 55 | // This call must not block for the connection to be established, and instead 56 | // should return successfully with sendComm == NULL with the expectation that 57 | // it will be called again until sendComm != NULL. 58 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection 59 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); 60 | // Finalize connection establishment after remote peer has called connect. 61 | // This call must not block for the connection to be established, and instead 62 | // should return successfully with recvComm == NULL with the expectation that 63 | // it will be called again until recvComm != NULL. 64 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection 65 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); 66 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 67 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 68 | ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); 69 | /* DMA-BUF support */ 70 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 71 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 72 | // Asynchronous send to a peer. 73 | // May return request == NULL if the call cannot be performed (or would block) 74 | ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); 75 | // Asynchronous recv from a peer. 76 | // May return request == NULL if the call cannot be performed (or would block) 77 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); 78 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 79 | // visible to the GPU 80 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 81 | // Test whether a request is complete. If size is not NULL, it returns the 82 | // number of bytes sent/received. 83 | ncclResult_t (*test)(void* request, int* done, int* sizes); 84 | // Close and free send/recv comm objects 85 | ncclResult_t (*closeSend)(void* sendComm); 86 | ncclResult_t (*closeRecv)(void* recvComm); 87 | ncclResult_t (*closeListen)(void* listenComm); 88 | 89 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code 90 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); 91 | 92 | // Notify the plugin that a recv has completed by the device 93 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); 94 | 95 | // Create a virtual NIC given the specified properties, which can be accessed at device index d 96 | ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); 97 | } ncclNet_v9_t; 98 | 99 | typedef struct { 100 | // Name of the collective network (mainly for logs) 101 | const char* name; 102 | // Initialize the collective network. 103 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 104 | // Return the number of adapters capable of doing collective operations. 105 | // If ndev returns 0, all other functions might be set to NULL. 106 | ncclResult_t (*devices)(int* ndev); 107 | // Get various device properties. 108 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); 109 | // Create a receiving object and provide a handle to connect to it. The 110 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 111 | // between ranks to create connections. 112 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 113 | // Create a group for collective operations. handles have been created 114 | // using listen() above. rank indicates caller's rank in the collective network. 115 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); 116 | // Returns whether a reduction operation on a data type is supported. 117 | // 1 for supported, 0 otherwise. 118 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); 119 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 120 | ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); 121 | /* DMA-BUF support */ 122 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 123 | ncclResult_t (*deregMr)(void* collComm, void* mhandle); 124 | // Performs an asynchronous allreduce operation on the collective group. 125 | // May return request == NULL if the call cannot be performed (or would block). 126 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, 127 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); 128 | ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts, 129 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes, 130 | void* sendMhandle, void** request); 131 | ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData, 132 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes, 133 | ncclDataType_t dataType, ncclRedOp_t redOp, 134 | void* recvMhandle, void** request); 135 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 136 | // visible to the GPU 137 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); 138 | // Test whether a request is complete. If size is not NULL, it returns the 139 | // number of bytes sent/received. 140 | ncclResult_t (*test)(void* request, int* done, int* size); 141 | // Close and free collective comm objects 142 | ncclResult_t (*closeColl)(void* collComm); 143 | ncclResult_t (*closeListen)(void* listenComm); 144 | 145 | // Create a virtual NIC given the specified properties, which can be accessed at device index d 146 | ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); 147 | } ncclCollNet_v9_t; 148 | 149 | #endif // end include guard 150 | -------------------------------------------------------------------------------- /include/p2p_plugin.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_P2P_PLUGIN_H_ 10 | #define NCCL_P2P_PLUGIN_H_ 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include "nccl.h" 17 | #include "net.h" 18 | #include "ibvwrap.h" 19 | #include "param.h" 20 | #include "socket.h" 21 | #include "utils.h" 22 | 23 | #define MAXNAMESIZE 64 24 | #define NCCL_NET_IB_MAX_RECVS 8 25 | // We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive 26 | #define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS) 27 | //static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion"); 28 | #define IB_DEVICE_SYSFS_FMT "/sys/class/infiniband/%s/device/%s" 29 | 30 | #define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED")) 31 | 32 | typedef enum nccl_p2p_plugin { 33 | NCCL_P2P_IB, 34 | NCCL_P2P_UCX, 35 | NCCL_P2P_UCX_RMA, 36 | NCCL_P2P_UCX_UCT, 37 | NCCL_P2P_UCX_UCT_RD, 38 | NCCL_P2P_LAST 39 | } nccl_p2p_plugin_t; 40 | 41 | struct ncclIbMr { 42 | uintptr_t addr; 43 | size_t pages; 44 | int refs; 45 | struct ibv_mr *mr; 46 | }; 47 | 48 | struct ncclIbMrCache { 49 | struct ncclIbMr *slots; 50 | int capacity, population; 51 | }; 52 | 53 | #define NCCL_IB_MAX_DEVS_PER_NIC 4 54 | #define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC 55 | typedef struct ncclIbMergedDev { 56 | ncclNetVDeviceProps_t vProps; 57 | int speed; 58 | char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+' 59 | } __attribute__((aligned(64))) ncclIbMergedDev; 60 | 61 | struct ncclIbStats { 62 | int fatalErrorCount; 63 | }; 64 | 65 | struct ncclIbRequest { 66 | struct ncclIbNetCommBase* base; 67 | int type; 68 | struct ncclSocket* sock; 69 | int events[NCCL_IB_MAX_DEVS_PER_NIC]; 70 | struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC]; 71 | int nreqs; 72 | union { 73 | struct { 74 | int size; 75 | void* data; 76 | uint32_t lkeys[NCCL_IB_MAX_DEVS_PER_NIC]; 77 | int offset; 78 | } send; 79 | struct { 80 | int* sizes; 81 | } recv; 82 | }; 83 | }; 84 | 85 | // Retain local RoCE address for error logging 86 | struct ncclIbGidInfo { 87 | uint8_t link_layer; 88 | union ibv_gid localGid; 89 | int32_t localGidIndex; 90 | }; 91 | 92 | typedef struct ncclIbNetCommDevBase { 93 | int ibDevN; 94 | struct ibv_pd* pd; 95 | struct ibv_cq* cq; 96 | uint64_t pad[2]; 97 | struct ncclIbGidInfo gidInfo; 98 | } ncclIbNetCommDevBase; 99 | 100 | typedef struct ncclIbDev { 101 | pthread_mutex_t lock; 102 | int device; 103 | uint64_t guid; 104 | uint8_t portNum; 105 | uint8_t link; 106 | uint8_t isSharpDev; 107 | int speed; 108 | struct ibv_context* context; 109 | int pdRefs; 110 | struct ibv_pd* pd; 111 | char devName[MAXNAMESIZE]; 112 | char *pciPath; 113 | char* virtualPciPath; 114 | int realPort; 115 | int maxQp; 116 | float latency; 117 | struct ncclIbMrCache mrCache; 118 | int ar; // ADAPTIVE_ROUTING 119 | struct ibv_port_attr portAttr; 120 | struct ncclIbStats stats; 121 | int dmaBufSupported; 122 | } __attribute__((aligned(64))) ncclIbDev; 123 | 124 | 125 | #define MAX_IB_DEVS 32 126 | #define MAX_IB_VDEVS MAX_IB_DEVS*8 127 | extern struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS]; 128 | extern struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; 129 | /* Detect whether GDR can work on a given NIC with the current CUDA device 130 | * Returns : 131 | * ncclSuccess : GDR works 132 | * ncclSystemError : no module or module loaded but not supported by GPU */ 133 | ncclResult_t nccl_p2p_gdr_support(); 134 | 135 | ncclResult_t nccl_p2p_dmabuf_support(int dev); 136 | 137 | ncclResult_t nccl_p2p_ib_pci_path(ncclIbDev *devs, int num_devs, char* dev_name, char** path, int* real_port); 138 | 139 | ncclResult_t nccl_p2p_ib_get_properties(ncclIbDev *devs, int ncclNMergedIbDevs, int dev, ncclNetProperties_t* props); 140 | 141 | ncclResult_t nccl_p2p_ib_init(int *nDevs, int *nmDevs, ncclIbDev *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, 142 | pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction); 143 | 144 | /* Convert value returtned by ibv_query_port to actual link width */ 145 | int nccl_p2p_ib_width(int width); 146 | 147 | /* Convert value returtned by ibv_query_port to actual link speed */ 148 | int nccl_p2p_ib_speed(int speed); 149 | 150 | int64_t ncclParamSharpMaxComms(); 151 | 152 | int64_t ncclParamIbMergeVfs(); 153 | 154 | int64_t ncclParamIbMergeNics(); 155 | 156 | int ncclIbRelaxedOrderingCapable(void); 157 | 158 | nccl_p2p_plugin_t nccl_p2p_get_plugin_type(); 159 | 160 | ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat); 161 | 162 | ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props, int nDevs, int *nmDevs); 163 | 164 | #endif 165 | -------------------------------------------------------------------------------- /include/param.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_PARAM_H_ 10 | #define NCCL_PARAM_H_ 11 | 12 | #include 13 | 14 | const char* userHomeDir(); 15 | void setEnvFile(const char* fileName); 16 | void initEnv(); 17 | const char *ncclGetEnv(const char *name); 18 | 19 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); 20 | 21 | #define NCCL_PARAM(name, env, deftVal) \ 22 | int64_t ncclParam##name() { \ 23 | NCCL_STATIC_ASSERT(deftVal != INT64_MIN, "default value cannot be the uninitialized value."); \ 24 | static int64_t cache = INT64_MIN; \ 25 | if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == INT64_MIN, false)) { \ 26 | ncclLoadParam("NCCL_" env, deftVal, INT64_MIN, &cache); \ 27 | } \ 28 | return cache; \ 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /include/socket.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_SOCKET_H_ 10 | #define NCCL_SOCKET_H_ 11 | 12 | #include "nccl.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "stdbool.h" 20 | #include "utils.h" 21 | 22 | #define MAX_IFS 16 23 | #define MAX_IF_NAME_SIZE 16 24 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) 25 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL 26 | 27 | /* Common socket address storage structure for IPv4/IPv6 */ 28 | union ncclSocketAddress { 29 | struct sockaddr sa; 30 | struct sockaddr_in sin; 31 | struct sockaddr_in6 sin6; 32 | }; 33 | 34 | enum ncclSocketState { 35 | ncclSocketStateNone = 0, 36 | ncclSocketStateInitialized = 1, 37 | ncclSocketStateAccepting = 2, 38 | ncclSocketStateAccepted = 3, 39 | ncclSocketStateConnecting = 4, 40 | ncclSocketStateConnectPolling = 5, 41 | ncclSocketStateConnected = 6, 42 | ncclSocketStateReady = 7, 43 | ncclSocketStateTerminating = 8, 44 | ncclSocketStateClosed = 9, 45 | ncclSocketStateError = 10, 46 | ncclSocketStateNum = 11 47 | 48 | }; 49 | 50 | enum ncclSocketType { 51 | ncclSocketTypeUnknown = 0, 52 | ncclSocketTypeBootstrap = 1, 53 | ncclSocketTypeProxy = 2, 54 | ncclSocketTypeNetIb = 4, 55 | ncclSocketTypeRasNetwork = 5 56 | }; 57 | 58 | struct ncclSocket { 59 | int fd; 60 | int acceptFd; 61 | int errorRetries; 62 | union ncclSocketAddress addr; 63 | volatile uint32_t* abortFlag; 64 | int asyncFlag; 65 | enum ncclSocketState state; 66 | int salen; 67 | uint64_t magic; 68 | enum ncclSocketType type; 69 | int customRetry; 70 | int finalizeCounter; // Used to keep track of initial handshake for async sockets. 71 | char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets. 72 | }; 73 | 74 | const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm); 75 | ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); 76 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); 77 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); 78 | 79 | // Initialize a socket 80 | ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry); 81 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call 82 | ncclResult_t ncclSocketListen(struct ncclSocket* sock); 83 | ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr); 84 | // Connect to sock->addr. sock->fd is set after a successful call. 85 | ncclResult_t ncclSocketConnect(struct ncclSocket* sock); 86 | // Return socket connection state. 87 | ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running); 88 | // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. 89 | ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock); 90 | ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd); 91 | ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock); 92 | 93 | #define NCCL_SOCKET_SEND 0 94 | #define NCCL_SOCKET_RECV 1 95 | 96 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed); 97 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); 98 | ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); 99 | ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); 100 | ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); 101 | ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait); 102 | #endif 103 | -------------------------------------------------------------------------------- /include/timer.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_TIMER_H_ 10 | #define NCCL_TIMER_H_ 11 | #if ENABLE_TIMER 12 | #include 13 | #include 14 | #include 15 | static double freq = -1; 16 | static void calibrate() { 17 | struct timeval tv; 18 | gettimeofday(&tv, NULL); 19 | uint64_t timeCycles = __rdtsc(); 20 | double time = - tv.tv_sec*1E6 - tv.tv_usec; 21 | uint64_t total = 0ULL; 22 | for (int i=0; i<10000; i++) total += __rdtsc(); 23 | gettimeofday(&tv, NULL); 24 | timeCycles = __rdtsc() - timeCycles; 25 | time += tv.tv_sec*1E6 + tv.tv_usec; 26 | freq = timeCycles/time; 27 | } 28 | static inline double gettime() { 29 | if (freq == -1) calibrate(); 30 | return __rdtsc()/freq; 31 | } 32 | static uint64_t counts[8]; 33 | static double times[8]; 34 | static double startTimes[8]; 35 | #define TIME_START(index) do { \ 36 | counts[index]++; \ 37 | startTimes[index] = gettime(); \ 38 | } while (0) 39 | 40 | #define TIME_STOP(index) do { \ 41 | times[index] += gettime() - startTimes[index]; \ 42 | } while (0) 43 | 44 | #define TIME_CANCEL(index) do { \ 45 | counts[index]--; \ 46 | } while (0) 47 | 48 | #define TIME_PRINT(name) do { \ 49 | printf("%s stats", name); \ 50 | for (int i=0; i<8; i++) { \ 51 | if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ 52 | counts[i] = 0; \ 53 | } \ 54 | printf("\n"); \ 55 | } while (0) 56 | #else 57 | #define TIME_START(index) do {} while(0) 58 | #define TIME_STOP(index) do {} while(0) 59 | #define TIME_CANCEL(index) do {} while(0) 60 | #define TIME_PRINT(name) 61 | #endif 62 | #endif 63 | -------------------------------------------------------------------------------- /include/ucx_uct_lib.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_UCX_UCT_LIB_H_ 10 | #define NCCL_UCX_UCT_LIB_H_ 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include "p2p_plugin.h" 17 | #include "socket.h" 18 | 19 | #include 20 | 21 | #define NCCL_UCX_UCT_MAX_RECVS NCCL_NET_IB_MAX_RECVS 22 | #define NCCL_UCT_LISTEN_HANDLE_MAGIC 0x43cf19ed91abdb85 23 | #define NCCL_UCT_REG_ALIGN 4096 24 | 25 | typedef enum { 26 | NCCL_UCT_AM_RTR = 14, /* Use particular values */ 27 | NCCL_UCT_AM_ATP = 15, 28 | NCCL_UCT_AM_RTS = 16, 29 | NCCL_UCT_AM_ATS = 17 30 | } nccl_uct_am_type_t; 31 | 32 | typedef enum { 33 | NCCL_UCT_START = 0, 34 | NCCL_UCT_CONNECT, 35 | NCCL_UCT_ACCEPT, 36 | NCCL_UCT_RECEIVE_REMOTE, /* Acceptor receives ep addr/remote communicator */ 37 | NCCL_UCT_RECEIVE_ADDR, 38 | NCCL_UCT_RX_READY, 39 | NCCL_UCT_DONE 40 | } nccl_uct_state_t; 41 | 42 | /* UCT EP address to exchange and connect to */ 43 | typedef struct { 44 | uint8_t dev_addr_size; 45 | uint8_t ep_addr_size; 46 | uint8_t data[64]; 47 | } nccl_uct_ep_addr_t; 48 | 49 | typedef struct { 50 | uct_iface_h iface; 51 | uct_md_h md; 52 | uct_component_h comp; 53 | void *addr; 54 | size_t addr_size; 55 | void *dev_addr; 56 | size_t dev_addr_size; 57 | size_t ep_addr_size; 58 | size_t rkey_packed_size; 59 | 60 | size_t am_max_short; 61 | size_t min_get_zcopy; 62 | } nccl_uct_iface_t; 63 | 64 | struct nccl_uct_context; 65 | 66 | typedef struct nccl_uct_worker { 67 | struct nccl_uct_worker *next; 68 | struct { 69 | pthread_t thread; 70 | int dev; 71 | } id; 72 | 73 | int count; 74 | ucs_async_context_t *async; 75 | uct_worker_h worker; 76 | nccl_uct_iface_t *uct_iface; 77 | struct nccl_uct_context *context; 78 | } nccl_uct_worker_t; 79 | 80 | typedef struct { 81 | uct_ep_h ep; 82 | uct_ep_addr_t *addr; 83 | size_t addr_size; 84 | nccl_uct_iface_t *uct_iface; 85 | uint8_t data[]; 86 | } nccl_uct_ep_t; 87 | 88 | /* All the remote addresses for the communicator */ 89 | typedef struct nccl_uct_comm_addr { 90 | nccl_uct_ep_addr_t rma; 91 | /* TODO: Add multi-QP here */ 92 | } nccl_uct_comm_addr_t; 93 | 94 | /* Either Receiver or Sender communicator, connected to one peer */ 95 | typedef struct nccl_uct_comm { 96 | struct ncclSocket sock; 97 | struct nccl_uct_context *context; 98 | int dev; 99 | 100 | nccl_uct_worker_t *uct_worker; 101 | nccl_uct_iface_t *uct_iface; 102 | nccl_uct_ep_t *uct_ep; 103 | 104 | struct nccl_uct_comm_remote { 105 | nccl_uct_comm_addr_t addr; /* Remote addresses */ 106 | const struct nccl_uct_comm *comm; /* Cookie received in connect */ 107 | } remote; 108 | 109 | /* Local GET on current device */ 110 | struct { 111 | int enabled; 112 | nccl_uct_ep_t *uct_ep; /* Locally read from HCA */ 113 | nccl_uct_ep_addr_t addr; 114 | 115 | uint8_t *mem; /* Dummy memory to read into */ 116 | uct_mem_h memh; 117 | } gpu_flush; 118 | } nccl_uct_comm_t; 119 | 120 | /* State tracking used while connecting/accepting only */ 121 | typedef struct { 122 | nccl_uct_state_t state; 123 | nccl_uct_comm_t *comm; /* current communicator being created */ 124 | int offset; /* for Socket reading */ 125 | int ready; /* accept must complete after connect */ 126 | } nccl_uct_stage_t; 127 | 128 | /* Memory registration handle in NCCL UCT plugin returned by ->regMR() */ 129 | typedef struct { 130 | uct_mem_h memh; 131 | nccl_uct_comm_t *comm; 132 | uct_rkey_bundle_t bundle; 133 | uint8_t rkey[]; 134 | } nccl_uct_memh_t; 135 | 136 | /* On-the-wire handle passed OOB by NCCL from listener to connector */ 137 | typedef struct { 138 | uint64_t magic; 139 | struct { 140 | union ncclSocketAddress addr; 141 | uint32_t id; 142 | } listener; 143 | nccl_uct_comm_t *comm; /* Created communicator in accept */ 144 | nccl_uct_stage_t stage; /* Used by connector */ 145 | } nccl_uct_listen_handle_t; 146 | 147 | /* Communicator while listening to remote ranks */ 148 | typedef struct { 149 | struct ncclSocket sock; 150 | struct nccl_uct_context *context; 151 | int dev; 152 | uint32_t id; 153 | nccl_uct_worker_t *uct_worker; 154 | nccl_uct_comm_t *comm; 155 | 156 | /* Used by acceptor */ 157 | nccl_uct_stage_t stage; 158 | } nccl_uct_listen_comm_t; 159 | 160 | /* Global state of the plugin */ 161 | typedef struct nccl_uct_context { 162 | /* Transport to use */ 163 | const char *tl_name; 164 | 165 | /* IB devices available */ 166 | int dev_count; 167 | int merge_dev_count; 168 | 169 | /* Use by common code to setup communicators */ 170 | struct nccl_uct_ops { 171 | ncclResult_t (*comm_alloc)(nccl_uct_comm_t **comm); 172 | ncclResult_t (*comm_init)(nccl_uct_comm_t *comm, 173 | struct nccl_uct_context *context, 174 | nccl_uct_worker_t *worker, int dev, 175 | const nccl_uct_comm_t *remote_comm); 176 | ncclResult_t (*iface_set)(nccl_uct_iface_t *uct_iface); 177 | } ops; 178 | 179 | /* Max sizes needed */ 180 | size_t am_short_size; 181 | size_t rkey_size; 182 | 183 | /* OOB socket for accepting/connecting */ 184 | char if_name[MAX_IF_NAME_SIZE]; 185 | union ncclSocketAddress if_addr; 186 | 187 | /* Number of listener created */ 188 | uint32_t listener_count; 189 | 190 | /* List of created workers */ 191 | nccl_uct_worker_t *worker_list; 192 | } nccl_uct_context_t; 193 | 194 | #define UCXCHECK(statement, failure_action, message, ...) \ 195 | do { \ 196 | ucs_status_t _status = statement; \ 197 | if (_status != UCS_OK) { \ 198 | WARN("Failed: " message ": %s", ##__VA_ARGS__, \ 199 | ucs_status_string(_status)); \ 200 | failure_action; \ 201 | } \ 202 | } while (0) 203 | 204 | extern nccl_uct_context_t context; 205 | 206 | /* Library functions */ 207 | ncclResult_t nccl_uct_iface_set_handler(nccl_uct_iface_t *uct_iface, int id, 208 | uct_am_callback_t callback); 209 | ncclResult_t nccl_uct_devices(int *ndev); 210 | ncclResult_t nccl_uct_comm_init(nccl_uct_comm_t *comm, 211 | nccl_uct_context_t *context, 212 | nccl_uct_worker_t *worker, int dev, 213 | const nccl_uct_comm_t *remote_comm); 214 | void nccl_uct_comm_deinit(nccl_uct_comm_t *comm); 215 | int nccl_uct_flush_index(nccl_uct_comm_t *base, int *sizes, int n); 216 | ncclResult_t nccl_uct_flush(nccl_uct_comm_t *base_comm, void *data, int size, 217 | nccl_uct_memh_t *uct_memh, 218 | uct_completion_t *completion, void **request); 219 | void nccl_uct_empty_callback(uct_completion_t *comp); 220 | 221 | /* NCCL common plugin callbacks */ 222 | ncclResult_t nccl_uct_listen(int dev, void *listen_handle, void **listen_comm); 223 | ncclResult_t nccl_uct_accept(void *listen_comm, void **recv_comm, 224 | ncclNetDeviceHandle_v7_t **recvDevComm); 225 | ncclResult_t nccl_uct_connect(int dev, ncclNetCommConfig_t* config, void *listen_handle, void **send_comm, 226 | ncclNetDeviceHandle_t **sendDevComm); 227 | ncclResult_t nccl_uct_close_listen(void *listen_comm); 228 | ncclResult_t nccl_uct_reg_mr_dmabuf(void *reg_comm, void *data, size_t size, 229 | int type, uint64_t offset, int fd, 230 | void **mhandle); 231 | ncclResult_t nccl_uct_reg_mr(void *reg_comm, void *data, size_t size, int type, 232 | void **mhandle); 233 | ncclResult_t nccl_uct_dereg_mr(void *dereg_comm, void *mhandle); 234 | 235 | /* Compatibility callback */ 236 | ncclResult_t nccl_uct_get_properties_v9(int dev, 237 | ncclNetProperties_v9_t *props_v9); 238 | ncclResult_t nccl_uct_get_properties_v8(int dev, 239 | ncclNetProperties_v8_t *props_v8); 240 | ncclResult_t nccl_uct_get_properties_v7(int dev, 241 | ncclNetProperties_v7_t *props_v7); 242 | ncclResult_t nccl_uct_reg_mr_v7(void *comm, void *data, int size, int type, 243 | void **mhandle); 244 | ncclResult_t nccl_uct_get_properties_v6(int dev, 245 | ncclNetProperties_v6_t *props_v6); 246 | ncclResult_t nccl_uct_connect_v9(int dev, void *listen_handle, void **send_comm, 247 | ncclNetDeviceHandle_t **sendDevComm); 248 | ncclResult_t nccl_uct_connect_v6(int dev, void *handle, void **send_comm); 249 | ncclResult_t nccl_uct_accept_v6(void *listen_comm, void **recv_comm); 250 | ncclResult_t nccl_uct_get_properties(int dev, ncclNetProperties_t *props); 251 | 252 | 253 | #define NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_func, get_properties_func, \ 254 | connect_func, accept_func, reg_mr_func, \ 255 | isend_func, irecv_func) \ 256 | { \ 257 | .name = plugin_name, \ 258 | .init = prefix##_##init_func, \ 259 | .devices = nccl_uct_devices, \ 260 | .getProperties = get_properties_func, \ 261 | .listen = nccl_uct_listen, \ 262 | .connect = connect_func, \ 263 | .accept = accept_func, \ 264 | .regMr = reg_mr_func, \ 265 | .regMrDmaBuf = nccl_uct_reg_mr_dmabuf, \ 266 | .deregMr = nccl_uct_dereg_mr, \ 267 | .isend = prefix##_##isend_func, \ 268 | .irecv = prefix##_##irecv_func, \ 269 | .iflush = prefix##_iflush, \ 270 | .test = prefix##_test, \ 271 | .closeSend = prefix##_close, \ 272 | .closeRecv = prefix##_close, \ 273 | .closeListen = nccl_uct_close_listen \ 274 | } 275 | 276 | #define NCCL_UCT_PLUGIN_V10(plugin_name, prefix) \ 277 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init, nccl_uct_get_properties, \ 278 | nccl_uct_connect, nccl_uct_accept, nccl_uct_reg_mr, \ 279 | isend, irecv) 280 | 281 | #define NCCL_UCT_PLUGIN_V9(plugin_name, prefix) \ 282 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v9, \ 283 | nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr, \ 284 | isend_v9, irecv_v9) 285 | 286 | #define NCCL_UCT_PLUGIN_V8(plugin_name, prefix) \ 287 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v8, \ 288 | nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr, \ 289 | isend_v8, irecv_v8) 290 | 291 | #define NCCL_UCT_PLUGIN_V7(plugin_name, prefix) \ 292 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v7, \ 293 | nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr_v7, \ 294 | isend_v8, irecv_v8) 295 | 296 | #define NCCL_UCT_PLUGIN_V6(plugin_name, prefix) \ 297 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v6, \ 298 | nccl_uct_connect_v6, nccl_uct_accept_v6, \ 299 | nccl_uct_reg_mr_v7, isend_v8, irecv_v8) 300 | 301 | #define NCCL_UCT_PLUGIN_V5(plugin_name, prefix) \ 302 | { \ 303 | .name = plugin_name, \ 304 | .init = prefix##_init_v9, \ 305 | .devices = nccl_uct_devices, \ 306 | .getProperties = nccl_uct_get_properties_v6, \ 307 | .listen = nccl_uct_listen, \ 308 | .connect = nccl_uct_connect_v6, \ 309 | .accept = nccl_uct_accept_v6, \ 310 | .regMr = nccl_uct_reg_mr_v7, \ 311 | .deregMr = nccl_uct_dereg_mr, \ 312 | .isend = prefix##_isend_v8, \ 313 | .irecv = prefix##_irecv_v8, \ 314 | .iflush = prefix##_iflush, \ 315 | .test = prefix##_test, \ 316 | .closeSend = prefix##_close, \ 317 | .closeRecv = prefix##_close, \ 318 | .closeListen = nccl_uct_close_listen \ 319 | } 320 | 321 | #endif /* NCCL_UCX_UCT_LIB_H_ */ 322 | -------------------------------------------------------------------------------- /include/ucx_uct_ring.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_UCX_UCT_RING_H_ 10 | #define NCCL_UCX_UCT_RING_H_ 11 | 12 | #include "nccl.h" 13 | #include 14 | 15 | #define NCCL_UCT_RING_SIZE (1 << 7) 16 | #define NCCL_UCT_RING_MASK (NCCL_UCT_RING_SIZE - 1) 17 | 18 | typedef struct nccl_uct_ring { 19 | unsigned first; 20 | unsigned last; 21 | unsigned size; 22 | unsigned entry_size; 23 | int tag[NCCL_UCT_RING_SIZE]; 24 | void *entry; 25 | } nccl_uct_ring_t; 26 | 27 | static inline ncclResult_t nccl_uct_ring_init(nccl_uct_ring_t *ring, 28 | unsigned entry_size) { 29 | int i; 30 | 31 | ring->first = 0; 32 | ring->last = 0; 33 | ring->entry_size = entry_size; 34 | ring->entry = malloc(entry_size * NCCL_UCT_RING_SIZE); 35 | if (ring->entry == NULL) { 36 | free(ring->entry); 37 | return ncclSystemError; 38 | } 39 | 40 | for (i = 0; i < NCCL_UCT_RING_SIZE; i++) { 41 | ring->tag[i] = INT_MAX; 42 | } 43 | return ncclSuccess; 44 | } 45 | 46 | static inline void nccl_uct_ring_deinit(nccl_uct_ring_t *ring) { 47 | free(ring->entry); 48 | } 49 | 50 | static inline void *nccl_uct_ring_get_entry(nccl_uct_ring_t *ring, unsigned i) { 51 | return (uint8_t*)ring->entry + (ring->entry_size * (i & NCCL_UCT_RING_MASK)); 52 | } 53 | 54 | static inline void nccl_uct_ring_append(nccl_uct_ring_t *ring, int tag, 55 | void *data, size_t len) { 56 | int j = ring->last & NCCL_UCT_RING_MASK; 57 | 58 | ring->last++; 59 | 60 | assert((ring->last & NCCL_UCT_RING_MASK) != 61 | (ring->first & NCCL_UCT_RING_MASK)); 62 | assert(ring->tag[j] == INT_MAX); 63 | assert(len == ring->entry_size); 64 | 65 | ring->tag[j] = tag; 66 | memcpy(nccl_uct_ring_get_entry(ring, j), data, len); 67 | } 68 | 69 | static inline int nccl_uct_ring_is_empty(const nccl_uct_ring_t *ring) { 70 | return ring->first == ring->last; 71 | } 72 | 73 | static inline void nccl_uct_ring_consume(nccl_uct_ring_t *ring, unsigned i) { 74 | unsigned j = i & NCCL_UCT_RING_MASK; 75 | 76 | assert(ring->tag[j] != INT_MAX); 77 | ring->tag[j] = INT_MAX; 78 | 79 | /* Cleanup upon tag hit */ 80 | if (i == ring->first) { 81 | for (; i != ring->last; i++) { 82 | j = i & NCCL_UCT_RING_MASK; 83 | if (ring->tag[j] != INT_MAX) { 84 | break; 85 | } 86 | ring->first = i + 1; 87 | } 88 | } 89 | } 90 | 91 | static inline unsigned nccl_uct_ring_find(nccl_uct_ring_t *ring, int tag) { 92 | unsigned i; 93 | 94 | assert(tag != INT_MAX); 95 | 96 | for (i = ring->first; i != ring->last; i++) { 97 | if (ring->tag[i & NCCL_UCT_RING_MASK] == tag) { 98 | return i; 99 | } 100 | } 101 | 102 | return ring->last; 103 | } 104 | 105 | #endif /* NCCL_UCX_UCT_RING_H_ */ 106 | -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #ifndef NCCL_UTILS_H_ 10 | #define NCCL_UTILS_H_ 11 | 12 | #include "nccl.h" 13 | #include 14 | 15 | #define NCCL_STATIC_ASSERT(_cond, _msg) \ 16 | switch(0) {case 0:case (_cond):;} 17 | 18 | ncclResult_t ncclIbMalloc(void** ptr, size_t size); 19 | ncclResult_t ncclRealloc(void** ptr, size_t old_size, size_t new_size); 20 | ncclResult_t getHostName(char* hostname, int maxlen); 21 | uint64_t getHostHash(); 22 | uint64_t getPidHash(); 23 | 24 | struct netIf { 25 | char prefix[64]; 26 | int port; 27 | }; 28 | 29 | int parseStringList(const char* string, struct netIf* ifList, int maxList); 30 | int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact); 31 | const char *get_plugin_lib_path(); 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /m4/sharp.m4: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2001-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # See file LICENSE for terms. 4 | # 5 | 6 | AC_DEFUN([CHECK_SHARP],[ 7 | 8 | AS_IF([test "x$sharp_checked" != "xyes"],[ 9 | 10 | sharp_happy="no" 11 | 12 | AC_ARG_WITH([sharp], 13 | [AS_HELP_STRING([--with-sharp=(DIR)], [Enable the use of SHARP (default is guess).])], 14 | [], [with_sharp=guess]) 15 | 16 | AS_IF([test "x$with_sharp" != "xno"], 17 | [ 18 | save_CPPFLAGS="$CPPFLAGS" 19 | save_CFLAGS="$CFLAGS" 20 | save_LDFLAGS="$LDFLAGS" 21 | 22 | AS_IF([test ! -z "$with_sharp" -a "x$with_sharp" != "xyes" -a "x$with_sharp" != "xguess"], 23 | [ 24 | check_sharp_dir="$with_sharp" 25 | check_sharp_libdir="$with_sharp/lib" 26 | CPPFLAGS="-I$with_sharp/include $save_CPPFLAGS" 27 | LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS" 28 | ]) 29 | 30 | AS_IF([test "x$check_sharp_dir" = "x" -a "x$HPCX_SHARP_DIR" != "x"], 31 | [ 32 | check_sharp_dir="$HPCX_SHARP_DIR" 33 | check_sharp_libdir="$HPCX_SHARP_DIR/lib" 34 | CPPFLAGS="-I$check_sharp_dir/include $save_CPPFLAGS" 35 | LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS" 36 | ]) 37 | 38 | AS_IF([test "x$check_sharp_dir" = "x" -a -d "/opt/mellanox/sharp/"], 39 | [ 40 | check_sharp_dir="/opt/mellanox/sharp/" 41 | check_sharp_libdir="/opt/mellanox/sharp/lib" 42 | CPPFLAGS="-I$check_sharp_dir/include $save_CPPFLAGS" 43 | LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS" 44 | ]) 45 | 46 | 47 | AS_IF([test ! -z "$with_sharp_libdir" -a "x$with_sharp_libdir" != "xyes"], 48 | [ 49 | check_sharp_libdir="$with_sharp_libdir" 50 | LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS" 51 | ]) 52 | 53 | AC_CHECK_HEADERS([sharp/api/sharp_coll.h], 54 | [ 55 | AC_CHECK_LIB([sharp_coll], [sharp_coll_init], 56 | [ 57 | sharp_happy="yes" 58 | ], 59 | [ 60 | sharp_happy="no" 61 | ]) 62 | ], 63 | [ 64 | sharp_happy="no" 65 | ]) 66 | 67 | AS_IF([test "x$sharp_happy" = "xyes"], 68 | [ 69 | AS_IF([test "x$check_sharp_dir" != "x"], 70 | [ 71 | AC_MSG_RESULT([SHARP dir: $check_sharp_dir]) 72 | AC_SUBST(SHARP_CPPFLAGS, "-I$check_sharp_dir/include/") 73 | ]) 74 | 75 | AS_IF([test "x$check_sharp_libdir" != "x"], 76 | [ 77 | AC_SUBST(SHARP_LDFLAGS, "-L$check_sharp_libdir") 78 | ]) 79 | 80 | AC_SUBST(SHARP_LIBADD, "-lsharp_coll") 81 | AC_CHECK_DECLS([SHARP_DTYPE_BFLOAT16], [AC_DEFINE([HAVE_SHARP_DTYPE_BFLOAT16_UINT8_INT8], 1, 82 | [SHARP v3 datatypes : bfloat16, uint8, int8])], [], 83 | [[#include ]]) 84 | AC_CHECK_DECLS([sharp_coll_reg_mr_v2], [], [], [[#include ]]) 85 | 86 | ], 87 | [ 88 | AS_IF([test "x$with_sharp" != "xguess"], 89 | [ 90 | AC_MSG_ERROR([SHARP support is requested but SHARP packages cannot be found]) 91 | ], 92 | [ 93 | AC_MSG_WARN([SHARP not found]) 94 | ]) 95 | ]) 96 | 97 | CFLAGS="$save_CFLAGS" 98 | CPPFLAGS="$save_CPPFLAGS" 99 | LDFLAGS="$save_LDFLAGS" 100 | 101 | ], 102 | [ 103 | AC_MSG_WARN([SHARP was explicitly disabled]) 104 | ]) 105 | 106 | sharp_checked=yes 107 | AM_CONDITIONAL([HAVE_SHARP_PLUGIN], [test "x$sharp_happy" != xno]) 108 | ]) 109 | 110 | ]) 111 | -------------------------------------------------------------------------------- /m4/ucx.m4: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2001-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # See file LICENSE for terms. 4 | # 5 | 6 | AC_DEFUN([CHECK_UCX],[ 7 | 8 | AS_IF([test "x$ucx_checked" != "xyes"],[ 9 | 10 | ucx_happy="no" 11 | 12 | AC_ARG_WITH([ucx], 13 | [AS_HELP_STRING([--with-ucx=(DIR)], [Enable the use of UCX (default is guess).])], 14 | [], [with_ucx=guess]) 15 | 16 | AS_IF([test "x$with_ucx" != "xno"], 17 | [ 18 | save_CPPFLAGS="$CPPFLAGS" 19 | save_CFLAGS="$CFLAGS" 20 | save_LDFLAGS="$LDFLAGS" 21 | 22 | AS_IF([test ! -z "$with_ucx" -a "x$with_ucx" != "xyes" -a "x$with_ucx" != "xguess"], 23 | [ 24 | check_ucx_dir="$with_ucx" 25 | check_ucx_libdir="$with_ucx/lib" 26 | CPPFLAGS="-I$with_ucx/include $save_CPPFLAGS" 27 | LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS" 28 | ]) 29 | 30 | AS_IF([test "x$check_ucx_dir" = "x" -a "x$HPCX_UCX_DIR" != "x"], 31 | [ 32 | check_ucx_dir="$HPCX_UCX_DIR" 33 | check_ucx_libdir="$HPCX_UCX_DIR/lib" 34 | CPPFLAGS="-I$check_ucx_dir/include $save_CPPFLAGS" 35 | LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS" 36 | ]) 37 | 38 | AS_IF([test ! -z "$with_ucx_libdir" -a "x$with_ucx_libdir" != "xyes"], 39 | [ 40 | check_ucx_libdir="$with_ucx_libdir" 41 | LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS" 42 | ]) 43 | 44 | AC_CHECK_HEADERS([ucp/api/ucp.h], 45 | [ 46 | AC_CHECK_LIB([ucp], [ucp_tag_send_nb], 47 | [ 48 | ucx_happy="yes" 49 | ], 50 | [ 51 | ucx_happy="no" 52 | ], [-luct -lucm -lucs]) 53 | ], 54 | [ 55 | ucx_happy="no" 56 | ]) 57 | 58 | AS_IF([test "x$ucx_happy" = "xyes"], 59 | [ 60 | AS_IF([test "x$check_ucx_dir" != "x"], 61 | [ 62 | AC_MSG_RESULT([UCX dir: $check_ucx_dir]) 63 | AC_SUBST(UCX_CPPFLAGS, "-I$check_ucx_dir/include/") 64 | ]) 65 | 66 | AS_IF([test "x$check_ucx_libdir" != "x"], 67 | [ 68 | AC_SUBST(UCX_LDFLAGS, "-L$check_ucx_libdir") 69 | ]) 70 | 71 | AC_SUBST(UCX_LIBADD, "-lucp -lucs -lucm -luct") 72 | ], 73 | [ 74 | AS_IF([test "x$with_ucx" != "xguess"], 75 | [ 76 | AC_MSG_ERROR([UCX support is requested but UCX packages cannot be found]) 77 | ], 78 | [ 79 | AC_MSG_WARN([UCX not found]) 80 | ]) 81 | ]) 82 | 83 | CFLAGS="$save_CFLAGS" 84 | CPPFLAGS="$save_CPPFLAGS" 85 | LDFLAGS="$save_LDFLAGS" 86 | 87 | ], 88 | [ 89 | AC_MSG_WARN([UCX was explicitly disabled]) 90 | ]) 91 | 92 | ucx_checked=yes 93 | AM_CONDITIONAL([HAVE_UCX_PLUGIN], [test "x$ucx_happy" != xno]) 94 | ]) 95 | 96 | ]) 97 | -------------------------------------------------------------------------------- /nccl-rdma-sharp-plugins.pc.in: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # 2. Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # 3. Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | 32 | prefix = @prefix@ 33 | exec_prefix = @exec_prefix@ 34 | libdir = @libdir@ 35 | 36 | Name: @PACKAGE@ 37 | URL: @PACKAGE_URL@ 38 | Description: RDMA and SHARP plugins for NCCL Collective library 39 | Version: @MAJOR_VERSION@.@MINOR_VERSION@ 40 | Libs: -L${libdir} -lnccl-net.so 41 | 42 | -------------------------------------------------------------------------------- /nccl-rdma-sharp-plugins.spec.in: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # 2. Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # 3. Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | 32 | %global rel @RPM_RELEASE@ 33 | %global version @VERSION@ 34 | %global pkgname @PACKAGE@ 35 | %global prefix @prefix@ 36 | %global __check_files %{nil} 37 | %global _libdir %{prefix}/lib 38 | %{!?configure_opts: %global configure_opts %{nil}} 39 | %global debug_package %{nil} 40 | %bcond_with valgrind 41 | %global _binary_filedigest_algorithm 1 42 | %global _source_filedigest_algorithm 1 43 | 44 | %global lt_release @LT_RELEASE@ 45 | %global lt_version @LT_CURRENT@.@LT_REVISION@.@LT_AGE@ 46 | 47 | Name: %{pkgname} 48 | Summary: RDMA and SHARP plugins for NCCL 49 | Version: %{version} 50 | Release: %{rel} 51 | 52 | License: Proprietary 53 | Group: Applications 54 | Source: %{pkgname}-%{version}.tar.gz 55 | Requires: libibverbs 56 | %if 0%{?suse_version} < 1100 57 | BuildRequires: gcc-c++ libibverbs-devel binutils 58 | %else 59 | BuildRequires: gcc-c++ libibverbs-devel binutils-devel 60 | %endif 61 | %if %{with valgrind} 62 | BuildRequires: valgrind-devel 63 | %endif 64 | 65 | BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) 66 | URL: http://www.mellanox.com 67 | Prefix: %{prefix} 68 | Provides: nccl-rdma-sharp-plugins 69 | Vendor: mellanox 70 | 71 | 72 | %description 73 | Provides RDMA and SHARP plugins for NCCL Collective library 74 | 75 | %prep 76 | rm -rf $RPM_BUILD_ROOT 77 | 78 | %setup -q 79 | 80 | %build 81 | ./configure 82 | make %{?_smp_mflags} 83 | 84 | %install 85 | 86 | rm -rf "$RPM_BUILD_ROOT" 87 | 88 | # Strip out some dependencies 89 | cat > find-requires.sh <<'EOF' 90 | exec %{__find_requires} "$@" | egrep -v '^perl' 91 | EOF 92 | chmod +x find-requires.sh 93 | %global _use_internal_dependency_generator 0 94 | %global __find_requires %{_builddir}/%{buildsubdir}/find-requires.sh 95 | 96 | make DESTDIR="$RPM_BUILD_ROOT" install 97 | mkdir -p $RPM_BUILD_ROOT/etc/ld.so.conf.d/ 98 | echo %{_libdir} > $RPM_BUILD_ROOT/etc/ld.so.conf.d/nccl-rdma-sharp-plugins.conf 99 | mkdir -p $RPM_BUILD_ROOT/usr/lib64/pkgconfig 100 | cp nccl-rdma-sharp-plugins.pc $RPM_BUILD_ROOT/usr/lib64/pkgconfig 101 | 102 | %clean 103 | # We may be in the directory that we're about to remove, so cd out of 104 | # there before we remove it 105 | cd /tmp 106 | 107 | # Remove installed driver after rpm build finished 108 | chmod -R o+w $RPM_BUILD_DIR/%{name}-%{version} 109 | rm -rf $RPM_BUILD_DIR/%{name}-%{version} 110 | 111 | test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT 112 | 113 | 114 | %files 115 | %defattr(-, root, root) 116 | %{prefix} 117 | /etc/ld.so.conf.d/nccl-rdma-sharp-plugins.conf 118 | /usr/lib64/pkgconfig/nccl-rdma-sharp-plugins.pc 119 | 120 | 121 | # Your application file list goes here 122 | # %{prefix}/lib/lib*.so* 123 | #%doc COPYRIGHT ChangeLog README AUTHORS NEWS 124 | #%doc doc/* 125 | 126 | # If you install a library 127 | %post 128 | /sbin/ldconfig || exit 1 129 | 130 | # If you install a library 131 | %postun 132 | /sbin/ldconfig 133 | exit 0 134 | 135 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # See file LICENSE for terms. 6 | # 7 | 8 | lib_LTLIBRARIES = libnccl-net.la 9 | 10 | libnccl_net_la_CPPFLAGS = -I$(top_srcdir)/include 11 | libnccl_net_la_CFLAGS = $(CFLAGS) -DGNU_SOURCE 12 | libnccl_net_la_LIBADD = -lcudart_static 13 | libnccl_net_la_LDFLAGS = $(LDFLAGS) 14 | 15 | libnccl_net_la_SOURCES = \ 16 | ibvwrap.c \ 17 | utils.c \ 18 | param.c \ 19 | socket.c \ 20 | p2p_plugin.c \ 21 | ib_plugin.c 22 | 23 | if HAVE_UCX_PLUGIN 24 | libnccl_net_la_CPPFLAGS += -DHAVE_UCX_PLUGIN $(UCX_CPPFLAGS) 25 | libnccl_net_la_LIBADD += $(UCX_LIBADD) 26 | libnccl_net_la_LDFLAGS += $(UCX_LDFLAGS) 27 | libnccl_net_la_SOURCES += \ 28 | ucx_plugin.c \ 29 | ucx_rma_plugin.c \ 30 | ucx_uct_lib.c \ 31 | ucx_uct_plugin.c \ 32 | ucx_uct_rd_plugin.c 33 | endif 34 | 35 | if HAVE_SHARP_PLUGIN 36 | libnccl_net_la_CPPFLAGS += -DHAVE_SHARP_PLUGIN $(SHARP_CPPFLAGS) 37 | libnccl_net_la_LIBADD += $(SHARP_LIBADD) 38 | libnccl_net_la_LDFLAGS += $(SHARP_LDFLAGS) 39 | libnccl_net_la_SOURCES += sharp_plugin.c 40 | endif 41 | 42 | install-exec-hook: 43 | cd $(libdir) && ln -sf libnccl-net.so libnccl-net-ibext.so 44 | -------------------------------------------------------------------------------- /src/ibvwrap.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #include 10 | #include 11 | 12 | #include "ibvwrap.h" 13 | #include "utils.h" 14 | #include "nccl.h" 15 | #include "param.h" 16 | 17 | #define IBV_PTR_CHECK_ERRNO(call, retval, error_retval, name) \ 18 | retval = call; \ 19 | if (retval == error_retval) { \ 20 | WARN("Call to " name " failed with error %s", strerror(errno)); \ 21 | return ncclSystemError; \ 22 | } \ 23 | return ncclSuccess; 24 | 25 | #define IBV_PTR_CHECK(call, retval, error_retval, name) \ 26 | retval = call; \ 27 | if (retval == error_retval) { \ 28 | WARN("Call to " name " failed"); \ 29 | return ncclSystemError; \ 30 | } \ 31 | return ncclSuccess; 32 | 33 | #define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(call, success_retval, name, supported) \ 34 | int ret = call; \ 35 | if (ret == ENOTSUP || ret == EOPNOTSUPP) { \ 36 | INFO(NCCL_NET, "Call to " name " not supported"); \ 37 | *supported = 0; \ 38 | return ncclSuccess; \ 39 | } else if (ret != success_retval) { \ 40 | WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \ 41 | *supported = 1; \ 42 | return ncclSystemError; \ 43 | } \ 44 | *supported = 1; \ 45 | return ncclSuccess; 46 | 47 | #define IBV_INT_CHECK_RET_ERRNO(call, success_retval, name) \ 48 | int ret = call; \ 49 | if (ret != success_retval) { \ 50 | WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \ 51 | return ncclSystemError; \ 52 | } \ 53 | return ncclSuccess; 54 | 55 | #define IBV_INT_CHECK(call, error_retval, name) \ 56 | int ret = call; \ 57 | if (ret == error_retval) { \ 58 | WARN("Call to " name " failed"); \ 59 | return ncclSystemError; \ 60 | } \ 61 | return ncclSuccess; 62 | 63 | #define IBV_PASSTHRU(call) \ 64 | call; \ 65 | return ncclSuccess; 66 | 67 | NCCL_PARAM(IbMQpRetryAll, "IB_MQP_RETRY_ALL", 0); 68 | NCCL_PARAM(IbMQpRetryCnt, "IB_MQP_RETRY_CNT", 34); 69 | NCCL_PARAM(IbMQpRetryTimeout, "IB_MQP_RETRY_SLEEP_MSEC", 100); // in milliseconds 70 | 71 | #define IBV_ERR_EQ(e, code) (e == code || e == (-code)) 72 | #define IBV_MQP_RETRY_ERRNO(e) (IBV_ERR_EQ(e, ETIMEDOUT)) 73 | #define IBV_MQP_RETRY_ERRNO_ALL(e) (ncclParamIbMQpRetryAll() ? (e != 0) : IBV_MQP_RETRY_ERRNO(e)) 74 | 75 | ncclResult_t wrap_ibv_fork_init() { 76 | IBV_INT_CHECK(ibv_fork_init(), -1, "ibv_fork_init"); 77 | } 78 | 79 | ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) { 80 | *ret = ibv_get_device_list(num_devices); 81 | if (*ret == NULL) *num_devices = 0; 82 | return ncclSuccess; 83 | } 84 | 85 | ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) { 86 | IBV_PASSTHRU(ibv_free_device_list(list)); 87 | } 88 | 89 | const char *wrap_ibv_get_device_name(struct ibv_device *device) { 90 | return ibv_get_device_name(device); 91 | } 92 | 93 | ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/ 94 | IBV_PTR_CHECK(ibv_open_device(device), *ret, NULL, "ibv_open_device"); 95 | } 96 | 97 | ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/ 98 | IBV_INT_CHECK(ibv_close_device(context), -1, "ibv_close_device"); 99 | } 100 | 101 | ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/ 102 | IBV_INT_CHECK(ibv_get_async_event(context, event), -1, "ibv_get_async_event"); 103 | } 104 | 105 | ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) { 106 | IBV_PASSTHRU(ibv_ack_async_event(event)); 107 | } 108 | 109 | ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ 110 | IBV_INT_CHECK_RET_ERRNO(ibv_query_device(context, device_attr), 0, "ibv_query_device"); 111 | } 112 | 113 | ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ 114 | IBV_INT_CHECK_RET_ERRNO(ibv_query_port(context, port_num, port_attr), 0, "ibv_query_port"); 115 | } 116 | 117 | ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) { 118 | IBV_INT_CHECK_RET_ERRNO(ibv_query_gid(context, port_num, index, gid), 0, "ibv_query_gid"); 119 | } 120 | 121 | ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { 122 | IBV_INT_CHECK_RET_ERRNO(ibv_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp"); 123 | } 124 | 125 | ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) { 126 | IBV_PTR_CHECK(ibv_alloc_pd(context), *ret, NULL, "ibv_alloc_pd"); 127 | } 128 | 129 | ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ 130 | IBV_INT_CHECK_RET_ERRNO(ibv_dealloc_pd(pd), 0, "ibv_dealloc_pd"); 131 | } 132 | 133 | ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) { 134 | IBV_PTR_CHECK(ibv_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr"); 135 | } 136 | 137 | struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { 138 | return ibv_reg_mr(pd, addr, length, access); 139 | } 140 | 141 | ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) { 142 | #if HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING 143 | IBV_PTR_CHECK(ibv_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); 144 | #else 145 | return ncclSystemError; 146 | #endif 147 | } 148 | 149 | /* DMA-BUF support */ 150 | ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { 151 | #if HAVE_DECL_IBV_REG_DMABUF_MR 152 | IBV_PTR_CHECK_ERRNO(ibv_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr"); 153 | #else 154 | return ncclSystemError; 155 | #endif 156 | } 157 | 158 | struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { 159 | #if HAVE_DECL_IBV_REG_DMABUF_MR 160 | return ibv_reg_dmabuf_mr(pd, offset, length, iova, fd, access); 161 | #else 162 | errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set 163 | return NULL; 164 | #endif 165 | } 166 | 167 | ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ 168 | IBV_INT_CHECK_RET_ERRNO(ibv_dereg_mr(mr), 0, "ibv_dereg_mr"); 169 | } 170 | 171 | ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) { 172 | IBV_PTR_CHECK_ERRNO(ibv_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq"); 173 | } 174 | 175 | ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) { 176 | IBV_INT_CHECK_RET_ERRNO(ibv_destroy_cq(cq), 0, "ibv_destroy_cq"); 177 | } 178 | 179 | ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) { 180 | IBV_INT_CHECK_RET_ERRNO(ibv_destroy_qp(qp), 0, "ibv_destroy_qp"); 181 | } 182 | 183 | ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { 184 | IBV_PTR_CHECK_ERRNO(ibv_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp"); 185 | } 186 | 187 | static void ibvQpStateName(enum ibv_qp_state state, char* msg, const size_t len) { 188 | switch (state) { 189 | case (IBV_QPS_RESET): snprintf(msg, len, "RESET"); break; 190 | case (IBV_QPS_INIT): snprintf(msg, len, "INIT"); break; 191 | case (IBV_QPS_RTR): snprintf(msg, len, "RTR"); break; 192 | case (IBV_QPS_RTS): snprintf(msg, len, "RTS"); break; 193 | case (IBV_QPS_SQD): snprintf(msg, len, "SQD"); break; 194 | case (IBV_QPS_SQE): snprintf(msg, len, "SQE"); break; 195 | case (IBV_QPS_ERR): snprintf(msg, len, "ERR"); break; 196 | case (IBV_QPS_UNKNOWN): snprintf(msg, len, "UNKNOWN"); break; 197 | default: snprintf(msg, len, "NOT RECOGNIZED (%d)", state); break; 198 | } 199 | } 200 | 201 | #define QP_ATTR(attr, userAttr, userFlag, mask) ((userFlag & mask) ? (userAttr) : (attr)) 202 | 203 | static void ibvModifyQpLog(struct ibv_qp* qp, enum ibv_qp_state qpState, struct ibv_qp_attr* userAttr, int userFlag, char* msg, size_t msgLen) { 204 | ncclResult_t res; 205 | int portNum = -1, gidIndex = -1; 206 | char localGidName[INET6_ADDRSTRLEN], remoteGidName[INET6_ADDRSTRLEN]; 207 | const char *localGidRes = NULL, *remoteGidRes = NULL; 208 | 209 | char nextState[32], currState[32]; 210 | ibvQpStateName(qp->state, currState, sizeof(currState)); 211 | ibvQpStateName(qpState, nextState, sizeof(nextState)); 212 | char devName[IBV_SYSFS_NAME_MAX] = ""; 213 | snprintf(devName, sizeof(devName), "%s", (qp->pd->context) ? wrap_ibv_get_device_name(qp->pd->context->device) : "N/A"); 214 | 215 | struct ibv_qp_attr attr; 216 | struct ibv_qp_init_attr init_attr; 217 | int attr_mask = IBV_QP_PORT | IBV_QP_AV; 218 | res = wrap_ibv_query_qp(qp, &attr, attr_mask, &init_attr); 219 | struct ibv_qp_attr *qpAttr = (res == ncclSuccess) ? &attr : NULL; 220 | 221 | // port info, portAttr can be NULL if not given by the user and query_qp failed 222 | struct ibv_qp_attr *portAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_PORT); 223 | portNum = portAttr ? portAttr->port_num : -1; 224 | 225 | // address info, avAttr can be NULL if not given by the user and query_qp failed 226 | struct ibv_qp_attr *avAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_AV); 227 | if (avAttr && avAttr->ah_attr.is_global) { 228 | union ibv_gid *remoteGid = &avAttr->ah_attr.grh.dgid; 229 | remoteGidRes = ibvGetGidStr(remoteGid, remoteGidName, sizeof(remoteGidName)); 230 | // we need pd->context to retrieve local GID, skip if not there 231 | if (!qp->pd->context) goto print; 232 | gidIndex = avAttr->ah_attr.grh.sgid_index; 233 | union ibv_gid localGid; 234 | NCCLCHECKGOTO(wrap_ibv_query_gid(qp->pd->context, portNum, gidIndex, &localGid), res, print); 235 | localGidRes = ibvGetGidStr(&localGid, localGidName, sizeof(localGidName)); 236 | } 237 | print: 238 | snprintf(msg, msgLen, "on dev %s:%d, curr state %s, next state %s, local GID index %d, local GID %s, remote GID %s", 239 | devName, portNum, currState, nextState, gidIndex, localGidRes ? localGidName : "N/A", remoteGidRes ? remoteGidName : "N/A"); 240 | return; 241 | } 242 | 243 | ncclResult_t wrap_ibv_modify_qp(struct ibv_qp* qp, struct ibv_qp_attr* attr, int attr_mask) { 244 | char qpMsg[1024]; 245 | int ret = 0, attempts = 0; 246 | int maxCnt = (int)ncclParamIbMQpRetryCnt() + 1; // number of attempts = number of retry + 1 247 | int timeOut = (int)ncclParamIbMQpRetryTimeout(); 248 | do { 249 | if (attempts > 0) { 250 | unsigned int sleepTime = timeOut * attempts; 251 | ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg)); 252 | INFO(NCCL_NET, "Call to ibv_modify_qp failed with %d %s, %s, retrying %d/%d after %u msec of sleep", ret, strerror(ret), qpMsg, attempts, maxCnt, sleepTime); 253 | // sleep before retrying 254 | struct timespec tv = {.tv_sec = sleepTime / 1000, .tv_nsec = (sleepTime % 1000) * ((long)1e6)}; 255 | nanosleep(&tv, NULL); 256 | } 257 | ret = ibv_modify_qp(qp, attr, attr_mask); 258 | attempts++; 259 | } while (IBV_MQP_RETRY_ERRNO_ALL(ret) && attempts < maxCnt); 260 | if (ret != 0) { 261 | ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg)); 262 | WARN("Call to ibv_modify_qp failed with %d %s, %s", ret, strerror(ret), qpMsg); 263 | return ncclSystemError; 264 | } 265 | return ncclSuccess; 266 | } 267 | 268 | ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { 269 | IBV_INT_CHECK_RET_ERRNO(qp->context->ops.post_send(qp, wr, bad_wr), 0, "ibv_post_send"); 270 | } 271 | 272 | ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { 273 | IBV_INT_CHECK_RET_ERRNO(qp->context->ops.post_recv(qp, wr, bad_wr), 0, "ibv_post_recv"); 274 | return ncclSuccess; 275 | } 276 | 277 | ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ 278 | #if HAVE_DECL_IBV_QUERY_ECE 279 | IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_query_ece(qp, ece), 0, "ibv_query_ece", supported); 280 | #else 281 | INFO(NCCL_NET, "Call to ibv_query_ece is skipped, doesn't exist"); 282 | *supported = 0; 283 | return ncclSuccess; 284 | #endif 285 | } 286 | 287 | ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ 288 | #if HAVE_DECL_IBV_SET_ECE 289 | IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_set_ece(qp, ece), 0, "ibv_set_ece", supported); 290 | #else 291 | INFO(NCCL_NET, "Call to ibv_set_ece skipped, doesn't exist"); 292 | *supported = 0; 293 | return ncclSuccess; 294 | #endif 295 | } 296 | 297 | ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) { 298 | *ret = (char *) ibv_event_type_str(event); 299 | return ncclSuccess; 300 | } 301 | -------------------------------------------------------------------------------- /src/param.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #include "param.h" 10 | #include "debug.h" 11 | 12 | //#include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #define MIN(a, b) ((a)<(b)?(a):(b)) 23 | const char* userHomeDir() { 24 | struct passwd *pwUser = getpwuid(getuid()); 25 | return pwUser == NULL ? NULL : pwUser->pw_dir; 26 | } 27 | 28 | void setEnvFile(const char* fileName) { 29 | FILE * file = fopen(fileName, "r"); 30 | if (file == NULL) return; 31 | 32 | char *line = NULL; 33 | char envVar[1024]; 34 | char envValue[1024]; 35 | size_t n = 0; 36 | ssize_t read; 37 | while ((read = getline(&line, &n, file)) != -1) { 38 | if (line[0] == '#') continue; 39 | if (line[read-1] == '\n') line[read-1] = '\0'; 40 | int s=0; // Env Var Size 41 | while (line[s] != '\0' && line[s] != '=') s++; 42 | if (line[s] == '\0') continue; 43 | strncpy(envVar, line, MIN(1023,s)); 44 | envVar[MIN(1023,s)] = '\0'; 45 | s++; 46 | strncpy(envValue, line+s, 1023); 47 | envValue[1023]='\0'; 48 | setenv(envVar, envValue, 0); 49 | //printf("%s : %s->%s\n", fileName, envVar, envValue); 50 | } 51 | if (line) free(line); 52 | fclose(file); 53 | } 54 | 55 | static void initEnvFunc() { 56 | char confFilePath[1024]; 57 | const char* userFile = getenv("NCCL_CONF_FILE"); 58 | if (userFile && strlen(userFile) > 0) { 59 | snprintf(confFilePath, sizeof(confFilePath), "%s", userFile); 60 | setEnvFile(confFilePath); 61 | } else { 62 | const char* userDir = userHomeDir(); 63 | if (userDir) { 64 | snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir); 65 | setEnvFile(confFilePath); 66 | } 67 | } 68 | snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf"); 69 | setEnvFile(confFilePath); 70 | } 71 | 72 | void initEnv() { 73 | static pthread_once_t once = PTHREAD_ONCE_INIT; 74 | pthread_once(&once, initEnvFunc); 75 | } 76 | 77 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { 78 | static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 79 | pthread_mutex_lock(&mutex); 80 | if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { 81 | const char* str = ncclGetEnv(env); 82 | int64_t value = deftVal; 83 | if (str && strlen(str) > 0) { 84 | errno = 0; 85 | value = strtoll(str, NULL, 0); 86 | if (errno) { 87 | value = deftVal; 88 | INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); 89 | } else { 90 | INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value); 91 | } 92 | } 93 | __atomic_store_n(cache, value, __ATOMIC_RELAXED); 94 | } 95 | pthread_mutex_unlock(&mutex); 96 | } 97 | 98 | const char* ncclGetEnv(const char* name) { 99 | initEnv(); 100 | return getenv(name); 101 | } 102 | -------------------------------------------------------------------------------- /src/ucx_uct_plugin.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 3 | * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * See LICENSE.txt for license information 7 | ************************************************************************/ 8 | 9 | #include "ucx_uct_lib.h" 10 | 11 | typedef enum { 12 | NCCL_UCT_REQ_IRECV = -1, 13 | NCCL_UCT_REQ_IFLUSH = -2 14 | } nccl_uct_request_type_t; 15 | 16 | struct nccl_uct_rdesc; 17 | 18 | /* On-the-wire descriptor of a posted receive request entry */ 19 | typedef struct { 20 | int tag; 21 | int size; 22 | void *data; 23 | int matched; 24 | uct_rkey_t rkey; 25 | } nccl_uct_chunk_t; 26 | 27 | /* On-the-wire descriptor of a receive request containing many chunks */ 28 | typedef struct { 29 | uint64_t id; 30 | uint16_t count; 31 | uint32_t size; 32 | struct nccl_uct_rdesc *peer_rdesc; /* Acts as a cookie along with id */ 33 | nccl_uct_chunk_t chunk[]; 34 | } nccl_uct_rdesc_hdr_t; 35 | 36 | /* On-the-wire descriptor for receive request completion */ 37 | typedef struct { 38 | uint64_t id; 39 | struct nccl_uct_rdesc *rdesc; 40 | int count; /* Number of sizes contained */ 41 | int sizes[NCCL_UCX_UCT_MAX_RECVS]; 42 | } nccl_uct_atp_t; 43 | 44 | /* 45 | * NCCL local request handler to progress: 46 | * - size -1 for multi receive 47 | * - size -2 for flush 48 | * - size > 0 for send 49 | */ 50 | typedef struct { 51 | /* Pending GET (iflush) PUT (isend) or receiving one ATP (irecv) */ 52 | uct_completion_t completion; 53 | int size; 54 | struct nccl_uct_rdesc *rdesc; 55 | } nccl_uct_req_t; 56 | 57 | /* Pending receive descriptor either on the receive or sending side */ 58 | typedef struct nccl_uct_rdesc { 59 | int nccl_usage; /* NCCL requests not finished/started */ 60 | int send_atp; /* >1 pending isend, ==1 pending atp send */ 61 | 62 | union { 63 | ucs_list_link_t list; /* comm's linked list */ 64 | struct nccl_uct_rdesc *next; /* inserted in free list */ 65 | }; 66 | 67 | struct nccl_uct_wr_comm *comm; 68 | nccl_uct_rdesc_hdr_t desc; 69 | nccl_uct_chunk_t storage[NCCL_UCX_UCT_MAX_RECVS]; /* Don't use directly */ 70 | nccl_uct_req_t reqs[NCCL_UCX_UCT_MAX_RECVS]; /* NCCL requests */ 71 | int sizes[NCCL_UCX_UCT_MAX_RECVS]; /* ATP received sizes */ 72 | } nccl_uct_rdesc_t; 73 | 74 | typedef struct nccl_uct_wr_comm { 75 | nccl_uct_comm_t base; 76 | 77 | int rdesc_alloc; /* Track allocated rdescs */ 78 | nccl_uct_rdesc_t *free_rdesc; /* Available rdesc for reuse */ 79 | uint64_t rdesc_id; /* Next sequence number to use */ 80 | 81 | /* Received RTRs: used by Sender communicator in ->isend() */ 82 | ucs_list_link_t rdesc_list; 83 | 84 | } nccl_uct_wr_comm_t; 85 | 86 | static inline nccl_uct_wr_comm_t * 87 | nccl_uct_wr_comm_get(nccl_uct_comm_t *base_comm) { 88 | return ucs_container_of(base_comm, nccl_uct_wr_comm_t, base); 89 | } 90 | 91 | static nccl_uct_rdesc_t *nccl_uct_comm_rdesc_get(nccl_uct_wr_comm_t *comm) { 92 | nccl_uct_rdesc_t *rdesc = comm->free_rdesc; 93 | 94 | if (rdesc == NULL) { 95 | rdesc = calloc(1, sizeof(*rdesc)); 96 | } else { 97 | comm->free_rdesc = rdesc->next; 98 | } 99 | 100 | rdesc->next = NULL; 101 | rdesc->comm = comm; 102 | comm->rdesc_alloc++; 103 | return rdesc; 104 | } 105 | 106 | static size_t nccl_uct_rdesc_size(int n) { 107 | return n * sizeof(nccl_uct_chunk_t) + sizeof(nccl_uct_rdesc_hdr_t); 108 | } 109 | 110 | /* Prepare a receive descriptor from irecv()/iflush() side */ 111 | static void nccl_uct_rdesc_set(nccl_uct_rdesc_t *rdesc, uint64_t id, int n, 112 | void **data, size_t *sizes, int *tags, 113 | nccl_uct_memh_t **uct_memh) { 114 | nccl_uct_rdesc_hdr_t *desc = &rdesc->desc; 115 | int i; 116 | 117 | /* Populate header */ 118 | desc->id = id; 119 | desc->count = n; 120 | desc->size = nccl_uct_rdesc_size(n); 121 | desc->peer_rdesc = rdesc; /* cookie, will be returned in ATP */ 122 | 123 | /* Ref count that prevents NCCL from releasing memory */ 124 | rdesc->nccl_usage = 1; 125 | rdesc->send_atp = 0; 126 | 127 | /* Zero (iflush) or one or many receive request are contained */ 128 | for (i = 0; i < n; i++) { 129 | desc->chunk[i].tag = tags[i]; 130 | desc->chunk[i].size = sizes[i]; 131 | desc->chunk[i].data = data[i]; 132 | desc->chunk[i].matched = 0; 133 | desc->chunk[i].rkey = uct_memh[i]->bundle.rkey; 134 | } 135 | } 136 | 137 | static nccl_uct_req_t *nccl_uct_rdesc_get_req(nccl_uct_rdesc_t *rdesc, int i, 138 | int size) { 139 | nccl_uct_req_t *req; 140 | 141 | assert(i < NCCL_UCX_UCT_MAX_RECVS); 142 | 143 | req = &rdesc->reqs[i]; 144 | req->size = size; 145 | req->rdesc = rdesc; 146 | 147 | req->completion.func = nccl_uct_empty_callback; 148 | req->completion.count = 1; 149 | req->completion.status = UCS_OK; 150 | 151 | return &rdesc->reqs[i]; 152 | } 153 | 154 | static void nccl_uct_comm_rdesc_put(nccl_uct_rdesc_t *rdesc) { 155 | nccl_uct_wr_comm_t *comm = rdesc->comm; 156 | 157 | assert(comm != NULL); 158 | 159 | rdesc->desc.id = -1; 160 | rdesc->comm = NULL; 161 | rdesc->next = comm->free_rdesc; 162 | comm->free_rdesc = rdesc; 163 | comm->rdesc_alloc--; 164 | } 165 | 166 | /* On receiver side, after ->irecv(), expect corresponding ATP */ 167 | static ucs_status_t nccl_uct_atp_callback(void *arg, void *data, size_t length, 168 | unsigned flags) { 169 | nccl_uct_atp_t *atp = (nccl_uct_atp_t*)((uint8_t*)data + 8); 170 | 171 | assert(length == (sizeof(*atp) + 8)); 172 | assert(*(nccl_uct_comm_t**)data == &atp->rdesc->comm->base); 173 | assert(atp->id == atp->rdesc->desc.id); 174 | assert(atp->count == atp->rdesc->desc.count); 175 | assert(atp->rdesc->reqs[0].completion.count == 1); 176 | 177 | atp->rdesc->reqs[0].completion.count--; 178 | memcpy(atp->rdesc->sizes, atp->sizes, atp->count * sizeof(*atp->sizes)); 179 | return UCS_OK; 180 | } 181 | 182 | /* On sender side, asynchronously receive rdesc/RTR, later used by ->isend() */ 183 | static ucs_status_t nccl_uct_rtr_callback(void *arg, void *data, size_t length, 184 | unsigned flags) { 185 | nccl_uct_comm_t *base_comm = *(nccl_uct_comm_t **)data; 186 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(base_comm); 187 | nccl_uct_rdesc_hdr_t *desc = (nccl_uct_rdesc_hdr_t*)((uint8_t*)data + 8); 188 | size_t size = desc->size; 189 | nccl_uct_rdesc_t *rdesc; 190 | 191 | rdesc = nccl_uct_comm_rdesc_get(comm); 192 | if (rdesc == NULL) { 193 | WARN("Failed to get an rdesc in RTR callback"); 194 | return UCS_ERR_NO_MEMORY; /* Cannot happend */ 195 | } 196 | 197 | ucs_list_add_tail(&comm->rdesc_list, &rdesc->list); 198 | 199 | assert((size + 8) == length); 200 | assert(size == nccl_uct_rdesc_size(desc->count)); 201 | 202 | memcpy(&rdesc->desc, desc, size); 203 | rdesc->nccl_usage = desc->count; 204 | rdesc->send_atp = desc->count + 1; 205 | return UCS_OK; 206 | } 207 | 208 | static ncclResult_t nccl_uct_wr_iface_set(nccl_uct_iface_t *uct_iface) { 209 | NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_RTR, 210 | nccl_uct_rtr_callback)); 211 | NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_ATP, 212 | nccl_uct_atp_callback)); 213 | return ncclSuccess; 214 | } 215 | 216 | static ncclResult_t nccl_uct_wr_comm_alloc(nccl_uct_comm_t **comm_p) { 217 | nccl_uct_wr_comm_t *comm = calloc(1, sizeof(nccl_uct_wr_comm_t)); 218 | if (comm != NULL) { 219 | *comm_p = &comm->base; 220 | return ncclSuccess; 221 | } 222 | 223 | return ncclSystemError; 224 | } 225 | 226 | static ncclResult_t nccl_uct_wr_comm_init(nccl_uct_comm_t *base_comm, 227 | nccl_uct_context_t *context, 228 | nccl_uct_worker_t *worker, int dev, 229 | const nccl_uct_comm_t *remote_comm) { 230 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(base_comm); 231 | 232 | ucs_list_head_init(&comm->rdesc_list); 233 | return nccl_uct_comm_init(&comm->base, context, worker, dev, remote_comm); 234 | } 235 | 236 | static ncclResult_t nccl_uct_wr_init(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { 237 | context.ops.comm_alloc = nccl_uct_wr_comm_alloc; 238 | context.ops.comm_init = nccl_uct_wr_comm_init; 239 | context.ops.iface_set = nccl_uct_wr_iface_set; 240 | context.am_short_size = nccl_uct_rdesc_size(NCCL_UCX_UCT_MAX_RECVS); 241 | context.rkey_size = sizeof(((nccl_uct_chunk_t*)0)->rkey); 242 | 243 | return nccl_p2p_ib_init(&context.dev_count, &context.merge_dev_count, ncclIbDevs, context.if_name, 244 | &context.if_addr, NULL, logFunction); 245 | } 246 | 247 | /* Outcome is either send_atp equal to 1 or 0 */ 248 | static void nccl_uct_send_atp(nccl_uct_wr_comm_t *comm, 249 | nccl_uct_rdesc_t *rdesc) { 250 | ucs_status_t status; 251 | nccl_uct_atp_t atp; 252 | int i; 253 | 254 | assert(rdesc->send_atp == 1); 255 | 256 | status = uct_ep_fence(comm->base.uct_ep->ep, 0); 257 | if (status != UCS_OK) { 258 | return; 259 | } 260 | 261 | atp.id = rdesc->desc.id; 262 | atp.rdesc = rdesc->desc.peer_rdesc; 263 | atp.count = rdesc->desc.count; 264 | 265 | /* Sizes from isend() are lower or equal to their irecv() side */ 266 | for (i = 0; i < rdesc->desc.count; i++) { 267 | atp.sizes[i] = rdesc->reqs[i].size; 268 | } 269 | 270 | status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_ATP, 271 | (uint64_t)comm->base.remote.comm, &atp, sizeof(atp)); 272 | if (status == UCS_OK) { 273 | rdesc->send_atp = 0; 274 | } 275 | } 276 | 277 | static ncclResult_t nccl_uct_send(nccl_uct_wr_comm_t *comm, void *data, 278 | int size, nccl_uct_memh_t *uct_memh, 279 | nccl_uct_rdesc_t *rdesc, int i, 280 | void **request) { 281 | ucs_status_t status; 282 | uct_iov_t iov; 283 | nccl_uct_req_t *req; 284 | 285 | *request = NULL; 286 | 287 | /* Details for local data */ 288 | iov.buffer = data; 289 | iov.length = size; 290 | iov.memh = uct_memh->memh; 291 | iov.stride = iov.length; 292 | iov.count = 1; 293 | 294 | assert(size <= rdesc->desc.chunk[i].size); 295 | 296 | req = nccl_uct_rdesc_get_req(rdesc, i, size); /* NCCL request */ 297 | 298 | status = uct_ep_put_zcopy(comm->base.uct_ep->ep, &iov, 1, 299 | (uint64_t)rdesc->desc.chunk[i].data, 300 | rdesc->desc.chunk[i].rkey, &req->completion); 301 | 302 | if (status == UCS_OK) { 303 | req->completion.count--; 304 | } else if (status != UCS_INPROGRESS) { 305 | return ncclSuccess; 306 | } 307 | 308 | rdesc->desc.chunk[i].matched = 1; 309 | --rdesc->send_atp; 310 | 311 | if (rdesc->send_atp == 1) { 312 | ucs_list_del(&rdesc->list); /* all ->isend() were now matched */ 313 | nccl_uct_send_atp(comm, rdesc); 314 | } 315 | 316 | *request = req; 317 | return ncclSuccess; 318 | } 319 | 320 | static ncclResult_t nccl_uct_wr_isend(void *send_comm, void *data, size_t size, 321 | int tag, void *mhandle, void* phandle, void **request) { 322 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(send_comm); 323 | nccl_uct_rdesc_t *rdesc; 324 | int i; 325 | 326 | *request = NULL; 327 | 328 | ucs_list_for_each(rdesc, &comm->rdesc_list, list) { 329 | for (i = 0; i < rdesc->desc.count; i++) { 330 | if (rdesc->desc.chunk[i].matched || (rdesc->desc.chunk[i].tag != tag)) { 331 | continue; 332 | } 333 | 334 | return nccl_uct_send(comm, data, size, mhandle, rdesc, i, request); 335 | } 336 | } 337 | 338 | /* Progress here to make sure we receive non-solicited RTRs */ 339 | uct_worker_progress(comm->base.uct_worker->worker); 340 | return ncclSuccess; 341 | } 342 | 343 | static ncclResult_t nccl_uct_wr_irecv(void *recv_comm, int n, void **data, 344 | size_t *sizes, int *tags, void **mhandles, 345 | void** phandles, void **request) { 346 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(recv_comm); 347 | nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandles; 348 | nccl_uct_rdesc_t *rdesc; 349 | ucs_status_t status; 350 | 351 | assert(n <= NCCL_UCX_UCT_MAX_RECVS); 352 | 353 | rdesc = nccl_uct_comm_rdesc_get(comm); 354 | if (rdesc == NULL) { 355 | return ncclInternalError; 356 | } 357 | 358 | nccl_uct_rdesc_set(rdesc, comm->rdesc_id++, n, data, sizes, tags, uct_memh); 359 | 360 | status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_RTR, 361 | (uint64_t)comm->base.remote.comm, &rdesc->desc, 362 | nccl_uct_rdesc_size(n)); 363 | if (status != UCS_OK) { 364 | nccl_uct_comm_rdesc_put(rdesc); 365 | *request = NULL; 366 | } else { 367 | /* Wait for receiving ATP */ 368 | *request = nccl_uct_rdesc_get_req(rdesc, 0, NCCL_UCT_REQ_IRECV); 369 | } 370 | 371 | return ncclSuccess; 372 | } 373 | 374 | static ncclResult_t nccl_uct_wr_iflush(void *recv_comm, int n, void **data, 375 | int *sizes, void **mhandle, 376 | void **request) { 377 | nccl_uct_comm_t *base_comm = recv_comm; 378 | int last = nccl_uct_flush_index(base_comm, sizes, n); 379 | nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandle; 380 | nccl_uct_rdesc_t *rdesc; 381 | nccl_uct_req_t *req; 382 | ncclResult_t result; 383 | 384 | if (last == -1) { 385 | return ncclSuccess; 386 | } 387 | 388 | rdesc = nccl_uct_comm_rdesc_get(nccl_uct_wr_comm_get(base_comm)); 389 | if (rdesc == NULL) { 390 | return ncclInternalError; 391 | } 392 | 393 | nccl_uct_rdesc_set(rdesc, ~0, 0, NULL, NULL, NULL, NULL); 394 | /* Wait for local GET completion */ 395 | req = nccl_uct_rdesc_get_req(rdesc, 0, NCCL_UCT_REQ_IFLUSH); 396 | *request = req; 397 | 398 | result = nccl_uct_flush(base_comm, data[last], sizes[last], uct_memh[last], 399 | &req->completion, request); 400 | if (*request == NULL) { 401 | nccl_uct_comm_rdesc_put(rdesc); 402 | } 403 | 404 | return result; 405 | } 406 | 407 | static ncclResult_t nccl_uct_wr_test(void *request, int *done, int *sizes) { 408 | nccl_uct_req_t *req = request; 409 | nccl_uct_rdesc_t *rdesc = req->rdesc; 410 | nccl_uct_wr_comm_t *comm = rdesc->comm; 411 | 412 | uct_worker_progress(comm->base.uct_worker->worker); 413 | 414 | *done = 0; 415 | 416 | if (rdesc->send_atp == 1) { 417 | /* Slowpath */ 418 | nccl_uct_send_atp(comm, rdesc); 419 | 420 | if (rdesc->send_atp && rdesc->nccl_usage == 1) { 421 | /* Keep the last isend request until ATP is out */ 422 | return ncclSuccess; 423 | } 424 | } 425 | 426 | if (req->completion.count > 0) { 427 | return ncclSuccess; 428 | } 429 | 430 | *done = 1; 431 | 432 | if (req->size == NCCL_UCT_REQ_IRECV) { 433 | assert(&rdesc->reqs[0] == req); 434 | if (sizes != NULL) { 435 | memcpy(sizes, rdesc->sizes, rdesc->desc.count * sizeof(*sizes)); 436 | } 437 | } else if (req->size == NCCL_UCT_REQ_IFLUSH) { 438 | assert(&rdesc->reqs[0] == req); 439 | } else { 440 | /* ->isend() request */ 441 | assert(req->size > -1); 442 | if (sizes != NULL) { 443 | sizes[0] = req->size; 444 | } 445 | } 446 | 447 | if (--rdesc->nccl_usage < 1) { 448 | assert(rdesc->send_atp == 0); 449 | assert(rdesc->nccl_usage == 0); 450 | nccl_uct_comm_rdesc_put(rdesc); 451 | } 452 | 453 | return ncclSuccess; 454 | } 455 | 456 | static ncclResult_t nccl_uct_wr_close(void *close_comm) { 457 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(close_comm); 458 | nccl_uct_rdesc_t *rdesc; 459 | 460 | nccl_uct_comm_deinit(close_comm); 461 | 462 | while ((rdesc = comm->free_rdesc) != NULL) { 463 | comm->free_rdesc = rdesc->next; 464 | free(rdesc); 465 | } 466 | 467 | assert(ucs_list_is_empty(&comm->rdesc_list)); 468 | assert(comm->rdesc_alloc == 0); 469 | free(comm); 470 | return ncclSuccess; 471 | } 472 | 473 | 474 | static ncclResult_t nccl_uct_wr_init_v9(ncclDebugLogger_t logFunction) { 475 | return nccl_uct_wr_init(logFunction, NULL); 476 | } 477 | 478 | static ncclResult_t nccl_uct_wr_isend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { 479 | return nccl_uct_wr_isend(sendComm, data, size, tag, mhandle, NULL, request); 480 | } 481 | 482 | static ncclResult_t nccl_uct_wr_irecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { 483 | return nccl_uct_wr_irecv(recvComm, n, data, sizes, tags, mhandles, NULL, request); 484 | } 485 | 486 | static ncclResult_t nccl_uct_wr_isend_v8(void *send_comm, void *data, int size, 487 | int tag, void *mhandle, void **request) { 488 | return nccl_uct_wr_isend_v9(send_comm, data, (size_t)size, tag, mhandle, request); 489 | } 490 | 491 | static ncclResult_t nccl_uct_wr_irecv_v8(void *recv_comm, int n, void **data, 492 | int *sizes, int *tags, void **mhandles, 493 | void **request) { 494 | size_t sizes_sizet[NCCL_NET_IB_MAX_RECVS]; 495 | for (int i=0; isend_rts == 0); 83 | assert(req->rts_count == req->count); 84 | assert(req->completion.count == 1); 85 | 86 | status = uct_ep_am_short(req->comm->base.uct_ep->ep, NCCL_UCT_AM_ATS, 87 | (uint64_t)req->comm->base.remote.comm, 88 | req->remote_req, 89 | sizeof(*req->remote_req) * req->rts_count); 90 | if (status == UCS_OK) { 91 | req->completion.count--; 92 | } 93 | } 94 | 95 | static void nccl_uct_rd_pending_add(nccl_uct_rd_comm_t *comm, 96 | nccl_uct_mem_t *src, nccl_uct_mem_t *dst) { 97 | nccl_uct_rd_req_t *req = dst->req; 98 | nccl_uct_get_param_t *param; 99 | 100 | assert(src->size <= dst->size); 101 | assert(req->rts_count < NCCL_UCX_UCT_MAX_RECVS); 102 | 103 | req->sizes[dst->index] = src->size; 104 | req->remote_req[req->rts_count++] = src->req; /* src->req is a cookie */ 105 | 106 | if (src->size == 0) { 107 | req->completion.count--; 108 | return; 109 | } 110 | 111 | param = &comm->pending.param[comm->pending.last & NCCL_UCT_PENDING_MASK]; 112 | comm->pending.last++; 113 | 114 | assert((comm->pending.first & NCCL_UCT_PENDING_MASK) != 115 | (comm->pending.last & NCCL_UCT_PENDING_MASK)); 116 | 117 | param->iov.buffer = dst->data; 118 | param->iov.length = src->size; 119 | param->iov.memh = dst->u.uct_memh->memh; 120 | param->iov.stride = 0; 121 | param->iov.count = 1; 122 | param->rva = (uint64_t)src->data; 123 | param->rkey = src->u.rkey; 124 | param->req = req; 125 | } 126 | 127 | static void nccl_uct_rd_pending_drain(nccl_uct_rd_comm_t *comm) { 128 | ucs_status_t status; 129 | nccl_uct_get_param_t *param; 130 | 131 | for (; comm->pending.first != comm->pending.last; comm->pending.first++) { 132 | param = &comm->pending.param[comm->pending.first & NCCL_UCT_PENDING_MASK]; 133 | 134 | status = uct_ep_get_zcopy(comm->base.uct_ep->ep, ¶m->iov, 1, param->rva, 135 | param->rkey, ¶m->req->completion); 136 | if (status == UCS_OK) { 137 | param->req->completion.count--; 138 | } else if (status != UCS_INPROGRESS) { 139 | break; 140 | } 141 | 142 | if (param->req->completion.count == 1) { 143 | nccl_uct_rd_send_ats(param->req); 144 | } 145 | } 146 | } 147 | 148 | static ucs_status_t nccl_uct_rd_ats_callback(void *arg, void *data, 149 | size_t length, unsigned flags) { 150 | nccl_uct_rd_req_t **req = (nccl_uct_rd_req_t **)((uint8_t *)data + 8); 151 | nccl_uct_rd_req_t **end = (nccl_uct_rd_req_t **)((uint8_t *)data + length); 152 | 153 | for (; req + 1 <= end; req++) { 154 | assert((*req)->completion.count == 1); 155 | assert((*req)->comm == nccl_uct_rd_comm_get(*(nccl_uct_comm_t**)data)); 156 | 157 | (*req)->completion.count = 0; 158 | } 159 | 160 | assert(req == end); 161 | return UCS_OK; 162 | } 163 | 164 | static ucs_status_t nccl_uct_rd_rts_callback(void *arg, void *data, 165 | size_t length, unsigned flags) { 166 | 167 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(*(nccl_uct_comm_t**)data); 168 | nccl_uct_mem_t *rts = (nccl_uct_mem_t *)((uint8_t *)data + 8); 169 | nccl_uct_ring_t *exp; 170 | nccl_uct_mem_t *dst; 171 | unsigned i; 172 | 173 | assert(length == (sizeof(*rts) + 8)); 174 | 175 | /* Do we already expect it? */ 176 | exp = &comm->exp; 177 | i = nccl_uct_ring_find(exp, rts->tag); 178 | if (i == exp->last) { 179 | nccl_uct_ring_append(&comm->unexp, rts->tag, rts, sizeof(*rts)); 180 | } else { 181 | /* Receive request was already posted */ 182 | dst = nccl_uct_ring_get_entry(exp, i); 183 | nccl_uct_rd_pending_add(comm, rts, dst); 184 | nccl_uct_ring_consume(exp, i); 185 | } 186 | 187 | return UCS_OK; 188 | } 189 | 190 | static ncclResult_t nccl_uct_rd_iface_set(nccl_uct_iface_t *uct_iface) { 191 | NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_RTS, 192 | nccl_uct_rd_rts_callback)); 193 | NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_ATS, 194 | nccl_uct_rd_ats_callback)); 195 | return ncclSuccess; 196 | } 197 | 198 | static ncclResult_t nccl_uct_rd_comm_alloc(nccl_uct_comm_t **comm_p) { 199 | nccl_uct_rd_comm_t *comm = calloc(1, sizeof(*comm)); 200 | if (comm != NULL) { 201 | *comm_p = &comm->base; 202 | return ncclSuccess; 203 | } 204 | 205 | return ncclSystemError; 206 | } 207 | 208 | static ncclResult_t nccl_uct_rd_comm_init(nccl_uct_comm_t *base_comm, 209 | nccl_uct_context_t *context, 210 | nccl_uct_worker_t *worker, int dev, 211 | const nccl_uct_comm_t *remote_comm) { 212 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(base_comm); 213 | 214 | comm->pending.first = 0; 215 | comm->pending.last = 0; 216 | comm->req_count = 0; 217 | comm->free_req = NULL; 218 | 219 | NCCLCHECK(nccl_uct_ring_init(&comm->exp, sizeof(nccl_uct_mem_t))); 220 | NCCLCHECK(nccl_uct_ring_init(&comm->unexp, sizeof(nccl_uct_mem_t))); 221 | 222 | return nccl_uct_comm_init(&comm->base, context, worker, dev, remote_comm); 223 | } 224 | 225 | static ncclResult_t nccl_uct_rd_init(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { 226 | NCCL_STATIC_ASSERT(NCCL_UCT_RING_SIZE >= 2 * MAX_REQUESTS, 227 | "Cannot handle expected/unexpected requests"); 228 | NCCL_STATIC_ASSERT(NCCL_UCT_PENDING_SIZE > MAX_REQUESTS, 229 | "Cannot handle enough pending requests"); 230 | 231 | context.ops.comm_alloc = nccl_uct_rd_comm_alloc; 232 | context.ops.comm_init = nccl_uct_rd_comm_init; 233 | context.ops.iface_set = nccl_uct_rd_iface_set; 234 | context.rkey_size = sizeof(((nccl_uct_mem_t*)0)->u.rkey); 235 | context.am_short_size = sizeof(((nccl_uct_rd_req_t*)0)->remote_req); 236 | if (sizeof(nccl_uct_mem_t) > context.am_short_size) { 237 | context.am_short_size = sizeof(nccl_uct_mem_t); 238 | } 239 | 240 | return nccl_p2p_ib_init(&context.dev_count, &context.merge_dev_count, ncclIbDevs, context.if_name, 241 | &context.if_addr, NULL, logFunction); 242 | } 243 | 244 | static nccl_uct_rd_req_t *nccl_uct_rd_req_alloc(nccl_uct_rd_comm_t *comm, 245 | int count) { 246 | nccl_uct_rd_req_t *req = comm->free_req; 247 | 248 | if (req == NULL) { 249 | req = malloc(sizeof(*req)); 250 | if (req == NULL) { 251 | return req; 252 | } 253 | } else { 254 | comm->free_req = req->next; 255 | } 256 | 257 | comm->req_count++; 258 | req->comm = comm; 259 | req->completion.func = nccl_uct_empty_callback; 260 | req->completion.count = count; 261 | req->completion.status = UCS_OK; 262 | return req; 263 | } 264 | 265 | static inline void nccl_uct_rd_req_free(nccl_uct_rd_req_t *req) { 266 | req->next = req->comm->free_req; 267 | req->comm->free_req = req; 268 | req->comm->req_count--; 269 | } 270 | 271 | static ncclResult_t nccl_uct_rd_isend(void *send_comm, void *data, size_t size, 272 | int tag, void *mhandle, void* phandle, void **request) { 273 | 274 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(send_comm); 275 | nccl_uct_memh_t *uct_memh = mhandle; 276 | nccl_uct_mem_t rts; 277 | nccl_uct_rd_req_t *req; 278 | ucs_status_t status; 279 | 280 | req = nccl_uct_rd_req_alloc(comm, 1); 281 | if (req == NULL) { 282 | *request = NULL; 283 | return ncclSuccess; 284 | } 285 | 286 | req->send_rts = 1; 287 | req->count = 1; 288 | req->sizes[0] = size; 289 | *request = req; 290 | 291 | rts.tag = tag; 292 | rts.size = size; 293 | rts.data = data; 294 | rts.u.rkey = uct_memh->bundle.rkey; 295 | rts.req = req; 296 | 297 | status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_RTS, 298 | (uint64_t)comm->base.remote.comm, &rts, sizeof(rts)); 299 | if (status != UCS_OK) { 300 | nccl_uct_rd_req_free(req); 301 | *request = NULL; 302 | } 303 | 304 | return ncclSuccess; 305 | } 306 | 307 | static ncclResult_t nccl_uct_rd_irecv(void *recv_comm, int n, void **data, 308 | size_t *sizes, int *tags, void **mhandles, 309 | void** phandles, void **request) { 310 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(recv_comm); 311 | nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandles; 312 | nccl_uct_ring_t *unexp; 313 | nccl_uct_rd_req_t *req; 314 | nccl_uct_mem_t *rts, recv; 315 | unsigned i, j; 316 | 317 | assert(n <= NCCL_UCX_UCT_MAX_RECVS); 318 | 319 | /* Create a request */ 320 | req = nccl_uct_rd_req_alloc(comm, n + 1); 321 | *request = req; 322 | if (req == NULL) { 323 | return ncclSuccess; 324 | } 325 | 326 | req->send_rts = 0; 327 | req->count = n; 328 | req->rts_count = 0; 329 | 330 | /* Try to match or build expected list */ 331 | for (i = 0; i < n; i++) { 332 | recv.tag = tags[i]; 333 | recv.size = sizes[i]; 334 | recv.data = data[i]; 335 | recv.u.uct_memh = uct_memh[i]; 336 | recv.req = req; 337 | recv.index = i; 338 | 339 | unexp = &comm->unexp; 340 | j = nccl_uct_ring_find(unexp, tags[i]); 341 | if (j == unexp->last) { 342 | nccl_uct_ring_append(&comm->exp, tags[i], &recv, sizeof(recv)); 343 | } else { 344 | rts = nccl_uct_ring_get_entry(unexp, j); 345 | nccl_uct_rd_pending_add(comm, rts, &recv); 346 | nccl_uct_ring_consume(unexp, j); 347 | } 348 | } 349 | 350 | return ncclSuccess; 351 | } 352 | 353 | static ncclResult_t nccl_uct_rd_iflush(void *recv_comm, int n, void **data, 354 | int *sizes, void **mhandle, 355 | void **request) { 356 | ncclResult_t result = ncclSuccess; 357 | nccl_uct_comm_t *base_comm = recv_comm; 358 | nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandle; 359 | int last = nccl_uct_flush_index(base_comm, sizes, n); 360 | nccl_uct_rd_req_t *req; 361 | 362 | *request = NULL; 363 | 364 | if (last != -1) { 365 | req = nccl_uct_rd_req_alloc(nccl_uct_rd_comm_get(recv_comm), 1); 366 | if (req != NULL) { 367 | req->send_rts = -1; 368 | *request = req; 369 | 370 | result = nccl_uct_flush(base_comm, data[last], sizes[last], 371 | uct_memh[last], &req->completion, request); 372 | if (*request == NULL) { 373 | nccl_uct_rd_req_free(req); 374 | } 375 | } 376 | } 377 | 378 | return result; 379 | } 380 | 381 | static ncclResult_t nccl_uct_rd_test(void *request, int *done, int *sizes) { 382 | nccl_uct_rd_req_t *req = request; 383 | 384 | while (uct_worker_progress(req->comm->base.uct_worker->worker)) 385 | ; /* empty */ 386 | 387 | nccl_uct_rd_pending_drain(req->comm); 388 | 389 | if (req->completion.count > 0) { 390 | if ((req->send_rts == 0) && (req->completion.count == 1)) { 391 | nccl_uct_rd_send_ats(req); 392 | } 393 | 394 | if (req->completion.count > 0) { 395 | *done = 0; 396 | return ncclSuccess; 397 | } 398 | } 399 | 400 | if ((sizes != NULL) && (req->send_rts > -1)) { 401 | memcpy(sizes, req->sizes, req->count * sizeof(*req->sizes)); 402 | } 403 | 404 | *done = 1; 405 | nccl_uct_rd_req_free(req); 406 | return ncclSuccess; 407 | } 408 | 409 | static ncclResult_t nccl_uct_rd_close(void *close_comm) { 410 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(close_comm); 411 | nccl_uct_rd_req_t *req; 412 | 413 | nccl_uct_comm_deinit(close_comm); 414 | 415 | while ((req = comm->free_req) != NULL) { 416 | comm->free_req = req->next; 417 | free(req); 418 | } 419 | 420 | assert(nccl_uct_ring_is_empty(&comm->exp)); 421 | assert(nccl_uct_ring_is_empty(&comm->unexp)); 422 | assert(comm->req_count == 0); 423 | assert(comm->pending.first == comm->pending.last); 424 | 425 | nccl_uct_ring_deinit(&comm->exp); 426 | nccl_uct_ring_deinit(&comm->unexp); 427 | free(comm); 428 | return ncclSuccess; 429 | } 430 | 431 | static ncclResult_t nccl_uct_rd_init_v9(ncclDebugLogger_t logFunction) { 432 | return nccl_uct_rd_init(logFunction, NULL); 433 | } 434 | 435 | static ncclResult_t nccl_uct_rd_isend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { 436 | return nccl_uct_rd_isend(sendComm, data, size, tag, mhandle, NULL, request); 437 | } 438 | 439 | static ncclResult_t nccl_uct_rd_irecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { 440 | return nccl_uct_rd_irecv(recvComm, n, data, sizes, tags, mhandles, NULL, request); 441 | } 442 | 443 | static ncclResult_t nccl_uct_rd_isend_v8(void *send_comm, void *data, int size, 444 | int tag, void *mhandle, void **request) { 445 | return nccl_uct_rd_isend_v9(send_comm, data, (size_t)size, tag, mhandle, request); 446 | } 447 | 448 | static ncclResult_t nccl_uct_rd_irecv_v8(void *recv_comm, int n, void **data, 449 | int *sizes, int *tags, void **mhandles, 450 | void **request) { 451 | size_t sizes_sizet[NCCL_NET_IB_MAX_RECVS]; 452 | for (int i=0; i 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "utils.h" 20 | #include "core.h" 21 | #include "param.h" 22 | 23 | // Allocate memory to be potentially ibv_reg_mr'd. This needs to be 24 | // allocated on separate pages as those pages will be marked DONTFORK 25 | // and if they are shared, that could cause a crash in a child process 26 | ncclResult_t ncclIbMalloc(void** ptr, size_t size) { 27 | size_t page_size = sysconf(_SC_PAGESIZE); 28 | void* p; 29 | int size_aligned = ROUNDUP(size, page_size); 30 | int ret = posix_memalign(&p, page_size, size_aligned); 31 | if (ret != 0) return ncclSystemError; 32 | memset(p, 0, size); 33 | *ptr = p; 34 | return ncclSuccess; 35 | } 36 | 37 | ncclResult_t ncclRealloc(void **ptr, size_t oldNelem, size_t nelem) { 38 | if (nelem < oldNelem) return ncclInternalError; 39 | if (nelem == oldNelem) return ncclSuccess; 40 | 41 | void* oldp = *ptr; 42 | void* p = (void*)malloc(nelem); 43 | if (p == NULL) { 44 | WARN("Failed to malloc %ld bytes", nelem); 45 | return ncclSystemError; 46 | } 47 | memcpy(p, oldp, oldNelem); 48 | free(oldp); 49 | memset(p+oldNelem, 0, (nelem-oldNelem)); 50 | *ptr = p; 51 | INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem, nelem, *ptr); 52 | return ncclSuccess; 53 | } 54 | 55 | 56 | int parseStringList(const char* string, struct netIf* ifList, int maxList) { 57 | if (!string) return 0; 58 | 59 | const char* ptr = string; 60 | 61 | int ifNum = 0; 62 | int ifC = 0; 63 | char c; 64 | do { 65 | c = *ptr; 66 | if (c == ':') { 67 | if (ifC > 0) { 68 | ifList[ifNum].prefix[ifC] = '\0'; 69 | ifList[ifNum].port = atoi(ptr+1); 70 | ifNum++; ifC = 0; 71 | } 72 | while (c != ',' && c != '\0') c = *(++ptr); 73 | } else if (c == ',' || c == '\0') { 74 | if (ifC > 0) { 75 | ifList[ifNum].prefix[ifC] = '\0'; 76 | ifList[ifNum].port = -1; 77 | ifNum++; ifC = 0; 78 | } 79 | } else { 80 | ifList[ifNum].prefix[ifC] = c; 81 | ifC++; 82 | } 83 | ptr++; 84 | } while (ifNum < maxList && c); 85 | return ifNum; 86 | } 87 | 88 | static int matchIf(const char* string, const char* ref, int matchExact) { 89 | // Make sure to include '\0' in the exact case 90 | int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); 91 | return strncmp(string, ref, matchLen) == 0; 92 | } 93 | 94 | static int matchPort(const int port1, const int port2) { 95 | if (port1 == -1) return 1; 96 | if (port2 == -1) return 1; 97 | if (port1 == port2) return 1; 98 | return 0; 99 | } 100 | 101 | 102 | int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact) { 103 | // Make an exception for the case where no user list is defined 104 | if (listSize == 0) return 1; 105 | 106 | for (int i=0; i