├── .ci
    ├── Jenkinsfile
    ├── README.md
    ├── build_cli.sh
    ├── build_nccl_rdma_sharp_plugins.sh
    ├── ci_functions.sh
    ├── config-header-check.yml
    ├── configure_sharp.sh
    ├── ibdev2netdev
    ├── nccl_tests
    ├── publish_artefacts.sh
    ├── pushd_functions.sh
    ├── run_nccl_tests.sh
    ├── settings.sh
    ├── sharp_coll_test_wrapper
    └── taskset
├── .clang-format
├── .github
    └── workflows
    │   └── nccl-sharp-plugin.yml
├── .gitignore
├── LICENSE
├── Makefile.am
├── README.md
├── autogen.sh
├── configure.ac
├── contrib
    └── buildrpm.sh
├── debian
    ├── changelog.in
    ├── compat
    ├── control.in
    ├── copyright
    ├── nccl-rdma-sharp-plugins.postinst.in
    ├── nccl-rdma-sharp-plugins.prem.in
    ├── rules.in
    └── source
    │   └── format
├── include
    ├── core.h
    ├── debug.h
    ├── ibvwrap.h
    ├── nccl.h
    ├── net.h
    ├── net_device.h
    ├── net_v10.h
    ├── net_v5.h
    ├── net_v6.h
    ├── net_v7.h
    ├── net_v8.h
    ├── net_v9.h
    ├── p2p_plugin.h
    ├── param.h
    ├── socket.h
    ├── timer.h
    ├── ucx_uct_lib.h
    ├── ucx_uct_ring.h
    └── utils.h
├── m4
    ├── sharp.m4
    └── ucx.m4
├── nccl-rdma-sharp-plugins.pc.in
├── nccl-rdma-sharp-plugins.spec.in
└── src
    ├── Makefile.am
    ├── ib_plugin.c
    ├── ibvwrap.c
    ├── p2p_plugin.c
    ├── param.c
    ├── sharp_plugin.c
    ├── socket.c
    ├── ucx_plugin.c
    ├── ucx_rma_plugin.c
    ├── ucx_uct_lib.c
    ├── ucx_uct_plugin.c
    ├── ucx_uct_rd_plugin.c
    └── utils.c


/.ci/Jenkinsfile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env groovy
 2 | 
 3 | // Verified with Jenkins v2.190.2
 4 | 
 5 | // TODO:
 6 | // 1. Calculate taskset/affinity for the scripts based on total number of jenkins executors
 7 | // 2. NCCL/CUDA/SHARP dependencies should be parameterized
 8 | // 3. HPC-X OS/MOFED support matrix should be covered (e.g. docker-based)
 9 | // 4. Add signal handlers in the scripts (e.g. to correctly handle Jenkins abort by timeout situations)
10 | 
11 | pipeline {
12 |     agent {label "ml-test-node-gpu"}
13 | 
14 |     options {
15 |         buildDiscarder(logRotator(numToKeepStr: '10'))
16 |         timeout(time: 90, unit: 'MINUTES')
17 |         disableConcurrentBuilds()
18 |     }
19 | 
20 |     environment {
21 |         NFS_WORKSPACE               = "${NFS_WORKSPACE_ROOT}/ml-nccl-rdma-sharp-plugins-pr/${BUILD_NUMBER}"
22 |         ARTEFACT_DIR                = "${NFS_WORKSPACE}/artefacts"
23 |         NCCL_RDMA_SHARP_PLUGINS_DIR = "${NFS_WORKSPACE}/nccl-rdma-sharp-plugins"
24 |         NCCL_TESTS_DIR              = "${NFS_WORKSPACE}/nccl-tests"
25 |     }
26 | 
27 |     stages {
28 |         stage('Preparations') {
29 |             steps {
30 |                 echo 'Preparations...'
31 |                 sh 'mkdir -p ${ARTEFACT_DIR}'
32 |                 sh 'mkdir -p ${NFS_WORKSPACE}'
33 |             }
34 |         }
35 |         stage('Build nccl-rdma-sharp-plugins') {
36 |             steps {
37 |                 echo 'Building nccl-rdma-sharp-plugins...'
38 |                 sh """#!/bin/bash
39 |                     set -o pipefail
40 |                     ${WORKSPACE}/.ci/build_nccl_rdma_sharp_plugins.sh 2>&1 | tee ${ARTEFACT_DIR}/build_nccl_rdma_sharp_plugins.log
41 |                     """
42 |             }
43 |         }
44 |         stage('Configure SHARP: startup') {
45 |             steps {
46 |                 echo 'Configure SHARP: startup...'
47 |                 sh """#!/bin/bash
48 |                     set -o pipefail
49 |                     ${WORKSPACE}/.ci/configure_sharp.sh 2>&1 | tee ${ARTEFACT_DIR}/configure_sharp_startup.log
50 |                     """
51 |             }
52 |         }
53 |         stage('Checkout NCCL tests') {
54 |             steps {
55 |                 dir("${NCCL_TESTS_DIR}") {
56 |                     git branch: 'master',
57 |                     url: 'https://github.com/NVIDIA/nccl-tests.git'
58 |                 }
59 |             }
60 |         }
61 |         stage('Test nccl-rdma-sharp-plugins') {
62 |             steps {
63 |                 echo 'Testing nccl-rdma-sharp-plugins...'
64 |                 sh """#!/bin/bash
65 |                     set -o pipefail
66 |                     ${WORKSPACE}/.ci/run_nccl_tests.sh 2>&1 | tee ${ARTEFACT_DIR}/run_nccl_test.log
67 |                     """
68 |             }
69 |         }
70 |         stage('Configure SHARP: stop') {
71 |             steps {
72 |                 echo 'Configure SHARP: stop...'
73 |                 sh """#!/bin/bash
74 |                     set -o pipefail
75 |                     ${WORKSPACE}/.ci/configure_sharp.sh stop 2>&1 | tee ${ARTEFACT_DIR}/configure_sharp_stop.log
76 |                     """
77 |             }
78 |         }
79 |     }
80 |     // Not needed, as there are no external contributors
81 |     // post {
82 |     //     always {
83 |     //         echo 'Post-actions...'
84 |     //         sh '${WORKSPACE}/.ci/publish_artefacts.sh'
85 |     //     }
86 |     // }
87 | }
88 | 


--------------------------------------------------------------------------------
/.ci/README.md:
--------------------------------------------------------------------------------
 1 | # nccl-rdma-sharp-plugins Continuous Integration (CI)
 2 | ## Overview
 3 | nccl-rdma-sharp-plugins CI is intended to make sanity checking for every code change. CI is started for each Pull Request (PR) and can be additionally triggered with **bot:mlx:test** (or **bot:mlx:retest**) keyword written in the PR comments. For users in the project WhiteList CI is started automatically, for others - project maintainers should approve CI start with '**ok to test**' keyword reply.<br>
 4 | CI status and artefacts (log files) are published within the PR comments.
 5 | ## Description
 6 | CI includes the following steps:
 7 | * Build nccl-rdma-sharp-plugins
 8 | * Test nccl-rdma-sharp-plugins with [NCCL tests](https://github.com/nvidia/nccl-tests). 
 9 | The tests are run with [NVIDIA's NCCL](https://github.com/NVIDIA/nccl) library built within CI from the internal repository.
10 | ### Test Environment
11 | CI is run in the Mellanox lab on a 2-node cluster with the following parameters:
12 | 
13 | Hardware
14 | * IB: 1x ConnectX-6 HCA (connected to Mellanox Quantum™ HDR switch)
15 | * GPU: 1x Nvidia Tesla K40m
16 | 
17 | Software
18 | * Ubuntu 18.04.4
19 | * Internal stable MLNX_OFED, HPC-X and SHARP versions


--------------------------------------------------------------------------------
/.ci/build_cli.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eE
 2 | . ./pushd_functions.sh
 3 | . ./ci_functions.sh
 4 | pushd /GIT
 5 | case $1 in
 6 |     build)
 7 |         configure
 8 |         echo "Building NCCL sharp plugin"
 9 |         build
10 |         ;;
11 |     sharp)
12 |         echo "Checking and configure sharp"
13 |         sharp
14 |         ;;
15 |     test)
16 |         echo "Running tests for NCCL sharp plugin"
17 |         test
18 |         ;;
19 |     *)
20 |         echo "Do nothing"
21 |         ;;
22 | esac
23 | 


--------------------------------------------------------------------------------
/.ci/build_nccl_rdma_sharp_plugins.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -leE
 2 | 
 3 | SCRIPT_DIR="$(
 4 |     cd "$(dirname "$0")"
 5 |     pwd -P
 6 | )"
 7 | cd "${SCRIPT_DIR}"
 8 | # shellcheck source=settings.sh
 9 | . "${SCRIPT_DIR}/settings.sh"
10 | 
11 | cd "${WORKSPACE}"
12 | 
13 | if ! "${WORKSPACE}/autogen.sh"; then
14 |     echo "ERROR: ${WORKSPACE}/autogen.sh failed"
15 |     echo "FAIL"
16 |     exit 1
17 | fi
18 | 
19 | if ! "${WORKSPACE}/configure" \
20 |     --prefix="${NCCL_RDMA_SHARP_PLUGINS_DIR}" \
21 |     --with-cuda="${CUDA_HOME}" \
22 |     --with-sharp="${HPCX_SHARP_DIR}"; then
23 |     echo "ERROR: ${WORKSPACE}/configure failed"
24 |     echo "FAIL"
25 |     exit 1
26 | fi
27 | 
28 | if ! make -j install; then
29 |     echo "ERROR: 'make -j install' failed"
30 |     echo "FAIL"
31 |     exit 1
32 | fi
33 | 
34 | if [ "$DEBUG" = "true" ]; then
35 |     echo "INFO: ${NCCL_RDMA_SHARP_PLUGINS_DIR}:"
36 |     # For debug purposes
37 |     find "${NCCL_RDMA_SHARP_PLUGINS_DIR}" -type f
38 | fi
39 | 
40 | echo "PASS"
41 | 


--------------------------------------------------------------------------------
/.ci/ci_functions.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eE
 2 | # Preparation a workplace & configs to CI
 3 | function configure() {
 4 |     rm -rf "${NFS_WORKSPACE}-pr" || true
 5 |     rm -rf "${NFS_WORKSPACE}" || true
 6 |     rm -rf ./nccl-rdma-sharp-plugins/.ci/cfg/* || true
 7 |     cd "${NFS_WORKSPACE_ROOT}" || exit 1
 8 |     mkdir -p ./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf 
 9 |     mkdir -p ./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf
10 | 
11 |     printf "%s\n%s\n" "${HOST1}" "${HOST2}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/hostfile
12 |     printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharp_am.cfg
13 |     printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharpd.cfg
14 |     printf "%s\n" "${SHARP_AM_HOST}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharp_am_node.txt
15 | 
16 |     printf "%s\n%s\n" "${HOST1}" "${HOST2}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/hostfile
17 |     printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharp_am.cfg
18 |     printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharpd.cfg
19 |     printf "%s\n" "${SHARP_AM_HOST}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharp_am_node.txt
20 | }
21 | 
22 | # Building NCCL rdma sharp plugin
23 | function build() {
24 |     echo "Running build_nccl_rdma_sharp_plugins.sh..."
25 |     "${WORKSPACE}"/.ci/build_nccl_rdma_sharp_plugins.sh && echo "Build SUCCESFULL !!!"
26 | }
27 | 
28 | # Checking and configuring Sharp
29 | function sharp() {
30 |     echo "Running configure_sharp.sh..."
31 |     "${WORKSPACE}"/.ci/configure_sharp.sh && echo "Step configure_sharp SUCCESFULL !!!"
32 | }
33 | 
34 | # Running of tests
35 | function test() {
36 |     git clone --depth=1 https://github.com/NVIDIA/nccl-tests.git "${NFS_WORKSPACE}"/nccl-tests
37 |     echo "Running run_nccl_tests.sh..."
38 |     "${WORKSPACE}"/.ci/run_nccl_tests.sh && echo "Tests SUCCESFULL !!!"
39 | }
40 | 


--------------------------------------------------------------------------------
/.ci/config-header-check.yml:
--------------------------------------------------------------------------------
 1 | general:
 2 |   exclude:
 3 |     - "\\.git.*"
 4 |     - "\\.(yml|md|txt)"
 5 |     - "^\\.ci.*"
 6 |     - "\\.(m4|ac)"
 7 |     - "LICENSE"
 8 |     - "debian/copyright"
 9 |     - "debian/compat"
10 |     - "debian/source/format"
11 | 
12 | bsd:
13 |   validate-spdx-license: true
14 |   include:
15 |     - ".*\\.(am|in|hpp|cpp|py|cc|h|c|sh)$"
16 | 


--------------------------------------------------------------------------------
/.ci/configure_sharp.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -l
  2 | SCRIPT_DIR="$(
  3 |     cd "$(dirname "$0")" || exit 1
  4 |     pwd -P
  5 | )"
  6 | cd "${SCRIPT_DIR}" || exit 1
  7 | # shellcheck source=settings.sh
  8 | . "${SCRIPT_DIR}/settings.sh"
  9 | 
 10 | if [ -z "${NCCL_RDMA_SHARP_PLUGINS_DIR}" ]; then
 11 |     echo "ERROR: NCCL_RDMA_SHARP_PLUGINS_DIR is not defined"
 12 |     echo "FAIL"
 13 |     exit 1
 14 | fi
 15 | 
 16 | export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${NCCL_RDMA_SHARP_PLUGINS_DIR}/lib:${LD_LIBRARY_PATH}"
 17 | 
 18 | # 1 - run sanity tests, 0 - do not run
 19 | VERIFY_SHARP_ENABLE=${VERIFY_SHARP_ENABLE:-1}
 20 | 
 21 | if [ -z "${NCCL_DIR}" ]; then
 22 |     module load dev/nccl-nightly-stable
 23 | else
 24 |     export LD_LIBRARY_PATH="${NCCL_DIR}/lib:${LD_LIBRARY_PATH}"
 25 | fi
 26 | 
 27 | # Available values: start|stop|restart
 28 | SHARP_MANAGER_ACTION="${1:-restart}"
 29 | echo "INFO: SHARP_MANAGER_ACTION = ${SHARP_MANAGER_ACTION}"
 30 | 
 31 | echo "INFO: NFS_WORKSPACE = ${NFS_WORKSPACE}"
 32 | 
 33 | if [ -z "${NFS_WORKSPACE}" ]; then
 34 |     echo "ERROR: NFS_WORKSPACE is not defined"
 35 |     echo "FAIL"
 36 |     exit 1
 37 | fi
 38 | 
 39 | if [ -z "${HPCX_SHARP_DIR}" ]; then
 40 |     echo "ERROR: HPCX_SHARP_DIR is not defined"
 41 |     echo "FAIL"
 42 |     exit 1
 43 | fi
 44 | 
 45 | HPCX_SHARP_DIR=/opt/mellanox/sharp
 46 | CONFIGURE_SHARP_TMP_DIR="${NFS_WORKSPACE}/configure_sharp_$$"
 47 | mkdir -p "${CONFIGURE_SHARP_TMP_DIR}"
 48 | chmod o+w "${CONFIGURE_SHARP_TMP_DIR}"
 49 | 
 50 | export SHARP_CONF="${CONFIGURE_SHARP_TMP_DIR}"
 51 | export SHARP_INI_FILE="${SHARP_CONF}/sharp_manager.ini"
 52 | 
 53 | cp -R "${CFG_DIR}/$HOSTNAME/sharp_conf/"* "${SHARP_CONF}"
 54 | 
 55 | if [ -f "${SHARP_CONF}/sharp_am_node.txt" ]; then
 56 |     SHARP_AM_NODE=$(cat ${SHARP_CONF}/sharp_am_node.txt)
 57 |     echo "INFO: SHARP_AM_NODE = ${SHARP_AM_NODE}"
 58 | else
 59 |     echo "ERROR: ${SHARP_CONF}/sharp_am_node.txt does not exist or not accessible"
 60 |     echo "FAIL"
 61 |     exit 1
 62 | fi
 63 | 
 64 | IB_DEV=$(/GIT/ibdev2netdev | awk '{ print $1 }'):1
 65 | SM_GUID=$(sudo sminfo -C ${IB_DEV} -P1 | awk '{print $7}' | cut -d',' -f1)
 66 | # SM/AM node
 67 | # SM_HOSTNAME=`sudo ibnetdiscover -H -C mlx5_0 -P1  | grep ${SM_GUID} | awk -F'"' '{print $2 }' | awk '{print $1}'`
 68 | HOSTS=$(cat $HOSTFILE | xargs | tr ' ' ',')
 69 | 
 70 | echo "INFO: IB_DEV = ${IB_DEV}"
 71 | echo "INFO: SM_GUID = ${SM_GUID}"
 72 | # echo "INFO: SM_HOSTNAME = ${SM_HOSTNAME}"
 73 | echo "INFO: HOSTS = ${HOSTS}"
 74 | 
 75 | rm -f ${SHARP_INI_FILE}
 76 | 
 77 | cat >${SHARP_INI_FILE} <<EOF
 78 | sharp_AM_server="${SHARP_AM_NODE}"
 79 | sharp_am_log_verbosity="3"
 80 | sharp_hostlist="$HOSTS"
 81 | sharp_manager_general_conf="${SHARP_CONF}"
 82 | sharpd_log_verbosity="3"
 83 | EOF
 84 | 
 85 | echo "INFO: SHARP_INI_FILE ${SHARP_INI_FILE} BEGIN"
 86 | cat ${SHARP_INI_FILE}
 87 | echo "INFO: SHARP_INI_FILE ${SHARP_INI_FILE} END"
 88 | 
 89 | trim_multiple_spaces() {
 90 |     echo "$1" | sed -s "s|\ \ *| |g"
 91 | }
 92 | 
 93 | check_opensm_status() {
 94 |     echo "Checking OpenSM status on ${SHARP_AM_NODE}..."
 95 | 
 96 |     ssh "${SHARP_AM_NODE}" "systemctl status opensmd"
 97 |     if [ $? -ne 0 ]; then
 98 |         echo "ERROR: opensmd is not run on ${SHARP_AM_NODE}"
 99 |         echo "FAIL"
100 |         exit 1
101 |     fi
102 | 
103 |     echo "Checking OpenSM status on ${SHARP_AM_NODE}... DONE"
104 | }
105 | 
106 | check_opensm_conf() {
107 |     echo "INFO: check_opensm_conf on ${SHARP_AM_NODE}..."
108 | 
109 |     OPENSM_CONFIG="/etc/opensm/opensm.conf"
110 |     echo "INFO: opensm config = ${OPENSM_CONFIG}"
111 | 
112 |     ssh "${SHARP_AM_NODE}" "grep \"routing_engine.*updn\" ${OPENSM_CONFIG} 2>/dev/null"
113 |     if [ $? -ne 0 ]; then
114 |         echo "ERROR: wrong value of routing_engine parameter in ${OPENSM_CONFIG}"
115 |         echo "Should be (example): routing_engine updn"
116 |         echo "FAIL"
117 |         exit 1
118 |     fi
119 | 
120 |     ssh "${SHARP_AM_NODE}" "grep \"sharp_enabled.*2\" ${OPENSM_CONFIG} 2>/dev/null"
121 |     if [ $? -ne 0 ]; then
122 |         echo "ERROR: wrong value of sharp_enabled parameter in ${OPENSM_CONFIG}"
123 |         echo "Should be (example): sharp_enabled 2"
124 |         echo "FAIL"
125 |         exit 1
126 |     fi
127 | 
128 |     echo "INFO: check_opensm_conf on ${SHARP_AM_NODE}... DONE"
129 | }
130 | 
131 | verify_sharp() {
132 |     echo "INFO: verify_sharp..."
133 | 
134 |     cp ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/* ${CONFIGURE_SHARP_TMP_DIR}
135 |     cd ${CONFIGURE_SHARP_TMP_DIR}
136 |     make CUDA=1 CUDA_HOME=${CUDA_HOME} SHARP_HOME="${HPCX_SHARP_DIR}"
137 |     if [ $? -ne 0 ]; then
138 |         echo "ERROR: verify_sharp make failed"
139 |         echo "FAIL"
140 |         exit 1
141 |     fi
142 | 
143 |     cp ${WORKSPACE}/.ci/sharp_coll_test_wrapper ./
144 |     ITERS=100
145 |     SKIP=20
146 |     NP=$(wc --lines "$HOSTFILE" | awk '{print $1}')
147 | 
148 |     # -mca coll_hcoll_enable 0 - disable HCOLL
149 |     MPIRUN_COMMON_OPTIONS="\
150 |         -np $NP \
151 |         -H $HOSTS \
152 |         --map-by node \
153 |         -x LD_LIBRARY_PATH \
154 |         --allow-run-as-root \
155 | 	-mca oob_tcp_if_exclude eth0 \
156 |     "
157 | 
158 |     # TODO change to SHARP_COLL_SAT_THRESHOLD=1 (32 - W/A for SHARP issue)
159 |     MPIRUN_SHARP_OPTIONS="\
160 |         -x SHARP_COLL_LOG_LEVEL=3 \
161 |         -x ENABLE_SHARP_COLL=1 \
162 |         -x SHARP_COLL_SAT_THRESHOLD=32 \
163 |         -x SHARP_COLL_ENABLE_SAT=1 \
164 |     "
165 | 
166 |     echo "Environment for the reproducer:"
167 |     echo "export PATH=$PATH"
168 |     echo "export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
169 |     echo "export OPAL_PREFIX=${OPAL_PREFIX}"
170 | 
171 |     # Test 1 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README):
172 |     # Run allreduce barrier perf test on 2 hosts using port mlx5_0
173 |     echo "${GH_FOLD}# Test 1..."
174 |     CMD="mpirun \
175 |             ${MPIRUN_COMMON_OPTIONS} \
176 |             ${MPIRUN_SHARP_OPTIONS} \
177 |             ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
178 |                 --iters $ITERS \
179 |                 --skip $SKIP \
180 |                 --mode perf \
181 |                 --collectives allreduce,barrier"
182 |     echo "INFO: Test 1 command line:"
183 |     trim_multiple_spaces "$CMD"
184 |     $CMD
185 |     if [ $? -ne 0 ]; then
186 |         echo "ERROR: verify_sharp Test 1 failed"
187 |         echo "FAIL"
188 |         exit 1
189 |     fi
190 |     echo "${GH_UNFOLD}"
191 |     echo "Test 1... DONE"
192 | 
193 |     # Test 2 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README):
194 |     # Run allreduce perf test on 2 hosts using port mlx5_0 with CUDA buffers
195 |     echo "${GH_FOLD}# Test 2..."
196 |     CMD="mpirun \
197 |             ${MPIRUN_COMMON_OPTIONS} \
198 |             ${MPIRUN_SHARP_OPTIONS} \
199 |             ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
200 |                 --iters $ITERS \
201 |                 --skip $SKIP \
202 |                 --mode perf \
203 |                 --collectives allreduce \
204 |                 -M cuda"
205 |     echo "INFO: Test 2 command line:"
206 |     trim_multiple_spaces "$CMD"
207 |     $CMD
208 |     if [ $? -ne 0 ]; then
209 |         echo "ERROR: verify_sharp Test 2 failed"
210 |         echo "FAIL"
211 |         exit 1
212 |     fi
213 |     echo "${GH_UNFOLD}"
214 |     echo "Test 2... DONE"
215 | 
216 |     # Test 3 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README):
217 |     # Run allreduce perf test on 2 hosts using port mlx5_0 with Streaming aggregation from 4B to 512MB
218 |     echo "${GH_FOLD}# Test 3..."
219 |     CMD="mpirun \
220 |             ${MPIRUN_COMMON_OPTIONS} \
221 |             ${MPIRUN_SHARP_OPTIONS} \
222 |             ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
223 |                 --iters $ITERS \
224 |                 --skip $SKIP \
225 |                 --mode perf \
226 |                 --collectives allreduce \
227 |                 -s 4:536870912"
228 |     echo "INFO: Test 3 command line:"
229 |     trim_multiple_spaces "$CMD"
230 |     $CMD
231 |     if [ $? -ne 0 ]; then
232 |         echo "ERROR: verify_sharp Test 3 failed"
233 |         echo "FAIL"
234 |         exit 1
235 |     fi
236 |     echo "${GH_UNFOLD}"
237 |     echo "Test 3... DONE"
238 | 
239 |     # Test 4:
240 |     # Run iallreduce perf test on 2 hosts using port mlx5_0
241 |     echo "${GH_FOLD}# Test 4..."
242 |     CMD="mpirun \
243 |             ${MPIRUN_COMMON_OPTIONS} \
244 |             ${MPIRUN_SHARP_OPTIONS} \
245 |             ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
246 |                 --iters $ITERS \
247 |                 --skip $SKIP \
248 |                 --mode perf \
249 |                 --collectives iallreduce \
250 |                 -N 128"
251 |     echo "INFO: Test 4 command line:"
252 |     trim_multiple_spaces "$CMD"
253 |     $CMD
254 |     if [ $? -ne 0 ]; then
255 |         echo "ERROR: verify_sharp Test 4 failed"
256 |         echo "FAIL"
257 |         exit 1
258 |     fi
259 |     echo "${GH_UNFOLD}"
260 |     echo "Test 4... DONE"
261 | 
262 |     # Test 5:
263 |     # Run iallreduce perf test on 2 hosts using port mlx5_0 with CUDA buffers
264 |     echo "${GH_FOLD}# Test 5..."
265 |     CMD="mpirun \
266 |             ${MPIRUN_COMMON_OPTIONS} \
267 |             ${MPIRUN_SHARP_OPTIONS} \
268 |             ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
269 |                 --iters $ITERS \
270 |                 --skip $SKIP \
271 |                 --mode perf \
272 |                 --collectives iallreduce \
273 |                 -N 128 \
274 |                 -M cuda"
275 |     echo "INFO: Test 5 command line:"
276 |     trim_multiple_spaces "$CMD"
277 |     $CMD
278 |     if [ $? -ne 0 ]; then
279 |         echo "ERROR: verify_sharp Test 5 failed"
280 |         echo "FAIL"
281 |         exit 1
282 |     fi
283 |     echo "${GH_UNFOLD}"
284 |     echo "Test 5... DONE"
285 | 
286 |     # Test 6:
287 |     # Run iallreduce perf test on 2 hosts using port mlx5_0 with Streaming aggregation from 4B to 512MB
288 |     echo "${GH_FOLD}# Test 6..."
289 |     CMD="mpirun \
290 |             ${MPIRUN_COMMON_OPTIONS} \
291 |             ${MPIRUN_SHARP_OPTIONS} \
292 |             ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
293 |                 --iters $ITERS \
294 |                 --skip $SKIP \
295 |                 --mode perf \
296 |                 --collectives iallreduce \
297 |                 -N 128 \
298 |                 -s 4:131072"
299 |     echo "INFO: Test 6 command line:"
300 |     trim_multiple_spaces "$CMD"
301 |     $CMD
302 |     if [ $? -ne 0 ]; then
303 |         echo "ERROR: verify_sharp Test 6 failed"
304 |         echo "FAIL"
305 |         exit 1
306 |     fi
307 |     echo "${GH_UNFOLD}"
308 |     echo "Test 6... DONE"
309 | 
310 |     # Test 7 (from the SHARP deployment guide): Without SAT
311 |     echo "${GH_FOLD}# Test 7..."
312 |     CMD="$OMPI_HOME/bin/mpirun \
313 |             ${MPIRUN_COMMON_OPTIONS} \
314 |             --bind-to core \
315 |             -mca btl_openib_warn_default_gid_prefix 0 \
316 |             -mca rmaps_dist_device ${IB_DEV} \
317 |             -mca rmaps_base_mapping_policy dist:span \
318 |             -x MXM_LOG_LEVEL=ERROR \
319 |             -x HCOLL_ML_DISABLE_REDUCE=1 \
320 |             -x LD_LIBRARY_PATH \
321 |             -x HCOLL_ENABLE_SHARP=2 \
322 |             -x SHARP_COLL_LOG_LEVEL=3 \
323 |             -x SHARP_COLL_GROUP_RESOURCE_POLICY=1 \
324 |             -x SHARP_COLL_MAX_PAYLOAD_SIZE=256 \
325 |             -x HCOLL_SHARP_UPROGRESS_NUM_POLLS=999 \
326 |             -x HCOLL_BCOL_P2P_ALLREDUCE_SHARP_MAX=4096 \
327 |             -x SHARP_COLL_PIPELINE_DEPTH=32 \
328 |             -x SHARP_COLL_JOB_QUOTA_OSTS=32 \
329 |             -x SHARP_COLL_JOB_QUOTA_MAX_GROUPS=4 \
330 |             -x SHARP_COLL_JOB_QUOTA_PAYLOAD_PER_OST=256 \
331 |             ${WORKSPACE}/.ci/taskset -c 1 \
332 |                 numactl --membind=0 \
333 |                     $HPCX_OSU_DIR/osu_allreduce \
334 |                         -i 100 \
335 |                         -x 100 \
336 |                         -f \
337 |                         -m 4096:4096"
338 |     echo "INFO: Test 7 command line:"
339 |     trim_multiple_spaces "$CMD"
340 |     $CMD
341 |     if [ $? -ne 0 ]; then
342 |         echo "ERROR: Test 7 (without SAT) failed, check the log file"
343 |         echo "FAIL"
344 |         exit 1
345 |     fi
346 |     echo "${GH_UNFOLD}"
347 |     echo "Test 7... DONE"
348 | 
349 |     # Test 8 (from the SHARP deployment guide): With SAT
350 |     echo "${GH_FOLD}# Test 8..."
351 |     CMD="$OMPI_HOME/bin/mpirun \
352 |             ${MPIRUN_COMMON_OPTIONS} \
353 |             -mca btl_openib_warn_default_gid_prefix 0 \
354 |             -mca rmaps_dist_device ${IB_DEV} \
355 |             -mca rmaps_base_mapping_policy dist:span \
356 |             -x MXM_ASYNC_INTERVAL=1800s \
357 |             -x MXM_LOG_LEVEL=ERROR \
358 |             -x HCOLL_ML_DISABLE_REDUCE=1 \
359 |             -x LD_LIBRARY_PATH \
360 |             -x HCOLL_ENABLE_SHARP=2 \
361 |             -x SHARP_COLL_LOG_LEVEL=3 \
362 |             -x SHARP_COLL_GROUP_RESOURCE_POLICY=1 \
363 |             -x SHARP_COLL_MAX_PAYLOAD_SIZE=256 \
364 |             -x HCOLL_SHARP_UPROGRESS_NUM_POLLS=999 \
365 |             -x HCOLL_BCOL_P2P_ALLREDUCE_SHARP_MAX=4096 \
366 |             -x SHARP_COLL_PIPELINE_DEPTH=32 \
367 |             -x SHARP_COLL_JOB_QUOTA_OSTS=32 \
368 |             -x SHARP_COLL_JOB_QUOTA_MAX_GROUPS=4 \
369 |             -x SHARP_COLL_JOB_QUOTA_PAYLOAD_PER_OST=256 \
370 |             -x SHARP_COLL_ENABLE_SAT=1 \
371 |             ${WORKSPACE}/.ci/taskset -c 1 \
372 |                 numactl --membind=0 \
373 |                     $HPCX_OSU_DIR/osu_allreduce \
374 |                         -i 100 \
375 |                         -x 100 \
376 |                         -f \
377 |                     -m 4096:4096"
378 |     echo "INFO: Test 8 command line:"
379 |     trim_multiple_spaces "$CMD"
380 |     $CMD
381 |     if [ $? -ne 0 ]; then
382 |         echo "ERROR: Test 8 (with SAT) failed, check the log file"
383 |         echo "FAIL"
384 |         exit 1
385 |     fi
386 |     echo "${GH_UNFOLD}"
387 |     echo "Test 8... DONE"
388 | 
389 |     echo "INFO: verify_sharp... DONE"
390 | }
391 | 
392 | if [ "${SHARP_MANAGER_ACTION}" != "stop" ]; then
393 |     check_opensm_status
394 |     check_opensm_conf
395 | fi
396 | 
397 | sudo PDSH_RCMD_TYPE=ssh SHARP_INI_FILE=${SHARP_INI_FILE} SHARP_CONF=${SHARP_CONF} ${HPCX_SHARP_DIR}/sbin/sharp_manager.sh "${SHARP_MANAGER_ACTION}" -l "$HOSTS" -s "${SHARP_AM_NODE}"
398 | if [ $? -ne 0 ]; then
399 |     echo "ERROR: sharp_manager.sh failed, check the log file"
400 |     echo "FAIL"
401 |     exit 1
402 | fi
403 | 
404 | if [ "${SHARP_MANAGER_ACTION}" != "stop" ] && [ "${VERIFY_SHARP_ENABLE}" -eq 1 ]; then
405 |     verify_sharp
406 | fi
407 | 
408 | sudo chmod -R 777 ${CONFIGURE_SHARP_TMP_DIR}
409 | rm -rf ${CONFIGURE_SHARP_TMP_DIR}
410 | 
411 | echo "PASS"
412 | 


--------------------------------------------------------------------------------
/.ci/ibdev2netdev:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | # ibdev2netdev doesn't work correctly inside a container. This wrapper is a workaround
3 | DEV_IB=$(ls -1 /dev/infiniband/umad*)
4 | N=${DEV_IB: -1}
5 | if [ -e /dev/infiniband/umad${N} ]; then
6 |     printf "mlx5_${N} port 1 ====> ib0\n"
7 | fi
8 | 


--------------------------------------------------------------------------------
/.ci/nccl_tests:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eE
 2 | # Wrapper to add correct parameters to the main app
 3 | IB_DEV=$(ibdev2netdev | awk '{ print $1 }'):1
 4 | ETH_DEV=$(ibdev2netdev | awk '{ print $5 }')
 5 | 
 6 | export HCOLL_MAIN_IB=${IB_DEV}
 7 | export NCCL_IB_HCA=${IB_DEV}
 8 | export UCX_NET_DEVICES=${IB_DEV}
 9 | export NCCL_SOCKET_IFNAME=${ETH_DEV}
10 | exec "${@}"
11 | 


--------------------------------------------------------------------------------
/.ci/publish_artefacts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -leE
 2 | 
 3 | SCRIPT_DIR="$(
 4 |     cd "$(dirname "$0")"
 5 |     pwd -P
 6 | )"
 7 | cd "${SCRIPT_DIR}"
 8 | # shellcheck source=settings.sh
 9 | . "${SCRIPT_DIR}/settings.sh"
10 | 
11 | echo 'Publish artefacts...'
12 | 
13 | export UPSTREAM_JOB_NAME=${UPSTREAM_JOB_NAME:-${JOB_NAME}}
14 | export UPSTREAM_BUILD_NUMBER=${UPSTREAM_BUILD_NUMBER:-${BUILD_NUMBER}}
15 | export UPSTREAM_ghprbGhRepository=${UPSTREAM_ghprbGhRepository:-${ghprbGhRepository}}
16 | export UPSTREAM_ghprbPullId=${UPSTREAM_ghprbPullId:-${ghprbPullId}}
17 | 
18 | ls -al "${ARTEFACT_DIR}"
19 | 
20 | publish_artefacts_to_gist.py
21 | 
22 | echo 'Publish artefacts... DONE'
23 | 


--------------------------------------------------------------------------------
/.ci/pushd_functions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | pushd() {
3 |     command pushd "$@" >/dev/null
4 | }
5 | 
6 | popd() {
7 |     command popd "$@" >/dev/null
8 | }
9 | 


--------------------------------------------------------------------------------
/.ci/run_nccl_tests.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -leE
  2 | SCRIPT_DIR="$(
  3 |     cd "$(dirname "$0")"
  4 |     pwd -P
  5 | )"
  6 | cd "${SCRIPT_DIR}"
  7 | # shellcheck source=settings.sh
  8 | . "${SCRIPT_DIR}"/settings.sh
  9 | 
 10 | GLOBAL_TEST_STATUS=0
 11 | 
 12 | if [ -z "${NCCL_DIR}" ]; then
 13 |     module load dev/nccl-nightly-stable
 14 | else
 15 |     export LD_LIBRARY_PATH="${NCCL_DIR}/lib:${LD_LIBRARY_PATH}"
 16 | fi
 17 | 
 18 | if [ -z "${NCCL_RDMA_SHARP_PLUGINS_DIR}" ]; then
 19 |     echo "ERROR: NCCL_RDMA_SHARP_PLUGINS_DIR is not defined"
 20 |     echo "FAIL"
 21 |     exit 1
 22 | fi
 23 | 
 24 | if [ -z "${NCCL_TESTS_DIR}" ]; then
 25 |     echo "ERROR: NCCL_TESTS_DIR is not defined"
 26 |     echo "FAIL"
 27 |     exit 1
 28 | fi
 29 | 
 30 | NP=2
 31 | IB_DEV=$(ibdev2netdev | awk '{ print $1 }'):1
 32 | # UCX_MEMTYPE_CACHE=n - to avoid warnings "memtype_cache.c:83   UCX  ERROR failed to insert region 0x1a1e890 [0x7f8d00000000..0x7f8d30000000]: Element already exists"
 33 | MPIRUN_OPTIONS_COMMON="\
 34 | -x LD_LIBRARY_PATH \
 35 | -x NCCL_DEBUG=INFO \
 36 | -x NCCL_DEBUG_SUBSYS=INIT \
 37 | -x UCX_MEMTYPE_CACHE=n \
 38 | -x HCOLL_ENABLE_SHARP=0 \
 39 | -x HCOLL_ENABLE_MCAST_ALL=0 \
 40 | -mca pml ucx \
 41 | -mca coll_hcoll_enable 1 \
 42 | --map-by node \
 43 | --bind-to none \
 44 | --hostfile ${HOSTFILE} \
 45 | -np $NP \
 46 | --report-bindings \
 47 | --allow-run-as-root \
 48 | -mca oob_tcp_if_exclude eth0 \
 49 | "
 50 | 
 51 | # Application options
 52 | ITER=100
 53 | WARMUP_ITER=100
 54 | MSG_SIZE_MIN="8"
 55 | MSG_SIZE_MAX="4M"
 56 | NCCL_TEST_EXE=("all_reduce_perf" "all_gather_perf" "broadcast_perf" "reduce_perf" "reduce_scatter_perf" "alltoall_perf")
 57 | NCCL_TEST_PARAMS=" -b ${MSG_SIZE_MIN} -e ${MSG_SIZE_MAX} -f 2 -g 1 -c 1 -z 1 -n $ITER -w $WARMUP_ITER -p 0 "
 58 | ENABLE_SAT=${ENABLE_SAT:-1}
 59 | echo "INFO: ENABLE_SAT = ${ENABLE_SAT}"
 60 | 
 61 | echo_hash_line() {
 62 |     echo "###############################################################################"
 63 | }
 64 | 
 65 | echo "CUDA_HOME: ${CUDA_HOME}"
 66 | echo "NCCL_DIR: ${NCCL_DIR}"
 67 | echo "NCCL_RDMA_SHARP_PLUGINS_DIR: ${NCCL_RDMA_SHARP_PLUGINS_DIR}"
 68 | echo "MPI_HOME: ${MPI_HOME}"
 69 | 
 70 | # Build NCCL-TESTS
 71 | cd "${NCCL_TESTS_DIR}"
 72 | make -j clean
 73 | 
 74 | make -j CUDA_HOME="${CUDA_HOME}" NCCL_HOME="${NCCL_DIR}" MPI=1 MPI_HOME="${MPI_HOME}"
 75 | 
 76 | export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${NCCL_RDMA_SHARP_PLUGINS_DIR}/lib:${LD_LIBRARY_PATH}"
 77 | 
 78 | trim_multiple_spaces() {
 79 |     echo "$1" | sed -s "s|\ \ *| |g"
 80 | }
 81 | 
 82 | # USAGE: all_reduce_perf
 83 | # [-t,--nthreads <num threads>]
 84 | # [-g,--ngpus <gpus per thread>]
 85 | # [-b,--minbytes <min size in bytes>]
 86 | # [-e,--maxbytes <max size in bytes>]
 87 | # [-i,--stepbytes <increment size>]
 88 | # [-f,--stepfactor <increment factor>]
 89 | # [-n,--iters <iteration count>]
 90 | # [-m,--agg_iters <aggregated iteration count>]
 91 | # [-w,--warmup_iters <warmup iteration count>]
 92 | # [-p,--parallel_init <0/1>]
 93 | # [-c,--check <0/1>]
 94 | # [-o,--op <sum/prod/min/max/all>]
 95 | # [-d,--datatype <nccltype/all>]
 96 | # [-r,--root <root>]
 97 | # [-z,--blocking <0/1>]
 98 | # [-h,--help]
 99 | 
100 | ###############################################################################
101 | # Run NCCL-TESTS (MPI)
102 | ###############################################################################
103 | 
104 | i=1
105 | 
106 | for TEST_EXE in ${NCCL_TEST_EXE[@]}; do
107 |     #===================
108 |     # NCCL_PLUGIN_P2P
109 |     #===================
110 |     # Enable ucx_rma tests once this is resolved: https://redmine.mellanox.com/issues/3037941
111 |     # for P2P_LAYER in ucx ucx_rma ib
112 |     for P2P_LAYER in ib ucx ucx_uct ucx_uct_read; do
113 |         MPIRUN_OPTIONS_PLUGIN_P2P_LAYER="-x NCCL_PLUGIN_P2P=${P2P_LAYER}"
114 | 
115 |         #===================
116 |         # NCCL_PROTO
117 |         #===================
118 |         for NCCL_PROTO in Simple LL DEFAULT; do
119 |             if [ "${NCCL_PROTO}" = "DEFAULT" ]; then
120 |                 MPIRUN_OPTIONS_NCCL_PROTO=""
121 |             else
122 |                 MPIRUN_OPTIONS_NCCL_PROTO="-x NCCL_PROTO=${NCCL_PROTO}"
123 |             fi
124 | 
125 |             #===================
126 |             # NCCL_ALGO
127 |             #===================
128 |             for NCCL_ALGO in CollNet Tree Ring DEFAULT; do
129 |                 if [ "${NCCL_ALGO}" = "CollNet" ] && [ "${TEST_EXE}" != "all_reduce_perf" ]; then
130 |                     # test sharp plugin only with all_reduce_perf
131 |                     continue
132 |                 fi
133 | 
134 |                 if [ "${NCCL_ALGO}" = "DEFAULT" ]; then
135 |                     MPIRUN_OPTIONS_NCCL_ALGO=""
136 |                 else
137 |                     MPIRUN_OPTIONS_NCCL_ALGO="-x NCCL_ALGO=${NCCL_ALGO}"
138 |                 fi
139 | 
140 |                 if [ "${NCCL_ALGO}" = "CollNet" ]; then
141 |                     MPIRUN_OPTIONS_NCCL_ALGO="-x NCCL_COLLNET_ENABLE=1"
142 |                 fi
143 | 
144 |                 #===================
145 |                 # SHARP_ENABLE
146 |                 #===================
147 |                 for SHARP_ENABLE in 0 1; do
148 |                     if { [ "${NCCL_ALGO}" = "Tree" ] || [ "${NCCL_ALGO}" = "Ring" ]; } && [ "$SHARP_ENABLE" = "1" ]; then
149 |                         # skip sharp enable 1 for tree and ring algorithms
150 |                         continue
151 |                     fi
152 |                     if [ "${SHARP_ENABLE}" = "0" ]; then
153 |                         MPIRUN_OPTIONS_SHARP=""
154 |                     else
155 |                         MPIRUN_OPTIONS_SHARP="\
156 |                             -x SHARP_COLL_LOG_LEVEL=3 \
157 |                             -x SHARP_COLL_ENABLE_SAT=${ENABLE_SAT} \
158 |                             "
159 |                     fi
160 | 
161 |                     #===================
162 |                     # NCCL_NET_GDR_LEVEL
163 |                     #===================
164 |                     # for NCCL_NET_GDR_LEVEL in 0 1 2 3 4 5 DEFAULT
165 |                     for NCCL_NET_GDR_LEVEL in DEFAULT; do
166 |                         if [ "${NCCL_NET_GDR_LEVEL}" = "DEFAULT" ]; then
167 |                             MPIRUN_OPTIONS_GDR_LEVEL=""
168 |                         else
169 |                             MPIRUN_OPTIONS_GDR_LEVEL="-x NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL}"
170 |                         fi
171 | 
172 |                         #===================
173 |                         # NCCL_NET_GDR_READ
174 |                         #===================
175 |                         # for NCCL_NET_GDR_READ in 0 1 DEFAULT
176 |                         for NCCL_NET_GDR_READ in DEFAULT; do
177 |                             if [ "${NCCL_NET_GDR_READ}" = "DEFAULT" ]; then
178 |                                 MPIRUN_OPTIONS_GDR_READ=""
179 |                             else
180 |                                 MPIRUN_OPTIONS_GDR_READ="-x NCCL_NET_GDR_READ=${NCCL_NET_GDR_READ}"
181 |                             fi
182 | 
183 |                             echo_hash_line
184 |                             echo "${GH_FOLD}{# Test $i...}"
185 |                             echo_hash_line
186 | 
187 |                             echo "INFO: TEST                = ${TEST_EXE}"
188 |                             echo "INFO: P2P_LAYER           = ${P2P_LAYER}"
189 |                             echo "INFO: NCCL_PROTO          = ${NCCL_PROTO}"
190 |                             echo "INFO: NCCL_ALGO           = ${NCCL_ALGO}"
191 |                             echo "INFO: SHARP_ENABLE        = ${SHARP_ENABLE}"
192 |                             echo "INFO: NCCL_NET_GDR_LEVEL  = ${NCCL_NET_GDR_LEVEL}"
193 |                             echo "INFO: NCCL_NET_GDR_READ   = ${NCCL_NET_GDR_READ}"
194 | 
195 |                             CMD="mpirun \
196 |                                 ${MPIRUN_OPTIONS_COMMON} \
197 |                                 ${MPIRUN_OPTIONS_NCCL_PROTO} \
198 |                                 ${MPIRUN_OPTIONS_NCCL_ALGO} \
199 |                                 ${MPIRUN_OPTIONS_SHARP} \
200 |                                 ${MPIRUN_OPTIONS_GDR_LEVEL} \
201 |                                 ${MPIRUN_OPTIONS_GDR_READ} \
202 |                                 ${MPIRUN_OPTIONS_PLUGIN_P2P_LAYER} \
203 |                                 ${WORKSPACE}/.ci/nccl_tests ${NCCL_TESTS_DIR}/build/${TEST_EXE} ${NCCL_TEST_PARAMS}"
204 |                             echo "# Test $i reproducer:"
205 |                             echo "export PATH=${PATH}"
206 |                             echo ""
207 |                             echo "export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
208 |                             echo ""
209 |                             echo "export OPAL_PREFIX=${OPAL_PREFIX}"
210 |                             echo ""
211 |                             trim_multiple_spaces "$CMD"
212 |                             if ! $CMD; then
213 |                                 echo "${GH_UNFOLD}"
214 |                                 echo "# Test $i... failed"
215 |                                 GLOBAL_TEST_STATUS=1
216 |                             else
217 |                                 echo "${GH_UNFOLD}"
218 |                                 echo "# Test $i... passed"
219 |                             fi
220 | 
221 |                             i=$((i + 1))
222 |                         done
223 |                     done
224 |                 done
225 |             done
226 |         done
227 |     done
228 | done
229 | 
230 | ###############################################################################
231 | if [ ${GLOBAL_TEST_STATUS} -ne 0 ]; then
232 |     echo "ERROR: some tests failed, check the log file"
233 |     echo "FAIL"
234 |     exit 1
235 | else
236 |     echo "All tests PASSED"
237 | fi
238 | 
239 | echo "PASS"
240 | 


--------------------------------------------------------------------------------
/.ci/settings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -leE
 2 | # Formating for Github acctions fold/unfold
 3 | GH=${GH:-0}
 4 | if [ "${GH}" -eq 1 ]; then
 5 |     GH_FOLD="::group::"
 6 |     GH_UNFOLD="::endgroup::"
 7 | fi
 8 | # PLUGINS
 9 | echo "INFO: DEBUG = $DEBUG"
10 | DEBUG=false
11 | if [ "$DEBUG" = "true" ]; then
12 |     set -x
13 | fi
14 | 
15 | # W/A for SHARP
16 | # CUDA 10.2 is the latest available version we would like to test, CUDA 10.1 is needed for SHARP
17 | # (due to HPC-X is buitl with CUDA 10.1).
18 | # CUDA 10.2 has priority in the env PATH/LD_LIBRARY_PATH.
19 | 
20 | # TODO remove use HPC-X which is already inside the image
21 | 
22 | #module load /hpc/local/etc/modulefiles/dev/cuda-latest
23 | HPCX_UBUNTU_INSTALL_DIR=${HPCX_UBUNTU_INSTALL_DIR:-/hpc/noarch/HPCX/unpacked/hpcx-v2.17-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64/}
24 | module load "${HPCX_UBUNTU_INSTALL_DIR}"/modulefiles/hpcx-ompi
25 | # . "${HPCX_UBUNTU_INSTALL_DIR}/hpcx-init.sh"
26 | # hpcx_load
27 | 
28 | # It is needed to disable nccl_rdma_sharp_plugin libs from HPC-X
29 | LD_LIBRARY_PATH="${LD_LIBRARY_PATH//nccl_rdma_sharp_plugin/nccl_rdma_sharp_pluginX}"
30 | export LD_LIBRARY_PATH
31 | CUDA_HOME=/usr/local/cuda
32 | #export UCX_NET_DEVICES=$(ibdev2netdev | awk '{print $1}'):1
33 | export NCCL_RDMA_SHARP_PLUGINS_DIR="${NCCL_RDMA_SHARP_PLUGINS_DIR:-${WORKSPACE}/_install}"
34 | echo "INFO: NCCL_RDMA_SHARP_PLUGINS_DIR = ${NCCL_RDMA_SHARP_PLUGINS_DIR}"
35 | 
36 | TOP_DIR="$(git rev-parse --show-toplevel)"
37 | echo "INFO: TOP_DIR = ${TOP_DIR}"
38 | 
39 | echo "INFO: CUDA_VER = ${CUDA_VER}"
40 | echo "INFO: CUDA_HOME = ${CUDA_HOME}"
41 | echo "INFO: HPCX_SHARP_DIR = ${HPCX_SHARP_DIR}"
42 | echo "INFO: HPCX_DIR = ${HPCX_DIR}"
43 | echo "INFO: WORKSPACE = ${WORKSPACE}"
44 | 
45 | HOSTNAME=$(hostname -s)
46 | echo "INFO: HOSTNAME = $HOSTNAME"
47 | 
48 | WORKSPACE="${WORKSPACE:-${TOP_DIR}}"
49 | CFG_DIR="${WORKSPACE}/.ci/cfg"
50 | HOSTFILE=${CFG_DIR}/$HOSTNAME/hostfile
51 | 
52 | if [ ! -f "${HOSTFILE}" ]; then
53 |     echo "ERROR: ${HOSTFILE} doesn't exist or not accessible"
54 |     echo "FAIL"
55 |     exit 1
56 | fi
57 | 
58 | if [ ! -d "${HPCX_DIR}" ]; then
59 |     echo "ERROR: ${HPCX_DIR} does not exist or not accessible"
60 |     echo "FAIL"
61 |     exit 1
62 | fi
63 | 


--------------------------------------------------------------------------------
/.ci/sharp_coll_test_wrapper:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | # Wrapper to add correct parameter to the main scripts without refactoring
3 | TEST_DEV=$(ibdev2netdev | awk '{ print $1 }' ):1
4 | echo ${UCX_NET_DEVICES}
5 | ./sharp_coll_test -d "${TEST_DEV}" "${@}"
6 | 


--------------------------------------------------------------------------------
/.ci/taskset:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | # Wrapper to add correct parameters to the main app
3 | TEST_DEV=$(ibdev2netdev | awk '{ print $1 }' ):1
4 | export MXM_RDMA_PORTS="${TEST_DEV}"
5 | export HCOLL_MAIN_IB="${TEST_DEV}"
6 | /usr/bin/taskset "${@}"
7 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | IndentWidth: 2
 2 | AlignEscapedNewlines: Indent
 3 | AlignConsecutiveAssignments: true
 4 | AlignConsecutiveDeclarations: false
 5 | AlignConsecutiveStructMembers: true
 6 | AlignConsecutiveMacros: true
 7 | AlignDeclarationByPointer: true
 8 | AlignAfterOpenBracket: true
 9 | AlignOperands: true
10 | 


--------------------------------------------------------------------------------
/.github/workflows/nccl-sharp-plugin.yml:
--------------------------------------------------------------------------------
 1 | name: NCCL Sharp plugin CI
 2 | on:
 3 |   workflow_dispatch: 
 4 |     inputs:
 5 |       mainhost:
 6 |         description: 'Choose one of hosts to run:'
 7 |         required: true
 8 |         type: choice
 9 |         default: 'host01'
10 |         options:
11 |           - host01
12 |           - host02
13 |   push:
14 |     branches: ['*']
15 |   pull_request:
16 |     branches: ['*']
17 | jobs:
18 |   deployment:
19 |     runs-on: [self-hosted, linux, x64]
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Deployment infrastructure
23 |       run: /start deploy
24 |   build:
25 |     needs: [deployment]
26 |     runs-on: [self-hosted, linux, x64]
27 |     steps:
28 |     - name: Building NCCL RDMA sharp plugin
29 |       run: /start build
30 |   sharp_config:
31 |     needs: [deployment, build]
32 |     runs-on: [self-hosted, linux, x64]
33 |     steps:
34 |     - name: Configuring and checking Sharp
35 |       run: /start sharp
36 |   testing:
37 |     needs: [sharp_config]
38 |     runs-on: [self-hosted, linux, x64]
39 |     steps:
40 |     - name: Running tests
41 |       run: /start test
42 |   clean:
43 |     if: ${{ always() }}
44 |     needs: [testing]
45 |     runs-on: [self-hosted, linux, x64]
46 |     steps:
47 |     - name: Cleaning
48 |       run: /start clean
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .gitignore
  2 | .project
  3 | .cproject
  4 | .settings
  5 | test/test
  6 | Makefile
  7 | Makefile.in
  8 | aclocal.m4
  9 | compile
 10 | config.guess
 11 | config.h.in
 12 | config.h.in~
 13 | config.sub
 14 | m4/libtool.m4
 15 | m4/ltoptions.m4
 16 | m4/ltsugar.m4
 17 | m4/ltversion.m4
 18 | m4/lt~obsolete.m4
 19 | config/aux
 20 | configure
 21 | install-sh
 22 | ltmain.sh
 23 | missing
 24 | config.h
 25 | config.log
 26 | config.status
 27 | libtool
 28 | stamp-h1
 29 | src/sharp/api/version.h
 30 | autom4te.cache
 31 | depcomp
 32 | .libs
 33 | *.la
 34 | .deps
 35 | .dirstamp
 36 | *.lo
 37 | *.o
 38 | build-*
 39 | sharp*tar.gz
 40 | rpm-dist
 41 | cov_build*
 42 | debian/changelog
 43 | debian/control
 44 | debian/rules
 45 | debian/sharp.postinst
 46 | debian/nccl-rdma-sharp-plugins.postinst
 47 | debian/nccl-rdma-sharp-plugins.prem
 48 | sharp.spec
 49 | sharp.pc
 50 | doc/doxygen-doc
 51 | doc/uml/uct.pdf
 52 | test-driver
 53 | install
 54 | src/api/version.h
 55 | tags
 56 | valgrind*xml
 57 | *.tap
 58 | jenkins/*
 59 | sharp-*
 60 | config.cache
 61 | nccl-rdma-sharp-plugins.pc
 62 | nccl-rdma-sharp-plugins.spec
 63 | 
 64 | # Prerequisites
 65 | *.d
 66 | 
 67 | # Compiled Object files
 68 | *.slo
 69 | *.lo
 70 | *.o
 71 | *.obj
 72 | 
 73 | # Precompiled Headers
 74 | *.gch
 75 | *.pch
 76 | 
 77 | # Compiled Dynamic libraries
 78 | *.so
 79 | *.dylib
 80 | *.dll
 81 | 
 82 | # Fortran module files
 83 | *.mod
 84 | *.smod
 85 | 
 86 | # Compiled Static libraries
 87 | *.lai
 88 | *.la
 89 | *.a
 90 | *.lib
 91 | 
 92 | # Executables
 93 | *.exe
 94 | *.out
 95 | *.app
 96 | 
 97 | .idea
 98 | 
 99 | *.orig
100 | *.bak
101 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without 
 4 | modification, are permitted provided that the following conditions 
 5 | are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright 
 8 | notice, this list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright
10 | notice, this list of conditions and the following disclaimer in the 
11 | documentation and/or other materials provided with the distribution.
12 | 3. Neither the name of the copyright holder nor the names of its 
13 | contributors may be used to endorse or promote products derived from 
14 | this software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
20 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 
22 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
24 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # See file LICENSE for terms.
 7 | #
 8 | 
 9 | SUBDIRS = src
10 | 
11 | EXTRA_DIST =
12 | EXTRA_DIST += autogen.sh
13 | EXTRA_DIST += include
14 | EXTRA_DIST += debian
15 | EXTRA_DIST += nccl-rdma-sharp-plugins.spec
16 | 
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # nccl-rdma-sharp-plugins
 2 | 
 3 | nccl-rdma-sharp plugin enables RDMA and Switch based collectives(SHARP)
 4 | with [NVIDIA's NCCL](https://github.com/NVIDIA/nccl) library
 5 | 
 6 | ## Overview
 7 | 
 8 | ## Requirements
 9 | 
10 | * MOFED
11 | * CUDA
12 | * SHARP
13 | * NCCL
14 | * GPUDirectRDMA plugin
15 | 
16 | ## Build Instructions
17 | 
18 | ### build system requirements
19 | 
20 | * CUDA
21 | * SHARP
22 | * MOFED
23 | 
24 | Plugin uses GNU autotools for its build system. You can build it as follows:
25 | 
26 | 
27 | ```
28 | $ ./autogen.sh
29 | $ ./configure
30 | $ make
31 | $ make install
32 | ```
33 | 
34 | The following flags enabled to build with custom dependencies
35 | 
36 | 
37 | ```
38 |   --with-verbs=PATH       Path to non-standard libibverbs installation
39 |   --with-sharp=PATH       Path to non-standard SHARP installation
40 |   --with-cuda=PATH        Path to non-standard CUDA installation
41 | ```
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 5 | # SPDX-License-Identifier: BSD-3-Clause
 6 | #
 7 | # Redistribution and use in source and binary forms, with or without
 8 | # modification, are permitted provided that the following conditions are met:
 9 | #
10 | # 1. Redistributions of source code must retain the above copyright notice, this
11 | # list of conditions and the following disclaimer.
12 | #
13 | # 2. Redistributions in binary form must reproduce the above copyright notice,
14 | # this list of conditions and the following disclaimer in the documentation
15 | # and/or other materials provided with the distribution.
16 | #
17 | # 3. Neither the name of the copyright holder nor the names of its
18 | # contributors may be used to endorse or promote products derived from
19 | # this software without specific prior written permission.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | #
32 | 
33 | rm -rf autom4te.cache
34 | autoreconf -ivf || exit 1
35 | rm -rf autom4te.cache
36 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | #
  4 | # See file LICENSE for terms.
  5 | #
  6 | AC_PREREQ([2.63])
  7 | 
  8 | AC_COPYRIGHT([Copyright (c) 2019, NVIDIA CORPORATION & AFFILIATES. All rights reserved.])
  9 | 
 10 | define([nccl_rdma_sharp_plugins_ver_major], 2)
 11 | define([nccl_rdma_sharp_plugins_ver_minor], 7)
 12 | 
 13 | AC_INIT([nccl-rdma-sharp-plugins], [nccl_rdma_sharp_plugins_ver_major.nccl_rdma_sharp_plugins_ver_minor], [support@mellanox.com], [],[http://github.com/Mellanox/nccl-rdma-sharp-plugins])
 14 | 
 15 | AM_INIT_AUTOMAKE([1.10 foreign tar-ustar subdir-objects])
 16 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 17 | AM_MAINTAINER_MODE
 18 | AC_CONFIG_MACRO_DIR([m4])
 19 | 
 20 | AC_USE_SYSTEM_EXTENSIONS
 21 | AC_GNU_SOURCE
 22 | AC_CONFIG_HEADERS([config.h])
 23 | 
 24 | RPM_RELEASE=1
 25 | MAJOR_VERSION=nccl_rdma_sharp_plugins_ver_major
 26 | MINOR_VERSION=nccl_rdma_sharp_plugins_ver_minor
 27 | VERSION=$MAJOR_VERSION.$MINOR_VERSION
 28 | 
 29 | AC_SUBST(RPM_RELEASE)
 30 | AC_SUBST(VERSION)
 31 | AC_SUBST(MAJOR_VERSION)
 32 | AC_SUBST(MINOR_VERSION)
 33 | AC_SUBST([BUILD_DATE], [$(date +'%b/%d/%Y')])
 34 | AC_SUBST([BUILD_TIME], [$(date +'%H:%M:%S')])
 35 | 
 36 | # Checks for programs.
 37 | AC_GNU_SOURCE
 38 | AC_PROG_CC
 39 | AC_PROG_CC_STDC
 40 | AC_PROG_CXX
 41 | AM_PROG_AS
 42 | AC_PROG_SED
 43 | AC_PROG_INSTALL
 44 | AC_PROG_LIBTOOL
 45 | AC_HEADER_STDC
 46 | LT_LIB_M
 47 | 
 48 | AC_ARG_ENABLE([debug],AS_HELP_STRING([--enable-debug], [Enable extra debugging code (default is NO).]),
 49 |               [], [enable_debug=no])
 50 | 
 51 | if test $enable_debug = yes; then
 52 | 	AC_DEFINE([ENABLE_DEBUG], [1], [Enable debugging code])
 53 | 	CFLAGS="$CFLAGS -O0 -g3 -Wall -Werror"
 54 | else
 55 | 	CFLAGS="$CFLAGS -O3 -DNDEBUG -Wall -Werror"
 56 | fi
 57 | 
 58 | #check for cuda
 59 | AC_ARG_WITH([cuda],
 60 |             [AC_HELP_STRING([--with-cuda=PATH],
 61 |             [Path to non-standard CUDA installation])],
 62 |             [AS_IF([test -d $withval/lib64], [cuda_libdir="lib64"], [cuda_libdir="lib"])
 63 |              CFLAGS="-I$withval/include $CFLAGS"
 64 |              LDFLAGS="-L$withval/$cuda_libdir $LDFLAGS"],
 65 |             [CFLAGS="-I/usr/local/cuda/include $CFLAGS"
 66 |              LDFLAGS="-L/usr/local/cuda/$cuda_libdir $LDFLAGS"])
 67 | 
 68 | AC_CHECK_HEADER(       [cuda_runtime.h], [], [AC_MSG_FAILURE([CUDA runtime header files not found])])
 69 | AC_CHECK_LIB([cudart], [cudaMalloc],     [], [AC_MSG_FAILURE([CUDA runtime libs not found])])
 70 | 
 71 | #check for verbs
 72 | AC_ARG_WITH([verbs],
 73 |             [AC_HELP_STRING([--with-verbs(=DIR)],
 74 |             [Build Infiniband  support, adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])],
 75 |             [CFLAGS="-I$with_verbs/include $CFLAGS"
 76 |              LDFLAGS="-L$with_verbs/lib64 -L$with_verbs/lib -libverbs $LDFLAGS"],
 77 |             [CFLAGS="-I/usr/include $CFLAGS"
 78 |              LDFLAGS="-L/usr/lib64 -L/usr/lib -libverbs $LDFLAGS"])
 79 | 
 80 | AC_CHECK_HEADER(        [infiniband/verbs.h],  [],[AC_MSG_FAILURE([ibverbs header files not found])])
 81 | AC_CHECK_LIB([ibverbs], [ibv_get_device_list], [],[AC_MSG_FAILURE([libibverbs not found]);])
 82 | 
 83 | AC_CHECK_DECLS([IBV_ACCESS_RELAXED_ORDERING, IBV_QPF_GRH_REQUIRED, ibv_reg_dmabuf_mr, ibv_query_ece, ibv_set_ece], [], [],
 84 |                     [[#include <infiniband/verbs.h>]])
 85 | 
 86 | # check for ucx
 87 | AM_CONDITIONAL([HAVE_UCX_PLUGIN], [false])
 88 | m4_include([m4/ucx.m4])
 89 | CHECK_UCX
 90 | AC_MSG_RESULT([UCX support: $ucx_happy])
 91 | 
 92 | # check for sharp
 93 | AM_CONDITIONAL([HAVE_SHARP_PLUGIN], [false])
 94 | m4_include([m4/sharp.m4])
 95 | CHECK_SHARP
 96 | AC_MSG_RESULT([SHARP support: $sharp_happy])
 97 | 
 98 | #check for required headers
 99 | AC_CHECK_HEADERS([limits.h stdlib.h string.h unistd.h], [],
100 |                  [AC_MSG_FAILURE([unable to find required headers])])
101 | 
102 | AC_CONFIG_FILES([Makefile
103 |                  src/Makefile
104 |                  nccl-rdma-sharp-plugins.spec
105 |                  nccl-rdma-sharp-plugins.pc
106 |                  debian/changelog
107 |                  debian/control
108 |                  debian/nccl-rdma-sharp-plugins.postinst
109 |                  debian/nccl-rdma-sharp-plugins.prem
110 |                  debian/rules
111 |                 ])
112 | AC_OUTPUT
113 | 
114 | echo "NCCL-RDMA-SHARP Plugin has been configured."
115 | 


--------------------------------------------------------------------------------
/contrib/buildrpm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eE
 2 | #
 3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 4 | # Copyright (c) 2001-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 5 | # SPDX-License-Identifier: BSD-3-Clause
 6 | # See file LICENSE for terms.
 7 | #
 8 | 
 9 | PACKAGE=nccl-rdma-sharp-plugins
10 | WS=$PWD
11 | rpmspec=${PACKAGE}.spec
12 | rpmmacros="--define='_rpmdir ${WS}/rpm-dist' --define='_srcrpmdir ${WS}/rpm-dist' --define='_sourcedir ${WS}' --define='_specdir ${WS}' --define='_builddir ${WS}'"
13 | rpmopts="--nodeps --buildroot='${WS}/_rpm'"
14 | 
15 | 
16 | 
17 | opt_tarball=0
18 | opt_srcrpm=0
19 | opt_binrpm=0
20 | 
21 | while test "$1" != ""; do
22 |     case $1 in
23 |         --tarball|-t) opt_tarball=1 ;;
24 |         --srcrpm|-s)  opt_srcrpm=1 ;;
25 |         --binrpm|-b)  opt_binrpm=1 ;;
26 |         *)
27 |             cat <<EOF
28 | Unrecognized argument: $1
29 | 
30 | Valid arguments:
31 | 
32 | --tarball|-t    Create tarball
33 | --srcrpm|-s     Create src.rpm
34 | --binrpm|-b     Create bin.rpm
35 | 
36 | 
37 | EOF
38 |             exit 1
39 |             ;;
40 |     esac
41 |     shift
42 | done
43 | 
44 | 
45 | mkdir -p rpm-dist
46 | 
47 | if [ $opt_tarball -eq 1 ]; then
48 |     make dist
49 | fi
50 | 
51 | if [ $opt_srcrpm -eq 1 ]; then
52 |     echo rpmbuild -bs $rpmmacros $rpmopts $rpmspec | bash -eEx
53 | fi
54 | 
55 | if [ $opt_binrpm -eq 1 ]; then
56 |     echo rpmbuild -bb $rpmmacros $rpmopts $rpmspec | bash -eEx
57 | fi
58 | 


--------------------------------------------------------------------------------
/debian/changelog.in:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice, this
10 | # list of conditions and the following disclaimer.
11 | #
12 | # 2. Redistributions in binary form must reproduce the above copyright notice,
13 | # this list of conditions and the following disclaimer in the documentation
14 | # and/or other materials provided with the distribution.
15 | #
16 | # 3. Neither the name of the copyright holder nor the names of its
17 | # contributors may be used to endorse or promote products derived from
18 | # this software without specific prior written permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | 
32 | nccl-rdma-sharp-plugins (@VERSION@) unstable; urgency=low
33 | 
34 |   * Initial release (Closes: #nnnn)  <nnnn is the bug number of your ITP>
35 | 
36 |  -- Mellanox Ltd. <support@mellanox.com>  Wed, 11 Sep 2013 15:24:22 +0300
37 | 


--------------------------------------------------------------------------------
/debian/compat:
--------------------------------------------------------------------------------
1 | 8
2 | 


--------------------------------------------------------------------------------
/debian/control.in:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice, this
10 | # list of conditions and the following disclaimer.
11 | #
12 | # 2. Redistributions in binary form must reproduce the above copyright notice,
13 | # this list of conditions and the following disclaimer in the documentation
14 | # and/or other materials provided with the distribution.
15 | #
16 | # 3. Neither the name of the copyright holder nor the names of its
17 | # contributors may be used to endorse or promote products derived from
18 | # this software without specific prior written permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | 
32 | Source: @PACKAGE@
33 | Section: libs
34 | Priority: extra
35 | Maintainer: support@mellanox.com
36 | Build-Depends: libibverbs-dev
37 | Standards-Version: @MAJOR_VERSION@.@MINOR_VERSION@
38 | Homepage: http://www.mellanox.com
39 | 
40 | Package: @PACKAGE@
41 | Section: libs
42 | Depends: ${shlibs:Depends}, ${misc:Depends}
43 | Architecture: any
44 | Description: RDMA and SHARP plugin for NCCL
45 |  Plugin enabled RDMA and  switch collectives(SHARP) in NCCL
46 | 


--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 2 | Upstream-Name: NCCL-RDMA-SHARP plugins
 3 | Source: http://www.mellanox.com
 4 | 
 5 | Files: *
 6 | Copyright (c) 2015-2019, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 7 | License: BSD
 8 |  Redistribution and use in source and binary forms, with or without 
 9 |  modification, are permitted provided that the following conditions 
10 |  are met:
11 |  
12 |  1. Redistributions of source code must retain the above copyright 
13 |  notice, this list of conditions and the following disclaimer.
14 |  2. Redistributions in binary form must reproduce the above copyright
15 |  notice, this list of conditions and the following disclaimer in the 
16 |  documentation and/or other materials provided with the distribution.
17 |  3. Neither the name of the copyright holder nor the names of its 
18 |  contributors may be used to endorse or promote products derived from 
19 |  this software without specific prior written permission.
20 |  
21 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
22 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
23 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
24 |  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
25 |  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
26 |  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 
27 |  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
28 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
29 |  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
30 |  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
31 |  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/debian/nccl-rdma-sharp-plugins.postinst.in:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 5 | # SPDX-License-Identifier: BSD-3-Clause
 6 | #
 7 | # Redistribution and use in source and binary forms, with or without
 8 | # modification, are permitted provided that the following conditions are met:
 9 | #
10 | # 1. Redistributions of source code must retain the above copyright notice, this
11 | # list of conditions and the following disclaimer.
12 | #
13 | # 2. Redistributions in binary form must reproduce the above copyright notice,
14 | # this list of conditions and the following disclaimer in the documentation
15 | # and/or other materials provided with the distribution.
16 | #
17 | # 3. Neither the name of the copyright holder nor the names of its
18 | # contributors may be used to endorse or promote products derived from
19 | # this software without specific prior written permission.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | #
32 | 
33 | set -e
34 | if [ @prefix@ != /usr/lib/pkgconfig ];then
35 |     install -m 755 @prefix@/lib/pkgconfig/nccl-rdma-sharp-plugins.pc /usr/lib/pkgconfig/nccl-rdma-sharp-plugins.pc
36 | fi
37 | 
38 | 


--------------------------------------------------------------------------------
/debian/nccl-rdma-sharp-plugins.prem.in:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 5 | # SPDX-License-Identifier: BSD-3-Clause
 6 | #
 7 | # Redistribution and use in source and binary forms, with or without
 8 | # modification, are permitted provided that the following conditions are met:
 9 | #
10 | # 1. Redistributions of source code must retain the above copyright notice, this
11 | # list of conditions and the following disclaimer.
12 | #
13 | # 2. Redistributions in binary form must reproduce the above copyright notice,
14 | # this list of conditions and the following disclaimer in the documentation
15 | # and/or other materials provided with the distribution.
16 | #
17 | # 3. Neither the name of the copyright holder nor the names of its
18 | # contributors may be used to endorse or promote products derived from
19 | # this software without specific prior written permission.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | #
32 | 
33 | PCF=/usr/lib/pkgconfig/nccl-rdma-sharp-plugins.pc
34 | 
35 | if [ -f $PCF ];then
36 |     rm -f $PCF
37 | fi
38 | 


--------------------------------------------------------------------------------
/debian/rules.in:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | # -*- makefile -*-
 3 | #
 4 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 5 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 6 | # SPDX-License-Identifier: BSD-3-Clause
 7 | #
 8 | # Redistribution and use in source and binary forms, with or without
 9 | # modification, are permitted provided that the following conditions are met:
10 | #
11 | # 1. Redistributions of source code must retain the above copyright notice, this
12 | # list of conditions and the following disclaimer.
13 | #
14 | # 2. Redistributions in binary form must reproduce the above copyright notice,
15 | # this list of conditions and the following disclaimer in the documentation
16 | # and/or other materials provided with the distribution.
17 | #
18 | # 3. Neither the name of the copyright holder nor the names of its
19 | # contributors may be used to endorse or promote products derived from
20 | # this software without specific prior written permission.
21 | #
22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | #
33 | 
34 | # Sample debian/rules that uses debhelper.
35 | # This file was originally written by Joey Hess and Craig Small.
36 | # As a special exception, when this file is copied by dh-make into a
37 | # dh-make output file, you may use that output file without restriction.
38 | # This special exception was added by Craig Small in version 0.37 of dh-make.
39 | 
40 | # Uncomment this to turn on verbose mode.
41 | #export DH_VERBOSE=1
42 | 
43 | %:
44 | 	dh $@ 
45 | 
46 | override_dh_auto_configure:
47 | 	contrib/configure-release
48 | 	chmod +x debian/rules
49 | 
50 | override_dh_shlibdeps:
51 | 	dh_shlibdeps --dpkg-shlibdeps-params=--ignore-missing-info
52 | 
53 | override_dh_auto_clean:
54 | 


--------------------------------------------------------------------------------
/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (native)
2 | 


--------------------------------------------------------------------------------
/include/core.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #ifndef NCCL_CORE_H_
 10 | #define NCCL_CORE_H_
 11 | 
 12 | #include "nccl.h"
 13 | #include "debug.h"
 14 | 
 15 | #include <stdint.h>
 16 | #include <stdlib.h>
 17 | 
 18 | #define MIN(a, b) ((a)<(b)?(a):(b))
 19 | #define MAX(a, b) ((a)>(b)?(a):(b))
 20 | 
 21 | #define DIVUP(x, y) \
 22 |     (((x)+(y)-1)/(y))
 23 | #define ROUNDUP(x, y) \
 24 |     (DIVUP((x), (y))*(y))
 25 | 
 26 | // Check CUDA calls
 27 | #define CUDACHECK(cmd) do {                                 \
 28 |     cudaError_t err = cmd;                                  \
 29 |     if( err != cudaSuccess ) {                              \
 30 |         WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
 31 |         return ncclUnhandledCudaError;                      \
 32 |     }                                                       \
 33 | } while(false)
 34 | 
 35 | #define CUDACHECKGOTO(cmd, RES, label) do {                 \
 36 |     cudaError_t err = cmd;                                  \
 37 |     if( err != cudaSuccess ) {                              \
 38 |         WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
 39 |         RES = ncclUnhandledCudaError;                       \
 40 |         goto label;                                         \
 41 |     }                                                       \
 42 | } while(false)
 43 | 
 44 | // Report failure but clear error and continue
 45 | #define CUDACHECKIGNORE(cmd) do {  \
 46 |     cudaError_t err = cmd;         \
 47 |     if( err != cudaSuccess ) {     \
 48 |         INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
 49 |         (void) cudaGetLastError(); \
 50 |     }                              \
 51 | } while(false)
 52 | 
 53 | #include <errno.h>
 54 | // Check system calls
 55 | #define SYSCHECK(statement, name) do { \
 56 |   int retval; \
 57 |   SYSCHECKSYNC((statement), name, retval); \
 58 |   if (retval == -1) { \
 59 |     WARN("Call to " name " failed: %s", strerror(errno)); \
 60 |     return ncclSystemError; \
 61 |   } \
 62 | } while (false)
 63 | 
 64 | #define SYSCHECKSYNC(statement, name, retval) do { \
 65 |   retval = (statement); \
 66 |   if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
 67 |     INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
 68 |   } else { \
 69 |     break; \
 70 |   } \
 71 | } while(true)
 72 | 
 73 | #define SYSCHECKGOTO(statement, name, RES, label) do { \
 74 |   int retval; \
 75 |   SYSCHECKSYNC((statement), name, retval); \
 76 |   if (retval == -1) { \
 77 |     WARN("Call to " name " failed: %s", strerror(errno)); \
 78 |     RES = ncclSystemError; \
 79 |     goto label; \
 80 |   } \
 81 | } while (0)
 82 | 
 83 | // Pthread calls don't set errno and never return EINTR.
 84 | #define PTHREADCHECK(statement, name) do { \
 85 |   int retval = (statement); \
 86 |   if (retval != 0) { \
 87 |     WARN("Call to " name " failed: %s", strerror(retval)); \
 88 |     return ncclSystemError; \
 89 |   } \
 90 | } while (0)
 91 | 
 92 | #define PTHREADCHECKGOTO(statement, name, RES, label) do { \
 93 |   int retval = (statement); \
 94 |   if (retval != 0) { \
 95 |     WARN("Call to " name " failed: %s", strerror(retval)); \
 96 |     RES = ncclSystemError; \
 97 |     goto label; \
 98 |   } \
 99 | } while (0)
100 | 
101 | 
102 | #define NEQCHECK(statement, value) do {   \
103 |   if ((statement) != value) {             \
104 |     /* Print the back trace*/             \
105 |     INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
106 |     return ncclSystemError;     \
107 |   }                             \
108 | } while (0)
109 | 
110 | #define NEQCHECKGOTO(statement, value, RES, label) do { \
111 |   if ((statement) != value) { \
112 |     /* Print the back trace*/ \
113 |     RES = ncclSystemError;    \
114 |     INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
115 |     goto label; \
116 |   } \
117 | } while (0)
118 | 
119 | #define EQCHECK(statement, value) do {    \
120 |   if ((statement) == value) {             \
121 |     /* Print the back trace*/             \
122 |     INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
123 |     return ncclSystemError;     \
124 |   }                             \
125 | } while (0)
126 | 
127 | #define EQCHECKGOTO(statement, value, RES, label) do { \
128 |   if ((statement) == value) { \
129 |     /* Print the back trace*/ \
130 |     RES = ncclSystemError;    \
131 |     INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
132 |     goto label; \
133 |   } \
134 | } while (0)
135 | 
136 | // Propagate errors up
137 | #define NCCLCHECK(call) do { \
138 |   ncclResult_t RES = call; \
139 |   if (RES != ncclSuccess && RES != ncclInProgress) { \
140 |     /* Print the back trace*/ \
141 |     return RES; \
142 |   } \
143 | } while (0)
144 | 
145 | #define NCCLCHECKGOTO(call, RES, label) do { \
146 |   RES = call; \
147 |   if (RES != ncclSuccess && RES != ncclInProgress) { \
148 |     /* Print the back trace*/ \
149 |     goto label; \
150 |   } \
151 | } while (0)
152 | 
153 | #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
154 |   volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
155 |   ncclResult_t RES = call;                \
156 |   if (RES != ncclSuccess && RES != ncclInProgress) {               \
157 |     return ncclInternalError;             \
158 |   }                                       \
159 |   if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
160 | } while (!(cond))
161 | 
162 | #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
163 |   volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
164 |   RES = call;                             \
165 |   if (RES != ncclSuccess && RES != ncclInProgress) {               \
166 |     goto label;                           \
167 |   }                                       \
168 |   if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
169 | } while (!(cond))
170 | 
171 | #define NCCLCHECKTHREAD(a, args) do { \
172 |   if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
173 |     INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
174 |     return args; \
175 |   } \
176 | } while(0)
177 | 
178 | #define CUDACHECKTHREAD(a) do { \
179 |   if ((a) != cudaSuccess) { \
180 |     INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
181 |     args->ret = ncclUnhandledCudaError; \
182 |     return args; \
183 |   } \
184 | } while(0)
185 | 
186 | #endif
187 | 


--------------------------------------------------------------------------------
/include/debug.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 |  * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: BSD-3-Clause
 5 |  *
 6 |  * See LICENSE.txt for license information
 7 |  ************************************************************************/
 8 | 
 9 | #ifndef NCCL_DEBUG_H_
10 | #define NCCL_DEBUG_H_
11 | 
12 | #include "core.h"
13 | 
14 | #include <stdio.h>
15 | 
16 | #include <sys/syscall.h>
17 | #include <limits.h>
18 | #include <string.h>
19 | #include <pthread.h>
20 | #include "net.h"
21 | 
22 | // Conform to pthread and NVTX standard
23 | #define NCCL_THREAD_NAMELEN 16
24 | 
25 | extern pthread_mutex_t ncclDebugLock;
26 | 
27 | extern ncclDebugLogger_t pluginLogFunction;
28 | 
29 | #define WARN(...) pluginLogFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
30 | #define INFO(FLAGS, ...) pluginLogFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
31 | 
32 | #ifdef ENABLE_TRACE
33 | #define TRACE(FLAGS, ...) pluginLogFunction(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
34 | #else
35 | #define TRACE(...)
36 | #endif
37 | 
38 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
39 | 
40 | void ncclResetDebugInit();
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/include/ibvwrap.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
  3 |  * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
  4 |  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  5 |  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
  6 |  *
  7 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  8 |  * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  9 |  * SPDX-License-Identifier: BSD-3-Clause
 10 |  *
 11 |  * See LICENSE.txt for license information
 12 |  ************************************************************************/
 13 | 
 14 | #ifndef NCCL_IBVWRAP_H_
 15 | #define NCCL_IBVWRAP_H_
 16 | #include "config.h"
 17 | #include "core.h"
 18 | #include "utils.h"
 19 | #include <arpa/inet.h>
 20 | #include <netinet/in.h>
 21 | #include <infiniband/verbs.h>
 22 | 
 23 | #if !HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING
 24 | #  define IBV_ACCESS_RELAXED_ORDERING               0
 25 | #endif
 26 | #if !HAVE_DECL_IBV_QPF_GRH_REQUIRED
 27 | #  define IBV_QPF_GRH_REQUIRED                      0
 28 | #endif
 29 | 
 30 | #if !HAVE_DECL_IBV_SET_ECE
 31 | struct ibv_ece {
 32 | 	/*
 33 | 	 * Unique identifier of the provider vendor on the network.
 34 | 	 * The providers will set IEEE OUI here to distinguish
 35 | 	 * itself in non-homogenius network.
 36 | 	 */
 37 | 	uint32_t vendor_id;
 38 | 	/*
 39 | 	 * Provider specific attributes which are supported or
 40 | 	 * needed to be enabled by ECE users.
 41 | 	 */
 42 | 	uint32_t options;
 43 | 	uint32_t comp_mask;
 44 | };
 45 | #endif
 46 | 
 47 | ncclResult_t wrap_ibv_fork_init(void);
 48 | ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
 49 | ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
 50 | const char *wrap_ibv_get_device_name(struct ibv_device *device);
 51 | ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
 52 | ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
 53 | ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
 54 | ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
 55 | ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
 56 | ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
 57 | ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
 58 | ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
 59 | ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
 60 | ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
 61 | ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
 62 | struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
 63 | ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
 64 | /* DMA-BUF support */
 65 | ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
 66 | struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
 67 | ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
 68 | ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
 69 | ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
 70 | ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
 71 | ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
 72 | static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
 73 |   int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
 74 |   if (done < 0) {
 75 |     WARN("Call to ibv_poll_cq() returned %d", done);
 76 |     return ncclSystemError;
 77 |   }
 78 |   *num_done = done;
 79 |   return ncclSuccess;
 80 | }
 81 | ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
 82 | ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
 83 | ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
 84 | ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
 85 | ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
 86 | ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr);
 87 | ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr);
 88 | ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
 89 | 
 90 | // converts a GID into a readable string. On success, returns a non-null pointer to gidStr.
 91 | // NULL is returned if there was an error, with errno set to indicate the error.
 92 | // errno = ENOSPC if the converted string would exceed strLen.
 93 | static inline const char* ibvGetGidStr(union ibv_gid* gid, char* gidStr, size_t strLen) {
 94 |   // GID is a 16B handle, to convert it to a readable form, we use inet_ntop
 95 |   // sizeof(ibv_gid) == sizeof(struct in6_addr), so using AF_INET6
 96 |   NCCL_STATIC_ASSERT(sizeof(union ibv_gid) == sizeof(struct in6_addr), "the sizeof struct ibv_gid must be the size of struct in6_addr");
 97 |   return inet_ntop(AF_INET6, gid->raw, gidStr, strLen);
 98 | }
 99 | 
100 | #endif //End include guard
101 | 


--------------------------------------------------------------------------------
/include/net.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 |  * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: BSD-3-Clause
 5 |  */
 6 | 
 7 | #ifndef NCCL_NET_H_
 8 | #define NCCL_NET_H_
 9 | 
10 | #include <stdint.h>
11 | #include <stdlib.h>
12 | 
13 | #define NCCL_NET_HANDLE_MAXSIZE 128
14 | //Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
15 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
16 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
17 | 
18 | 
19 | #define NCCL_PTR_HOST 0x1
20 | #define NCCL_PTR_CUDA 0x2
21 | #define NCCL_PTR_DMABUF 0x4
22 | 
23 | // Maximum number of requests per comm object
24 | #define NCCL_NET_MAX_REQUESTS 8
25 | 
26 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
27 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
28 | 
29 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
30 | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
31 | 
32 | #include "net_v10.h"
33 | #include "net_v9.h"
34 | #include "net_v8.h"
35 | #include "net_v7.h"
36 | #include "net_v6.h"
37 | #include "net_v5.h"
38 | 
39 | #endif // end include guard
40 | 


--------------------------------------------------------------------------------
/include/net_device.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 |  * Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: BSD-3-Clause
 5 |  *
 6 |  * See LICENSE.txt for license information
 7 |  ************************************************************************/
 8 | 
 9 | #ifndef NET_DEVICE_H_
10 | #define NET_DEVICE_H_
11 | 
12 | #define NCCL_NET_DEVICE_INVALID_VERSION      0x0
13 | #define NCCL_NET_MTU_SIZE                    4096
14 | 
15 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
16 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
17 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
18 | 
19 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
20 | 
21 | typedef struct {
22 |   ncclNetDeviceType netDeviceType; // Network offload type
23 |   int netDeviceVersion;            // Version number for network offload
24 |   void* handle;
25 |   size_t size;
26 |   int needsProxyProgress;
27 | } ncclNetDeviceHandle_v7_t;
28 | 
29 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
30 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
31 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
32 | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/include/net_v10.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  */
  6 | #ifndef NCCL_NET_V10_H_
  7 | #define NCCL_NET_V10_H_
  8 | 
  9 | #include "net_device.h"
 10 | 
 11 | #define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
 12 | #define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
 13 | typedef struct {
 14 |   int ndevs;
 15 |   int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
 16 | } ncclNetVDeviceProps_v10_t;
 17 | typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
 18 | 
 19 | #define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
 20 | typedef struct {
 21 |   // Plugin-specific TC value
 22 |   int trafficClass;
 23 | } ncclNetCommConfig_v10_t;
 24 | typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
 25 | 
 26 | typedef struct {
 27 |   char* name;                      // Used mostly for logging.
 28 |   char* pciPath;                   // Path to the PCI device in /sys.
 29 |   uint64_t guid;                   // Unique identifier for the NIC chip. Important for
 30 |                                    // cards with multiple PCI functions (Physical or virtual).
 31 |   int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
 32 |   int regIsGlobal;                 // regMr is not tied to a particular comm
 33 |   int forceFlush;                  // Force a flush on receives
 34 |   int speed;                       // Port speed in Mbps.
 35 |   int port;                        // Port number.
 36 |   float latency;                   // Network latency
 37 |   int maxComms;                    // Maximum number of comms we can create
 38 |   int maxRecvs;                    // Maximum number of grouped receives.
 39 |   ncclNetDeviceType netDeviceType; // Network offload type
 40 |   int netDeviceVersion;            // Version number for network offload
 41 |   ncclNetVDeviceProps_v10_t vProps;
 42 |   size_t maxP2pBytes;              // Max transfer size for point-to-point operations
 43 |   size_t maxCollBytes;             // Max transfer size for collective operations
 44 | } ncclNetProperties_v10_t;
 45 | 
 46 | typedef ncclNetProperties_v10_t ncclNetProperties_t;
 47 | 
 48 | typedef struct {
 49 |   // Name of the network (mainly for logs)
 50 |   const char* name;
 51 |   // Initialize the network.
 52 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
 53 |   // Return the number of adapters.
 54 |   ncclResult_t (*devices)(int* ndev);
 55 |   // Get various device properties.
 56 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
 57 |   // Create a receiving object and provide a handle to connect to it. The
 58 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
 59 |   // between ranks to create a connection.
 60 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
 61 |   // Connect to a handle and return a sending comm object for that peer.
 62 |   // This call must not block for the connection to be established, and instead
 63 |   // should return successfully with sendComm == NULL with the expectation that
 64 |   // it will be called again until sendComm != NULL.
 65 |   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
 66 |   ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
 67 |   // Finalize connection establishment after remote peer has called connect.
 68 |   // This call must not block for the connection to be established, and instead
 69 |   // should return successfully with recvComm == NULL with the expectation that
 70 |   // it will be called again until recvComm != NULL.
 71 |   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
 72 |   ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
 73 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
 74 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
 75 |   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
 76 |   /* DMA-BUF support */
 77 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
 78 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
 79 |   // Asynchronous send to a peer.
 80 |   // May return request == NULL if the call cannot be performed (or would block)
 81 |   ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
 82 |   // Asynchronous recv from a peer.
 83 |   // May return request == NULL if the call cannot be performed (or would block)
 84 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
 85 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
 86 |   // visible to the GPU
 87 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
 88 |   // Test whether a request is complete. If size is not NULL, it returns the
 89 |   // number of bytes sent/received.
 90 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
 91 |   // Close and free send/recv comm objects
 92 |   ncclResult_t (*closeSend)(void* sendComm);
 93 |   ncclResult_t (*closeRecv)(void* recvComm);
 94 |   ncclResult_t (*closeListen)(void* listenComm);
 95 | 
 96 |   // Copy the given mhandle to a dptr in a format usable by this plugin's device code
 97 |   ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
 98 | 
 99 |   // Notify the plugin that a recv has completed by the device
100 |   ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
101 | 
102 |   // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
103 |   // what index this new vNIC exists at
104 |   ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
105 | } ncclNet_v10_t;
106 | 
107 | typedef struct {
108 |   void* mhandle;
109 |   void* address;
110 |   size_t size;
111 | } ncclNetSGE_v10_t;
112 | 
113 | typedef struct {
114 |   // Name of the collective network (mainly for logs)
115 |   const char* name;
116 |   // Initialize the collective network.
117 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
118 |   // Return the number of adapters capable of doing collective operations.
119 |   // If ndev returns 0, all other functions might be set to NULL.
120 |   ncclResult_t (*devices)(int* ndev);
121 |   // Get various device properties.
122 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
123 |   // Create a receiving object and provide a handle to connect to it. The
124 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
125 |   // between ranks to create connections.
126 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
127 |   // Create a group for collective operations. handles have been created
128 |   // using listen() above. rank indicates caller's rank in the collective network.
129 |   ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
130 |   // Returns whether a reduction operation on a data type is supported.
131 |   // 1 for supported, 0 otherwise.
132 |   ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
133 |   // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
134 |   ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
135 |   /* DMA-BUF support */
136 |   ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
137 |   ncclResult_t (*deregMr)(void* collComm, void* mhandle);
138 |   // Performs an asynchronous allreduce operation on the collective group.
139 |   // May return request == NULL if the call cannot be performed (or would block).
140 |   ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
141 |       ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
142 |   ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
143 |                              size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
144 |                              void* sendMhandle, void** request);
145 |   ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
146 |                                  size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
147 |                                  ncclDataType_t dataType, ncclRedOp_t redOp,
148 |                                  void* recvMhandle, void** request);
149 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
150 |   // visible to the GPU
151 |   ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
152 |   // Test whether a request is complete. If size is not NULL, it returns the
153 |   // number of bytes sent/received.
154 |   ncclResult_t (*test)(void* request, int* done, int* size);
155 |   // Close and free collective comm objects
156 |   ncclResult_t (*closeColl)(void* collComm);
157 |   ncclResult_t (*closeListen)(void* listenComm);
158 |   // Create a virtual NIC given the specified properties, which can be accessed at device index d
159 |   ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
160 | } ncclCollNet_v10_t;
161 | 
162 | typedef ncclCollNet_v10_t ncclCollNet_t;
163 | 
164 | #endif // end include guard
165 | 


--------------------------------------------------------------------------------
/include/net_v5.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 |  * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: BSD-3-Clause
 5 |  */
 6 | 
 7 | #ifndef NCCL_NET_V5_H_
 8 | #define NCCL_NET_V5_H_
 9 | 
10 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
11 | typedef struct {
12 |   // Name of the network (mainly for logs)
13 |   const char* name;
14 |   // Initialize the network.
15 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
16 |   // Return the number of adapters.
17 |   ncclResult_t (*devices)(int* ndev);
18 |   // Get various device properties.
19 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
20 |   // Create a receiving object and provide a handle to connect to it. The
21 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
22 |   // between ranks to create a connection.
23 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
24 |   // Connect to a handle and return a sending comm object for that peer.
25 |   // This call must not block for the connection to be established, and instead
26 |   // should return successfully with sendComm == NULL with the expectation that
27 |   // it will be called again until sendComm != NULL.
28 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
29 |   // Finalize connection establishment after remote peer has called connect.
30 |   // This call must not block for the connection to be established, and instead
31 |   // should return successfully with recvComm == NULL with the expectation that
32 |   // it will be called again until recvComm != NULL.
33 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
34 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
35 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
36 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
37 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
38 |   // Asynchronous send to a peer.
39 |   // May return request == NULL if the call cannot be performed (or would block)
40 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
41 |   // Asynchronous recv from a peer.
42 |   // May return request == NULL if the call cannot be performed (or would block)
43 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
44 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
45 |   // visible to the GPU
46 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
47 |   // Test whether a request is complete. If size is not NULL, it returns the
48 |   // number of bytes sent/received.
49 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
50 |   // Close and free send/recv comm objects
51 |   ncclResult_t (*closeSend)(void* sendComm);
52 |   ncclResult_t (*closeRecv)(void* recvComm);
53 |   ncclResult_t (*closeListen)(void* listenComm);
54 | } ncclNet_v5_t;
55 | 
56 | 
57 | typedef struct {
58 |   // Name of the collective network (mainly for logs)
59 |   const char* name;
60 |   // Initialize the collective network.
61 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
62 |   // Return the number of adapters capable of doing collective operations.
63 |   // If ndev returns 0, all other functions might be set to NULL.
64 |   ncclResult_t (*devices)(int* ndev);
65 |   // Get various device properties.
66 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
67 |   // Create a receiving object and provide a handle to connect to it. The
68 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
69 |   // between ranks to create connections.
70 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
71 |   // Create a group for collective operations. handles have been created
72 |   // using listen() above. rank indicates caller's rank in the collective network.
73 |   ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
74 |   // Returns whether a reduction operation on a data type is supported.
75 |   // 1 for supported, 0 otherwise.
76 |   ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
77 |   // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
78 |   ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
79 |   ncclResult_t (*deregMr)(void* collComm, void* mhandle);
80 |   // Performs an asynchronous allreduce operation on the collective group.
81 |   // May return request == NULL if the call cannot be performed (or would block).
82 |   ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
83 |       ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
84 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
85 |   // visible to the GPU
86 |   ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
87 |   // Test whether a request is complete. If size is not NULL, it returns the
88 |   // number of bytes sent/received.
89 |   ncclResult_t (*test)(void* request, int* done, int* size);
90 |   // Close and free collective comm objects
91 |   ncclResult_t (*closeColl)(void* collComm);
92 |   ncclResult_t (*closeListen)(void* listenComm);
93 | } ncclCollNet_v5_t;
94 | 
95 | #endif
96 | 


--------------------------------------------------------------------------------
/include/net_v6.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  */
  6 | 
  7 | #ifndef NCCL_NET_V6_H_
  8 | #define NCCL_NET_V6_H_
  9 | 
 10 | typedef struct {
 11 |   char* name;     // Used mostly for logging.
 12 |   char* pciPath;  // Path to the PCI device in /sys.
 13 |   uint64_t guid;  // Unique identifier for the NIC chip. Important for
 14 |                   // cards with multiple PCI functions (Physical or virtual).
 15 |   int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
 16 |   int speed;      // Port speed in Mbps.
 17 |   int port;       // Port number.
 18 |   float latency;  // Network latency
 19 |   int maxComms;   // Maximum number of comms we can create
 20 |   int maxRecvs;   // Maximum number of grouped receives.
 21 | }ncclNetProperties_v6_t;
 22 | 
 23 | typedef struct {
 24 |   // Name of the network (mainly for logs)
 25 |   const char* name;
 26 |   // Initialize the network.
 27 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
 28 |   // Return the number of adapters.
 29 |   ncclResult_t (*devices)(int* ndev);
 30 |   // Get various device properties.
 31 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
 32 |   // Create a receiving object and provide a handle to connect to it. The
 33 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
 34 |   // between ranks to create a connection.
 35 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
 36 |   // Connect to a handle and return a sending comm object for that peer.
 37 |   // This call must not block for the connection to be established, and instead
 38 |   // should return successfully with sendComm == NULL with the expectation that
 39 |   // it will be called again until sendComm != NULL.
 40 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
 41 |   // Finalize connection establishment after remote peer has called connect.
 42 |   // This call must not block for the connection to be established, and instead
 43 |   // should return successfully with recvComm == NULL with the expectation that
 44 |   // it will be called again until recvComm != NULL.
 45 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
 46 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
 47 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
 48 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
 49 |   /* DMA-BUF support */
 50 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
 51 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
 52 |   // Asynchronous send to a peer.
 53 |   // May return request == NULL if the call cannot be performed (or would block)
 54 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
 55 |   // Asynchronous recv from a peer.
 56 |   // May return request == NULL if the call cannot be performed (or would block)
 57 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
 58 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
 59 |   // visible to the GPU
 60 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
 61 |   // Test whether a request is complete. If size is not NULL, it returns the
 62 |   // number of bytes sent/received.
 63 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
 64 |   // Close and free send/recv comm objects
 65 |   ncclResult_t (*closeSend)(void* sendComm);
 66 |   ncclResult_t (*closeRecv)(void* recvComm);
 67 |   ncclResult_t (*closeListen)(void* listenComm);
 68 | } ncclNet_v6_t;
 69 | 
 70 | typedef struct {
 71 |   // Name of the collective network (mainly for logs)
 72 |   const char* name;
 73 |   // Initialize the collective network.
 74 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
 75 |   // Return the number of adapters capable of doing collective operations.
 76 |   // If ndev returns 0, all other functions might be set to NULL.
 77 |   ncclResult_t (*devices)(int* ndev);
 78 |   // Get various device properties.
 79 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
 80 |   // Create a receiving object and provide a handle to connect to it. The
 81 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
 82 |   // between ranks to create connections.
 83 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
 84 |   // Create a group for collective operations. handles have been created
 85 |   // using listen() above. rank indicates caller's rank in the collective network.
 86 |   ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
 87 |   // Returns whether a reduction operation on a data type is supported.
 88 |   // 1 for supported, 0 otherwise.
 89 |   ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
 90 |   // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
 91 |   ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
 92 |   /* DMA-BUF support */
 93 |   ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
 94 |   ncclResult_t (*deregMr)(void* collComm, void* mhandle);
 95 |   // Performs an asynchronous allreduce operation on the collective group.
 96 |   // May return request == NULL if the call cannot be performed (or would block).
 97 |   ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
 98 |       ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
 99 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
100 |   // visible to the GPU
101 |   ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
102 |   // Test whether a request is complete. If size is not NULL, it returns the
103 |   // number of bytes sent/received.
104 |   ncclResult_t (*test)(void* request, int* done, int* size);
105 |   // Close and free collective comm objects
106 |   ncclResult_t (*closeColl)(void* collComm);
107 |   ncclResult_t (*closeListen)(void* listenComm);
108 | } ncclCollNet_v6_t;
109 | 
110 | #endif // end include guard
111 | 


--------------------------------------------------------------------------------
/include/net_v7.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  */
  6 | 
  7 | #ifndef NCCL_NET_V7_H_
  8 | #define NCCL_NET_V7_H_
  9 | 
 10 | #include "net_device.h"
 11 | 
 12 | typedef struct {
 13 |   char* name;                      // Used mostly for logging.
 14 |   char* pciPath;                   // Path to the PCI device in /sys.
 15 |   uint64_t guid;                   // Unique identifier for the NIC chip. Important for
 16 |                                    // cards with multiple PCI functions (Physical or virtual).
 17 |   int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
 18 |   int speed;                       // Port speed in Mbps.
 19 |   int port;                        // Port number.
 20 |   float latency;                   // Network latency
 21 |   int maxComms;                    // Maximum number of comms we can create
 22 |   int maxRecvs;                    // Maximum number of grouped receives.
 23 |   ncclNetDeviceType netDeviceType; // Network offload type
 24 |   int netDeviceVersion;            // Version number for network offload
 25 | } ncclNetProperties_v7_t;
 26 | 
 27 | typedef struct {
 28 |   // Name of the network (mainly for logs)
 29 |   const char* name;
 30 |   // Initialize the network.
 31 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
 32 |   // Return the number of adapters.
 33 |   ncclResult_t (*devices)(int* ndev);
 34 |   // Get various device properties.
 35 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
 36 |   // Create a receiving object and provide a handle to connect to it. The
 37 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
 38 |   // between ranks to create a connection.
 39 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
 40 |   // Connect to a handle and return a sending comm object for that peer.
 41 |   // This call must not block for the connection to be established, and instead
 42 |   // should return successfully with sendComm == NULL with the expectation that
 43 |   // it will be called again until sendComm != NULL.
 44 |   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
 45 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
 46 |   // Finalize connection establishment after remote peer has called connect.
 47 |   // This call must not block for the connection to be established, and instead
 48 |   // should return successfully with recvComm == NULL with the expectation that
 49 |   // it will be called again until recvComm != NULL.
 50 |   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
 51 |   ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
 52 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
 53 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
 54 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
 55 |   /* DMA-BUF support */
 56 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
 57 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
 58 |   // Asynchronous send to a peer.
 59 |   // May return request == NULL if the call cannot be performed (or would block)
 60 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
 61 |   // Asynchronous recv from a peer.
 62 |   // May return request == NULL if the call cannot be performed (or would block)
 63 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
 64 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
 65 |   // visible to the GPU
 66 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
 67 |   // Test whether a request is complete. If size is not NULL, it returns the
 68 |   // number of bytes sent/received.
 69 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
 70 |   // Close and free send/recv comm objects
 71 |   ncclResult_t (*closeSend)(void* sendComm);
 72 |   ncclResult_t (*closeRecv)(void* recvComm);
 73 |   ncclResult_t (*closeListen)(void* listenComm);
 74 | 
 75 |   // Copy the given mhandle to a dptr in a format usable by this plugin's device code
 76 |   ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
 77 | 
 78 |   // Notify the plugin that a recv has completed by the device
 79 |   ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
 80 | } ncclNet_v7_t;
 81 | 
 82 | // v7 struct for backwards compatibility
 83 | typedef struct {
 84 |   // Name of the collective network (mainly for logs)
 85 |   const char* name;
 86 |   // Initialize the collective network.
 87 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
 88 |   // Return the number of adapters capable of doing collective operations.
 89 |   // If ndev returns 0, all other functions might be set to NULL.
 90 |   ncclResult_t (*devices)(int* ndev);
 91 |   // Get various device properties.
 92 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
 93 |   // Create a receiving object and provide a handle to connect to it. The
 94 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
 95 |   // between ranks to create connections.
 96 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
 97 |   // Create a group for collective operations. handles have been created
 98 |   // using listen() above. rank indicates caller's rank in the collective network.
 99 |   ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
100 |   // Returns whether a reduction operation on a data type is supported.
101 |   // 1 for supported, 0 otherwise.
102 |   ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
103 |   // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
104 |   ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
105 |   /* DMA-BUF support */
106 |   ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
107 |   ncclResult_t (*deregMr)(void* collComm, void* mhandle);
108 |   // Performs an asynchronous allreduce operation on the collective group.
109 |   // May return request == NULL if the call cannot be performed (or would block).
110 |   ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
111 |       ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
112 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
113 |   // visible to the GPU
114 |   ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
115 |   // Test whether a request is complete. If size is not NULL, it returns the
116 |   // number of bytes sent/received.
117 |   ncclResult_t (*test)(void* request, int* done, int* size);
118 |   // Close and free collective comm objects
119 |   ncclResult_t (*closeColl)(void* collComm);
120 |   ncclResult_t (*closeListen)(void* listenComm);
121 | } ncclCollNet_v7_t;
122 | 
123 | #endif // end include guard
124 | 


--------------------------------------------------------------------------------
/include/net_v8.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  */
  6 | 
  7 | #ifndef NCCL_NET_V8_H_
  8 | #define NCCL_NET_V8_H_
  9 | #include "net_device.h"
 10 | 
 11 | typedef struct {
 12 |   char* name;                      // Used mostly for logging.
 13 |   char* pciPath;                   // Path to the PCI device in /sys.
 14 |   uint64_t guid;                   // Unique identifier for the NIC chip. Important for
 15 |                                    // cards with multiple PCI functions (Physical or virtual).
 16 |   int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
 17 |   int regIsGlobal;                 // regMr is not tied to a particular comm
 18 |   int speed;                       // Port speed in Mbps.
 19 |   int port;                        // Port number.
 20 |   float latency;                   // Network latency
 21 |   int maxComms;                    // Maximum number of comms we can create
 22 |   int maxRecvs;                    // Maximum number of grouped receives.
 23 |   ncclNetDeviceType netDeviceType; // Network offload type
 24 |   int netDeviceVersion;            // Version number for network offload
 25 | } ncclNetProperties_v8_t;
 26 | 
 27 | typedef struct {
 28 |   // Name of the network (mainly for logs)
 29 |   const char* name;
 30 |   // Initialize the network.
 31 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
 32 |   // Return the number of adapters.
 33 |   ncclResult_t (*devices)(int* ndev);
 34 |   // Get various device properties.
 35 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
 36 |   // Create a receiving object and provide a handle to connect to it. The
 37 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
 38 |   // between ranks to create a connection.
 39 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
 40 |   // Connect to a handle and return a sending comm object for that peer.
 41 |   // This call must not block for the connection to be established, and instead
 42 |   // should return successfully with sendComm == NULL with the expectation that
 43 |   // it will be called again until sendComm != NULL.
 44 |   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
 45 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
 46 |   // Finalize connection establishment after remote peer has called connect.
 47 |   // This call must not block for the connection to be established, and instead
 48 |   // should return successfully with recvComm == NULL with the expectation that
 49 |   // it will be called again until recvComm != NULL.
 50 |   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
 51 |   ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
 52 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
 53 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
 54 |   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
 55 |   /* DMA-BUF support */
 56 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
 57 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
 58 |   // Asynchronous send to a peer.
 59 |   // May return request == NULL if the call cannot be performed (or would block)
 60 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
 61 |   // Asynchronous recv from a peer.
 62 |   // May return request == NULL if the call cannot be performed (or would block)
 63 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
 64 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
 65 |   // visible to the GPU
 66 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
 67 |   // Test whether a request is complete. If size is not NULL, it returns the
 68 |   // number of bytes sent/received.
 69 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
 70 |   // Close and free send/recv comm objects
 71 |   ncclResult_t (*closeSend)(void* sendComm);
 72 |   ncclResult_t (*closeRecv)(void* recvComm);
 73 |   ncclResult_t (*closeListen)(void* listenComm);
 74 | 
 75 |   // Copy the given mhandle to a dptr in a format usable by this plugin's device code
 76 |   ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
 77 | 
 78 |   // Notify the plugin that a recv has completed by the device
 79 |   ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
 80 | } ncclNet_v8_t;
 81 | 
 82 | 
 83 | typedef struct {
 84 |   void* mhandle;
 85 |   void* address;
 86 |   uint32_t size;
 87 | } ncclNetSGE_v8_t;
 88 | 
 89 | typedef struct {
 90 |   // Name of the collective network (mainly for logs)
 91 |   const char* name;
 92 |   // Initialize the collective network.
 93 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
 94 |   // Return the number of adapters capable of doing collective operations.
 95 |   // If ndev returns 0, all other functions might be set to NULL.
 96 |   ncclResult_t (*devices)(int* ndev);
 97 |   // Get various device properties.
 98 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
 99 |   // Create a receiving object and provide a handle to connect to it. The
100 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
101 |   // between ranks to create connections.
102 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
103 |   // Create a group for collective operations. handles have been created
104 |   // using listen() above. rank indicates caller's rank in the collective network.
105 |   ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
106 |   // Returns whether a reduction operation on a data type is supported.
107 |   // 1 for supported, 0 otherwise.
108 |   ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
109 |   // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
110 |   ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
111 |   /* DMA-BUF support */
112 |   ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
113 |   ncclResult_t (*deregMr)(void* collComm, void* mhandle);
114 |   // Performs an asynchronous allreduce operation on the collective group.
115 |   // May return request == NULL if the call cannot be performed (or would block).
116 |   ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
117 |       ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
118 |   ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
119 |                              size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
120 |                              void* sendMhandle, void** request);
121 |   ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
122 |                                  size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
123 |                                  ncclDataType_t dataType, ncclRedOp_t redOp,
124 |                                  void* recvMhandle, void** request);
125 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
126 |   // visible to the GPU
127 |   ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
128 |   // Test whether a request is complete. If size is not NULL, it returns the
129 |   // number of bytes sent/received.
130 |   ncclResult_t (*test)(void* request, int* done, int* size);
131 |   // Close and free collective comm objects
132 |   ncclResult_t (*closeColl)(void* collComm);
133 |   ncclResult_t (*closeListen)(void* listenComm);
134 | } ncclCollNet_v8_t;
135 | 
136 | 
137 | #endif // end include guard
138 | 


--------------------------------------------------------------------------------
/include/net_v9.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  */
  6 | 
  7 | #ifndef NCCL_NET_V9_H_
  8 | #define NCCL_NET_V9_H_
  9 | #include "net_device.h"
 10 | 
 11 | // Max number of ncclNet objects which can live in the same process
 12 | #define NCCL_NET_MAX_PLUGINS 3
 13 | 
 14 | #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
 15 | 
 16 | typedef struct {
 17 |   int ndevs;
 18 |   int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
 19 | } ncclNetVDeviceProps_v9_t;
 20 | 
 21 | typedef struct {
 22 |   char* name;                      // Used mostly for logging.
 23 |   char* pciPath;                   // Path to the PCI device in /sys.
 24 |   uint64_t guid;                   // Unique identifier for the NIC chip. Important for
 25 |                                    // cards with multiple PCI functions (Physical or virtual).
 26 |   int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
 27 |   int regIsGlobal;                 // regMr is not tied to a particular comm
 28 |   int forceFlush;                  // Force a flush on receives
 29 |   int speed;                       // Port speed in Mbps.
 30 |   int port;                        // Port number.
 31 |   float latency;                   // Network latency
 32 |   int maxComms;                    // Maximum number of comms we can create
 33 |   int maxRecvs;                    // Maximum number of grouped receives.
 34 |   ncclNetDeviceType netDeviceType; // Network offload type
 35 |   int netDeviceVersion;            // Version number for network offload
 36 |   ncclNetVDeviceProps_v9_t vProps;
 37 |   size_t maxP2pBytes;              // Max transfer size for point-to-point operations
 38 |   size_t maxCollBytes;             // Max transfer size for collective operations
 39 | } ncclNetProperties_v9_t;
 40 | 
 41 | typedef struct {
 42 |   // Name of the network (mainly for logs)
 43 |   const char* name;
 44 |   // Initialize the network.
 45 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
 46 |   // Return the number of adapters.
 47 |   ncclResult_t (*devices)(int* ndev);
 48 |   // Get various device properties.
 49 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
 50 |   // Create a receiving object and provide a handle to connect to it. The
 51 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
 52 |   // between ranks to create a connection.
 53 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
 54 |   // Connect to a handle and return a sending comm object for that peer.
 55 |   // This call must not block for the connection to be established, and instead
 56 |   // should return successfully with sendComm == NULL with the expectation that
 57 |   // it will be called again until sendComm != NULL.
 58 |   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
 59 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
 60 |   // Finalize connection establishment after remote peer has called connect.
 61 |   // This call must not block for the connection to be established, and instead
 62 |   // should return successfully with recvComm == NULL with the expectation that
 63 |   // it will be called again until recvComm != NULL.
 64 |   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
 65 |   ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
 66 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
 67 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
 68 |   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
 69 |   /* DMA-BUF support */
 70 |   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
 71 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
 72 |   // Asynchronous send to a peer.
 73 |   // May return request == NULL if the call cannot be performed (or would block)
 74 |   ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
 75 |   // Asynchronous recv from a peer.
 76 |   // May return request == NULL if the call cannot be performed (or would block)
 77 |   ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
 78 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
 79 |   // visible to the GPU
 80 |   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
 81 |   // Test whether a request is complete. If size is not NULL, it returns the
 82 |   // number of bytes sent/received.
 83 |   ncclResult_t (*test)(void* request, int* done, int* sizes);
 84 |   // Close and free send/recv comm objects
 85 |   ncclResult_t (*closeSend)(void* sendComm);
 86 |   ncclResult_t (*closeRecv)(void* recvComm);
 87 |   ncclResult_t (*closeListen)(void* listenComm);
 88 | 
 89 |   // Copy the given mhandle to a dptr in a format usable by this plugin's device code
 90 |   ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
 91 | 
 92 |   // Notify the plugin that a recv has completed by the device
 93 |   ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
 94 | 
 95 |   // Create a virtual NIC given the specified properties, which can be accessed at device index d
 96 |   ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
 97 | } ncclNet_v9_t;
 98 | 
 99 | typedef struct {
100 |   // Name of the collective network (mainly for logs)
101 |   const char* name;
102 |   // Initialize the collective network.
103 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
104 |   // Return the number of adapters capable of doing collective operations.
105 |   // If ndev returns 0, all other functions might be set to NULL.
106 |   ncclResult_t (*devices)(int* ndev);
107 |   // Get various device properties.
108 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
109 |   // Create a receiving object and provide a handle to connect to it. The
110 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
111 |   // between ranks to create connections.
112 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
113 |   // Create a group for collective operations. handles have been created
114 |   // using listen() above. rank indicates caller's rank in the collective network.
115 |   ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
116 |   // Returns whether a reduction operation on a data type is supported.
117 |   // 1 for supported, 0 otherwise.
118 |   ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
119 |   // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
120 |   ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
121 |   /* DMA-BUF support */
122 |   ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
123 |   ncclResult_t (*deregMr)(void* collComm, void* mhandle);
124 |   // Performs an asynchronous allreduce operation on the collective group.
125 |   // May return request == NULL if the call cannot be performed (or would block).
126 |   ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
127 |       ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
128 |   ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
129 |                              size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
130 |                              void* sendMhandle, void** request);
131 |   ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
132 |                                  size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
133 |                                  ncclDataType_t dataType, ncclRedOp_t redOp,
134 |                                  void* recvMhandle, void** request);
135 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
136 |   // visible to the GPU
137 |   ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
138 |   // Test whether a request is complete. If size is not NULL, it returns the
139 |   // number of bytes sent/received.
140 |   ncclResult_t (*test)(void* request, int* done, int* size);
141 |   // Close and free collective comm objects
142 |   ncclResult_t (*closeColl)(void* collComm);
143 |   ncclResult_t (*closeListen)(void* listenComm);
144 | 
145 |   // Create a virtual NIC given the specified properties, which can be accessed at device index d
146 |   ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
147 | } ncclCollNet_v9_t;
148 | 
149 | #endif // end include guard
150 | 


--------------------------------------------------------------------------------
/include/p2p_plugin.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #ifndef NCCL_P2P_PLUGIN_H_
 10 | #define NCCL_P2P_PLUGIN_H_
 11 | 
 12 | #include <stdint.h>
 13 | #include <unistd.h>
 14 | #include <assert.h>
 15 | 
 16 | #include "nccl.h"
 17 | #include "net.h"
 18 | #include "ibvwrap.h"
 19 | #include "param.h"
 20 | #include "socket.h"
 21 | #include "utils.h"
 22 | 
 23 | #define MAXNAMESIZE 64
 24 | #define NCCL_NET_IB_MAX_RECVS 8
 25 | // We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive
 26 | #define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS)
 27 | //static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion");
 28 | #define IB_DEVICE_SYSFS_FMT "/sys/class/infiniband/%s/device/%s"
 29 | 
 30 | #define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
 31 | 
 32 | typedef enum nccl_p2p_plugin {
 33 |   NCCL_P2P_IB,
 34 |   NCCL_P2P_UCX,
 35 |   NCCL_P2P_UCX_RMA,
 36 |   NCCL_P2P_UCX_UCT,
 37 |   NCCL_P2P_UCX_UCT_RD,
 38 |   NCCL_P2P_LAST
 39 | } nccl_p2p_plugin_t;
 40 | 
 41 | struct ncclIbMr {
 42 |   uintptr_t addr;
 43 |   size_t pages;
 44 |   int refs;
 45 |   struct ibv_mr *mr;
 46 | };
 47 | 
 48 | struct ncclIbMrCache {
 49 |   struct ncclIbMr *slots;
 50 |   int capacity, population;
 51 | };
 52 | 
 53 | #define NCCL_IB_MAX_DEVS_PER_NIC 4
 54 | #define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC
 55 | typedef struct ncclIbMergedDev {
 56 |   ncclNetVDeviceProps_t vProps;
 57 |   int speed;
 58 |   char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
 59 | } __attribute__((aligned(64))) ncclIbMergedDev;
 60 | 
 61 | struct ncclIbStats {
 62 |   int fatalErrorCount;
 63 | };
 64 | 
 65 | struct ncclIbRequest {
 66 |   struct ncclIbNetCommBase* base;
 67 |   int type;
 68 |   struct ncclSocket* sock;
 69 |   int events[NCCL_IB_MAX_DEVS_PER_NIC];
 70 |   struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC];
 71 |   int nreqs;
 72 |   union {
 73 |     struct {
 74 |       int size;
 75 |       void* data;
 76 |       uint32_t lkeys[NCCL_IB_MAX_DEVS_PER_NIC];
 77 |       int offset;
 78 |     } send;
 79 |     struct {
 80 |       int* sizes;
 81 |     } recv;
 82 |   };
 83 | };
 84 | 
 85 | // Retain local RoCE address for error logging
 86 | struct ncclIbGidInfo {
 87 |   uint8_t link_layer;
 88 |   union ibv_gid localGid;
 89 |   int32_t localGidIndex;
 90 | };
 91 | 
 92 | typedef struct ncclIbNetCommDevBase {
 93 |   int ibDevN;
 94 |   struct ibv_pd* pd;
 95 |   struct ibv_cq* cq;
 96 |   uint64_t pad[2];
 97 |   struct ncclIbGidInfo gidInfo;
 98 | } ncclIbNetCommDevBase;
 99 | 
100 | typedef struct ncclIbDev {
101 |   pthread_mutex_t lock;
102 |   int      device;
103 |   uint64_t guid;
104 |   uint8_t portNum;
105 |   uint8_t  link;
106 |   uint8_t  isSharpDev;
107 |   int      speed;
108 |   struct   ibv_context* context;
109 |   int      pdRefs;
110 |   struct ibv_pd*  pd;
111 |   char     devName[MAXNAMESIZE];
112 |   char     *pciPath;
113 |   char* virtualPciPath;
114 |   int      realPort;
115 |   int      maxQp;
116 |   float latency;
117 |   struct   ncclIbMrCache mrCache;
118 |   int ar; // ADAPTIVE_ROUTING
119 |   struct ibv_port_attr portAttr;
120 |   struct ncclIbStats stats;
121 |   int dmaBufSupported;
122 | } __attribute__((aligned(64))) ncclIbDev;
123 | 
124 | 
125 | #define MAX_IB_DEVS  32
126 | #define MAX_IB_VDEVS MAX_IB_DEVS*8
127 | extern struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
128 | extern struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
129 | /* Detect whether GDR can work on a given NIC with the current CUDA device
130 |  * Returns :
131 |  * ncclSuccess : GDR works
132 |  * ncclSystemError : no module or module loaded but not supported by GPU */
133 | ncclResult_t nccl_p2p_gdr_support();
134 | 
135 | ncclResult_t nccl_p2p_dmabuf_support(int dev);
136 | 
137 | ncclResult_t nccl_p2p_ib_pci_path(ncclIbDev *devs, int num_devs, char* dev_name, char** path, int* real_port);
138 | 
139 | ncclResult_t nccl_p2p_ib_get_properties(ncclIbDev *devs, int ncclNMergedIbDevs, int dev, ncclNetProperties_t* props);
140 | 
141 | ncclResult_t nccl_p2p_ib_init(int *nDevs, int *nmDevs, ncclIbDev *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr,
142 |                               pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction);
143 | 
144 | /* Convert value returtned by ibv_query_port to actual link width */
145 | int nccl_p2p_ib_width(int width);
146 | 
147 | /* Convert value returtned by ibv_query_port to actual link speed */
148 | int nccl_p2p_ib_speed(int speed);
149 | 
150 | int64_t ncclParamSharpMaxComms();
151 | 
152 | int64_t ncclParamIbMergeVfs();
153 | 
154 | int64_t ncclParamIbMergeNics();
155 | 
156 | int ncclIbRelaxedOrderingCapable(void);
157 | 
158 | nccl_p2p_plugin_t nccl_p2p_get_plugin_type();
159 | 
160 | ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat);
161 | 
162 | ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props, int nDevs, int *nmDevs);
163 | 
164 | #endif
165 | 


--------------------------------------------------------------------------------
/include/param.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 |  * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: BSD-3-Clause
 5 |  *
 6 |  * See LICENSE.txt for license information
 7 |  ************************************************************************/
 8 | 
 9 | #ifndef NCCL_PARAM_H_
10 | #define NCCL_PARAM_H_
11 | 
12 | #include <stdint.h>
13 | 
14 | const char* userHomeDir();
15 | void setEnvFile(const char* fileName);
16 | void initEnv();
17 | const char *ncclGetEnv(const char *name);
18 | 
19 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
20 | 
21 | #define NCCL_PARAM(name, env, deftVal) \
22 |   int64_t ncclParam##name() { \
23 |     NCCL_STATIC_ASSERT(deftVal != INT64_MIN, "default value cannot be the uninitialized value."); \
24 |     static int64_t cache = INT64_MIN; \
25 |     if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == INT64_MIN, false)) { \
26 |       ncclLoadParam("NCCL_" env, deftVal, INT64_MIN, &cache); \
27 |   } \
28 |     return cache; \
29 |   }
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/include/socket.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #ifndef NCCL_SOCKET_H_
 10 | #define NCCL_SOCKET_H_
 11 | 
 12 | #include "nccl.h"
 13 | #include <sys/socket.h>
 14 | #include <arpa/inet.h>
 15 | #include <netinet/tcp.h>
 16 | #include <netdb.h>
 17 | #include <fcntl.h>
 18 | #include <poll.h>
 19 | #include "stdbool.h"
 20 | #include "utils.h"
 21 | 
 22 | #define MAX_IFS 16
 23 | #define MAX_IF_NAME_SIZE 16
 24 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
 25 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
 26 | 
 27 | /* Common socket address storage structure for IPv4/IPv6 */
 28 | union ncclSocketAddress {
 29 |   struct sockaddr sa;
 30 |   struct sockaddr_in sin;
 31 |   struct sockaddr_in6 sin6;
 32 | };
 33 | 
 34 | enum ncclSocketState {
 35 |   ncclSocketStateNone = 0,
 36 |   ncclSocketStateInitialized = 1,
 37 |   ncclSocketStateAccepting = 2,
 38 |   ncclSocketStateAccepted = 3,
 39 |   ncclSocketStateConnecting = 4,
 40 |   ncclSocketStateConnectPolling = 5,
 41 |   ncclSocketStateConnected = 6,
 42 |   ncclSocketStateReady = 7,
 43 |   ncclSocketStateTerminating = 8,
 44 |   ncclSocketStateClosed = 9,
 45 |   ncclSocketStateError = 10,
 46 |   ncclSocketStateNum = 11
 47 | 
 48 | };
 49 | 
 50 | enum ncclSocketType {
 51 |   ncclSocketTypeUnknown = 0,
 52 |   ncclSocketTypeBootstrap = 1,
 53 |   ncclSocketTypeProxy = 2,
 54 |   ncclSocketTypeNetIb = 4,
 55 |   ncclSocketTypeRasNetwork = 5
 56 | };
 57 | 
 58 | struct ncclSocket {
 59 |   int fd;
 60 |   int acceptFd;
 61 |   int errorRetries;
 62 |   union ncclSocketAddress addr;
 63 |   volatile uint32_t* abortFlag;
 64 |   int asyncFlag;
 65 |   enum ncclSocketState state;
 66 |   int salen;
 67 |   uint64_t magic;
 68 |   enum ncclSocketType type;
 69 |   int customRetry;
 70 |   int finalizeCounter; // Used to keep track of initial handshake for async sockets.
 71 |   char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets.
 72 | };
 73 | 
 74 | const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm);
 75 | ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
 76 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
 77 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
 78 | 
 79 | // Initialize a socket
 80 | ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry);
 81 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
 82 | ncclResult_t ncclSocketListen(struct ncclSocket* sock);
 83 | ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
 84 | // Connect to sock->addr. sock->fd is set after a successful call.
 85 | ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
 86 | // Return socket connection state.
 87 | ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
 88 | // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
 89 | ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
 90 | ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
 91 | ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
 92 | 
 93 | #define NCCL_SOCKET_SEND 0
 94 | #define NCCL_SOCKET_RECV 1
 95 | 
 96 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed);
 97 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
 98 | ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
 99 | ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
100 | ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
101 | ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait);
102 | #endif
103 | 


--------------------------------------------------------------------------------
/include/timer.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 |  * Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: BSD-3-Clause
 5 |  *
 6 |  * See LICENSE.txt for license information
 7 |  ************************************************************************/
 8 | 
 9 | #ifndef NCCL_TIMER_H_
10 | #define NCCL_TIMER_H_
11 | #if ENABLE_TIMER
12 | #include <unistd.h>
13 | #include <sys/time.h>
14 | #include <x86intrin.h>
15 | static double freq = -1;
16 | static void calibrate() {
17 |   struct timeval tv;
18 |   gettimeofday(&tv, NULL);
19 |   uint64_t timeCycles = __rdtsc();
20 |   double time = - tv.tv_sec*1E6 - tv.tv_usec;
21 |   uint64_t total = 0ULL;
22 |   for (int i=0; i<10000; i++) total += __rdtsc();
23 |   gettimeofday(&tv, NULL);
24 |   timeCycles = __rdtsc() - timeCycles;
25 |   time += tv.tv_sec*1E6 + tv.tv_usec;
26 |   freq = timeCycles/time;
27 | }
28 | static inline double gettime() {
29 |   if (freq == -1) calibrate();
30 |   return __rdtsc()/freq;
31 | }
32 | static uint64_t counts[8];
33 | static double times[8];
34 | static double startTimes[8];
35 | #define TIME_START(index) do { \
36 |   counts[index]++; \
37 |   startTimes[index] = gettime(); \
38 | } while (0)
39 | 
40 | #define TIME_STOP(index) do { \
41 |   times[index] += gettime() - startTimes[index]; \
42 | } while (0)
43 | 
44 | #define TIME_CANCEL(index) do { \
45 |   counts[index]--; \
46 | } while (0)
47 | 
48 | #define TIME_PRINT(name) do { \
49 |   printf("%s stats", name); \
50 |   for (int i=0; i<8; i++) { \
51 |     if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
52 |     counts[i] = 0; \
53 |   } \
54 |   printf("\n"); \
55 | } while (0)
56 | #else
57 | #define TIME_START(index) do {} while(0)
58 | #define TIME_STOP(index) do {} while(0)
59 | #define TIME_CANCEL(index) do {} while(0)
60 | #define TIME_PRINT(name)
61 | #endif
62 | #endif
63 | 


--------------------------------------------------------------------------------
/include/ucx_uct_lib.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #ifndef NCCL_UCX_UCT_LIB_H_
 10 | #define NCCL_UCX_UCT_LIB_H_
 11 | 
 12 | #include <assert.h>
 13 | #include <stdint.h>
 14 | #include <unistd.h>
 15 | 
 16 | #include "p2p_plugin.h"
 17 | #include "socket.h"
 18 | 
 19 | #include <uct/api/uct.h>
 20 | 
 21 | #define NCCL_UCX_UCT_MAX_RECVS       NCCL_NET_IB_MAX_RECVS
 22 | #define NCCL_UCT_LISTEN_HANDLE_MAGIC 0x43cf19ed91abdb85
 23 | #define NCCL_UCT_REG_ALIGN           4096
 24 | 
 25 | typedef enum {
 26 |   NCCL_UCT_AM_RTR = 14, /* Use particular values */
 27 |   NCCL_UCT_AM_ATP = 15,
 28 |   NCCL_UCT_AM_RTS = 16,
 29 |   NCCL_UCT_AM_ATS = 17
 30 | } nccl_uct_am_type_t;
 31 | 
 32 | typedef enum {
 33 |   NCCL_UCT_START = 0,
 34 |   NCCL_UCT_CONNECT,
 35 |   NCCL_UCT_ACCEPT,
 36 |   NCCL_UCT_RECEIVE_REMOTE, /* Acceptor receives ep addr/remote communicator */
 37 |   NCCL_UCT_RECEIVE_ADDR,
 38 |   NCCL_UCT_RX_READY,
 39 |   NCCL_UCT_DONE
 40 | } nccl_uct_state_t;
 41 | 
 42 | /* UCT EP address to exchange and connect to */
 43 | typedef struct {
 44 |   uint8_t dev_addr_size;
 45 |   uint8_t ep_addr_size;
 46 |   uint8_t data[64];
 47 | } nccl_uct_ep_addr_t;
 48 | 
 49 | typedef struct {
 50 |   uct_iface_h     iface;
 51 |   uct_md_h        md;
 52 |   uct_component_h comp;
 53 |   void            *addr;
 54 |   size_t          addr_size;
 55 |   void            *dev_addr;
 56 |   size_t          dev_addr_size;
 57 |   size_t          ep_addr_size;
 58 |   size_t          rkey_packed_size;
 59 | 
 60 |   size_t          am_max_short;
 61 |   size_t          min_get_zcopy;
 62 | } nccl_uct_iface_t;
 63 | 
 64 | struct nccl_uct_context;
 65 | 
 66 | typedef struct nccl_uct_worker {
 67 |   struct nccl_uct_worker *next;
 68 |   struct {
 69 |     pthread_t thread;
 70 |     int       dev;
 71 |   } id;
 72 | 
 73 |   int                     count;
 74 |   ucs_async_context_t     *async;
 75 |   uct_worker_h            worker;
 76 |   nccl_uct_iface_t        *uct_iface;
 77 |   struct nccl_uct_context *context;
 78 | } nccl_uct_worker_t;
 79 | 
 80 | typedef struct {
 81 |   uct_ep_h         ep;
 82 |   uct_ep_addr_t    *addr;
 83 |   size_t           addr_size;
 84 |   nccl_uct_iface_t *uct_iface;
 85 |   uint8_t          data[];
 86 | } nccl_uct_ep_t;
 87 | 
 88 | /* All the remote addresses for the communicator */
 89 | typedef struct nccl_uct_comm_addr {
 90 |   nccl_uct_ep_addr_t rma;
 91 |   /* TODO: Add multi-QP here */
 92 | } nccl_uct_comm_addr_t;
 93 | 
 94 | /* Either Receiver or Sender communicator, connected to one peer */
 95 | typedef struct nccl_uct_comm {
 96 |   struct ncclSocket       sock;
 97 |   struct nccl_uct_context *context;
 98 |   int                     dev;
 99 | 
100 |   nccl_uct_worker_t       *uct_worker;
101 |   nccl_uct_iface_t        *uct_iface;
102 |   nccl_uct_ep_t           *uct_ep;
103 | 
104 |   struct nccl_uct_comm_remote {
105 |     nccl_uct_comm_addr_t       addr;  /* Remote addresses */
106 |     const struct nccl_uct_comm *comm; /* Cookie received in connect */
107 |   } remote;
108 | 
109 |   /* Local GET on current device */
110 |   struct {
111 |     int                enabled;
112 |     nccl_uct_ep_t      *uct_ep; /* Locally read from HCA */
113 |     nccl_uct_ep_addr_t addr;
114 | 
115 |     uint8_t            *mem; /* Dummy memory to read into */
116 |     uct_mem_h          memh;
117 |   } gpu_flush;
118 | } nccl_uct_comm_t;
119 | 
120 | /* State tracking used while connecting/accepting only */
121 | typedef struct {
122 |   nccl_uct_state_t state;
123 |   nccl_uct_comm_t  *comm;  /* current communicator being created */
124 |   int              offset; /* for Socket reading */
125 |   int              ready;  /* accept must complete after connect */
126 | } nccl_uct_stage_t;
127 | 
128 | /* Memory registration handle in NCCL UCT plugin returned by ->regMR() */
129 | typedef struct {
130 |   uct_mem_h         memh;
131 |   nccl_uct_comm_t   *comm;
132 |   uct_rkey_bundle_t bundle;
133 |   uint8_t           rkey[];
134 | } nccl_uct_memh_t;
135 | 
136 | /* On-the-wire handle passed OOB by NCCL from listener to connector */
137 | typedef struct {
138 |   uint64_t                  magic;
139 |   struct {
140 |     union ncclSocketAddress addr;
141 |     uint32_t                id;
142 |   } listener;
143 |   nccl_uct_comm_t           *comm; /* Created communicator in accept */
144 |   nccl_uct_stage_t          stage; /* Used by connector */
145 | } nccl_uct_listen_handle_t;
146 | 
147 | /* Communicator while listening to remote ranks */
148 | typedef struct {
149 |   struct ncclSocket       sock;
150 |   struct nccl_uct_context *context;
151 |   int                     dev;
152 |   uint32_t                id;
153 |   nccl_uct_worker_t       *uct_worker;
154 |   nccl_uct_comm_t         *comm;
155 | 
156 |   /* Used by acceptor */
157 |   nccl_uct_stage_t        stage;
158 | } nccl_uct_listen_comm_t;
159 | 
160 | /* Global state of the plugin */
161 | typedef struct nccl_uct_context {
162 |   /* Transport to use */
163 |   const char              *tl_name;
164 | 
165 |   /* IB devices available */
166 |   int                     dev_count;
167 |   int                     merge_dev_count;
168 | 
169 |   /* Use by common code to setup communicators */
170 |   struct nccl_uct_ops {
171 |     ncclResult_t (*comm_alloc)(nccl_uct_comm_t **comm);
172 |     ncclResult_t (*comm_init)(nccl_uct_comm_t *comm,
173 |                               struct nccl_uct_context *context,
174 |                               nccl_uct_worker_t *worker, int dev,
175 |                               const nccl_uct_comm_t *remote_comm);
176 |     ncclResult_t (*iface_set)(nccl_uct_iface_t *uct_iface);
177 |   } ops;
178 | 
179 |   /* Max sizes needed */
180 |   size_t                  am_short_size;
181 |   size_t                  rkey_size;
182 | 
183 |   /* OOB socket for accepting/connecting */
184 |   char                    if_name[MAX_IF_NAME_SIZE];
185 |   union ncclSocketAddress if_addr;
186 | 
187 |   /* Number of listener created */
188 |   uint32_t                listener_count;
189 | 
190 |   /* List of created workers */
191 |   nccl_uct_worker_t       *worker_list;
192 | } nccl_uct_context_t;
193 | 
194 | #define UCXCHECK(statement, failure_action, message, ...) \
195 |   do { \
196 |     ucs_status_t _status = statement; \
197 |     if (_status != UCS_OK) { \
198 |       WARN("Failed: " message ": %s", ##__VA_ARGS__, \
199 |            ucs_status_string(_status)); \
200 |       failure_action; \
201 |     } \
202 |   } while (0)
203 | 
204 | extern nccl_uct_context_t context;
205 | 
206 | /* Library functions */
207 | ncclResult_t nccl_uct_iface_set_handler(nccl_uct_iface_t *uct_iface, int id,
208 |                                         uct_am_callback_t callback);
209 | ncclResult_t nccl_uct_devices(int *ndev);
210 | ncclResult_t nccl_uct_comm_init(nccl_uct_comm_t *comm,
211 |                                 nccl_uct_context_t *context,
212 |                                 nccl_uct_worker_t *worker, int dev,
213 |                                 const nccl_uct_comm_t *remote_comm);
214 | void nccl_uct_comm_deinit(nccl_uct_comm_t *comm);
215 | int nccl_uct_flush_index(nccl_uct_comm_t *base, int *sizes, int n);
216 | ncclResult_t nccl_uct_flush(nccl_uct_comm_t *base_comm, void *data, int size,
217 |                             nccl_uct_memh_t *uct_memh,
218 |                             uct_completion_t *completion, void **request);
219 | void nccl_uct_empty_callback(uct_completion_t *comp);
220 | 
221 | /* NCCL common plugin callbacks */
222 | ncclResult_t nccl_uct_listen(int dev, void *listen_handle, void **listen_comm);
223 | ncclResult_t nccl_uct_accept(void *listen_comm, void **recv_comm,
224 |                              ncclNetDeviceHandle_v7_t **recvDevComm);
225 | ncclResult_t nccl_uct_connect(int dev, ncclNetCommConfig_t* config, void *listen_handle, void **send_comm,
226 |                               ncclNetDeviceHandle_t **sendDevComm);
227 | ncclResult_t nccl_uct_close_listen(void *listen_comm);
228 | ncclResult_t nccl_uct_reg_mr_dmabuf(void *reg_comm, void *data, size_t size,
229 |                                     int type, uint64_t offset, int fd,
230 |                                     void **mhandle);
231 | ncclResult_t nccl_uct_reg_mr(void *reg_comm, void *data, size_t size, int type,
232 |                              void **mhandle);
233 | ncclResult_t nccl_uct_dereg_mr(void *dereg_comm, void *mhandle);
234 | 
235 | /* Compatibility callback */
236 | ncclResult_t nccl_uct_get_properties_v9(int dev,
237 |                                         ncclNetProperties_v9_t *props_v9);
238 | ncclResult_t nccl_uct_get_properties_v8(int dev,
239 |                                         ncclNetProperties_v8_t *props_v8);
240 | ncclResult_t nccl_uct_get_properties_v7(int dev,
241 |                                         ncclNetProperties_v7_t *props_v7);
242 | ncclResult_t nccl_uct_reg_mr_v7(void *comm, void *data, int size, int type,
243 |                                 void **mhandle);
244 | ncclResult_t nccl_uct_get_properties_v6(int dev,
245 |                                         ncclNetProperties_v6_t *props_v6);
246 | ncclResult_t nccl_uct_connect_v9(int dev, void *listen_handle, void **send_comm,
247 |                               ncclNetDeviceHandle_t **sendDevComm);
248 | ncclResult_t nccl_uct_connect_v6(int dev, void *handle, void **send_comm);
249 | ncclResult_t nccl_uct_accept_v6(void *listen_comm, void **recv_comm);
250 | ncclResult_t nccl_uct_get_properties(int dev, ncclNetProperties_t *props);
251 | 
252 | 
253 | #define NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_func, get_properties_func, \
254 |                              connect_func, accept_func, reg_mr_func, \
255 |                              isend_func, irecv_func) \
256 |   { \
257 |     .name          = plugin_name, \
258 |     .init          = prefix##_##init_func, \
259 |     .devices       = nccl_uct_devices, \
260 |     .getProperties = get_properties_func, \
261 |     .listen        = nccl_uct_listen, \
262 |     .connect       = connect_func, \
263 |     .accept        = accept_func, \
264 |     .regMr         = reg_mr_func, \
265 |     .regMrDmaBuf   = nccl_uct_reg_mr_dmabuf, \
266 |     .deregMr       = nccl_uct_dereg_mr, \
267 |     .isend         = prefix##_##isend_func, \
268 |     .irecv         = prefix##_##irecv_func, \
269 |     .iflush        = prefix##_iflush, \
270 |     .test          = prefix##_test, \
271 |     .closeSend     = prefix##_close, \
272 |     .closeRecv     = prefix##_close, \
273 |     .closeListen   = nccl_uct_close_listen \
274 |   }
275 | 
276 | #define NCCL_UCT_PLUGIN_V10(plugin_name, prefix) \
277 |   NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init, nccl_uct_get_properties, \
278 |                        nccl_uct_connect, nccl_uct_accept, nccl_uct_reg_mr, \
279 |                        isend, irecv)
280 | 
281 | #define NCCL_UCT_PLUGIN_V9(plugin_name, prefix) \
282 |   NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v9, \
283 |                        nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr, \
284 |                        isend_v9, irecv_v9)
285 | 
286 | #define NCCL_UCT_PLUGIN_V8(plugin_name, prefix) \
287 |   NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v8, \
288 |                        nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr, \
289 |                        isend_v8, irecv_v8)
290 | 
291 | #define NCCL_UCT_PLUGIN_V7(plugin_name, prefix) \
292 |   NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v7, \
293 |                        nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr_v7, \
294 |                        isend_v8, irecv_v8)
295 | 
296 | #define NCCL_UCT_PLUGIN_V6(plugin_name, prefix) \
297 |   NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9,  nccl_uct_get_properties_v6, \
298 |                        nccl_uct_connect_v6, nccl_uct_accept_v6, \
299 |                        nccl_uct_reg_mr_v7, isend_v8, irecv_v8)
300 | 
301 | #define NCCL_UCT_PLUGIN_V5(plugin_name, prefix) \
302 |   { \
303 |     .name          = plugin_name, \
304 |     .init          = prefix##_init_v9, \
305 |     .devices       = nccl_uct_devices, \
306 |     .getProperties = nccl_uct_get_properties_v6, \
307 |     .listen        = nccl_uct_listen, \
308 |     .connect       = nccl_uct_connect_v6, \
309 |     .accept        = nccl_uct_accept_v6, \
310 |     .regMr         = nccl_uct_reg_mr_v7, \
311 |     .deregMr       = nccl_uct_dereg_mr, \
312 |     .isend         = prefix##_isend_v8, \
313 |     .irecv         = prefix##_irecv_v8, \
314 |     .iflush        = prefix##_iflush, \
315 |     .test          = prefix##_test, \
316 |     .closeSend     = prefix##_close, \
317 |     .closeRecv     = prefix##_close, \
318 |     .closeListen   = nccl_uct_close_listen \
319 |   }
320 | 
321 | #endif /* NCCL_UCX_UCT_LIB_H_ */
322 | 


--------------------------------------------------------------------------------
/include/ucx_uct_ring.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #ifndef NCCL_UCX_UCT_RING_H_
 10 | #define NCCL_UCX_UCT_RING_H_
 11 | 
 12 | #include "nccl.h"
 13 | #include <assert.h>
 14 | 
 15 | #define NCCL_UCT_RING_SIZE (1 << 7)
 16 | #define NCCL_UCT_RING_MASK (NCCL_UCT_RING_SIZE - 1)
 17 | 
 18 | typedef struct nccl_uct_ring {
 19 |   unsigned first;
 20 |   unsigned last;
 21 |   unsigned size;
 22 |   unsigned entry_size;
 23 |   int      tag[NCCL_UCT_RING_SIZE];
 24 |   void     *entry;
 25 | } nccl_uct_ring_t;
 26 | 
 27 | static inline ncclResult_t nccl_uct_ring_init(nccl_uct_ring_t *ring,
 28 |                                               unsigned entry_size) {
 29 |   int i;
 30 | 
 31 |   ring->first      = 0;
 32 |   ring->last       = 0;
 33 |   ring->entry_size = entry_size;
 34 |   ring->entry      = malloc(entry_size * NCCL_UCT_RING_SIZE);
 35 |   if (ring->entry == NULL) {
 36 |     free(ring->entry);
 37 |     return ncclSystemError;
 38 |   }
 39 | 
 40 |   for (i = 0; i < NCCL_UCT_RING_SIZE; i++) {
 41 |     ring->tag[i] = INT_MAX;
 42 |   }
 43 |   return ncclSuccess;
 44 | }
 45 | 
 46 | static inline void nccl_uct_ring_deinit(nccl_uct_ring_t *ring) {
 47 |   free(ring->entry);
 48 | }
 49 | 
 50 | static inline void *nccl_uct_ring_get_entry(nccl_uct_ring_t *ring, unsigned i) {
 51 |   return (uint8_t*)ring->entry + (ring->entry_size * (i & NCCL_UCT_RING_MASK));
 52 | }
 53 | 
 54 | static inline void nccl_uct_ring_append(nccl_uct_ring_t *ring, int tag,
 55 |                                         void *data, size_t len) {
 56 |   int j = ring->last & NCCL_UCT_RING_MASK;
 57 | 
 58 |   ring->last++;
 59 | 
 60 |   assert((ring->last & NCCL_UCT_RING_MASK) !=
 61 |          (ring->first & NCCL_UCT_RING_MASK));
 62 |   assert(ring->tag[j] == INT_MAX);
 63 |   assert(len == ring->entry_size);
 64 | 
 65 |   ring->tag[j] = tag;
 66 |   memcpy(nccl_uct_ring_get_entry(ring, j), data, len);
 67 | }
 68 | 
 69 | static inline int nccl_uct_ring_is_empty(const nccl_uct_ring_t *ring) {
 70 |   return ring->first == ring->last;
 71 | }
 72 | 
 73 | static inline void nccl_uct_ring_consume(nccl_uct_ring_t *ring, unsigned i) {
 74 |   unsigned j = i & NCCL_UCT_RING_MASK;
 75 | 
 76 |   assert(ring->tag[j] != INT_MAX);
 77 |   ring->tag[j] = INT_MAX;
 78 | 
 79 |   /* Cleanup upon tag hit */
 80 |   if (i == ring->first) {
 81 |     for (; i != ring->last; i++) {
 82 |       j = i & NCCL_UCT_RING_MASK;
 83 |       if (ring->tag[j] != INT_MAX) {
 84 |         break;
 85 |       }
 86 |       ring->first = i + 1;
 87 |     }
 88 |   }
 89 | }
 90 | 
 91 | static inline unsigned nccl_uct_ring_find(nccl_uct_ring_t *ring, int tag) {
 92 |   unsigned i;
 93 | 
 94 |   assert(tag != INT_MAX);
 95 | 
 96 |   for (i = ring->first; i != ring->last; i++) {
 97 |     if (ring->tag[i & NCCL_UCT_RING_MASK] == tag) {
 98 |       return i;
 99 |     }
100 |   }
101 | 
102 |   return ring->last;
103 | }
104 | 
105 | #endif /* NCCL_UCX_UCT_RING_H_ */
106 | 


--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 |  * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: BSD-3-Clause
 5 |  *
 6 |  * See LICENSE.txt for license information
 7 |  ************************************************************************/
 8 | 
 9 | #ifndef NCCL_UTILS_H_
10 | #define NCCL_UTILS_H_
11 | 
12 | #include "nccl.h"
13 | #include <stdint.h>
14 | 
15 | #define NCCL_STATIC_ASSERT(_cond, _msg) \
16 |     switch(0) {case 0:case (_cond):;}
17 | 
18 | ncclResult_t ncclIbMalloc(void** ptr, size_t size);
19 | ncclResult_t ncclRealloc(void** ptr, size_t old_size, size_t new_size);
20 | ncclResult_t getHostName(char* hostname, int maxlen);
21 | uint64_t getHostHash();
22 | uint64_t getPidHash();
23 | 
24 | struct netIf {
25 |   char prefix[64];
26 |   int port;
27 | };
28 | 
29 | int parseStringList(const char* string, struct netIf* ifList, int maxList);
30 | int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact);
31 | const char *get_plugin_lib_path();
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/m4/sharp.m4:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2001-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | # See file LICENSE for terms.
  4 | #
  5 | 
  6 | AC_DEFUN([CHECK_SHARP],[
  7 | 
  8 | AS_IF([test "x$sharp_checked" != "xyes"],[
  9 | 
 10 |     sharp_happy="no"
 11 | 
 12 |     AC_ARG_WITH([sharp],
 13 |             [AS_HELP_STRING([--with-sharp=(DIR)], [Enable the use of SHARP (default is guess).])],
 14 |             [], [with_sharp=guess])
 15 | 
 16 |     AS_IF([test "x$with_sharp" != "xno"],
 17 |     [
 18 |         save_CPPFLAGS="$CPPFLAGS"
 19 |         save_CFLAGS="$CFLAGS"
 20 |         save_LDFLAGS="$LDFLAGS"
 21 | 
 22 |         AS_IF([test ! -z "$with_sharp" -a "x$with_sharp" != "xyes" -a "x$with_sharp" != "xguess"],
 23 |         [
 24 |             check_sharp_dir="$with_sharp"
 25 |             check_sharp_libdir="$with_sharp/lib"
 26 |             CPPFLAGS="-I$with_sharp/include $save_CPPFLAGS"
 27 |             LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS"
 28 |         ])
 29 | 
 30 |         AS_IF([test "x$check_sharp_dir" = "x" -a "x$HPCX_SHARP_DIR" != "x"],
 31 |         [
 32 |             check_sharp_dir="$HPCX_SHARP_DIR"
 33 |             check_sharp_libdir="$HPCX_SHARP_DIR/lib"
 34 |             CPPFLAGS="-I$check_sharp_dir/include $save_CPPFLAGS"
 35 |             LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS"
 36 |         ])
 37 | 
 38 |         AS_IF([test "x$check_sharp_dir" = "x" -a -d "/opt/mellanox/sharp/"],
 39 |         [
 40 |             check_sharp_dir="/opt/mellanox/sharp/"
 41 |             check_sharp_libdir="/opt/mellanox/sharp/lib"
 42 |             CPPFLAGS="-I$check_sharp_dir/include $save_CPPFLAGS"
 43 |             LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS"
 44 |         ])
 45 | 
 46 | 
 47 |         AS_IF([test ! -z "$with_sharp_libdir" -a "x$with_sharp_libdir" != "xyes"],
 48 |         [
 49 |             check_sharp_libdir="$with_sharp_libdir"
 50 |             LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS"
 51 |         ])
 52 | 
 53 |         AC_CHECK_HEADERS([sharp/api/sharp_coll.h],
 54 |         [
 55 |             AC_CHECK_LIB([sharp_coll], [sharp_coll_init],
 56 |             [
 57 |                 sharp_happy="yes"
 58 |             ],
 59 |             [
 60 |                 sharp_happy="no"
 61 |             ])
 62 |         ],
 63 |         [
 64 |             sharp_happy="no"
 65 |         ])
 66 | 
 67 |         AS_IF([test "x$sharp_happy" = "xyes"],
 68 |         [
 69 |             AS_IF([test "x$check_sharp_dir" != "x"],
 70 |             [
 71 |                 AC_MSG_RESULT([SHARP dir: $check_sharp_dir])
 72 |                 AC_SUBST(SHARP_CPPFLAGS, "-I$check_sharp_dir/include/")
 73 |             ])
 74 | 
 75 |             AS_IF([test "x$check_sharp_libdir" != "x"],
 76 |             [
 77 |                 AC_SUBST(SHARP_LDFLAGS, "-L$check_sharp_libdir")
 78 |             ])
 79 | 
 80 |             AC_SUBST(SHARP_LIBADD, "-lsharp_coll")
 81 |             AC_CHECK_DECLS([SHARP_DTYPE_BFLOAT16], [AC_DEFINE([HAVE_SHARP_DTYPE_BFLOAT16_UINT8_INT8], 1,
 82 |                                                     [SHARP v3 datatypes : bfloat16, uint8, int8])], [],
 83 |                            [[#include <sharp/api/sharp_coll.h>]])
 84 |             AC_CHECK_DECLS([sharp_coll_reg_mr_v2], [], [], [[#include <sharp/api/sharp_coll.h>]])
 85 | 
 86 |         ],
 87 |         [
 88 |             AS_IF([test "x$with_sharp" != "xguess"],
 89 |             [
 90 |                 AC_MSG_ERROR([SHARP support is requested but SHARP packages cannot be found])
 91 |             ],
 92 |             [
 93 |                 AC_MSG_WARN([SHARP not found])
 94 |             ])
 95 |         ])
 96 | 
 97 |         CFLAGS="$save_CFLAGS"
 98 |         CPPFLAGS="$save_CPPFLAGS"
 99 |         LDFLAGS="$save_LDFLAGS"
100 | 
101 |     ],
102 |     [
103 |         AC_MSG_WARN([SHARP was explicitly disabled])
104 |     ])
105 | 
106 |     sharp_checked=yes
107 |     AM_CONDITIONAL([HAVE_SHARP_PLUGIN], [test "x$sharp_happy" != xno])
108 | ])
109 | 
110 | ])
111 | 


--------------------------------------------------------------------------------
/m4/ucx.m4:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2001-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # See file LICENSE for terms.
 4 | #
 5 | 
 6 | AC_DEFUN([CHECK_UCX],[
 7 | 
 8 | AS_IF([test "x$ucx_checked" != "xyes"],[
 9 | 
10 |     ucx_happy="no"
11 | 
12 |     AC_ARG_WITH([ucx],
13 |             [AS_HELP_STRING([--with-ucx=(DIR)], [Enable the use of UCX (default is guess).])],
14 |             [], [with_ucx=guess])
15 | 
16 |     AS_IF([test "x$with_ucx" != "xno"],
17 |     [
18 |         save_CPPFLAGS="$CPPFLAGS"
19 |         save_CFLAGS="$CFLAGS"
20 |         save_LDFLAGS="$LDFLAGS"
21 | 
22 |         AS_IF([test ! -z "$with_ucx" -a "x$with_ucx" != "xyes" -a "x$with_ucx" != "xguess"],
23 |         [
24 |             check_ucx_dir="$with_ucx"
25 |             check_ucx_libdir="$with_ucx/lib"
26 |             CPPFLAGS="-I$with_ucx/include $save_CPPFLAGS"
27 |             LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS"
28 |         ])
29 | 
30 |         AS_IF([test "x$check_ucx_dir" = "x" -a "x$HPCX_UCX_DIR" != "x"],
31 |         [
32 |             check_ucx_dir="$HPCX_UCX_DIR"
33 |             check_ucx_libdir="$HPCX_UCX_DIR/lib"
34 |             CPPFLAGS="-I$check_ucx_dir/include $save_CPPFLAGS"
35 |             LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS"
36 |         ])
37 | 
38 |         AS_IF([test ! -z "$with_ucx_libdir" -a "x$with_ucx_libdir" != "xyes"],
39 |         [
40 |             check_ucx_libdir="$with_ucx_libdir"
41 |             LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS"
42 |         ])
43 | 
44 |         AC_CHECK_HEADERS([ucp/api/ucp.h],
45 |         [
46 |             AC_CHECK_LIB([ucp], [ucp_tag_send_nb],
47 |             [
48 |                 ucx_happy="yes"
49 |             ],
50 |             [
51 |                 ucx_happy="no"
52 |             ], [-luct -lucm -lucs])
53 |         ],
54 |         [
55 |             ucx_happy="no"
56 |         ])
57 | 
58 |         AS_IF([test "x$ucx_happy" = "xyes"],
59 |         [
60 |             AS_IF([test "x$check_ucx_dir" != "x"],
61 |             [
62 |                 AC_MSG_RESULT([UCX dir: $check_ucx_dir])
63 |                 AC_SUBST(UCX_CPPFLAGS, "-I$check_ucx_dir/include/")
64 |             ])
65 | 
66 |             AS_IF([test "x$check_ucx_libdir" != "x"],
67 |             [
68 |                 AC_SUBST(UCX_LDFLAGS, "-L$check_ucx_libdir")
69 |             ])
70 | 
71 |             AC_SUBST(UCX_LIBADD, "-lucp -lucs -lucm -luct")
72 |         ],
73 |         [
74 |             AS_IF([test "x$with_ucx" != "xguess"],
75 |             [
76 |                 AC_MSG_ERROR([UCX support is requested but UCX packages cannot be found])
77 |             ],
78 |             [
79 |                 AC_MSG_WARN([UCX not found])
80 |             ])
81 |         ])
82 | 
83 |         CFLAGS="$save_CFLAGS"
84 |         CPPFLAGS="$save_CPPFLAGS"
85 |         LDFLAGS="$save_LDFLAGS"
86 | 
87 |     ],
88 |     [
89 |         AC_MSG_WARN([UCX was explicitly disabled])
90 |     ])
91 | 
92 |     ucx_checked=yes
93 |     AM_CONDITIONAL([HAVE_UCX_PLUGIN], [test "x$ucx_happy" != xno])
94 | ])
95 | 
96 | ])
97 | 


--------------------------------------------------------------------------------
/nccl-rdma-sharp-plugins.pc.in:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice, this
10 | # list of conditions and the following disclaimer.
11 | #
12 | # 2. Redistributions in binary form must reproduce the above copyright notice,
13 | # this list of conditions and the following disclaimer in the documentation
14 | # and/or other materials provided with the distribution.
15 | #
16 | # 3. Neither the name of the copyright holder nor the names of its
17 | # contributors may be used to endorse or promote products derived from
18 | # this software without specific prior written permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | 
32 | prefix = @prefix@
33 | exec_prefix = @exec_prefix@
34 | libdir = @libdir@
35 | 
36 | Name: @PACKAGE@
37 | URL: @PACKAGE_URL@
38 | Description:  RDMA and SHARP plugins for NCCL Collective library
39 | Version: @MAJOR_VERSION@.@MINOR_VERSION@
40 | Libs: -L${libdir} -lnccl-net.so
41 | 
42 | 


--------------------------------------------------------------------------------
/nccl-rdma-sharp-plugins.spec.in:
--------------------------------------------------------------------------------
  1 | #
  2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #
  9 | # 1. Redistributions of source code must retain the above copyright notice, this
 10 | # list of conditions and the following disclaimer.
 11 | #
 12 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 13 | # this list of conditions and the following disclaimer in the documentation
 14 | # and/or other materials provided with the distribution.
 15 | #
 16 | # 3. Neither the name of the copyright holder nor the names of its
 17 | # contributors may be used to endorse or promote products derived from
 18 | # this software without specific prior written permission.
 19 | #
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | #
 31 | 
 32 | %global rel @RPM_RELEASE@
 33 | %global version @VERSION@
 34 | %global pkgname @PACKAGE@
 35 | %global prefix @prefix@
 36 | %global __check_files %{nil}
 37 | %global _libdir %{prefix}/lib
 38 | %{!?configure_opts: %global configure_opts %{nil}}
 39 | %global  debug_package %{nil}
 40 | %bcond_with valgrind
 41 | %global _binary_filedigest_algorithm 1
 42 | %global _source_filedigest_algorithm 1
 43 | 
 44 | %global lt_release @LT_RELEASE@
 45 | %global lt_version @LT_CURRENT@.@LT_REVISION@.@LT_AGE@
 46 | 
 47 | Name: %{pkgname}
 48 | Summary: RDMA and SHARP plugins for NCCL
 49 | Version: %{version}
 50 | Release: %{rel}
 51 | 
 52 | License: Proprietary
 53 | Group: Applications
 54 | Source: %{pkgname}-%{version}.tar.gz
 55 | Requires: libibverbs
 56 | %if 0%{?suse_version} < 1100
 57 | BuildRequires: gcc-c++ libibverbs-devel binutils
 58 | %else
 59 | BuildRequires: gcc-c++ libibverbs-devel binutils-devel
 60 | %endif
 61 | %if %{with valgrind}
 62 | BuildRequires: valgrind-devel
 63 | %endif
 64 | 
 65 | BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
 66 | URL: http://www.mellanox.com
 67 | Prefix: %{prefix}
 68 | Provides: nccl-rdma-sharp-plugins
 69 | Vendor: mellanox
 70 | 
 71 | 
 72 | %description
 73 | Provides RDMA and SHARP plugins for NCCL Collective library
 74 | 
 75 | %prep
 76 | rm -rf $RPM_BUILD_ROOT
 77 | 
 78 | %setup -q
 79 | 
 80 | %build
 81 | ./configure 
 82 | make %{?_smp_mflags}
 83 | 
 84 | %install
 85 | 
 86 | rm -rf "$RPM_BUILD_ROOT"
 87 | 
 88 | # Strip out some dependencies
 89 | cat > find-requires.sh <<'EOF'
 90 | exec %{__find_requires} "$@" | egrep -v '^perl'
 91 | EOF
 92 | chmod +x find-requires.sh
 93 | %global _use_internal_dependency_generator 0
 94 | %global __find_requires %{_builddir}/%{buildsubdir}/find-requires.sh
 95 | 
 96 | make DESTDIR="$RPM_BUILD_ROOT" install
 97 | mkdir -p $RPM_BUILD_ROOT/etc/ld.so.conf.d/
 98 | echo %{_libdir} > $RPM_BUILD_ROOT/etc/ld.so.conf.d/nccl-rdma-sharp-plugins.conf
 99 | mkdir -p $RPM_BUILD_ROOT/usr/lib64/pkgconfig
100 | cp nccl-rdma-sharp-plugins.pc $RPM_BUILD_ROOT/usr/lib64/pkgconfig
101 | 
102 | %clean
103 | # We may be in the directory that we're about to remove, so cd out of
104 | # there before we remove it
105 | cd /tmp
106 | 
107 | # Remove installed driver after rpm build finished
108 | chmod -R o+w $RPM_BUILD_DIR/%{name}-%{version}
109 | rm -rf $RPM_BUILD_DIR/%{name}-%{version}
110 | 
111 | test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
112 | 
113 | 
114 | %files
115 | %defattr(-, root, root)
116 | %{prefix}
117 | /etc/ld.so.conf.d/nccl-rdma-sharp-plugins.conf
118 | /usr/lib64/pkgconfig/nccl-rdma-sharp-plugins.pc
119 | 
120 | 
121 | # Your application file list goes here
122 | # %{prefix}/lib/lib*.so*
123 | #%doc COPYRIGHT ChangeLog README AUTHORS NEWS
124 | #%doc doc/*
125 | 
126 | # If you install a library
127 | %post
128 | /sbin/ldconfig || exit 1
129 | 
130 | # If you install a library
131 | %postun
132 | /sbin/ldconfig
133 | exit 0
134 | 
135 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | # See file LICENSE for terms.
 6 | #
 7 | 
 8 | lib_LTLIBRARIES     = libnccl-net.la
 9 | 
10 | libnccl_net_la_CPPFLAGS = -I$(top_srcdir)/include
11 | libnccl_net_la_CFLAGS   = $(CFLAGS) -DGNU_SOURCE
12 | libnccl_net_la_LIBADD   = -lcudart_static
13 | libnccl_net_la_LDFLAGS  = $(LDFLAGS)
14 | 
15 | libnccl_net_la_SOURCES = \
16 | 	ibvwrap.c \
17 | 	utils.c	\
18 |         param.c \
19 | 	socket.c \
20 | 	p2p_plugin.c \
21 | 	ib_plugin.c
22 | 
23 | if HAVE_UCX_PLUGIN
24 | libnccl_net_la_CPPFLAGS += -DHAVE_UCX_PLUGIN $(UCX_CPPFLAGS)
25 | libnccl_net_la_LIBADD   += $(UCX_LIBADD)
26 | libnccl_net_la_LDFLAGS  += $(UCX_LDFLAGS) 
27 | libnccl_net_la_SOURCES  += \
28 | 	ucx_plugin.c \
29 | 	ucx_rma_plugin.c \
30 | 	ucx_uct_lib.c \
31 | 	ucx_uct_plugin.c \
32 | 	ucx_uct_rd_plugin.c
33 | endif
34 | 
35 | if HAVE_SHARP_PLUGIN
36 | libnccl_net_la_CPPFLAGS += -DHAVE_SHARP_PLUGIN $(SHARP_CPPFLAGS)
37 | libnccl_net_la_LIBADD   += $(SHARP_LIBADD)
38 | libnccl_net_la_LDFLAGS  += $(SHARP_LDFLAGS) 
39 | libnccl_net_la_SOURCES  += sharp_plugin.c
40 | endif
41 | 
42 | install-exec-hook:
43 | 	cd $(libdir) && ln -sf libnccl-net.so libnccl-net-ibext.so
44 | 


--------------------------------------------------------------------------------
/src/ibvwrap.c:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #include <stdint.h>
 10 | #include <stdbool.h>
 11 | 
 12 | #include "ibvwrap.h"
 13 | #include "utils.h"
 14 | #include "nccl.h"
 15 | #include "param.h"
 16 | 
 17 | #define IBV_PTR_CHECK_ERRNO(call, retval, error_retval, name) \
 18 |   retval = call; \
 19 |   if (retval == error_retval) { \
 20 |     WARN("Call to " name " failed with error %s", strerror(errno)); \
 21 |     return ncclSystemError; \
 22 |   } \
 23 |   return ncclSuccess;
 24 | 
 25 | #define IBV_PTR_CHECK(call, retval, error_retval, name) \
 26 |   retval = call; \
 27 |   if (retval == error_retval) { \
 28 |     WARN("Call to " name " failed"); \
 29 |     return ncclSystemError; \
 30 |   } \
 31 |   return ncclSuccess;
 32 | 
 33 | #define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(call, success_retval, name, supported) \
 34 |   int ret = call; \
 35 |   if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
 36 |     INFO(NCCL_NET, "Call to " name " not supported"); \
 37 |     *supported = 0; \
 38 |     return ncclSuccess; \
 39 |   } else if (ret != success_retval) { \
 40 |     WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
 41 |     *supported = 1; \
 42 |     return ncclSystemError; \
 43 |   } \
 44 |   *supported = 1; \
 45 |   return ncclSuccess;
 46 | 
 47 | #define IBV_INT_CHECK_RET_ERRNO(call, success_retval, name) \
 48 |   int ret = call; \
 49 |   if (ret != success_retval) { \
 50 |     WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
 51 |     return ncclSystemError; \
 52 |   } \
 53 |   return ncclSuccess;
 54 | 
 55 | #define IBV_INT_CHECK(call, error_retval, name) \
 56 |   int ret = call; \
 57 |   if (ret == error_retval) { \
 58 |     WARN("Call to " name " failed"); \
 59 |     return ncclSystemError; \
 60 |   } \
 61 |   return ncclSuccess;
 62 | 
 63 | #define IBV_PASSTHRU(call) \
 64 |   call; \
 65 |   return ncclSuccess;
 66 | 
 67 | NCCL_PARAM(IbMQpRetryAll, "IB_MQP_RETRY_ALL", 0);
 68 | NCCL_PARAM(IbMQpRetryCnt, "IB_MQP_RETRY_CNT", 34);
 69 | NCCL_PARAM(IbMQpRetryTimeout, "IB_MQP_RETRY_SLEEP_MSEC", 100); // in milliseconds
 70 | 
 71 | #define IBV_ERR_EQ(e, code)        (e == code || e == (-code))
 72 | #define IBV_MQP_RETRY_ERRNO(e)     (IBV_ERR_EQ(e, ETIMEDOUT))
 73 | #define IBV_MQP_RETRY_ERRNO_ALL(e) (ncclParamIbMQpRetryAll() ? (e != 0) : IBV_MQP_RETRY_ERRNO(e))
 74 | 
 75 | ncclResult_t wrap_ibv_fork_init() {
 76 |   IBV_INT_CHECK(ibv_fork_init(), -1, "ibv_fork_init");
 77 | }
 78 | 
 79 | ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) {
 80 |   *ret = ibv_get_device_list(num_devices);
 81 |   if (*ret == NULL) *num_devices = 0;
 82 |   return ncclSuccess;
 83 | }
 84 | 
 85 | ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) {
 86 |   IBV_PASSTHRU(ibv_free_device_list(list));
 87 | }
 88 | 
 89 | const char *wrap_ibv_get_device_name(struct ibv_device *device) {
 90 |   return ibv_get_device_name(device);
 91 | }
 92 | 
 93 | ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
 94 |   IBV_PTR_CHECK(ibv_open_device(device), *ret, NULL, "ibv_open_device");
 95 | }
 96 | 
 97 | ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
 98 |   IBV_INT_CHECK(ibv_close_device(context), -1, "ibv_close_device");
 99 | }
100 | 
101 | ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/
102 |   IBV_INT_CHECK(ibv_get_async_event(context, event), -1, "ibv_get_async_event");
103 | }
104 | 
105 | ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) {
106 |   IBV_PASSTHRU(ibv_ack_async_event(event));
107 | }
108 | 
109 | ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
110 |   IBV_INT_CHECK_RET_ERRNO(ibv_query_device(context, device_attr), 0, "ibv_query_device");
111 | }
112 | 
113 | ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
114 |   IBV_INT_CHECK_RET_ERRNO(ibv_query_port(context, port_num, port_attr), 0, "ibv_query_port");
115 | }
116 | 
117 | ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
118 |   IBV_INT_CHECK_RET_ERRNO(ibv_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
119 | }
120 | 
121 | ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) {
122 |   IBV_INT_CHECK_RET_ERRNO(ibv_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
123 | }
124 | 
125 | ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
126 |   IBV_PTR_CHECK(ibv_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
127 | }
128 | 
129 | ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
130 |   IBV_INT_CHECK_RET_ERRNO(ibv_dealloc_pd(pd), 0, "ibv_dealloc_pd");
131 | }
132 | 
133 | ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
134 |   IBV_PTR_CHECK(ibv_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
135 | }
136 | 
137 | struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
138 |   return ibv_reg_mr(pd, addr, length, access);
139 | }
140 | 
141 | ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
142 | #if HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING
143 |   IBV_PTR_CHECK(ibv_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
144 | #else
145 |   return ncclSystemError;
146 | #endif
147 | }
148 | 
149 | /* DMA-BUF support */
150 | ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
151 | #if HAVE_DECL_IBV_REG_DMABUF_MR
152 |   IBV_PTR_CHECK_ERRNO(ibv_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
153 | #else
154 |   return ncclSystemError;
155 | #endif
156 | }
157 | 
158 | struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
159 | #if HAVE_DECL_IBV_REG_DMABUF_MR
160 |   return ibv_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
161 | #else
162 |   errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
163 |   return NULL;
164 | #endif
165 | }
166 | 
167 | ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
168 |   IBV_INT_CHECK_RET_ERRNO(ibv_dereg_mr(mr), 0, "ibv_dereg_mr");
169 | }
170 | 
171 | ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
172 |   IBV_PTR_CHECK_ERRNO(ibv_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
173 | }
174 | 
175 | ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
176 |   IBV_INT_CHECK_RET_ERRNO(ibv_destroy_cq(cq), 0, "ibv_destroy_cq");
177 | }
178 | 
179 | ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
180 |   IBV_INT_CHECK_RET_ERRNO(ibv_destroy_qp(qp), 0, "ibv_destroy_qp");
181 | }
182 | 
183 | ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
184 |   IBV_PTR_CHECK_ERRNO(ibv_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
185 | }
186 | 
187 | static void ibvQpStateName(enum ibv_qp_state state, char* msg, const size_t len) {
188 |   switch (state) {
189 |   case (IBV_QPS_RESET): snprintf(msg, len, "RESET"); break;
190 |   case (IBV_QPS_INIT): snprintf(msg, len, "INIT"); break;
191 |   case (IBV_QPS_RTR): snprintf(msg, len, "RTR"); break;
192 |   case (IBV_QPS_RTS): snprintf(msg, len, "RTS"); break;
193 |   case (IBV_QPS_SQD): snprintf(msg, len, "SQD"); break;
194 |   case (IBV_QPS_SQE): snprintf(msg, len, "SQE"); break;
195 |   case (IBV_QPS_ERR): snprintf(msg, len, "ERR"); break;
196 |   case (IBV_QPS_UNKNOWN): snprintf(msg, len, "UNKNOWN"); break;
197 |   default: snprintf(msg, len, "NOT RECOGNIZED (%d)", state); break;
198 |   }
199 | }
200 | 
201 | #define QP_ATTR(attr, userAttr, userFlag, mask) ((userFlag & mask) ? (userAttr) : (attr))
202 | 
203 | static void ibvModifyQpLog(struct ibv_qp* qp, enum ibv_qp_state qpState, struct ibv_qp_attr* userAttr, int userFlag, char* msg, size_t msgLen) {
204 |   ncclResult_t res;
205 |   int portNum = -1, gidIndex = -1;
206 |   char localGidName[INET6_ADDRSTRLEN], remoteGidName[INET6_ADDRSTRLEN];
207 |   const char *localGidRes = NULL, *remoteGidRes = NULL;
208 | 
209 |   char nextState[32], currState[32];
210 |   ibvQpStateName(qp->state, currState, sizeof(currState));
211 |   ibvQpStateName(qpState, nextState, sizeof(nextState));
212 |   char devName[IBV_SYSFS_NAME_MAX] = "";
213 |   snprintf(devName, sizeof(devName), "%s", (qp->pd->context) ? wrap_ibv_get_device_name(qp->pd->context->device) : "N/A");
214 | 
215 |   struct ibv_qp_attr attr;
216 |   struct ibv_qp_init_attr init_attr;
217 |   int attr_mask = IBV_QP_PORT | IBV_QP_AV;
218 |   res = wrap_ibv_query_qp(qp, &attr, attr_mask, &init_attr);
219 |   struct ibv_qp_attr *qpAttr = (res == ncclSuccess) ? &attr : NULL;
220 | 
221 |   // port info, portAttr can be NULL if not given by the user and query_qp failed
222 |   struct ibv_qp_attr *portAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_PORT);
223 |   portNum = portAttr ? portAttr->port_num : -1;
224 | 
225 |   // address info, avAttr can be NULL if not given by the user and query_qp failed
226 |   struct ibv_qp_attr *avAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_AV);
227 |   if (avAttr && avAttr->ah_attr.is_global) {
228 |     union ibv_gid *remoteGid = &avAttr->ah_attr.grh.dgid;
229 |     remoteGidRes = ibvGetGidStr(remoteGid, remoteGidName, sizeof(remoteGidName));
230 |     // we need pd->context to retrieve local GID, skip if not there
231 |     if (!qp->pd->context) goto print;
232 |     gidIndex =  avAttr->ah_attr.grh.sgid_index;
233 |     union ibv_gid localGid;
234 |     NCCLCHECKGOTO(wrap_ibv_query_gid(qp->pd->context, portNum, gidIndex, &localGid), res, print);
235 |     localGidRes = ibvGetGidStr(&localGid, localGidName, sizeof(localGidName));
236 |   }
237 | print:
238 |   snprintf(msg, msgLen, "on dev %s:%d, curr state %s, next state %s, local GID index %d, local GID %s, remote GID %s",
239 |            devName, portNum, currState, nextState, gidIndex, localGidRes ? localGidName : "N/A", remoteGidRes ? remoteGidName : "N/A");
240 |   return;
241 | }
242 | 
243 | ncclResult_t wrap_ibv_modify_qp(struct ibv_qp* qp, struct ibv_qp_attr* attr, int attr_mask) {
244 |   char qpMsg[1024];
245 |   int ret = 0, attempts = 0;
246 |   int maxCnt = (int)ncclParamIbMQpRetryCnt() + 1; // number of attempts = number of retry + 1
247 |   int timeOut = (int)ncclParamIbMQpRetryTimeout();
248 |   do {
249 |     if (attempts > 0) {
250 |       unsigned int sleepTime = timeOut * attempts;
251 |       ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
252 |       INFO(NCCL_NET, "Call to ibv_modify_qp failed with %d %s, %s, retrying %d/%d after %u msec of sleep", ret, strerror(ret), qpMsg, attempts, maxCnt, sleepTime);
253 |       // sleep before retrying
254 |       struct timespec tv = {.tv_sec = sleepTime / 1000, .tv_nsec = (sleepTime % 1000) * ((long)1e6)};
255 |       nanosleep(&tv, NULL);
256 |     }
257 |     ret = ibv_modify_qp(qp, attr, attr_mask);
258 |     attempts++;
259 |   } while (IBV_MQP_RETRY_ERRNO_ALL(ret) && attempts < maxCnt);
260 |   if (ret != 0) {
261 |     ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
262 |     WARN("Call to ibv_modify_qp failed with %d %s, %s", ret, strerror(ret), qpMsg);
263 |     return ncclSystemError;
264 |   }
265 |   return ncclSuccess;
266 |  }
267 | 
268 | ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
269 |   IBV_INT_CHECK_RET_ERRNO(qp->context->ops.post_send(qp, wr, bad_wr), 0, "ibv_post_send");
270 | }
271 | 
272 | ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
273 |   IBV_INT_CHECK_RET_ERRNO(qp->context->ops.post_recv(qp, wr, bad_wr), 0, "ibv_post_recv");
274 |   return ncclSuccess;
275 | }
276 | 
277 | ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
278 | #if HAVE_DECL_IBV_QUERY_ECE
279 |   IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_query_ece(qp, ece), 0, "ibv_query_ece", supported);
280 | #else
281 |     INFO(NCCL_NET, "Call to ibv_query_ece is skipped, doesn't exist");
282 |     *supported = 0;
283 |     return ncclSuccess;
284 | #endif
285 | }
286 | 
287 | ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
288 | #if HAVE_DECL_IBV_SET_ECE
289 |   IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_set_ece(qp, ece), 0, "ibv_set_ece", supported);
290 | #else
291 |     INFO(NCCL_NET, "Call to ibv_set_ece skipped, doesn't exist");
292 |     *supported = 0;
293 |     return ncclSuccess;
294 | #endif
295 | }
296 | 
297 | ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
298 |   *ret = (char *) ibv_event_type_str(event);
299 |   return ncclSuccess;
300 | }
301 | 


--------------------------------------------------------------------------------
/src/param.c:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #include "param.h"
 10 | #include "debug.h"
 11 | 
 12 | //#include <algorithm>
 13 | #include <errno.h>
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <string.h>
 17 | #include <sys/types.h>
 18 | #include <unistd.h>
 19 | #include <pthread.h>
 20 | #include <pwd.h>
 21 | 
 22 | #define MIN(a, b) ((a)<(b)?(a):(b))
 23 | const char* userHomeDir() {
 24 |   struct passwd *pwUser = getpwuid(getuid());
 25 |   return pwUser == NULL ? NULL : pwUser->pw_dir;
 26 | }
 27 | 
 28 | void setEnvFile(const char* fileName) {
 29 |   FILE * file = fopen(fileName, "r");
 30 |   if (file == NULL) return;
 31 | 
 32 |   char *line = NULL;
 33 |   char envVar[1024];
 34 |   char envValue[1024];
 35 |   size_t n = 0;
 36 |   ssize_t read;
 37 |   while ((read = getline(&line, &n, file)) != -1) {
 38 |     if (line[0] == '#') continue;
 39 |     if (line[read-1] == '\n') line[read-1] = '\0';
 40 |     int s=0; // Env Var Size
 41 |     while (line[s] != '\0' && line[s] != '=') s++;
 42 |     if (line[s] == '\0') continue;
 43 |     strncpy(envVar, line, MIN(1023,s));
 44 |     envVar[MIN(1023,s)] = '\0';
 45 |     s++;
 46 |     strncpy(envValue, line+s, 1023);
 47 |     envValue[1023]='\0';
 48 |     setenv(envVar, envValue, 0);
 49 |     //printf("%s : %s->%s\n", fileName, envVar, envValue);
 50 |   }
 51 |   if (line) free(line);
 52 |   fclose(file);
 53 | }
 54 | 
 55 | static void initEnvFunc() {
 56 |   char confFilePath[1024];
 57 |   const char* userFile = getenv("NCCL_CONF_FILE");
 58 |   if (userFile && strlen(userFile) > 0) {
 59 |     snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
 60 |     setEnvFile(confFilePath);
 61 |   } else {
 62 |     const char* userDir = userHomeDir();
 63 |     if (userDir) {
 64 |       snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir);
 65 |       setEnvFile(confFilePath);
 66 |     }
 67 |   }
 68 |   snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf");
 69 |   setEnvFile(confFilePath);
 70 | }
 71 | 
 72 | void initEnv() {
 73 |   static pthread_once_t once = PTHREAD_ONCE_INIT;
 74 |   pthread_once(&once, initEnvFunc);
 75 | }
 76 | 
 77 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
 78 |   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
 79 |   pthread_mutex_lock(&mutex);
 80 |   if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
 81 |     const char* str = ncclGetEnv(env);
 82 |     int64_t value = deftVal;
 83 |     if (str && strlen(str) > 0) {
 84 |       errno = 0;
 85 |       value = strtoll(str, NULL, 0);
 86 |       if (errno) {
 87 |         value = deftVal;
 88 |         INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
 89 |       } else {
 90 |         INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value);
 91 |       }
 92 |     }
 93 |     __atomic_store_n(cache, value, __ATOMIC_RELAXED);
 94 |   }
 95 |   pthread_mutex_unlock(&mutex);
 96 | }
 97 | 
 98 | const char* ncclGetEnv(const char* name) {
 99 |   initEnv();
100 |   return getenv(name);
101 | }
102 | 


--------------------------------------------------------------------------------
/src/ucx_uct_plugin.c:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #include "ucx_uct_lib.h"
 10 | 
 11 | typedef enum {
 12 |   NCCL_UCT_REQ_IRECV  = -1,
 13 |   NCCL_UCT_REQ_IFLUSH = -2
 14 | } nccl_uct_request_type_t;
 15 | 
 16 | struct nccl_uct_rdesc;
 17 | 
 18 | /* On-the-wire descriptor of a posted receive request entry */
 19 | typedef struct {
 20 |   int        tag;
 21 |   int        size;
 22 |   void       *data;
 23 |   int        matched;
 24 |   uct_rkey_t rkey;
 25 | } nccl_uct_chunk_t;
 26 | 
 27 | /* On-the-wire descriptor of a receive request containing many chunks */
 28 | typedef struct {
 29 |   uint64_t              id;
 30 |   uint16_t              count;
 31 |   uint32_t              size;
 32 |   struct nccl_uct_rdesc *peer_rdesc; /* Acts as a cookie along with id */
 33 |   nccl_uct_chunk_t      chunk[];
 34 | } nccl_uct_rdesc_hdr_t;
 35 | 
 36 | /* On-the-wire descriptor for receive request completion */
 37 | typedef struct {
 38 |   uint64_t              id;
 39 |   struct nccl_uct_rdesc *rdesc;
 40 |   int                   count; /* Number of sizes contained */
 41 |   int                   sizes[NCCL_UCX_UCT_MAX_RECVS];
 42 | } nccl_uct_atp_t;
 43 | 
 44 | /*
 45 |  * NCCL local request handler to progress:
 46 |  * - size -1 for multi receive
 47 |  * - size -2 for flush
 48 |  * - size > 0 for send
 49 |  */
 50 | typedef struct {
 51 |   /* Pending GET (iflush) PUT (isend) or receiving one ATP (irecv) */
 52 |   uct_completion_t      completion;
 53 |   int                   size;
 54 |   struct nccl_uct_rdesc *rdesc;
 55 | } nccl_uct_req_t;
 56 | 
 57 | /* Pending receive descriptor either on the receive or sending side */
 58 | typedef struct nccl_uct_rdesc {
 59 |   int                   nccl_usage; /* NCCL requests not finished/started */
 60 |   int                   send_atp;   /* >1 pending isend, ==1 pending atp send */
 61 | 
 62 |   union {
 63 |     ucs_list_link_t       list;  /* comm's linked list */
 64 |     struct nccl_uct_rdesc *next; /* inserted in free list */
 65 |   };
 66 | 
 67 |   struct nccl_uct_wr_comm  *comm;
 68 |   nccl_uct_rdesc_hdr_t     desc;
 69 |   nccl_uct_chunk_t         storage[NCCL_UCX_UCT_MAX_RECVS]; /* Don't use directly */
 70 |   nccl_uct_req_t           reqs[NCCL_UCX_UCT_MAX_RECVS];    /* NCCL requests */
 71 |   int                      sizes[NCCL_UCX_UCT_MAX_RECVS];   /* ATP received sizes */
 72 | } nccl_uct_rdesc_t;
 73 | 
 74 | typedef struct nccl_uct_wr_comm {
 75 |   nccl_uct_comm_t      base;
 76 | 
 77 |   int                  rdesc_alloc; /* Track allocated rdescs */
 78 |   nccl_uct_rdesc_t     *free_rdesc; /* Available rdesc for reuse */
 79 |   uint64_t             rdesc_id;    /* Next sequence number to use */
 80 | 
 81 |   /* Received RTRs: used by Sender communicator in ->isend() */
 82 |   ucs_list_link_t      rdesc_list;
 83 | 
 84 | } nccl_uct_wr_comm_t;
 85 | 
 86 | static inline nccl_uct_wr_comm_t *
 87 | nccl_uct_wr_comm_get(nccl_uct_comm_t *base_comm) {
 88 |   return ucs_container_of(base_comm, nccl_uct_wr_comm_t, base);
 89 | }
 90 | 
 91 | static nccl_uct_rdesc_t *nccl_uct_comm_rdesc_get(nccl_uct_wr_comm_t *comm) {
 92 |   nccl_uct_rdesc_t *rdesc = comm->free_rdesc;
 93 | 
 94 |   if (rdesc == NULL) {
 95 |     rdesc = calloc(1, sizeof(*rdesc));
 96 |   } else {
 97 |     comm->free_rdesc = rdesc->next;
 98 |   }
 99 | 
100 |   rdesc->next = NULL;
101 |   rdesc->comm = comm;
102 |   comm->rdesc_alloc++;
103 |   return rdesc;
104 | }
105 | 
106 | static size_t nccl_uct_rdesc_size(int n) {
107 |   return n * sizeof(nccl_uct_chunk_t) + sizeof(nccl_uct_rdesc_hdr_t);
108 | }
109 | 
110 | /* Prepare a receive descriptor from irecv()/iflush() side */
111 | static void nccl_uct_rdesc_set(nccl_uct_rdesc_t *rdesc, uint64_t id, int n,
112 |                                void **data, size_t *sizes, int *tags,
113 |                                nccl_uct_memh_t **uct_memh) {
114 |   nccl_uct_rdesc_hdr_t *desc = &rdesc->desc;
115 |   int i;
116 | 
117 |   /* Populate header */
118 |   desc->id         = id;
119 |   desc->count      = n;
120 |   desc->size       = nccl_uct_rdesc_size(n);
121 |   desc->peer_rdesc = rdesc; /* cookie, will be returned in ATP */
122 | 
123 |   /* Ref count that prevents NCCL from releasing memory */
124 |   rdesc->nccl_usage = 1;
125 |   rdesc->send_atp   = 0;
126 | 
127 |   /* Zero (iflush) or one or many receive request are contained */
128 |   for (i = 0; i < n; i++) {
129 |     desc->chunk[i].tag     = tags[i];
130 |     desc->chunk[i].size    = sizes[i];
131 |     desc->chunk[i].data    = data[i];
132 |     desc->chunk[i].matched = 0;
133 |     desc->chunk[i].rkey    = uct_memh[i]->bundle.rkey;
134 |   }
135 | }
136 | 
137 | static nccl_uct_req_t *nccl_uct_rdesc_get_req(nccl_uct_rdesc_t *rdesc, int i,
138 |                                               int size) {
139 |   nccl_uct_req_t *req;
140 | 
141 |   assert(i < NCCL_UCX_UCT_MAX_RECVS);
142 | 
143 |   req        = &rdesc->reqs[i];
144 |   req->size  = size;
145 |   req->rdesc = rdesc;
146 | 
147 |   req->completion.func   = nccl_uct_empty_callback;
148 |   req->completion.count  = 1;
149 |   req->completion.status = UCS_OK;
150 | 
151 |   return &rdesc->reqs[i];
152 | }
153 | 
154 | static void nccl_uct_comm_rdesc_put(nccl_uct_rdesc_t *rdesc) {
155 |   nccl_uct_wr_comm_t *comm = rdesc->comm;
156 | 
157 |   assert(comm != NULL);
158 | 
159 |   rdesc->desc.id   = -1;
160 |   rdesc->comm      = NULL;
161 |   rdesc->next      = comm->free_rdesc;
162 |   comm->free_rdesc = rdesc;
163 |   comm->rdesc_alloc--;
164 | }
165 | 
166 | /* On receiver side, after ->irecv(), expect corresponding ATP */
167 | static ucs_status_t nccl_uct_atp_callback(void *arg, void *data, size_t length,
168 |                                           unsigned flags) {
169 |   nccl_uct_atp_t *atp = (nccl_uct_atp_t*)((uint8_t*)data + 8);
170 | 
171 |   assert(length == (sizeof(*atp) + 8));
172 |   assert(*(nccl_uct_comm_t**)data == &atp->rdesc->comm->base);
173 |   assert(atp->id == atp->rdesc->desc.id);
174 |   assert(atp->count == atp->rdesc->desc.count);
175 |   assert(atp->rdesc->reqs[0].completion.count == 1);
176 | 
177 |   atp->rdesc->reqs[0].completion.count--;
178 |   memcpy(atp->rdesc->sizes, atp->sizes, atp->count * sizeof(*atp->sizes));
179 |   return UCS_OK;
180 | }
181 | 
182 | /* On sender side, asynchronously receive rdesc/RTR, later used by ->isend() */
183 | static ucs_status_t nccl_uct_rtr_callback(void *arg, void *data, size_t length,
184 |                                           unsigned flags) {
185 |   nccl_uct_comm_t *base_comm = *(nccl_uct_comm_t **)data;
186 |   nccl_uct_wr_comm_t *comm   = nccl_uct_wr_comm_get(base_comm);
187 |   nccl_uct_rdesc_hdr_t *desc = (nccl_uct_rdesc_hdr_t*)((uint8_t*)data + 8);
188 |   size_t size                = desc->size;
189 |   nccl_uct_rdesc_t *rdesc;
190 | 
191 |   rdesc = nccl_uct_comm_rdesc_get(comm);
192 |   if (rdesc == NULL) {
193 |     WARN("Failed to get an rdesc in RTR callback");
194 |     return UCS_ERR_NO_MEMORY; /* Cannot happend */
195 |   }
196 | 
197 |   ucs_list_add_tail(&comm->rdesc_list, &rdesc->list);
198 | 
199 |   assert((size + 8) == length);
200 |   assert(size == nccl_uct_rdesc_size(desc->count));
201 | 
202 |   memcpy(&rdesc->desc, desc, size);
203 |   rdesc->nccl_usage = desc->count;
204 |   rdesc->send_atp   = desc->count + 1;
205 |   return UCS_OK;
206 | }
207 | 
208 | static ncclResult_t nccl_uct_wr_iface_set(nccl_uct_iface_t *uct_iface) {
209 |   NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_RTR,
210 |                                        nccl_uct_rtr_callback));
211 |   NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_ATP,
212 |                                        nccl_uct_atp_callback));
213 |   return ncclSuccess;
214 | }
215 | 
216 | static ncclResult_t nccl_uct_wr_comm_alloc(nccl_uct_comm_t **comm_p) {
217 |   nccl_uct_wr_comm_t *comm = calloc(1, sizeof(nccl_uct_wr_comm_t));
218 |   if (comm != NULL) {
219 |     *comm_p = &comm->base;
220 |     return ncclSuccess;
221 |   }
222 | 
223 |   return ncclSystemError;
224 | }
225 | 
226 | static ncclResult_t nccl_uct_wr_comm_init(nccl_uct_comm_t *base_comm,
227 |                                           nccl_uct_context_t *context,
228 |                                           nccl_uct_worker_t *worker, int dev,
229 |                                           const nccl_uct_comm_t *remote_comm) {
230 |   nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(base_comm);
231 | 
232 |   ucs_list_head_init(&comm->rdesc_list);
233 |   return nccl_uct_comm_init(&comm->base, context, worker, dev, remote_comm);
234 | }
235 | 
236 | static ncclResult_t nccl_uct_wr_init(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
237 |   context.ops.comm_alloc = nccl_uct_wr_comm_alloc;
238 |   context.ops.comm_init  = nccl_uct_wr_comm_init;
239 |   context.ops.iface_set  = nccl_uct_wr_iface_set;
240 |   context.am_short_size  = nccl_uct_rdesc_size(NCCL_UCX_UCT_MAX_RECVS);
241 |   context.rkey_size      = sizeof(((nccl_uct_chunk_t*)0)->rkey);
242 | 
243 |   return nccl_p2p_ib_init(&context.dev_count, &context.merge_dev_count, ncclIbDevs, context.if_name,
244 |                           &context.if_addr, NULL, logFunction);
245 | }
246 | 
247 | /* Outcome is either send_atp equal to 1 or 0 */
248 | static void nccl_uct_send_atp(nccl_uct_wr_comm_t *comm,
249 |                               nccl_uct_rdesc_t *rdesc) {
250 |   ucs_status_t status;
251 |   nccl_uct_atp_t atp;
252 |   int i;
253 | 
254 |   assert(rdesc->send_atp == 1);
255 | 
256 |   status = uct_ep_fence(comm->base.uct_ep->ep, 0);
257 |   if (status != UCS_OK) {
258 |     return;
259 |   }
260 | 
261 |   atp.id    = rdesc->desc.id;
262 |   atp.rdesc = rdesc->desc.peer_rdesc;
263 |   atp.count = rdesc->desc.count;
264 | 
265 |   /* Sizes from isend() are lower or equal to their irecv() side */
266 |   for (i = 0; i < rdesc->desc.count; i++) {
267 |     atp.sizes[i] = rdesc->reqs[i].size;
268 |   }
269 | 
270 |   status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_ATP,
271 |                            (uint64_t)comm->base.remote.comm, &atp, sizeof(atp));
272 |   if (status == UCS_OK) {
273 |     rdesc->send_atp = 0;
274 |   }
275 | }
276 | 
277 | static ncclResult_t nccl_uct_send(nccl_uct_wr_comm_t *comm, void *data,
278 |                                   int size, nccl_uct_memh_t *uct_memh,
279 |                                   nccl_uct_rdesc_t *rdesc, int i,
280 |                                   void **request) {
281 |   ucs_status_t status;
282 |   uct_iov_t iov;
283 |   nccl_uct_req_t *req;
284 | 
285 |   *request = NULL;
286 | 
287 |   /* Details for local data */
288 |   iov.buffer = data;
289 |   iov.length = size;
290 |   iov.memh   = uct_memh->memh;
291 |   iov.stride = iov.length;
292 |   iov.count  = 1;
293 | 
294 |   assert(size <= rdesc->desc.chunk[i].size);
295 | 
296 |   req = nccl_uct_rdesc_get_req(rdesc, i, size); /* NCCL request */
297 | 
298 |   status = uct_ep_put_zcopy(comm->base.uct_ep->ep, &iov, 1,
299 |                             (uint64_t)rdesc->desc.chunk[i].data,
300 |                             rdesc->desc.chunk[i].rkey, &req->completion);
301 | 
302 |   if (status == UCS_OK) {
303 |     req->completion.count--;
304 |   } else if (status != UCS_INPROGRESS) {
305 |     return ncclSuccess;
306 |   }
307 | 
308 |   rdesc->desc.chunk[i].matched = 1;
309 |   --rdesc->send_atp;
310 | 
311 |   if (rdesc->send_atp == 1) {
312 |     ucs_list_del(&rdesc->list); /* all ->isend() were now matched */
313 |     nccl_uct_send_atp(comm, rdesc);
314 |   }
315 | 
316 |   *request = req;
317 |   return ncclSuccess;
318 | }
319 | 
320 | static ncclResult_t nccl_uct_wr_isend(void *send_comm, void *data, size_t size,
321 |                                       int tag, void *mhandle,  void* phandle, void **request) {
322 |   nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(send_comm);
323 |   nccl_uct_rdesc_t *rdesc;
324 |   int i;
325 | 
326 |   *request = NULL;
327 | 
328 |   ucs_list_for_each(rdesc, &comm->rdesc_list, list) {
329 |     for (i = 0; i < rdesc->desc.count; i++) {
330 |       if (rdesc->desc.chunk[i].matched || (rdesc->desc.chunk[i].tag != tag)) {
331 |         continue;
332 |       }
333 | 
334 |       return nccl_uct_send(comm, data, size, mhandle, rdesc, i, request);
335 |     }
336 |   }
337 | 
338 |   /* Progress here to make sure we receive non-solicited RTRs */
339 |   uct_worker_progress(comm->base.uct_worker->worker);
340 |   return ncclSuccess;
341 | }
342 | 
343 | static ncclResult_t nccl_uct_wr_irecv(void *recv_comm, int n, void **data,
344 |                                       size_t *sizes, int *tags, void **mhandles,
345 |                                       void** phandles, void **request) {
346 |   nccl_uct_wr_comm_t *comm   = nccl_uct_wr_comm_get(recv_comm);
347 |   nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandles;
348 |   nccl_uct_rdesc_t *rdesc;
349 |   ucs_status_t status;
350 | 
351 |   assert(n <= NCCL_UCX_UCT_MAX_RECVS);
352 | 
353 |   rdesc = nccl_uct_comm_rdesc_get(comm);
354 |   if (rdesc == NULL) {
355 |     return ncclInternalError;
356 |   }
357 | 
358 |   nccl_uct_rdesc_set(rdesc, comm->rdesc_id++, n, data, sizes, tags, uct_memh);
359 | 
360 |   status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_RTR,
361 |                            (uint64_t)comm->base.remote.comm, &rdesc->desc,
362 |                            nccl_uct_rdesc_size(n));
363 |   if (status != UCS_OK) {
364 |     nccl_uct_comm_rdesc_put(rdesc);
365 |     *request = NULL;
366 |   } else {
367 |     /* Wait for receiving ATP */
368 |     *request = nccl_uct_rdesc_get_req(rdesc, 0, NCCL_UCT_REQ_IRECV);
369 |   }
370 | 
371 |   return ncclSuccess;
372 | }
373 | 
374 | static ncclResult_t nccl_uct_wr_iflush(void *recv_comm, int n, void **data,
375 |                                        int *sizes, void **mhandle,
376 |                                        void **request) {
377 |   nccl_uct_comm_t *base_comm = recv_comm;
378 |   int last                   = nccl_uct_flush_index(base_comm, sizes, n);
379 |   nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandle;
380 |   nccl_uct_rdesc_t *rdesc;
381 |   nccl_uct_req_t *req;
382 |   ncclResult_t result;
383 | 
384 |   if (last == -1) {
385 |     return ncclSuccess;
386 |   }
387 | 
388 |   rdesc = nccl_uct_comm_rdesc_get(nccl_uct_wr_comm_get(base_comm));
389 |   if (rdesc == NULL) {
390 |     return ncclInternalError;
391 |   }
392 | 
393 |   nccl_uct_rdesc_set(rdesc, ~0, 0, NULL, NULL, NULL, NULL);
394 |   /* Wait for local GET completion */
395 |   req      = nccl_uct_rdesc_get_req(rdesc, 0, NCCL_UCT_REQ_IFLUSH);
396 |   *request = req;
397 | 
398 |   result = nccl_uct_flush(base_comm, data[last], sizes[last], uct_memh[last],
399 |                           &req->completion, request);
400 |   if (*request == NULL) {
401 |     nccl_uct_comm_rdesc_put(rdesc);
402 |   }
403 | 
404 |   return result;
405 | }
406 | 
407 | static ncclResult_t nccl_uct_wr_test(void *request, int *done, int *sizes) {
408 |   nccl_uct_req_t *req      = request;
409 |   nccl_uct_rdesc_t *rdesc  = req->rdesc;
410 |   nccl_uct_wr_comm_t *comm = rdesc->comm;
411 | 
412 |   uct_worker_progress(comm->base.uct_worker->worker);
413 | 
414 |   *done = 0;
415 | 
416 |   if (rdesc->send_atp == 1) {
417 |     /* Slowpath */
418 |     nccl_uct_send_atp(comm, rdesc);
419 | 
420 |     if (rdesc->send_atp && rdesc->nccl_usage == 1) {
421 |       /* Keep the last isend request until ATP is out */
422 |       return ncclSuccess;
423 |     }
424 |   }
425 | 
426 |   if (req->completion.count > 0) {
427 |     return ncclSuccess;
428 |   }
429 | 
430 |   *done = 1;
431 | 
432 |   if (req->size == NCCL_UCT_REQ_IRECV) {
433 |     assert(&rdesc->reqs[0] == req);
434 |     if (sizes != NULL) {
435 |       memcpy(sizes, rdesc->sizes, rdesc->desc.count * sizeof(*sizes));
436 |     }
437 |   } else if (req->size == NCCL_UCT_REQ_IFLUSH) {
438 |     assert(&rdesc->reqs[0] == req);
439 |   } else {
440 |     /* ->isend() request */
441 |     assert(req->size > -1);
442 |     if (sizes != NULL) {
443 |       sizes[0] = req->size;
444 |     }
445 |   }
446 | 
447 |   if (--rdesc->nccl_usage < 1) {
448 |     assert(rdesc->send_atp == 0);
449 |     assert(rdesc->nccl_usage == 0);
450 |     nccl_uct_comm_rdesc_put(rdesc);
451 |   }
452 | 
453 |   return ncclSuccess;
454 | }
455 | 
456 | static ncclResult_t nccl_uct_wr_close(void *close_comm) {
457 |   nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(close_comm);
458 |   nccl_uct_rdesc_t *rdesc;
459 | 
460 |   nccl_uct_comm_deinit(close_comm);
461 | 
462 |   while ((rdesc = comm->free_rdesc) != NULL) {
463 |     comm->free_rdesc = rdesc->next;
464 |     free(rdesc);
465 |   }
466 | 
467 |   assert(ucs_list_is_empty(&comm->rdesc_list));
468 |   assert(comm->rdesc_alloc == 0);
469 |   free(comm);
470 |   return ncclSuccess;
471 | }
472 | 
473 | 
474 | static ncclResult_t nccl_uct_wr_init_v9(ncclDebugLogger_t logFunction) {
475 |   return nccl_uct_wr_init(logFunction, NULL);
476 | }
477 | 
478 | static ncclResult_t nccl_uct_wr_isend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
479 |   return nccl_uct_wr_isend(sendComm, data, size, tag, mhandle, NULL, request);
480 | }
481 | 
482 | static ncclResult_t nccl_uct_wr_irecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
483 |   return nccl_uct_wr_irecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
484 | }
485 | 
486 | static ncclResult_t nccl_uct_wr_isend_v8(void *send_comm, void *data, int size,
487 |                                       int tag, void *mhandle, void **request) {
488 |   return nccl_uct_wr_isend_v9(send_comm, data, (size_t)size, tag, mhandle, request);
489 | }
490 | 
491 | static ncclResult_t nccl_uct_wr_irecv_v8(void *recv_comm, int n, void **data,
492 |                                       int *sizes, int *tags, void **mhandles,
493 |                                       void **request) {
494 |   size_t sizes_sizet[NCCL_NET_IB_MAX_RECVS];
495 |   for (int i=0; i<n; i++) sizes_sizet[i] = sizes[i];
496 |   return nccl_uct_wr_irecv_v9(recv_comm, n, data, sizes_sizet, tags, mhandles, request);
497 | }
498 | 
499 | ncclNet_v10_t ucxUctPlugin_v10 = NCCL_UCT_PLUGIN_V10("UCX-UCT", nccl_uct_wr);
500 | ncclNet_v9_t ucxUctPlugin_v9 = NCCL_UCT_PLUGIN_V9("UCX-UCT", nccl_uct_wr);
501 | ncclNet_v8_t ucxUctPlugin_v8 = NCCL_UCT_PLUGIN_V8("UCX-UCT", nccl_uct_wr);
502 | ncclNet_v7_t ucxUctPlugin_v7 = NCCL_UCT_PLUGIN_V7("UCX-UCT", nccl_uct_wr);
503 | ncclNet_v6_t ucxUctPlugin_v6 = NCCL_UCT_PLUGIN_V6("UCX-UCT", nccl_uct_wr);
504 | ncclNet_v5_t ucxUctPlugin_v5 = NCCL_UCT_PLUGIN_V5("UCX-UCT", nccl_uct_wr);
505 | 


--------------------------------------------------------------------------------
/src/ucx_uct_rd_plugin.c:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #include "ucx_uct_lib.h"
 10 | #include "ucx_uct_ring.h"
 11 | 
 12 | #define NCCL_UCT_PENDING_SIZE 128
 13 | #define NCCL_UCT_PENDING_MASK (NCCL_UCT_PENDING_SIZE - 1)
 14 | 
 15 | /* Memory chunk to send or receive */
 16 | typedef struct {
 17 |   int                    tag;
 18 |   int                    size;
 19 |   void                   *data;
 20 |   union {
 21 |     uct_rkey_t           rkey;
 22 |     nccl_uct_memh_t      *uct_memh;
 23 |   } u;
 24 |   struct nccl_uct_rd_req *req;
 25 |   unsigned               index; /* irecv(): position in the receive request */
 26 | } nccl_uct_mem_t;
 27 | 
 28 | /* Context for GET requests to be posted */
 29 | typedef struct {
 30 |   uct_iov_t              iov;
 31 |   uint64_t               rva;
 32 |   uct_rkey_t             rkey;
 33 |   struct nccl_uct_rd_req *req;
 34 | } nccl_uct_get_param_t;
 35 | 
 36 | /* Communicator for client or server side */
 37 | typedef struct nccl_uct_rd_comm {
 38 |   /* Base communicator with endpoints setup */
 39 |   nccl_uct_comm_t        base;
 40 | 
 41 |   /* NCCL request free list */
 42 |   int                    req_count;
 43 |   struct nccl_uct_rd_req *free_req;
 44 | 
 45 |   /* TAG matching rings */
 46 |   nccl_uct_ring_t        exp;
 47 |   nccl_uct_ring_t        unexp;
 48 | 
 49 |   /* GET zcopy for matched chunks, but yet to be posted */
 50 |   struct {
 51 |     unsigned             first;
 52 |     unsigned             last;
 53 |     nccl_uct_get_param_t param[NCCL_UCT_PENDING_SIZE];
 54 |   } pending;
 55 | } nccl_uct_rd_comm_t;
 56 | 
 57 | /* Either irecv, isend or iflush NCCL request */
 58 | typedef struct nccl_uct_rd_req {
 59 |   uct_completion_t       completion; /* Release when count equals zero */
 60 |   int                    send_rts;   /* Request type */
 61 |   nccl_uct_rd_comm_t     *comm;      /* Parent communicator */
 62 |   struct nccl_uct_rd_req *next;      /* Free list node */
 63 | 
 64 |   int                    count;     /* isend(): 1, irecv(): from 1 to n */
 65 |   int                    rts_count; /* RTS actually received and matched */
 66 | 
 67 |   /* Sizes actually read to report, received from RTS */
 68 |   int                    sizes[NCCL_UCX_UCT_MAX_RECVS];
 69 | 
 70 |   /* Remote completed requests cookies, to send with ATS */
 71 |   struct nccl_uct_rd_req *remote_req[NCCL_UCX_UCT_MAX_RECVS];
 72 | } nccl_uct_rd_req_t;
 73 | 
 74 | static inline nccl_uct_rd_comm_t *
 75 | nccl_uct_rd_comm_get(nccl_uct_comm_t *base_comm) {
 76 |   return ucs_container_of(base_comm, nccl_uct_rd_comm_t, base);
 77 | }
 78 | 
 79 | static void nccl_uct_rd_send_ats(nccl_uct_rd_req_t *req) {
 80 |   ucs_status_t status;
 81 | 
 82 |   assert(req->send_rts == 0);
 83 |   assert(req->rts_count == req->count);
 84 |   assert(req->completion.count == 1);
 85 | 
 86 |   status = uct_ep_am_short(req->comm->base.uct_ep->ep, NCCL_UCT_AM_ATS,
 87 |                            (uint64_t)req->comm->base.remote.comm,
 88 |                            req->remote_req,
 89 |                            sizeof(*req->remote_req) * req->rts_count);
 90 |   if (status == UCS_OK) {
 91 |     req->completion.count--;
 92 |   }
 93 | }
 94 | 
 95 | static void nccl_uct_rd_pending_add(nccl_uct_rd_comm_t *comm,
 96 |                                     nccl_uct_mem_t *src, nccl_uct_mem_t *dst) {
 97 |   nccl_uct_rd_req_t *req = dst->req;
 98 |   nccl_uct_get_param_t *param;
 99 | 
100 |   assert(src->size <= dst->size);
101 |   assert(req->rts_count < NCCL_UCX_UCT_MAX_RECVS);
102 | 
103 |   req->sizes[dst->index]            = src->size;
104 |   req->remote_req[req->rts_count++] = src->req; /* src->req is a cookie */
105 | 
106 |   if (src->size == 0) {
107 |     req->completion.count--;
108 |     return;
109 |   }
110 | 
111 |   param = &comm->pending.param[comm->pending.last & NCCL_UCT_PENDING_MASK];
112 |   comm->pending.last++;
113 | 
114 |   assert((comm->pending.first & NCCL_UCT_PENDING_MASK) !=
115 |          (comm->pending.last & NCCL_UCT_PENDING_MASK));
116 | 
117 |   param->iov.buffer = dst->data;
118 |   param->iov.length = src->size;
119 |   param->iov.memh   = dst->u.uct_memh->memh;
120 |   param->iov.stride = 0;
121 |   param->iov.count  = 1;
122 |   param->rva        = (uint64_t)src->data;
123 |   param->rkey       = src->u.rkey;
124 |   param->req        = req;
125 | }
126 | 
127 | static void nccl_uct_rd_pending_drain(nccl_uct_rd_comm_t *comm) {
128 |   ucs_status_t status;
129 |   nccl_uct_get_param_t *param;
130 | 
131 |   for (; comm->pending.first != comm->pending.last; comm->pending.first++) {
132 |     param = &comm->pending.param[comm->pending.first & NCCL_UCT_PENDING_MASK];
133 | 
134 |     status = uct_ep_get_zcopy(comm->base.uct_ep->ep, &param->iov, 1, param->rva,
135 |                               param->rkey, &param->req->completion);
136 |     if (status == UCS_OK) {
137 |       param->req->completion.count--;
138 |     } else if (status != UCS_INPROGRESS) {
139 |       break;
140 |     }
141 | 
142 |     if (param->req->completion.count == 1) {
143 |       nccl_uct_rd_send_ats(param->req);
144 |     }
145 |   }
146 | }
147 | 
148 | static ucs_status_t nccl_uct_rd_ats_callback(void *arg, void *data,
149 |                                              size_t length, unsigned flags) {
150 |   nccl_uct_rd_req_t **req  = (nccl_uct_rd_req_t **)((uint8_t *)data + 8);
151 |   nccl_uct_rd_req_t **end  = (nccl_uct_rd_req_t **)((uint8_t *)data + length);
152 | 
153 |   for (; req + 1 <= end; req++) {
154 |     assert((*req)->completion.count == 1);
155 |     assert((*req)->comm == nccl_uct_rd_comm_get(*(nccl_uct_comm_t**)data));
156 | 
157 |     (*req)->completion.count = 0;
158 |   }
159 | 
160 |   assert(req == end);
161 |   return UCS_OK;
162 | }
163 | 
164 | static ucs_status_t nccl_uct_rd_rts_callback(void *arg, void *data,
165 |                                              size_t length, unsigned flags) {
166 | 
167 |   nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(*(nccl_uct_comm_t**)data);
168 |   nccl_uct_mem_t *rts      = (nccl_uct_mem_t *)((uint8_t *)data + 8);
169 |   nccl_uct_ring_t *exp;
170 |   nccl_uct_mem_t *dst;
171 |   unsigned i;
172 | 
173 |   assert(length == (sizeof(*rts) + 8));
174 | 
175 |   /* Do we already expect it? */
176 |   exp = &comm->exp;
177 |   i   = nccl_uct_ring_find(exp, rts->tag);
178 |   if (i == exp->last) {
179 |     nccl_uct_ring_append(&comm->unexp, rts->tag, rts, sizeof(*rts));
180 |   } else {
181 |     /* Receive request was already posted */
182 |     dst = nccl_uct_ring_get_entry(exp, i);
183 |     nccl_uct_rd_pending_add(comm, rts, dst);
184 |     nccl_uct_ring_consume(exp, i);
185 |   }
186 | 
187 |   return UCS_OK;
188 | }
189 | 
190 | static ncclResult_t nccl_uct_rd_iface_set(nccl_uct_iface_t *uct_iface) {
191 |   NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_RTS,
192 |                                        nccl_uct_rd_rts_callback));
193 |   NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_ATS,
194 |                                        nccl_uct_rd_ats_callback));
195 |   return ncclSuccess;
196 | }
197 | 
198 | static ncclResult_t nccl_uct_rd_comm_alloc(nccl_uct_comm_t **comm_p) {
199 |   nccl_uct_rd_comm_t *comm = calloc(1, sizeof(*comm));
200 |   if (comm != NULL) {
201 |     *comm_p = &comm->base;
202 |     return ncclSuccess;
203 |   }
204 | 
205 |   return ncclSystemError;
206 | }
207 | 
208 | static ncclResult_t nccl_uct_rd_comm_init(nccl_uct_comm_t *base_comm,
209 |                                           nccl_uct_context_t *context,
210 |                                           nccl_uct_worker_t *worker, int dev,
211 |                                           const nccl_uct_comm_t *remote_comm) {
212 |   nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(base_comm);
213 | 
214 |   comm->pending.first = 0;
215 |   comm->pending.last  = 0;
216 |   comm->req_count     = 0;
217 |   comm->free_req      = NULL;
218 | 
219 |   NCCLCHECK(nccl_uct_ring_init(&comm->exp, sizeof(nccl_uct_mem_t)));
220 |   NCCLCHECK(nccl_uct_ring_init(&comm->unexp, sizeof(nccl_uct_mem_t)));
221 | 
222 |   return nccl_uct_comm_init(&comm->base, context, worker, dev, remote_comm);
223 | }
224 | 
225 | static ncclResult_t nccl_uct_rd_init(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
226 |   NCCL_STATIC_ASSERT(NCCL_UCT_RING_SIZE >= 2 * MAX_REQUESTS,
227 |                      "Cannot handle expected/unexpected requests");
228 |   NCCL_STATIC_ASSERT(NCCL_UCT_PENDING_SIZE > MAX_REQUESTS,
229 |                      "Cannot handle enough pending requests");
230 | 
231 |   context.ops.comm_alloc = nccl_uct_rd_comm_alloc;
232 |   context.ops.comm_init  = nccl_uct_rd_comm_init;
233 |   context.ops.iface_set  = nccl_uct_rd_iface_set;
234 |   context.rkey_size      = sizeof(((nccl_uct_mem_t*)0)->u.rkey);
235 |   context.am_short_size  = sizeof(((nccl_uct_rd_req_t*)0)->remote_req);
236 |   if (sizeof(nccl_uct_mem_t) > context.am_short_size) {
237 |     context.am_short_size = sizeof(nccl_uct_mem_t);
238 |   }
239 | 
240 |   return nccl_p2p_ib_init(&context.dev_count, &context.merge_dev_count, ncclIbDevs, context.if_name,
241 |                           &context.if_addr, NULL, logFunction);
242 | }
243 | 
244 | static nccl_uct_rd_req_t *nccl_uct_rd_req_alloc(nccl_uct_rd_comm_t *comm,
245 |                                                 int count) {
246 |   nccl_uct_rd_req_t *req = comm->free_req;
247 | 
248 |   if (req == NULL) {
249 |     req = malloc(sizeof(*req));
250 |     if (req == NULL) {
251 |       return req;
252 |     }
253 |   } else {
254 |     comm->free_req = req->next;
255 |   }
256 | 
257 |   comm->req_count++;
258 |   req->comm              = comm;
259 |   req->completion.func   = nccl_uct_empty_callback;
260 |   req->completion.count  = count;
261 |   req->completion.status = UCS_OK;
262 |   return req;
263 | }
264 | 
265 | static inline void nccl_uct_rd_req_free(nccl_uct_rd_req_t *req) {
266 |   req->next           = req->comm->free_req;
267 |   req->comm->free_req = req;
268 |   req->comm->req_count--;
269 | }
270 | 
271 | static ncclResult_t nccl_uct_rd_isend(void *send_comm, void *data, size_t size,
272 |                                       int tag, void *mhandle, void* phandle, void **request) {
273 | 
274 |   nccl_uct_rd_comm_t *comm  = nccl_uct_rd_comm_get(send_comm);
275 |   nccl_uct_memh_t *uct_memh = mhandle;
276 |   nccl_uct_mem_t rts;
277 |   nccl_uct_rd_req_t *req;
278 |   ucs_status_t status;
279 | 
280 |   req = nccl_uct_rd_req_alloc(comm, 1);
281 |   if (req == NULL) {
282 |     *request = NULL;
283 |     return ncclSuccess;
284 |   }
285 | 
286 |   req->send_rts = 1;
287 |   req->count    = 1;
288 |   req->sizes[0] = size;
289 |   *request      = req;
290 | 
291 |   rts.tag    = tag;
292 |   rts.size   = size;
293 |   rts.data   = data;
294 |   rts.u.rkey = uct_memh->bundle.rkey;
295 |   rts.req    = req;
296 | 
297 |   status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_RTS,
298 |                            (uint64_t)comm->base.remote.comm, &rts, sizeof(rts));
299 |   if (status != UCS_OK) {
300 |     nccl_uct_rd_req_free(req);
301 |     *request = NULL;
302 |   }
303 | 
304 |   return ncclSuccess;
305 | }
306 | 
307 | static ncclResult_t nccl_uct_rd_irecv(void *recv_comm, int n, void **data,
308 |                                       size_t *sizes, int *tags, void **mhandles,
309 |                                       void** phandles, void **request) {
310 |   nccl_uct_rd_comm_t *comm   = nccl_uct_rd_comm_get(recv_comm);
311 |   nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandles;
312 |   nccl_uct_ring_t *unexp;
313 |   nccl_uct_rd_req_t *req;
314 |   nccl_uct_mem_t *rts, recv;
315 |   unsigned i, j;
316 | 
317 |   assert(n <= NCCL_UCX_UCT_MAX_RECVS);
318 | 
319 |   /* Create a request */
320 |   req      = nccl_uct_rd_req_alloc(comm, n + 1);
321 |   *request = req;
322 |   if (req == NULL) {
323 |     return ncclSuccess;
324 |   }
325 | 
326 |   req->send_rts  = 0;
327 |   req->count     = n;
328 |   req->rts_count = 0;
329 | 
330 |   /* Try to match or build expected list */
331 |   for (i = 0; i < n; i++) {
332 |     recv.tag        = tags[i];
333 |     recv.size       = sizes[i];
334 |     recv.data       = data[i];
335 |     recv.u.uct_memh = uct_memh[i];
336 |     recv.req        = req;
337 |     recv.index      = i;
338 | 
339 |     unexp = &comm->unexp;
340 |     j     = nccl_uct_ring_find(unexp, tags[i]);
341 |     if (j == unexp->last) {
342 |       nccl_uct_ring_append(&comm->exp, tags[i], &recv, sizeof(recv));
343 |     } else {
344 |       rts = nccl_uct_ring_get_entry(unexp, j);
345 |       nccl_uct_rd_pending_add(comm, rts, &recv);
346 |       nccl_uct_ring_consume(unexp, j);
347 |     }
348 |   }
349 | 
350 |   return ncclSuccess;
351 | }
352 | 
353 | static ncclResult_t nccl_uct_rd_iflush(void *recv_comm, int n, void **data,
354 |                                        int *sizes, void **mhandle,
355 |                                        void **request) {
356 |   ncclResult_t result        = ncclSuccess;
357 |   nccl_uct_comm_t *base_comm = recv_comm;
358 |   nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandle;
359 |   int last                   = nccl_uct_flush_index(base_comm, sizes, n);
360 |   nccl_uct_rd_req_t *req;
361 | 
362 |   *request = NULL;
363 | 
364 |   if (last != -1) {
365 |     req = nccl_uct_rd_req_alloc(nccl_uct_rd_comm_get(recv_comm), 1);
366 |     if (req != NULL) {
367 |       req->send_rts = -1;
368 |       *request      = req;
369 | 
370 |       result = nccl_uct_flush(base_comm, data[last], sizes[last],
371 |                               uct_memh[last], &req->completion, request);
372 |       if (*request == NULL) {
373 |         nccl_uct_rd_req_free(req);
374 |       }
375 |     }
376 |   }
377 | 
378 |   return result;
379 | }
380 | 
381 | static ncclResult_t nccl_uct_rd_test(void *request, int *done, int *sizes) {
382 |   nccl_uct_rd_req_t *req = request;
383 | 
384 |   while (uct_worker_progress(req->comm->base.uct_worker->worker))
385 |     ; /* empty */
386 | 
387 |   nccl_uct_rd_pending_drain(req->comm);
388 | 
389 |   if (req->completion.count > 0) {
390 |     if ((req->send_rts == 0) && (req->completion.count == 1)) {
391 |       nccl_uct_rd_send_ats(req);
392 |     }
393 | 
394 |     if (req->completion.count > 0) {
395 |       *done = 0;
396 |       return ncclSuccess;
397 |     }
398 |   }
399 | 
400 |   if ((sizes != NULL) && (req->send_rts > -1)) {
401 |     memcpy(sizes, req->sizes, req->count * sizeof(*req->sizes));
402 |   }
403 | 
404 |   *done = 1;
405 |   nccl_uct_rd_req_free(req);
406 |   return ncclSuccess;
407 | }
408 | 
409 | static ncclResult_t nccl_uct_rd_close(void *close_comm) {
410 |   nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(close_comm);
411 |   nccl_uct_rd_req_t *req;
412 | 
413 |   nccl_uct_comm_deinit(close_comm);
414 | 
415 |   while ((req = comm->free_req) != NULL) {
416 |     comm->free_req = req->next;
417 |     free(req);
418 |   }
419 | 
420 |   assert(nccl_uct_ring_is_empty(&comm->exp));
421 |   assert(nccl_uct_ring_is_empty(&comm->unexp));
422 |   assert(comm->req_count == 0);
423 |   assert(comm->pending.first == comm->pending.last);
424 | 
425 |   nccl_uct_ring_deinit(&comm->exp);
426 |   nccl_uct_ring_deinit(&comm->unexp);
427 |   free(comm);
428 |   return ncclSuccess;
429 | }
430 | 
431 | static ncclResult_t nccl_uct_rd_init_v9(ncclDebugLogger_t logFunction) {
432 |   return nccl_uct_rd_init(logFunction, NULL);
433 | }
434 | 
435 | static ncclResult_t nccl_uct_rd_isend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
436 |   return nccl_uct_rd_isend(sendComm, data, size, tag, mhandle, NULL, request);
437 | }
438 | 
439 | static ncclResult_t nccl_uct_rd_irecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
440 |   return nccl_uct_rd_irecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
441 | }
442 | 
443 | static ncclResult_t nccl_uct_rd_isend_v8(void *send_comm, void *data, int size,
444 |                                       int tag, void *mhandle, void **request) {
445 |   return nccl_uct_rd_isend_v9(send_comm, data, (size_t)size, tag, mhandle, request);
446 | }
447 | 
448 | static ncclResult_t nccl_uct_rd_irecv_v8(void *recv_comm, int n, void **data,
449 |                                       int *sizes, int *tags, void **mhandles,
450 |                                       void **request) {
451 |   size_t sizes_sizet[NCCL_NET_IB_MAX_RECVS];
452 |   for (int i=0; i<n; i++) sizes_sizet[i] = sizes[i];
453 |   return nccl_uct_rd_irecv_v9(recv_comm, n, data, sizes_sizet, tags, mhandles, request);
454 | }
455 | 
456 | ncclNet_v10_t ucxUctRdPlugin_v10 = NCCL_UCT_PLUGIN_V10("UCX-UCT-RD", nccl_uct_rd);
457 | ncclNet_v9_t ucxUctRdPlugin_v9 = NCCL_UCT_PLUGIN_V9("UCX-UCT-RD", nccl_uct_rd);
458 | ncclNet_v8_t ucxUctRdPlugin_v8 = NCCL_UCT_PLUGIN_V8("UCX-UCT-RD", nccl_uct_rd);
459 | ncclNet_v7_t ucxUctRdPlugin_v7 = NCCL_UCT_PLUGIN_V7("UCX-UCT-RD", nccl_uct_rd);
460 | ncclNet_v6_t ucxUctRdPlugin_v6 = NCCL_UCT_PLUGIN_V6("UCX-UCT-RD", nccl_uct_rd);
461 | ncclNet_v5_t ucxUctRdPlugin_v5 = NCCL_UCT_PLUGIN_V5("UCX-UCT-RD", nccl_uct_rd);
462 | 


--------------------------------------------------------------------------------
/src/utils.c:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
  3 |  * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: BSD-3-Clause
  5 |  *
  6 |  * See LICENSE.txt for license information
  7 |  ************************************************************************/
  8 | 
  9 | #define _GNU_SOURCE
 10 | #include <unistd.h>
 11 | #include <string.h>
 12 | #include <stdio.h>
 13 | #include <stdarg.h>
 14 | #include <ctype.h>
 15 | #include <fcntl.h>
 16 | #include <dlfcn.h>
 17 | #include <assert.h>
 18 | #include <stdbool.h>
 19 | #include "utils.h"
 20 | #include "core.h"
 21 | #include "param.h"
 22 | 
 23 | // Allocate memory to be potentially ibv_reg_mr'd. This needs to be
 24 | // allocated on separate pages as those pages will be marked DONTFORK
 25 | // and if they are shared, that could cause a crash in a child process
 26 | ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
 27 |   size_t page_size = sysconf(_SC_PAGESIZE);
 28 |   void* p;
 29 |   int size_aligned = ROUNDUP(size, page_size);
 30 |   int ret = posix_memalign(&p, page_size, size_aligned);
 31 |   if (ret != 0) return ncclSystemError;
 32 |   memset(p, 0, size);
 33 |   *ptr = p;
 34 |   return ncclSuccess;
 35 | }
 36 | 
 37 | ncclResult_t ncclRealloc(void **ptr, size_t oldNelem, size_t nelem) {
 38 |   if (nelem < oldNelem) return ncclInternalError;
 39 |   if (nelem == oldNelem) return ncclSuccess;
 40 | 
 41 |   void* oldp = *ptr;
 42 |   void* p = (void*)malloc(nelem);
 43 |   if (p == NULL) {
 44 |     WARN("Failed to malloc %ld bytes", nelem);
 45 |     return ncclSystemError;
 46 |   }
 47 |   memcpy(p, oldp, oldNelem);
 48 |   free(oldp);
 49 |   memset(p+oldNelem, 0, (nelem-oldNelem));
 50 |   *ptr = p;
 51 |   INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem, nelem, *ptr);
 52 |   return ncclSuccess;
 53 | }
 54 | 
 55 | 
 56 | int parseStringList(const char* string, struct netIf* ifList, int maxList) {
 57 |   if (!string) return 0;
 58 | 
 59 |   const char* ptr = string;
 60 | 
 61 |   int ifNum = 0;
 62 |   int ifC = 0;
 63 |   char c;
 64 |   do {
 65 |     c = *ptr;
 66 |     if (c == ':') {
 67 |       if (ifC > 0) {
 68 |         ifList[ifNum].prefix[ifC] = '\0';
 69 |         ifList[ifNum].port = atoi(ptr+1);
 70 |         ifNum++; ifC = 0;
 71 |       }
 72 |       while (c != ',' && c != '\0') c = *(++ptr);
 73 |     } else if (c == ',' || c == '\0') {
 74 |       if (ifC > 0) {
 75 |         ifList[ifNum].prefix[ifC] = '\0';
 76 |         ifList[ifNum].port = -1;
 77 |         ifNum++; ifC = 0;
 78 |       }
 79 |     } else {
 80 |       ifList[ifNum].prefix[ifC] = c;
 81 |       ifC++;
 82 |     }
 83 |     ptr++;
 84 |   } while (ifNum < maxList && c);
 85 |   return ifNum;
 86 | }
 87 | 
 88 | static int matchIf(const char* string, const char* ref, int matchExact) {
 89 |   // Make sure to include '\0' in the exact case
 90 |   int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
 91 |   return strncmp(string, ref, matchLen) == 0;
 92 | }
 93 | 
 94 | static int matchPort(const int port1, const int port2) {
 95 |   if (port1 == -1) return 1;
 96 |   if (port2 == -1) return 1;
 97 |   if (port1 == port2) return 1;
 98 |   return 0;
 99 | }
100 | 
101 | 
102 | int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact) {
103 |   // Make an exception for the case where no user list is defined
104 |   if (listSize == 0) return 1;
105 | 
106 |   for (int i=0; i<listSize; i++) {
107 |     if (matchIf(string, ifList[i].prefix, matchExact)
108 |         && matchPort(port, ifList[i].port)) {
109 |       return 1;
110 |     }
111 |   }
112 |   return 0;
113 | }
114 | 
115 | const char *get_plugin_lib_path()
116 | {
117 |   Dl_info dl_info;
118 |   int ret;
119 | 
120 |   ret = dladdr((void*)&get_plugin_lib_path, &dl_info);
121 |   if (ret == 0) return NULL;
122 | 
123 |   return dl_info.dli_fname;
124 | }
125 | 
126 | NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
127 | 
128 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
129 |   // pthread_setname_np is nonstandard GNU extension
130 |   // needs the following feature test macro
131 | #ifdef _GNU_SOURCE
132 |   if (ncclParamSetThreadName() != 1) return;
133 |   char threadName[NCCL_THREAD_NAMELEN];
134 |   va_list vargs;
135 |   va_start(vargs, fmt);
136 |   vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs);
137 |   va_end(vargs);
138 |   pthread_setname_np(thread, threadName);
139 | #endif
140 | }
141 | 


--------------------------------------------------------------------------------