├── .ci
├── Jenkinsfile
├── README.md
├── build_cli.sh
├── build_nccl_rdma_sharp_plugins.sh
├── ci_functions.sh
├── config-header-check.yml
├── configure_sharp.sh
├── ibdev2netdev
├── nccl_tests
├── publish_artefacts.sh
├── pushd_functions.sh
├── run_nccl_tests.sh
├── settings.sh
├── sharp_coll_test_wrapper
└── taskset
├── .clang-format
├── .github
└── workflows
│ └── nccl-sharp-plugin.yml
├── .gitignore
├── LICENSE
├── Makefile.am
├── README.md
├── autogen.sh
├── configure.ac
├── contrib
└── buildrpm.sh
├── debian
├── changelog.in
├── compat
├── control.in
├── copyright
├── nccl-rdma-sharp-plugins.postinst.in
├── nccl-rdma-sharp-plugins.prem.in
├── rules.in
└── source
│ └── format
├── include
├── core.h
├── debug.h
├── ibvwrap.h
├── nccl.h
├── net.h
├── net_device.h
├── net_v10.h
├── net_v5.h
├── net_v6.h
├── net_v7.h
├── net_v8.h
├── net_v9.h
├── p2p_plugin.h
├── param.h
├── socket.h
├── timer.h
├── ucx_uct_lib.h
├── ucx_uct_ring.h
└── utils.h
├── m4
├── sharp.m4
└── ucx.m4
├── nccl-rdma-sharp-plugins.pc.in
├── nccl-rdma-sharp-plugins.spec.in
└── src
├── Makefile.am
├── ib_plugin.c
├── ibvwrap.c
├── p2p_plugin.c
├── param.c
├── sharp_plugin.c
├── socket.c
├── ucx_plugin.c
├── ucx_rma_plugin.c
├── ucx_uct_lib.c
├── ucx_uct_plugin.c
├── ucx_uct_rd_plugin.c
└── utils.c
/.ci/Jenkinsfile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env groovy
2 |
3 | // Verified with Jenkins v2.190.2
4 |
5 | // TODO:
6 | // 1. Calculate taskset/affinity for the scripts based on total number of jenkins executors
7 | // 2. NCCL/CUDA/SHARP dependencies should be parameterized
8 | // 3. HPC-X OS/MOFED support matrix should be covered (e.g. docker-based)
9 | // 4. Add signal handlers in the scripts (e.g. to correctly handle Jenkins abort by timeout situations)
10 |
11 | pipeline {
12 | agent {label "ml-test-node-gpu"}
13 |
14 | options {
15 | buildDiscarder(logRotator(numToKeepStr: '10'))
16 | timeout(time: 90, unit: 'MINUTES')
17 | disableConcurrentBuilds()
18 | }
19 |
20 | environment {
21 | NFS_WORKSPACE = "${NFS_WORKSPACE_ROOT}/ml-nccl-rdma-sharp-plugins-pr/${BUILD_NUMBER}"
22 | ARTEFACT_DIR = "${NFS_WORKSPACE}/artefacts"
23 | NCCL_RDMA_SHARP_PLUGINS_DIR = "${NFS_WORKSPACE}/nccl-rdma-sharp-plugins"
24 | NCCL_TESTS_DIR = "${NFS_WORKSPACE}/nccl-tests"
25 | }
26 |
27 | stages {
28 | stage('Preparations') {
29 | steps {
30 | echo 'Preparations...'
31 | sh 'mkdir -p ${ARTEFACT_DIR}'
32 | sh 'mkdir -p ${NFS_WORKSPACE}'
33 | }
34 | }
35 | stage('Build nccl-rdma-sharp-plugins') {
36 | steps {
37 | echo 'Building nccl-rdma-sharp-plugins...'
38 | sh """#!/bin/bash
39 | set -o pipefail
40 | ${WORKSPACE}/.ci/build_nccl_rdma_sharp_plugins.sh 2>&1 | tee ${ARTEFACT_DIR}/build_nccl_rdma_sharp_plugins.log
41 | """
42 | }
43 | }
44 | stage('Configure SHARP: startup') {
45 | steps {
46 | echo 'Configure SHARP: startup...'
47 | sh """#!/bin/bash
48 | set -o pipefail
49 | ${WORKSPACE}/.ci/configure_sharp.sh 2>&1 | tee ${ARTEFACT_DIR}/configure_sharp_startup.log
50 | """
51 | }
52 | }
53 | stage('Checkout NCCL tests') {
54 | steps {
55 | dir("${NCCL_TESTS_DIR}") {
56 | git branch: 'master',
57 | url: 'https://github.com/NVIDIA/nccl-tests.git'
58 | }
59 | }
60 | }
61 | stage('Test nccl-rdma-sharp-plugins') {
62 | steps {
63 | echo 'Testing nccl-rdma-sharp-plugins...'
64 | sh """#!/bin/bash
65 | set -o pipefail
66 | ${WORKSPACE}/.ci/run_nccl_tests.sh 2>&1 | tee ${ARTEFACT_DIR}/run_nccl_test.log
67 | """
68 | }
69 | }
70 | stage('Configure SHARP: stop') {
71 | steps {
72 | echo 'Configure SHARP: stop...'
73 | sh """#!/bin/bash
74 | set -o pipefail
75 | ${WORKSPACE}/.ci/configure_sharp.sh stop 2>&1 | tee ${ARTEFACT_DIR}/configure_sharp_stop.log
76 | """
77 | }
78 | }
79 | }
80 | // Not needed, as there are no external contributors
81 | // post {
82 | // always {
83 | // echo 'Post-actions...'
84 | // sh '${WORKSPACE}/.ci/publish_artefacts.sh'
85 | // }
86 | // }
87 | }
88 |
--------------------------------------------------------------------------------
/.ci/README.md:
--------------------------------------------------------------------------------
1 | # nccl-rdma-sharp-plugins Continuous Integration (CI)
2 | ## Overview
3 | nccl-rdma-sharp-plugins CI is intended to make sanity checking for every code change. CI is started for each Pull Request (PR) and can be additionally triggered with **bot:mlx:test** (or **bot:mlx:retest**) keyword written in the PR comments. For users in the project WhiteList CI is started automatically, for others - project maintainers should approve CI start with '**ok to test**' keyword reply.
4 | CI status and artefacts (log files) are published within the PR comments.
5 | ## Description
6 | CI includes the following steps:
7 | * Build nccl-rdma-sharp-plugins
8 | * Test nccl-rdma-sharp-plugins with [NCCL tests](https://github.com/nvidia/nccl-tests).
9 | The tests are run with [NVIDIA's NCCL](https://github.com/NVIDIA/nccl) library built within CI from the internal repository.
10 | ### Test Environment
11 | CI is run in the Mellanox lab on a 2-node cluster with the following parameters:
12 |
13 | Hardware
14 | * IB: 1x ConnectX-6 HCA (connected to Mellanox Quantum™ HDR switch)
15 | * GPU: 1x Nvidia Tesla K40m
16 |
17 | Software
18 | * Ubuntu 18.04.4
19 | * Internal stable MLNX_OFED, HPC-X and SHARP versions
--------------------------------------------------------------------------------
/.ci/build_cli.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | . ./pushd_functions.sh
3 | . ./ci_functions.sh
4 | pushd /GIT
5 | case $1 in
6 | build)
7 | configure
8 | echo "Building NCCL sharp plugin"
9 | build
10 | ;;
11 | sharp)
12 | echo "Checking and configure sharp"
13 | sharp
14 | ;;
15 | test)
16 | echo "Running tests for NCCL sharp plugin"
17 | test
18 | ;;
19 | *)
20 | echo "Do nothing"
21 | ;;
22 | esac
23 |
--------------------------------------------------------------------------------
/.ci/build_nccl_rdma_sharp_plugins.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -leE
2 |
3 | SCRIPT_DIR="$(
4 | cd "$(dirname "$0")"
5 | pwd -P
6 | )"
7 | cd "${SCRIPT_DIR}"
8 | # shellcheck source=settings.sh
9 | . "${SCRIPT_DIR}/settings.sh"
10 |
11 | cd "${WORKSPACE}"
12 |
13 | if ! "${WORKSPACE}/autogen.sh"; then
14 | echo "ERROR: ${WORKSPACE}/autogen.sh failed"
15 | echo "FAIL"
16 | exit 1
17 | fi
18 |
19 | if ! "${WORKSPACE}/configure" \
20 | --prefix="${NCCL_RDMA_SHARP_PLUGINS_DIR}" \
21 | --with-cuda="${CUDA_HOME}" \
22 | --with-sharp="${HPCX_SHARP_DIR}"; then
23 | echo "ERROR: ${WORKSPACE}/configure failed"
24 | echo "FAIL"
25 | exit 1
26 | fi
27 |
28 | if ! make -j install; then
29 | echo "ERROR: 'make -j install' failed"
30 | echo "FAIL"
31 | exit 1
32 | fi
33 |
34 | if [ "$DEBUG" = "true" ]; then
35 | echo "INFO: ${NCCL_RDMA_SHARP_PLUGINS_DIR}:"
36 | # For debug purposes
37 | find "${NCCL_RDMA_SHARP_PLUGINS_DIR}" -type f
38 | fi
39 |
40 | echo "PASS"
41 |
--------------------------------------------------------------------------------
/.ci/ci_functions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | # Preparation a workplace & configs to CI
3 | function configure() {
4 | rm -rf "${NFS_WORKSPACE}-pr" || true
5 | rm -rf "${NFS_WORKSPACE}" || true
6 | rm -rf ./nccl-rdma-sharp-plugins/.ci/cfg/* || true
7 | cd "${NFS_WORKSPACE_ROOT}" || exit 1
8 | mkdir -p ./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf
9 | mkdir -p ./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf
10 |
11 | printf "%s\n%s\n" "${HOST1}" "${HOST2}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/hostfile
12 | printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharp_am.cfg
13 | printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharpd.cfg
14 | printf "%s\n" "${SHARP_AM_HOST}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST1}"/sharp_conf/sharp_am_node.txt
15 |
16 | printf "%s\n%s\n" "${HOST1}" "${HOST2}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/hostfile
17 | printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharp_am.cfg
18 | printf "log_verbosity 3\n" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharpd.cfg
19 | printf "%s\n" "${SHARP_AM_HOST}" >./nccl-rdma-sharp-plugins/.ci/cfg/"${HOST2}"/sharp_conf/sharp_am_node.txt
20 | }
21 |
22 | # Building NCCL rdma sharp plugin
23 | function build() {
24 | echo "Running build_nccl_rdma_sharp_plugins.sh..."
25 | "${WORKSPACE}"/.ci/build_nccl_rdma_sharp_plugins.sh && echo "Build SUCCESFULL !!!"
26 | }
27 |
28 | # Checking and configuring Sharp
29 | function sharp() {
30 | echo "Running configure_sharp.sh..."
31 | "${WORKSPACE}"/.ci/configure_sharp.sh && echo "Step configure_sharp SUCCESFULL !!!"
32 | }
33 |
34 | # Running of tests
35 | function test() {
36 | git clone --depth=1 https://github.com/NVIDIA/nccl-tests.git "${NFS_WORKSPACE}"/nccl-tests
37 | echo "Running run_nccl_tests.sh..."
38 | "${WORKSPACE}"/.ci/run_nccl_tests.sh && echo "Tests SUCCESFULL !!!"
39 | }
40 |
--------------------------------------------------------------------------------
/.ci/config-header-check.yml:
--------------------------------------------------------------------------------
1 | general:
2 | exclude:
3 | - "\\.git.*"
4 | - "\\.(yml|md|txt)"
5 | - "^\\.ci.*"
6 | - "\\.(m4|ac)"
7 | - "LICENSE"
8 | - "debian/copyright"
9 | - "debian/compat"
10 | - "debian/source/format"
11 |
12 | bsd:
13 | validate-spdx-license: true
14 | include:
15 | - ".*\\.(am|in|hpp|cpp|py|cc|h|c|sh)$"
16 |
--------------------------------------------------------------------------------
/.ci/configure_sharp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -l
2 | SCRIPT_DIR="$(
3 | cd "$(dirname "$0")" || exit 1
4 | pwd -P
5 | )"
6 | cd "${SCRIPT_DIR}" || exit 1
7 | # shellcheck source=settings.sh
8 | . "${SCRIPT_DIR}/settings.sh"
9 |
10 | if [ -z "${NCCL_RDMA_SHARP_PLUGINS_DIR}" ]; then
11 | echo "ERROR: NCCL_RDMA_SHARP_PLUGINS_DIR is not defined"
12 | echo "FAIL"
13 | exit 1
14 | fi
15 |
16 | export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${NCCL_RDMA_SHARP_PLUGINS_DIR}/lib:${LD_LIBRARY_PATH}"
17 |
18 | # 1 - run sanity tests, 0 - do not run
19 | VERIFY_SHARP_ENABLE=${VERIFY_SHARP_ENABLE:-1}
20 |
21 | if [ -z "${NCCL_DIR}" ]; then
22 | module load dev/nccl-nightly-stable
23 | else
24 | export LD_LIBRARY_PATH="${NCCL_DIR}/lib:${LD_LIBRARY_PATH}"
25 | fi
26 |
27 | # Available values: start|stop|restart
28 | SHARP_MANAGER_ACTION="${1:-restart}"
29 | echo "INFO: SHARP_MANAGER_ACTION = ${SHARP_MANAGER_ACTION}"
30 |
31 | echo "INFO: NFS_WORKSPACE = ${NFS_WORKSPACE}"
32 |
33 | if [ -z "${NFS_WORKSPACE}" ]; then
34 | echo "ERROR: NFS_WORKSPACE is not defined"
35 | echo "FAIL"
36 | exit 1
37 | fi
38 |
39 | if [ -z "${HPCX_SHARP_DIR}" ]; then
40 | echo "ERROR: HPCX_SHARP_DIR is not defined"
41 | echo "FAIL"
42 | exit 1
43 | fi
44 |
45 | HPCX_SHARP_DIR=/opt/mellanox/sharp
46 | CONFIGURE_SHARP_TMP_DIR="${NFS_WORKSPACE}/configure_sharp_$$"
47 | mkdir -p "${CONFIGURE_SHARP_TMP_DIR}"
48 | chmod o+w "${CONFIGURE_SHARP_TMP_DIR}"
49 |
50 | export SHARP_CONF="${CONFIGURE_SHARP_TMP_DIR}"
51 | export SHARP_INI_FILE="${SHARP_CONF}/sharp_manager.ini"
52 |
53 | cp -R "${CFG_DIR}/$HOSTNAME/sharp_conf/"* "${SHARP_CONF}"
54 |
55 | if [ -f "${SHARP_CONF}/sharp_am_node.txt" ]; then
56 | SHARP_AM_NODE=$(cat ${SHARP_CONF}/sharp_am_node.txt)
57 | echo "INFO: SHARP_AM_NODE = ${SHARP_AM_NODE}"
58 | else
59 | echo "ERROR: ${SHARP_CONF}/sharp_am_node.txt does not exist or not accessible"
60 | echo "FAIL"
61 | exit 1
62 | fi
63 |
64 | IB_DEV=$(/GIT/ibdev2netdev | awk '{ print $1 }'):1
65 | SM_GUID=$(sudo sminfo -C ${IB_DEV} -P1 | awk '{print $7}' | cut -d',' -f1)
66 | # SM/AM node
67 | # SM_HOSTNAME=`sudo ibnetdiscover -H -C mlx5_0 -P1 | grep ${SM_GUID} | awk -F'"' '{print $2 }' | awk '{print $1}'`
68 | HOSTS=$(cat $HOSTFILE | xargs | tr ' ' ',')
69 |
70 | echo "INFO: IB_DEV = ${IB_DEV}"
71 | echo "INFO: SM_GUID = ${SM_GUID}"
72 | # echo "INFO: SM_HOSTNAME = ${SM_HOSTNAME}"
73 | echo "INFO: HOSTS = ${HOSTS}"
74 |
75 | rm -f ${SHARP_INI_FILE}
76 |
77 | cat >${SHARP_INI_FILE} </dev/null"
113 | if [ $? -ne 0 ]; then
114 | echo "ERROR: wrong value of routing_engine parameter in ${OPENSM_CONFIG}"
115 | echo "Should be (example): routing_engine updn"
116 | echo "FAIL"
117 | exit 1
118 | fi
119 |
120 | ssh "${SHARP_AM_NODE}" "grep \"sharp_enabled.*2\" ${OPENSM_CONFIG} 2>/dev/null"
121 | if [ $? -ne 0 ]; then
122 | echo "ERROR: wrong value of sharp_enabled parameter in ${OPENSM_CONFIG}"
123 | echo "Should be (example): sharp_enabled 2"
124 | echo "FAIL"
125 | exit 1
126 | fi
127 |
128 | echo "INFO: check_opensm_conf on ${SHARP_AM_NODE}... DONE"
129 | }
130 |
131 | verify_sharp() {
132 | echo "INFO: verify_sharp..."
133 |
134 | cp ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/* ${CONFIGURE_SHARP_TMP_DIR}
135 | cd ${CONFIGURE_SHARP_TMP_DIR}
136 | make CUDA=1 CUDA_HOME=${CUDA_HOME} SHARP_HOME="${HPCX_SHARP_DIR}"
137 | if [ $? -ne 0 ]; then
138 | echo "ERROR: verify_sharp make failed"
139 | echo "FAIL"
140 | exit 1
141 | fi
142 |
143 | cp ${WORKSPACE}/.ci/sharp_coll_test_wrapper ./
144 | ITERS=100
145 | SKIP=20
146 | NP=$(wc --lines "$HOSTFILE" | awk '{print $1}')
147 |
148 | # -mca coll_hcoll_enable 0 - disable HCOLL
149 | MPIRUN_COMMON_OPTIONS="\
150 | -np $NP \
151 | -H $HOSTS \
152 | --map-by node \
153 | -x LD_LIBRARY_PATH \
154 | --allow-run-as-root \
155 | -mca oob_tcp_if_exclude eth0 \
156 | "
157 |
158 | # TODO change to SHARP_COLL_SAT_THRESHOLD=1 (32 - W/A for SHARP issue)
159 | MPIRUN_SHARP_OPTIONS="\
160 | -x SHARP_COLL_LOG_LEVEL=3 \
161 | -x ENABLE_SHARP_COLL=1 \
162 | -x SHARP_COLL_SAT_THRESHOLD=32 \
163 | -x SHARP_COLL_ENABLE_SAT=1 \
164 | "
165 |
166 | echo "Environment for the reproducer:"
167 | echo "export PATH=$PATH"
168 | echo "export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
169 | echo "export OPAL_PREFIX=${OPAL_PREFIX}"
170 |
171 | # Test 1 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README):
172 | # Run allreduce barrier perf test on 2 hosts using port mlx5_0
173 | echo "${GH_FOLD}# Test 1..."
174 | CMD="mpirun \
175 | ${MPIRUN_COMMON_OPTIONS} \
176 | ${MPIRUN_SHARP_OPTIONS} \
177 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
178 | --iters $ITERS \
179 | --skip $SKIP \
180 | --mode perf \
181 | --collectives allreduce,barrier"
182 | echo "INFO: Test 1 command line:"
183 | trim_multiple_spaces "$CMD"
184 | $CMD
185 | if [ $? -ne 0 ]; then
186 | echo "ERROR: verify_sharp Test 1 failed"
187 | echo "FAIL"
188 | exit 1
189 | fi
190 | echo "${GH_UNFOLD}"
191 | echo "Test 1... DONE"
192 |
193 | # Test 2 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README):
194 | # Run allreduce perf test on 2 hosts using port mlx5_0 with CUDA buffers
195 | echo "${GH_FOLD}# Test 2..."
196 | CMD="mpirun \
197 | ${MPIRUN_COMMON_OPTIONS} \
198 | ${MPIRUN_SHARP_OPTIONS} \
199 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
200 | --iters $ITERS \
201 | --skip $SKIP \
202 | --mode perf \
203 | --collectives allreduce \
204 | -M cuda"
205 | echo "INFO: Test 2 command line:"
206 | trim_multiple_spaces "$CMD"
207 | $CMD
208 | if [ $? -ne 0 ]; then
209 | echo "ERROR: verify_sharp Test 2 failed"
210 | echo "FAIL"
211 | exit 1
212 | fi
213 | echo "${GH_UNFOLD}"
214 | echo "Test 2... DONE"
215 |
216 | # Test 3 (from ${HPCX_SHARP_DIR}/share/sharp/examples/mpi/coll/README):
217 | # Run allreduce perf test on 2 hosts using port mlx5_0 with Streaming aggregation from 4B to 512MB
218 | echo "${GH_FOLD}# Test 3..."
219 | CMD="mpirun \
220 | ${MPIRUN_COMMON_OPTIONS} \
221 | ${MPIRUN_SHARP_OPTIONS} \
222 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
223 | --iters $ITERS \
224 | --skip $SKIP \
225 | --mode perf \
226 | --collectives allreduce \
227 | -s 4:536870912"
228 | echo "INFO: Test 3 command line:"
229 | trim_multiple_spaces "$CMD"
230 | $CMD
231 | if [ $? -ne 0 ]; then
232 | echo "ERROR: verify_sharp Test 3 failed"
233 | echo "FAIL"
234 | exit 1
235 | fi
236 | echo "${GH_UNFOLD}"
237 | echo "Test 3... DONE"
238 |
239 | # Test 4:
240 | # Run iallreduce perf test on 2 hosts using port mlx5_0
241 | echo "${GH_FOLD}# Test 4..."
242 | CMD="mpirun \
243 | ${MPIRUN_COMMON_OPTIONS} \
244 | ${MPIRUN_SHARP_OPTIONS} \
245 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
246 | --iters $ITERS \
247 | --skip $SKIP \
248 | --mode perf \
249 | --collectives iallreduce \
250 | -N 128"
251 | echo "INFO: Test 4 command line:"
252 | trim_multiple_spaces "$CMD"
253 | $CMD
254 | if [ $? -ne 0 ]; then
255 | echo "ERROR: verify_sharp Test 4 failed"
256 | echo "FAIL"
257 | exit 1
258 | fi
259 | echo "${GH_UNFOLD}"
260 | echo "Test 4... DONE"
261 |
262 | # Test 5:
263 | # Run iallreduce perf test on 2 hosts using port mlx5_0 with CUDA buffers
264 | echo "${GH_FOLD}# Test 5..."
265 | CMD="mpirun \
266 | ${MPIRUN_COMMON_OPTIONS} \
267 | ${MPIRUN_SHARP_OPTIONS} \
268 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
269 | --iters $ITERS \
270 | --skip $SKIP \
271 | --mode perf \
272 | --collectives iallreduce \
273 | -N 128 \
274 | -M cuda"
275 | echo "INFO: Test 5 command line:"
276 | trim_multiple_spaces "$CMD"
277 | $CMD
278 | if [ $? -ne 0 ]; then
279 | echo "ERROR: verify_sharp Test 5 failed"
280 | echo "FAIL"
281 | exit 1
282 | fi
283 | echo "${GH_UNFOLD}"
284 | echo "Test 5... DONE"
285 |
286 | # Test 6:
287 | # Run iallreduce perf test on 2 hosts using port mlx5_0 with Streaming aggregation from 4B to 512MB
288 | echo "${GH_FOLD}# Test 6..."
289 | CMD="mpirun \
290 | ${MPIRUN_COMMON_OPTIONS} \
291 | ${MPIRUN_SHARP_OPTIONS} \
292 | ${CONFIGURE_SHARP_TMP_DIR}/sharp_coll_test_wrapper \
293 | --iters $ITERS \
294 | --skip $SKIP \
295 | --mode perf \
296 | --collectives iallreduce \
297 | -N 128 \
298 | -s 4:131072"
299 | echo "INFO: Test 6 command line:"
300 | trim_multiple_spaces "$CMD"
301 | $CMD
302 | if [ $? -ne 0 ]; then
303 | echo "ERROR: verify_sharp Test 6 failed"
304 | echo "FAIL"
305 | exit 1
306 | fi
307 | echo "${GH_UNFOLD}"
308 | echo "Test 6... DONE"
309 |
310 | # Test 7 (from the SHARP deployment guide): Without SAT
311 | echo "${GH_FOLD}# Test 7..."
312 | CMD="$OMPI_HOME/bin/mpirun \
313 | ${MPIRUN_COMMON_OPTIONS} \
314 | --bind-to core \
315 | -mca btl_openib_warn_default_gid_prefix 0 \
316 | -mca rmaps_dist_device ${IB_DEV} \
317 | -mca rmaps_base_mapping_policy dist:span \
318 | -x MXM_LOG_LEVEL=ERROR \
319 | -x HCOLL_ML_DISABLE_REDUCE=1 \
320 | -x LD_LIBRARY_PATH \
321 | -x HCOLL_ENABLE_SHARP=2 \
322 | -x SHARP_COLL_LOG_LEVEL=3 \
323 | -x SHARP_COLL_GROUP_RESOURCE_POLICY=1 \
324 | -x SHARP_COLL_MAX_PAYLOAD_SIZE=256 \
325 | -x HCOLL_SHARP_UPROGRESS_NUM_POLLS=999 \
326 | -x HCOLL_BCOL_P2P_ALLREDUCE_SHARP_MAX=4096 \
327 | -x SHARP_COLL_PIPELINE_DEPTH=32 \
328 | -x SHARP_COLL_JOB_QUOTA_OSTS=32 \
329 | -x SHARP_COLL_JOB_QUOTA_MAX_GROUPS=4 \
330 | -x SHARP_COLL_JOB_QUOTA_PAYLOAD_PER_OST=256 \
331 | ${WORKSPACE}/.ci/taskset -c 1 \
332 | numactl --membind=0 \
333 | $HPCX_OSU_DIR/osu_allreduce \
334 | -i 100 \
335 | -x 100 \
336 | -f \
337 | -m 4096:4096"
338 | echo "INFO: Test 7 command line:"
339 | trim_multiple_spaces "$CMD"
340 | $CMD
341 | if [ $? -ne 0 ]; then
342 | echo "ERROR: Test 7 (without SAT) failed, check the log file"
343 | echo "FAIL"
344 | exit 1
345 | fi
346 | echo "${GH_UNFOLD}"
347 | echo "Test 7... DONE"
348 |
349 | # Test 8 (from the SHARP deployment guide): With SAT
350 | echo "${GH_FOLD}# Test 8..."
351 | CMD="$OMPI_HOME/bin/mpirun \
352 | ${MPIRUN_COMMON_OPTIONS} \
353 | -mca btl_openib_warn_default_gid_prefix 0 \
354 | -mca rmaps_dist_device ${IB_DEV} \
355 | -mca rmaps_base_mapping_policy dist:span \
356 | -x MXM_ASYNC_INTERVAL=1800s \
357 | -x MXM_LOG_LEVEL=ERROR \
358 | -x HCOLL_ML_DISABLE_REDUCE=1 \
359 | -x LD_LIBRARY_PATH \
360 | -x HCOLL_ENABLE_SHARP=2 \
361 | -x SHARP_COLL_LOG_LEVEL=3 \
362 | -x SHARP_COLL_GROUP_RESOURCE_POLICY=1 \
363 | -x SHARP_COLL_MAX_PAYLOAD_SIZE=256 \
364 | -x HCOLL_SHARP_UPROGRESS_NUM_POLLS=999 \
365 | -x HCOLL_BCOL_P2P_ALLREDUCE_SHARP_MAX=4096 \
366 | -x SHARP_COLL_PIPELINE_DEPTH=32 \
367 | -x SHARP_COLL_JOB_QUOTA_OSTS=32 \
368 | -x SHARP_COLL_JOB_QUOTA_MAX_GROUPS=4 \
369 | -x SHARP_COLL_JOB_QUOTA_PAYLOAD_PER_OST=256 \
370 | -x SHARP_COLL_ENABLE_SAT=1 \
371 | ${WORKSPACE}/.ci/taskset -c 1 \
372 | numactl --membind=0 \
373 | $HPCX_OSU_DIR/osu_allreduce \
374 | -i 100 \
375 | -x 100 \
376 | -f \
377 | -m 4096:4096"
378 | echo "INFO: Test 8 command line:"
379 | trim_multiple_spaces "$CMD"
380 | $CMD
381 | if [ $? -ne 0 ]; then
382 | echo "ERROR: Test 8 (with SAT) failed, check the log file"
383 | echo "FAIL"
384 | exit 1
385 | fi
386 | echo "${GH_UNFOLD}"
387 | echo "Test 8... DONE"
388 |
389 | echo "INFO: verify_sharp... DONE"
390 | }
391 |
392 | if [ "${SHARP_MANAGER_ACTION}" != "stop" ]; then
393 | check_opensm_status
394 | check_opensm_conf
395 | fi
396 |
397 | sudo PDSH_RCMD_TYPE=ssh SHARP_INI_FILE=${SHARP_INI_FILE} SHARP_CONF=${SHARP_CONF} ${HPCX_SHARP_DIR}/sbin/sharp_manager.sh "${SHARP_MANAGER_ACTION}" -l "$HOSTS" -s "${SHARP_AM_NODE}"
398 | if [ $? -ne 0 ]; then
399 | echo "ERROR: sharp_manager.sh failed, check the log file"
400 | echo "FAIL"
401 | exit 1
402 | fi
403 |
404 | if [ "${SHARP_MANAGER_ACTION}" != "stop" ] && [ "${VERIFY_SHARP_ENABLE}" -eq 1 ]; then
405 | verify_sharp
406 | fi
407 |
408 | sudo chmod -R 777 ${CONFIGURE_SHARP_TMP_DIR}
409 | rm -rf ${CONFIGURE_SHARP_TMP_DIR}
410 |
411 | echo "PASS"
412 |
--------------------------------------------------------------------------------
/.ci/ibdev2netdev:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | # ibdev2netdev doesn't work correctly inside a container. This wrapper is a workaround
3 | DEV_IB=$(ls -1 /dev/infiniband/umad*)
4 | N=${DEV_IB: -1}
5 | if [ -e /dev/infiniband/umad${N} ]; then
6 | printf "mlx5_${N} port 1 ====> ib0\n"
7 | fi
8 |
--------------------------------------------------------------------------------
/.ci/nccl_tests:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | # Wrapper to add correct parameters to the main app
3 | IB_DEV=$(ibdev2netdev | awk '{ print $1 }'):1
4 | ETH_DEV=$(ibdev2netdev | awk '{ print $5 }')
5 |
6 | export HCOLL_MAIN_IB=${IB_DEV}
7 | export NCCL_IB_HCA=${IB_DEV}
8 | export UCX_NET_DEVICES=${IB_DEV}
9 | export NCCL_SOCKET_IFNAME=${ETH_DEV}
10 | exec "${@}"
11 |
--------------------------------------------------------------------------------
/.ci/publish_artefacts.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -leE
2 |
3 | SCRIPT_DIR="$(
4 | cd "$(dirname "$0")"
5 | pwd -P
6 | )"
7 | cd "${SCRIPT_DIR}"
8 | # shellcheck source=settings.sh
9 | . "${SCRIPT_DIR}/settings.sh"
10 |
11 | echo 'Publish artefacts...'
12 |
13 | export UPSTREAM_JOB_NAME=${UPSTREAM_JOB_NAME:-${JOB_NAME}}
14 | export UPSTREAM_BUILD_NUMBER=${UPSTREAM_BUILD_NUMBER:-${BUILD_NUMBER}}
15 | export UPSTREAM_ghprbGhRepository=${UPSTREAM_ghprbGhRepository:-${ghprbGhRepository}}
16 | export UPSTREAM_ghprbPullId=${UPSTREAM_ghprbPullId:-${ghprbPullId}}
17 |
18 | ls -al "${ARTEFACT_DIR}"
19 |
20 | publish_artefacts_to_gist.py
21 |
22 | echo 'Publish artefacts... DONE'
23 |
--------------------------------------------------------------------------------
/.ci/pushd_functions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | pushd() {
3 | command pushd "$@" >/dev/null
4 | }
5 |
6 | popd() {
7 | command popd "$@" >/dev/null
8 | }
9 |
--------------------------------------------------------------------------------
/.ci/run_nccl_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -leE
2 | SCRIPT_DIR="$(
3 | cd "$(dirname "$0")"
4 | pwd -P
5 | )"
6 | cd "${SCRIPT_DIR}"
7 | # shellcheck source=settings.sh
8 | . "${SCRIPT_DIR}"/settings.sh
9 |
10 | GLOBAL_TEST_STATUS=0
11 |
12 | if [ -z "${NCCL_DIR}" ]; then
13 | module load dev/nccl-nightly-stable
14 | else
15 | export LD_LIBRARY_PATH="${NCCL_DIR}/lib:${LD_LIBRARY_PATH}"
16 | fi
17 |
18 | if [ -z "${NCCL_RDMA_SHARP_PLUGINS_DIR}" ]; then
19 | echo "ERROR: NCCL_RDMA_SHARP_PLUGINS_DIR is not defined"
20 | echo "FAIL"
21 | exit 1
22 | fi
23 |
24 | if [ -z "${NCCL_TESTS_DIR}" ]; then
25 | echo "ERROR: NCCL_TESTS_DIR is not defined"
26 | echo "FAIL"
27 | exit 1
28 | fi
29 |
30 | NP=2
31 | IB_DEV=$(ibdev2netdev | awk '{ print $1 }'):1
32 | # UCX_MEMTYPE_CACHE=n - to avoid warnings "memtype_cache.c:83 UCX ERROR failed to insert region 0x1a1e890 [0x7f8d00000000..0x7f8d30000000]: Element already exists"
33 | MPIRUN_OPTIONS_COMMON="\
34 | -x LD_LIBRARY_PATH \
35 | -x NCCL_DEBUG=INFO \
36 | -x NCCL_DEBUG_SUBSYS=INIT \
37 | -x UCX_MEMTYPE_CACHE=n \
38 | -x HCOLL_ENABLE_SHARP=0 \
39 | -x HCOLL_ENABLE_MCAST_ALL=0 \
40 | -mca pml ucx \
41 | -mca coll_hcoll_enable 1 \
42 | --map-by node \
43 | --bind-to none \
44 | --hostfile ${HOSTFILE} \
45 | -np $NP \
46 | --report-bindings \
47 | --allow-run-as-root \
48 | -mca oob_tcp_if_exclude eth0 \
49 | "
50 |
51 | # Application options
52 | ITER=100
53 | WARMUP_ITER=100
54 | MSG_SIZE_MIN="8"
55 | MSG_SIZE_MAX="4M"
56 | NCCL_TEST_EXE=("all_reduce_perf" "all_gather_perf" "broadcast_perf" "reduce_perf" "reduce_scatter_perf" "alltoall_perf")
57 | NCCL_TEST_PARAMS=" -b ${MSG_SIZE_MIN} -e ${MSG_SIZE_MAX} -f 2 -g 1 -c 1 -z 1 -n $ITER -w $WARMUP_ITER -p 0 "
58 | ENABLE_SAT=${ENABLE_SAT:-1}
59 | echo "INFO: ENABLE_SAT = ${ENABLE_SAT}"
60 |
61 | echo_hash_line() {
62 | echo "###############################################################################"
63 | }
64 |
65 | echo "CUDA_HOME: ${CUDA_HOME}"
66 | echo "NCCL_DIR: ${NCCL_DIR}"
67 | echo "NCCL_RDMA_SHARP_PLUGINS_DIR: ${NCCL_RDMA_SHARP_PLUGINS_DIR}"
68 | echo "MPI_HOME: ${MPI_HOME}"
69 |
70 | # Build NCCL-TESTS
71 | cd "${NCCL_TESTS_DIR}"
72 | make -j clean
73 |
74 | make -j CUDA_HOME="${CUDA_HOME}" NCCL_HOME="${NCCL_DIR}" MPI=1 MPI_HOME="${MPI_HOME}"
75 |
76 | export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${NCCL_RDMA_SHARP_PLUGINS_DIR}/lib:${LD_LIBRARY_PATH}"
77 |
78 | trim_multiple_spaces() {
79 | echo "$1" | sed -s "s|\ \ *| |g"
80 | }
81 |
82 | # USAGE: all_reduce_perf
83 | # [-t,--nthreads ]
84 | # [-g,--ngpus ]
85 | # [-b,--minbytes ]
86 | # [-e,--maxbytes ]
87 | # [-i,--stepbytes ]
88 | # [-f,--stepfactor ]
89 | # [-n,--iters ]
90 | # [-m,--agg_iters ]
91 | # [-w,--warmup_iters ]
92 | # [-p,--parallel_init <0/1>]
93 | # [-c,--check <0/1>]
94 | # [-o,--op ]
95 | # [-d,--datatype ]
96 | # [-r,--root ]
97 | # [-z,--blocking <0/1>]
98 | # [-h,--help]
99 |
100 | ###############################################################################
101 | # Run NCCL-TESTS (MPI)
102 | ###############################################################################
103 |
104 | i=1
105 |
106 | for TEST_EXE in ${NCCL_TEST_EXE[@]}; do
107 | #===================
108 | # NCCL_PLUGIN_P2P
109 | #===================
110 | # Enable ucx_rma tests once this is resolved: https://redmine.mellanox.com/issues/3037941
111 | # for P2P_LAYER in ucx ucx_rma ib
112 | for P2P_LAYER in ib ucx ucx_uct ucx_uct_read; do
113 | MPIRUN_OPTIONS_PLUGIN_P2P_LAYER="-x NCCL_PLUGIN_P2P=${P2P_LAYER}"
114 |
115 | #===================
116 | # NCCL_PROTO
117 | #===================
118 | for NCCL_PROTO in Simple LL DEFAULT; do
119 | if [ "${NCCL_PROTO}" = "DEFAULT" ]; then
120 | MPIRUN_OPTIONS_NCCL_PROTO=""
121 | else
122 | MPIRUN_OPTIONS_NCCL_PROTO="-x NCCL_PROTO=${NCCL_PROTO}"
123 | fi
124 |
125 | #===================
126 | # NCCL_ALGO
127 | #===================
128 | for NCCL_ALGO in CollNet Tree Ring DEFAULT; do
129 | if [ "${NCCL_ALGO}" = "CollNet" ] && [ "${TEST_EXE}" != "all_reduce_perf" ]; then
130 | # test sharp plugin only with all_reduce_perf
131 | continue
132 | fi
133 |
134 | if [ "${NCCL_ALGO}" = "DEFAULT" ]; then
135 | MPIRUN_OPTIONS_NCCL_ALGO=""
136 | else
137 | MPIRUN_OPTIONS_NCCL_ALGO="-x NCCL_ALGO=${NCCL_ALGO}"
138 | fi
139 |
140 | if [ "${NCCL_ALGO}" = "CollNet" ]; then
141 | MPIRUN_OPTIONS_NCCL_ALGO="-x NCCL_COLLNET_ENABLE=1"
142 | fi
143 |
144 | #===================
145 | # SHARP_ENABLE
146 | #===================
147 | for SHARP_ENABLE in 0 1; do
148 | if { [ "${NCCL_ALGO}" = "Tree" ] || [ "${NCCL_ALGO}" = "Ring" ]; } && [ "$SHARP_ENABLE" = "1" ]; then
149 | # skip sharp enable 1 for tree and ring algorithms
150 | continue
151 | fi
152 | if [ "${SHARP_ENABLE}" = "0" ]; then
153 | MPIRUN_OPTIONS_SHARP=""
154 | else
155 | MPIRUN_OPTIONS_SHARP="\
156 | -x SHARP_COLL_LOG_LEVEL=3 \
157 | -x SHARP_COLL_ENABLE_SAT=${ENABLE_SAT} \
158 | "
159 | fi
160 |
161 | #===================
162 | # NCCL_NET_GDR_LEVEL
163 | #===================
164 | # for NCCL_NET_GDR_LEVEL in 0 1 2 3 4 5 DEFAULT
165 | for NCCL_NET_GDR_LEVEL in DEFAULT; do
166 | if [ "${NCCL_NET_GDR_LEVEL}" = "DEFAULT" ]; then
167 | MPIRUN_OPTIONS_GDR_LEVEL=""
168 | else
169 | MPIRUN_OPTIONS_GDR_LEVEL="-x NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL}"
170 | fi
171 |
172 | #===================
173 | # NCCL_NET_GDR_READ
174 | #===================
175 | # for NCCL_NET_GDR_READ in 0 1 DEFAULT
176 | for NCCL_NET_GDR_READ in DEFAULT; do
177 | if [ "${NCCL_NET_GDR_READ}" = "DEFAULT" ]; then
178 | MPIRUN_OPTIONS_GDR_READ=""
179 | else
180 | MPIRUN_OPTIONS_GDR_READ="-x NCCL_NET_GDR_READ=${NCCL_NET_GDR_READ}"
181 | fi
182 |
183 | echo_hash_line
184 | echo "${GH_FOLD}{# Test $i...}"
185 | echo_hash_line
186 |
187 | echo "INFO: TEST = ${TEST_EXE}"
188 | echo "INFO: P2P_LAYER = ${P2P_LAYER}"
189 | echo "INFO: NCCL_PROTO = ${NCCL_PROTO}"
190 | echo "INFO: NCCL_ALGO = ${NCCL_ALGO}"
191 | echo "INFO: SHARP_ENABLE = ${SHARP_ENABLE}"
192 | echo "INFO: NCCL_NET_GDR_LEVEL = ${NCCL_NET_GDR_LEVEL}"
193 | echo "INFO: NCCL_NET_GDR_READ = ${NCCL_NET_GDR_READ}"
194 |
195 | CMD="mpirun \
196 | ${MPIRUN_OPTIONS_COMMON} \
197 | ${MPIRUN_OPTIONS_NCCL_PROTO} \
198 | ${MPIRUN_OPTIONS_NCCL_ALGO} \
199 | ${MPIRUN_OPTIONS_SHARP} \
200 | ${MPIRUN_OPTIONS_GDR_LEVEL} \
201 | ${MPIRUN_OPTIONS_GDR_READ} \
202 | ${MPIRUN_OPTIONS_PLUGIN_P2P_LAYER} \
203 | ${WORKSPACE}/.ci/nccl_tests ${NCCL_TESTS_DIR}/build/${TEST_EXE} ${NCCL_TEST_PARAMS}"
204 | echo "# Test $i reproducer:"
205 | echo "export PATH=${PATH}"
206 | echo ""
207 | echo "export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
208 | echo ""
209 | echo "export OPAL_PREFIX=${OPAL_PREFIX}"
210 | echo ""
211 | trim_multiple_spaces "$CMD"
212 | if ! $CMD; then
213 | echo "${GH_UNFOLD}"
214 | echo "# Test $i... failed"
215 | GLOBAL_TEST_STATUS=1
216 | else
217 | echo "${GH_UNFOLD}"
218 | echo "# Test $i... passed"
219 | fi
220 |
221 | i=$((i + 1))
222 | done
223 | done
224 | done
225 | done
226 | done
227 | done
228 | done
229 |
230 | ###############################################################################
231 | if [ ${GLOBAL_TEST_STATUS} -ne 0 ]; then
232 | echo "ERROR: some tests failed, check the log file"
233 | echo "FAIL"
234 | exit 1
235 | else
236 | echo "All tests PASSED"
237 | fi
238 |
239 | echo "PASS"
240 |
--------------------------------------------------------------------------------
/.ci/settings.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -leE
2 | # Formating for Github acctions fold/unfold
3 | GH=${GH:-0}
4 | if [ "${GH}" -eq 1 ]; then
5 | GH_FOLD="::group::"
6 | GH_UNFOLD="::endgroup::"
7 | fi
8 | # PLUGINS
9 | echo "INFO: DEBUG = $DEBUG"
10 | DEBUG=false
11 | if [ "$DEBUG" = "true" ]; then
12 | set -x
13 | fi
14 |
15 | # W/A for SHARP
16 | # CUDA 10.2 is the latest available version we would like to test, CUDA 10.1 is needed for SHARP
17 | # (due to HPC-X is buitl with CUDA 10.1).
18 | # CUDA 10.2 has priority in the env PATH/LD_LIBRARY_PATH.
19 |
20 | # TODO remove use HPC-X which is already inside the image
21 |
22 | #module load /hpc/local/etc/modulefiles/dev/cuda-latest
23 | HPCX_UBUNTU_INSTALL_DIR=${HPCX_UBUNTU_INSTALL_DIR:-/hpc/noarch/HPCX/unpacked/hpcx-v2.17-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64/}
24 | module load "${HPCX_UBUNTU_INSTALL_DIR}"/modulefiles/hpcx-ompi
25 | # . "${HPCX_UBUNTU_INSTALL_DIR}/hpcx-init.sh"
26 | # hpcx_load
27 |
28 | # It is needed to disable nccl_rdma_sharp_plugin libs from HPC-X
29 | LD_LIBRARY_PATH="${LD_LIBRARY_PATH//nccl_rdma_sharp_plugin/nccl_rdma_sharp_pluginX}"
30 | export LD_LIBRARY_PATH
31 | CUDA_HOME=/usr/local/cuda
32 | #export UCX_NET_DEVICES=$(ibdev2netdev | awk '{print $1}'):1
33 | export NCCL_RDMA_SHARP_PLUGINS_DIR="${NCCL_RDMA_SHARP_PLUGINS_DIR:-${WORKSPACE}/_install}"
34 | echo "INFO: NCCL_RDMA_SHARP_PLUGINS_DIR = ${NCCL_RDMA_SHARP_PLUGINS_DIR}"
35 |
36 | TOP_DIR="$(git rev-parse --show-toplevel)"
37 | echo "INFO: TOP_DIR = ${TOP_DIR}"
38 |
39 | echo "INFO: CUDA_VER = ${CUDA_VER}"
40 | echo "INFO: CUDA_HOME = ${CUDA_HOME}"
41 | echo "INFO: HPCX_SHARP_DIR = ${HPCX_SHARP_DIR}"
42 | echo "INFO: HPCX_DIR = ${HPCX_DIR}"
43 | echo "INFO: WORKSPACE = ${WORKSPACE}"
44 |
45 | HOSTNAME=$(hostname -s)
46 | echo "INFO: HOSTNAME = $HOSTNAME"
47 |
48 | WORKSPACE="${WORKSPACE:-${TOP_DIR}}"
49 | CFG_DIR="${WORKSPACE}/.ci/cfg"
50 | HOSTFILE=${CFG_DIR}/$HOSTNAME/hostfile
51 |
52 | if [ ! -f "${HOSTFILE}" ]; then
53 | echo "ERROR: ${HOSTFILE} doesn't exist or not accessible"
54 | echo "FAIL"
55 | exit 1
56 | fi
57 |
58 | if [ ! -d "${HPCX_DIR}" ]; then
59 | echo "ERROR: ${HPCX_DIR} does not exist or not accessible"
60 | echo "FAIL"
61 | exit 1
62 | fi
63 |
--------------------------------------------------------------------------------
/.ci/sharp_coll_test_wrapper:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | # Wrapper to add correct parameter to the main scripts without refactoring
3 | TEST_DEV=$(ibdev2netdev | awk '{ print $1 }' ):1
4 | echo ${UCX_NET_DEVICES}
5 | ./sharp_coll_test -d "${TEST_DEV}" "${@}"
6 |
--------------------------------------------------------------------------------
/.ci/taskset:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | # Wrapper to add correct parameters to the main app
3 | TEST_DEV=$(ibdev2netdev | awk '{ print $1 }' ):1
4 | export MXM_RDMA_PORTS="${TEST_DEV}"
5 | export HCOLL_MAIN_IB="${TEST_DEV}"
6 | /usr/bin/taskset "${@}"
7 |
--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | IndentWidth: 2
2 | AlignEscapedNewlines: Indent
3 | AlignConsecutiveAssignments: true
4 | AlignConsecutiveDeclarations: false
5 | AlignConsecutiveStructMembers: true
6 | AlignConsecutiveMacros: true
7 | AlignDeclarationByPointer: true
8 | AlignAfterOpenBracket: true
9 | AlignOperands: true
10 |
--------------------------------------------------------------------------------
/.github/workflows/nccl-sharp-plugin.yml:
--------------------------------------------------------------------------------
1 | name: NCCL Sharp plugin CI
2 | on:
3 | workflow_dispatch:
4 | inputs:
5 | mainhost:
6 | description: 'Choose one of hosts to run:'
7 | required: true
8 | type: choice
9 | default: 'host01'
10 | options:
11 | - host01
12 | - host02
13 | push:
14 | branches: ['*']
15 | pull_request:
16 | branches: ['*']
17 | jobs:
18 | deployment:
19 | runs-on: [self-hosted, linux, x64]
20 | steps:
21 | - uses: actions/checkout@v3
22 | - name: Deployment infrastructure
23 | run: /start deploy
24 | build:
25 | needs: [deployment]
26 | runs-on: [self-hosted, linux, x64]
27 | steps:
28 | - name: Building NCCL RDMA sharp plugin
29 | run: /start build
30 | sharp_config:
31 | needs: [deployment, build]
32 | runs-on: [self-hosted, linux, x64]
33 | steps:
34 | - name: Configuring and checking Sharp
35 | run: /start sharp
36 | testing:
37 | needs: [sharp_config]
38 | runs-on: [self-hosted, linux, x64]
39 | steps:
40 | - name: Running tests
41 | run: /start test
42 | clean:
43 | if: ${{ always() }}
44 | needs: [testing]
45 | runs-on: [self-hosted, linux, x64]
46 | steps:
47 | - name: Cleaning
48 | run: /start clean
49 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .gitignore
2 | .project
3 | .cproject
4 | .settings
5 | test/test
6 | Makefile
7 | Makefile.in
8 | aclocal.m4
9 | compile
10 | config.guess
11 | config.h.in
12 | config.h.in~
13 | config.sub
14 | m4/libtool.m4
15 | m4/ltoptions.m4
16 | m4/ltsugar.m4
17 | m4/ltversion.m4
18 | m4/lt~obsolete.m4
19 | config/aux
20 | configure
21 | install-sh
22 | ltmain.sh
23 | missing
24 | config.h
25 | config.log
26 | config.status
27 | libtool
28 | stamp-h1
29 | src/sharp/api/version.h
30 | autom4te.cache
31 | depcomp
32 | .libs
33 | *.la
34 | .deps
35 | .dirstamp
36 | *.lo
37 | *.o
38 | build-*
39 | sharp*tar.gz
40 | rpm-dist
41 | cov_build*
42 | debian/changelog
43 | debian/control
44 | debian/rules
45 | debian/sharp.postinst
46 | debian/nccl-rdma-sharp-plugins.postinst
47 | debian/nccl-rdma-sharp-plugins.prem
48 | sharp.spec
49 | sharp.pc
50 | doc/doxygen-doc
51 | doc/uml/uct.pdf
52 | test-driver
53 | install
54 | src/api/version.h
55 | tags
56 | valgrind*xml
57 | *.tap
58 | jenkins/*
59 | sharp-*
60 | config.cache
61 | nccl-rdma-sharp-plugins.pc
62 | nccl-rdma-sharp-plugins.spec
63 |
64 | # Prerequisites
65 | *.d
66 |
67 | # Compiled Object files
68 | *.slo
69 | *.lo
70 | *.o
71 | *.obj
72 |
73 | # Precompiled Headers
74 | *.gch
75 | *.pch
76 |
77 | # Compiled Dynamic libraries
78 | *.so
79 | *.dylib
80 | *.dll
81 |
82 | # Fortran module files
83 | *.mod
84 | *.smod
85 |
86 | # Compiled Static libraries
87 | *.lai
88 | *.la
89 | *.a
90 | *.lib
91 |
92 | # Executables
93 | *.exe
94 | *.out
95 | *.app
96 |
97 | .idea
98 |
99 | *.orig
100 | *.bak
101 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions
5 | are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright
8 | notice, this list of conditions and the following disclaimer.
9 | 2. Redistributions in binary form must reproduce the above copyright
10 | notice, this list of conditions and the following disclaimer in the
11 | documentation and/or other materials provided with the distribution.
12 | 3. Neither the name of the copyright holder nor the names of its
13 | contributors may be used to endorse or promote products derived from
14 | this software without specific prior written permission.
15 |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | #
2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # See file LICENSE for terms.
7 | #
8 |
9 | SUBDIRS = src
10 |
11 | EXTRA_DIST =
12 | EXTRA_DIST += autogen.sh
13 | EXTRA_DIST += include
14 | EXTRA_DIST += debian
15 | EXTRA_DIST += nccl-rdma-sharp-plugins.spec
16 |
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # nccl-rdma-sharp-plugins
2 |
3 | nccl-rdma-sharp plugin enables RDMA and Switch based collectives(SHARP)
4 | with [NVIDIA's NCCL](https://github.com/NVIDIA/nccl) library
5 |
6 | ## Overview
7 |
8 | ## Requirements
9 |
10 | * MOFED
11 | * CUDA
12 | * SHARP
13 | * NCCL
14 | * GPUDirectRDMA plugin
15 |
16 | ## Build Instructions
17 |
18 | ### build system requirements
19 |
20 | * CUDA
21 | * SHARP
22 | * MOFED
23 |
24 | Plugin uses GNU autotools for its build system. You can build it as follows:
25 |
26 |
27 | ```
28 | $ ./autogen.sh
29 | $ ./configure
30 | $ make
31 | $ make install
32 | ```
33 |
34 | The following flags enabled to build with custom dependencies
35 |
36 |
37 | ```
38 | --with-verbs=PATH Path to non-standard libibverbs installation
39 | --with-sharp=PATH Path to non-standard SHARP installation
40 | --with-cuda=PATH Path to non-standard CUDA installation
41 | ```
42 |
43 |
44 |
--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 | # SPDX-License-Identifier: BSD-3-Clause
6 | #
7 | # Redistribution and use in source and binary forms, with or without
8 | # modification, are permitted provided that the following conditions are met:
9 | #
10 | # 1. Redistributions of source code must retain the above copyright notice, this
11 | # list of conditions and the following disclaimer.
12 | #
13 | # 2. Redistributions in binary form must reproduce the above copyright notice,
14 | # this list of conditions and the following disclaimer in the documentation
15 | # and/or other materials provided with the distribution.
16 | #
17 | # 3. Neither the name of the copyright holder nor the names of its
18 | # contributors may be used to endorse or promote products derived from
19 | # this software without specific prior written permission.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | #
32 |
33 | rm -rf autom4te.cache
34 | autoreconf -ivf || exit 1
35 | rm -rf autom4te.cache
36 |
--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | #
4 | # See file LICENSE for terms.
5 | #
6 | AC_PREREQ([2.63])
7 |
8 | AC_COPYRIGHT([Copyright (c) 2019, NVIDIA CORPORATION & AFFILIATES. All rights reserved.])
9 |
10 | define([nccl_rdma_sharp_plugins_ver_major], 2)
11 | define([nccl_rdma_sharp_plugins_ver_minor], 7)
12 |
13 | AC_INIT([nccl-rdma-sharp-plugins], [nccl_rdma_sharp_plugins_ver_major.nccl_rdma_sharp_plugins_ver_minor], [support@mellanox.com], [],[http://github.com/Mellanox/nccl-rdma-sharp-plugins])
14 |
15 | AM_INIT_AUTOMAKE([1.10 foreign tar-ustar subdir-objects])
16 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
17 | AM_MAINTAINER_MODE
18 | AC_CONFIG_MACRO_DIR([m4])
19 |
20 | AC_USE_SYSTEM_EXTENSIONS
21 | AC_GNU_SOURCE
22 | AC_CONFIG_HEADERS([config.h])
23 |
24 | RPM_RELEASE=1
25 | MAJOR_VERSION=nccl_rdma_sharp_plugins_ver_major
26 | MINOR_VERSION=nccl_rdma_sharp_plugins_ver_minor
27 | VERSION=$MAJOR_VERSION.$MINOR_VERSION
28 |
29 | AC_SUBST(RPM_RELEASE)
30 | AC_SUBST(VERSION)
31 | AC_SUBST(MAJOR_VERSION)
32 | AC_SUBST(MINOR_VERSION)
33 | AC_SUBST([BUILD_DATE], [$(date +'%b/%d/%Y')])
34 | AC_SUBST([BUILD_TIME], [$(date +'%H:%M:%S')])
35 |
36 | # Checks for programs.
37 | AC_GNU_SOURCE
38 | AC_PROG_CC
39 | AC_PROG_CC_STDC
40 | AC_PROG_CXX
41 | AM_PROG_AS
42 | AC_PROG_SED
43 | AC_PROG_INSTALL
44 | AC_PROG_LIBTOOL
45 | AC_HEADER_STDC
46 | LT_LIB_M
47 |
48 | AC_ARG_ENABLE([debug],AS_HELP_STRING([--enable-debug], [Enable extra debugging code (default is NO).]),
49 | [], [enable_debug=no])
50 |
51 | if test $enable_debug = yes; then
52 | AC_DEFINE([ENABLE_DEBUG], [1], [Enable debugging code])
53 | CFLAGS="$CFLAGS -O0 -g3 -Wall -Werror"
54 | else
55 | CFLAGS="$CFLAGS -O3 -DNDEBUG -Wall -Werror"
56 | fi
57 |
58 | #check for cuda
59 | AC_ARG_WITH([cuda],
60 | [AC_HELP_STRING([--with-cuda=PATH],
61 | [Path to non-standard CUDA installation])],
62 | [AS_IF([test -d $withval/lib64], [cuda_libdir="lib64"], [cuda_libdir="lib"])
63 | CFLAGS="-I$withval/include $CFLAGS"
64 | LDFLAGS="-L$withval/$cuda_libdir $LDFLAGS"],
65 | [CFLAGS="-I/usr/local/cuda/include $CFLAGS"
66 | LDFLAGS="-L/usr/local/cuda/$cuda_libdir $LDFLAGS"])
67 |
68 | AC_CHECK_HEADER( [cuda_runtime.h], [], [AC_MSG_FAILURE([CUDA runtime header files not found])])
69 | AC_CHECK_LIB([cudart], [cudaMalloc], [], [AC_MSG_FAILURE([CUDA runtime libs not found])])
70 |
71 | #check for verbs
72 | AC_ARG_WITH([verbs],
73 | [AC_HELP_STRING([--with-verbs(=DIR)],
74 | [Build Infiniband support, adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])],
75 | [CFLAGS="-I$with_verbs/include $CFLAGS"
76 | LDFLAGS="-L$with_verbs/lib64 -L$with_verbs/lib -libverbs $LDFLAGS"],
77 | [CFLAGS="-I/usr/include $CFLAGS"
78 | LDFLAGS="-L/usr/lib64 -L/usr/lib -libverbs $LDFLAGS"])
79 |
80 | AC_CHECK_HEADER( [infiniband/verbs.h], [],[AC_MSG_FAILURE([ibverbs header files not found])])
81 | AC_CHECK_LIB([ibverbs], [ibv_get_device_list], [],[AC_MSG_FAILURE([libibverbs not found]);])
82 |
83 | AC_CHECK_DECLS([IBV_ACCESS_RELAXED_ORDERING, IBV_QPF_GRH_REQUIRED, ibv_reg_dmabuf_mr, ibv_query_ece, ibv_set_ece], [], [],
84 | [[#include ]])
85 |
86 | # check for ucx
87 | AM_CONDITIONAL([HAVE_UCX_PLUGIN], [false])
88 | m4_include([m4/ucx.m4])
89 | CHECK_UCX
90 | AC_MSG_RESULT([UCX support: $ucx_happy])
91 |
92 | # check for sharp
93 | AM_CONDITIONAL([HAVE_SHARP_PLUGIN], [false])
94 | m4_include([m4/sharp.m4])
95 | CHECK_SHARP
96 | AC_MSG_RESULT([SHARP support: $sharp_happy])
97 |
98 | #check for required headers
99 | AC_CHECK_HEADERS([limits.h stdlib.h string.h unistd.h], [],
100 | [AC_MSG_FAILURE([unable to find required headers])])
101 |
102 | AC_CONFIG_FILES([Makefile
103 | src/Makefile
104 | nccl-rdma-sharp-plugins.spec
105 | nccl-rdma-sharp-plugins.pc
106 | debian/changelog
107 | debian/control
108 | debian/nccl-rdma-sharp-plugins.postinst
109 | debian/nccl-rdma-sharp-plugins.prem
110 | debian/rules
111 | ])
112 | AC_OUTPUT
113 |
114 | echo "NCCL-RDMA-SHARP Plugin has been configured."
115 |
--------------------------------------------------------------------------------
/contrib/buildrpm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eE
2 | #
3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
4 | # Copyright (c) 2001-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 | # SPDX-License-Identifier: BSD-3-Clause
6 | # See file LICENSE for terms.
7 | #
8 |
9 | PACKAGE=nccl-rdma-sharp-plugins
10 | WS=$PWD
11 | rpmspec=${PACKAGE}.spec
12 | rpmmacros="--define='_rpmdir ${WS}/rpm-dist' --define='_srcrpmdir ${WS}/rpm-dist' --define='_sourcedir ${WS}' --define='_specdir ${WS}' --define='_builddir ${WS}'"
13 | rpmopts="--nodeps --buildroot='${WS}/_rpm'"
14 |
15 |
16 |
17 | opt_tarball=0
18 | opt_srcrpm=0
19 | opt_binrpm=0
20 |
21 | while test "$1" != ""; do
22 | case $1 in
23 | --tarball|-t) opt_tarball=1 ;;
24 | --srcrpm|-s) opt_srcrpm=1 ;;
25 | --binrpm|-b) opt_binrpm=1 ;;
26 | *)
27 | cat <
35 |
36 | -- Mellanox Ltd. Wed, 11 Sep 2013 15:24:22 +0300
37 |
--------------------------------------------------------------------------------
/debian/compat:
--------------------------------------------------------------------------------
1 | 8
2 |
--------------------------------------------------------------------------------
/debian/control.in:
--------------------------------------------------------------------------------
1 | #
2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # Redistribution and use in source and binary forms, with or without
7 | # modification, are permitted provided that the following conditions are met:
8 | #
9 | # 1. Redistributions of source code must retain the above copyright notice, this
10 | # list of conditions and the following disclaimer.
11 | #
12 | # 2. Redistributions in binary form must reproduce the above copyright notice,
13 | # this list of conditions and the following disclaimer in the documentation
14 | # and/or other materials provided with the distribution.
15 | #
16 | # 3. Neither the name of the copyright holder nor the names of its
17 | # contributors may be used to endorse or promote products derived from
18 | # this software without specific prior written permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | #
31 |
32 | Source: @PACKAGE@
33 | Section: libs
34 | Priority: extra
35 | Maintainer: support@mellanox.com
36 | Build-Depends: libibverbs-dev
37 | Standards-Version: @MAJOR_VERSION@.@MINOR_VERSION@
38 | Homepage: http://www.mellanox.com
39 |
40 | Package: @PACKAGE@
41 | Section: libs
42 | Depends: ${shlibs:Depends}, ${misc:Depends}
43 | Architecture: any
44 | Description: RDMA and SHARP plugin for NCCL
45 | Plugin enabled RDMA and switch collectives(SHARP) in NCCL
46 |
--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
2 | Upstream-Name: NCCL-RDMA-SHARP plugins
3 | Source: http://www.mellanox.com
4 |
5 | Files: *
6 | Copyright (c) 2015-2019, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7 | License: BSD
8 | Redistribution and use in source and binary forms, with or without
9 | modification, are permitted provided that the following conditions
10 | are met:
11 |
12 | 1. Redistributions of source code must retain the above copyright
13 | notice, this list of conditions and the following disclaimer.
14 | 2. Redistributions in binary form must reproduce the above copyright
15 | notice, this list of conditions and the following disclaimer in the
16 | documentation and/or other materials provided with the distribution.
17 | 3. Neither the name of the copyright holder nor the names of its
18 | contributors may be used to endorse or promote products derived from
19 | this software without specific prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 |
--------------------------------------------------------------------------------
/debian/nccl-rdma-sharp-plugins.postinst.in:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 | # SPDX-License-Identifier: BSD-3-Clause
6 | #
7 | # Redistribution and use in source and binary forms, with or without
8 | # modification, are permitted provided that the following conditions are met:
9 | #
10 | # 1. Redistributions of source code must retain the above copyright notice, this
11 | # list of conditions and the following disclaimer.
12 | #
13 | # 2. Redistributions in binary form must reproduce the above copyright notice,
14 | # this list of conditions and the following disclaimer in the documentation
15 | # and/or other materials provided with the distribution.
16 | #
17 | # 3. Neither the name of the copyright holder nor the names of its
18 | # contributors may be used to endorse or promote products derived from
19 | # this software without specific prior written permission.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | #
32 |
33 | set -e
34 | if [ @prefix@ != /usr/lib/pkgconfig ];then
35 | install -m 755 @prefix@/lib/pkgconfig/nccl-rdma-sharp-plugins.pc /usr/lib/pkgconfig/nccl-rdma-sharp-plugins.pc
36 | fi
37 |
38 |
--------------------------------------------------------------------------------
/debian/nccl-rdma-sharp-plugins.prem.in:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
4 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 | # SPDX-License-Identifier: BSD-3-Clause
6 | #
7 | # Redistribution and use in source and binary forms, with or without
8 | # modification, are permitted provided that the following conditions are met:
9 | #
10 | # 1. Redistributions of source code must retain the above copyright notice, this
11 | # list of conditions and the following disclaimer.
12 | #
13 | # 2. Redistributions in binary form must reproduce the above copyright notice,
14 | # this list of conditions and the following disclaimer in the documentation
15 | # and/or other materials provided with the distribution.
16 | #
17 | # 3. Neither the name of the copyright holder nor the names of its
18 | # contributors may be used to endorse or promote products derived from
19 | # this software without specific prior written permission.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | #
32 |
33 | PCF=/usr/lib/pkgconfig/nccl-rdma-sharp-plugins.pc
34 |
35 | if [ -f $PCF ];then
36 | rm -f $PCF
37 | fi
38 |
--------------------------------------------------------------------------------
/debian/rules.in:
--------------------------------------------------------------------------------
1 | #!/usr/bin/make -f
2 | # -*- makefile -*-
3 | #
4 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
5 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
6 | # SPDX-License-Identifier: BSD-3-Clause
7 | #
8 | # Redistribution and use in source and binary forms, with or without
9 | # modification, are permitted provided that the following conditions are met:
10 | #
11 | # 1. Redistributions of source code must retain the above copyright notice, this
12 | # list of conditions and the following disclaimer.
13 | #
14 | # 2. Redistributions in binary form must reproduce the above copyright notice,
15 | # this list of conditions and the following disclaimer in the documentation
16 | # and/or other materials provided with the distribution.
17 | #
18 | # 3. Neither the name of the copyright holder nor the names of its
19 | # contributors may be used to endorse or promote products derived from
20 | # this software without specific prior written permission.
21 | #
22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | #
33 |
34 | # Sample debian/rules that uses debhelper.
35 | # This file was originally written by Joey Hess and Craig Small.
36 | # As a special exception, when this file is copied by dh-make into a
37 | # dh-make output file, you may use that output file without restriction.
38 | # This special exception was added by Craig Small in version 0.37 of dh-make.
39 |
40 | # Uncomment this to turn on verbose mode.
41 | #export DH_VERBOSE=1
42 |
43 | %:
44 | dh $@
45 |
46 | override_dh_auto_configure:
47 | contrib/configure-release
48 | chmod +x debian/rules
49 |
50 | override_dh_shlibdeps:
51 | dh_shlibdeps --dpkg-shlibdeps-params=--ignore-missing-info
52 |
53 | override_dh_auto_clean:
54 |
--------------------------------------------------------------------------------
/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (native)
2 |
--------------------------------------------------------------------------------
/include/core.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_CORE_H_
10 | #define NCCL_CORE_H_
11 |
12 | #include "nccl.h"
13 | #include "debug.h"
14 |
15 | #include
16 | #include
17 |
18 | #define MIN(a, b) ((a)<(b)?(a):(b))
19 | #define MAX(a, b) ((a)>(b)?(a):(b))
20 |
21 | #define DIVUP(x, y) \
22 | (((x)+(y)-1)/(y))
23 | #define ROUNDUP(x, y) \
24 | (DIVUP((x), (y))*(y))
25 |
26 | // Check CUDA calls
27 | #define CUDACHECK(cmd) do { \
28 | cudaError_t err = cmd; \
29 | if( err != cudaSuccess ) { \
30 | WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
31 | return ncclUnhandledCudaError; \
32 | } \
33 | } while(false)
34 |
35 | #define CUDACHECKGOTO(cmd, RES, label) do { \
36 | cudaError_t err = cmd; \
37 | if( err != cudaSuccess ) { \
38 | WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
39 | RES = ncclUnhandledCudaError; \
40 | goto label; \
41 | } \
42 | } while(false)
43 |
44 | // Report failure but clear error and continue
45 | #define CUDACHECKIGNORE(cmd) do { \
46 | cudaError_t err = cmd; \
47 | if( err != cudaSuccess ) { \
48 | INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
49 | (void) cudaGetLastError(); \
50 | } \
51 | } while(false)
52 |
53 | #include
54 | // Check system calls
55 | #define SYSCHECK(statement, name) do { \
56 | int retval; \
57 | SYSCHECKSYNC((statement), name, retval); \
58 | if (retval == -1) { \
59 | WARN("Call to " name " failed: %s", strerror(errno)); \
60 | return ncclSystemError; \
61 | } \
62 | } while (false)
63 |
64 | #define SYSCHECKSYNC(statement, name, retval) do { \
65 | retval = (statement); \
66 | if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
67 | INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
68 | } else { \
69 | break; \
70 | } \
71 | } while(true)
72 |
73 | #define SYSCHECKGOTO(statement, name, RES, label) do { \
74 | int retval; \
75 | SYSCHECKSYNC((statement), name, retval); \
76 | if (retval == -1) { \
77 | WARN("Call to " name " failed: %s", strerror(errno)); \
78 | RES = ncclSystemError; \
79 | goto label; \
80 | } \
81 | } while (0)
82 |
83 | // Pthread calls don't set errno and never return EINTR.
84 | #define PTHREADCHECK(statement, name) do { \
85 | int retval = (statement); \
86 | if (retval != 0) { \
87 | WARN("Call to " name " failed: %s", strerror(retval)); \
88 | return ncclSystemError; \
89 | } \
90 | } while (0)
91 |
92 | #define PTHREADCHECKGOTO(statement, name, RES, label) do { \
93 | int retval = (statement); \
94 | if (retval != 0) { \
95 | WARN("Call to " name " failed: %s", strerror(retval)); \
96 | RES = ncclSystemError; \
97 | goto label; \
98 | } \
99 | } while (0)
100 |
101 |
102 | #define NEQCHECK(statement, value) do { \
103 | if ((statement) != value) { \
104 | /* Print the back trace*/ \
105 | INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
106 | return ncclSystemError; \
107 | } \
108 | } while (0)
109 |
110 | #define NEQCHECKGOTO(statement, value, RES, label) do { \
111 | if ((statement) != value) { \
112 | /* Print the back trace*/ \
113 | RES = ncclSystemError; \
114 | INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
115 | goto label; \
116 | } \
117 | } while (0)
118 |
119 | #define EQCHECK(statement, value) do { \
120 | if ((statement) == value) { \
121 | /* Print the back trace*/ \
122 | INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
123 | return ncclSystemError; \
124 | } \
125 | } while (0)
126 |
127 | #define EQCHECKGOTO(statement, value, RES, label) do { \
128 | if ((statement) == value) { \
129 | /* Print the back trace*/ \
130 | RES = ncclSystemError; \
131 | INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
132 | goto label; \
133 | } \
134 | } while (0)
135 |
136 | // Propagate errors up
137 | #define NCCLCHECK(call) do { \
138 | ncclResult_t RES = call; \
139 | if (RES != ncclSuccess && RES != ncclInProgress) { \
140 | /* Print the back trace*/ \
141 | return RES; \
142 | } \
143 | } while (0)
144 |
145 | #define NCCLCHECKGOTO(call, RES, label) do { \
146 | RES = call; \
147 | if (RES != ncclSuccess && RES != ncclInProgress) { \
148 | /* Print the back trace*/ \
149 | goto label; \
150 | } \
151 | } while (0)
152 |
153 | #define NCCLWAIT(call, cond, abortFlagPtr) do { \
154 | volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
155 | ncclResult_t RES = call; \
156 | if (RES != ncclSuccess && RES != ncclInProgress) { \
157 | return ncclInternalError; \
158 | } \
159 | if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
160 | } while (!(cond))
161 |
162 | #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
163 | volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
164 | RES = call; \
165 | if (RES != ncclSuccess && RES != ncclInProgress) { \
166 | goto label; \
167 | } \
168 | if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
169 | } while (!(cond))
170 |
171 | #define NCCLCHECKTHREAD(a, args) do { \
172 | if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
173 | INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
174 | return args; \
175 | } \
176 | } while(0)
177 |
178 | #define CUDACHECKTHREAD(a) do { \
179 | if ((a) != cudaSuccess) { \
180 | INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
181 | args->ret = ncclUnhandledCudaError; \
182 | return args; \
183 | } \
184 | } while(0)
185 |
186 | #endif
187 |
--------------------------------------------------------------------------------
/include/debug.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_DEBUG_H_
10 | #define NCCL_DEBUG_H_
11 |
12 | #include "core.h"
13 |
14 | #include
15 |
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include "net.h"
21 |
22 | // Conform to pthread and NVTX standard
23 | #define NCCL_THREAD_NAMELEN 16
24 |
25 | extern pthread_mutex_t ncclDebugLock;
26 |
27 | extern ncclDebugLogger_t pluginLogFunction;
28 |
29 | #define WARN(...) pluginLogFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
30 | #define INFO(FLAGS, ...) pluginLogFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
31 |
32 | #ifdef ENABLE_TRACE
33 | #define TRACE(FLAGS, ...) pluginLogFunction(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
34 | #else
35 | #define TRACE(...)
36 | #endif
37 |
38 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
39 |
40 | void ncclResetDebugInit();
41 |
42 | #endif
43 |
--------------------------------------------------------------------------------
/include/ibvwrap.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
3 | * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved.
4 | * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
5 | * Copyright (c) 2005 PathScale, Inc. All rights reserved.
6 | *
7 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
8 | * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
9 | * SPDX-License-Identifier: BSD-3-Clause
10 | *
11 | * See LICENSE.txt for license information
12 | ************************************************************************/
13 |
14 | #ifndef NCCL_IBVWRAP_H_
15 | #define NCCL_IBVWRAP_H_
16 | #include "config.h"
17 | #include "core.h"
18 | #include "utils.h"
19 | #include
20 | #include
21 | #include
22 |
23 | #if !HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING
24 | # define IBV_ACCESS_RELAXED_ORDERING 0
25 | #endif
26 | #if !HAVE_DECL_IBV_QPF_GRH_REQUIRED
27 | # define IBV_QPF_GRH_REQUIRED 0
28 | #endif
29 |
30 | #if !HAVE_DECL_IBV_SET_ECE
31 | struct ibv_ece {
32 | /*
33 | * Unique identifier of the provider vendor on the network.
34 | * The providers will set IEEE OUI here to distinguish
35 | * itself in non-homogenius network.
36 | */
37 | uint32_t vendor_id;
38 | /*
39 | * Provider specific attributes which are supported or
40 | * needed to be enabled by ECE users.
41 | */
42 | uint32_t options;
43 | uint32_t comp_mask;
44 | };
45 | #endif
46 |
47 | ncclResult_t wrap_ibv_fork_init(void);
48 | ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
49 | ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
50 | const char *wrap_ibv_get_device_name(struct ibv_device *device);
51 | ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
52 | ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
53 | ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
54 | ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
55 | ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
56 | ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
57 | ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
58 | ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
59 | ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
60 | ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
61 | ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
62 | struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
63 | ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
64 | /* DMA-BUF support */
65 | ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
66 | struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
67 | ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
68 | ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
69 | ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
70 | ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
71 | ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
72 | static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
73 | int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
74 | if (done < 0) {
75 | WARN("Call to ibv_poll_cq() returned %d", done);
76 | return ncclSystemError;
77 | }
78 | *num_done = done;
79 | return ncclSuccess;
80 | }
81 | ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
82 | ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
83 | ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
84 | ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
85 | ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
86 | ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr);
87 | ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr);
88 | ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
89 |
90 | // converts a GID into a readable string. On success, returns a non-null pointer to gidStr.
91 | // NULL is returned if there was an error, with errno set to indicate the error.
92 | // errno = ENOSPC if the converted string would exceed strLen.
93 | static inline const char* ibvGetGidStr(union ibv_gid* gid, char* gidStr, size_t strLen) {
94 | // GID is a 16B handle, to convert it to a readable form, we use inet_ntop
95 | // sizeof(ibv_gid) == sizeof(struct in6_addr), so using AF_INET6
96 | NCCL_STATIC_ASSERT(sizeof(union ibv_gid) == sizeof(struct in6_addr), "the sizeof struct ibv_gid must be the size of struct in6_addr");
97 | return inet_ntop(AF_INET6, gid->raw, gidStr, strLen);
98 | }
99 |
100 | #endif //End include guard
101 |
--------------------------------------------------------------------------------
/include/net.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | */
6 |
7 | #ifndef NCCL_NET_H_
8 | #define NCCL_NET_H_
9 |
10 | #include
11 | #include
12 |
13 | #define NCCL_NET_HANDLE_MAXSIZE 128
14 | //Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
15 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
16 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
17 |
18 |
19 | #define NCCL_PTR_HOST 0x1
20 | #define NCCL_PTR_CUDA 0x2
21 | #define NCCL_PTR_DMABUF 0x4
22 |
23 | // Maximum number of requests per comm object
24 | #define NCCL_NET_MAX_REQUESTS 8
25 |
26 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
27 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
28 |
29 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
30 | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
31 |
32 | #include "net_v10.h"
33 | #include "net_v9.h"
34 | #include "net_v8.h"
35 | #include "net_v7.h"
36 | #include "net_v6.h"
37 | #include "net_v5.h"
38 |
39 | #endif // end include guard
40 |
--------------------------------------------------------------------------------
/include/net_device.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NET_DEVICE_H_
10 | #define NET_DEVICE_H_
11 |
12 | #define NCCL_NET_DEVICE_INVALID_VERSION 0x0
13 | #define NCCL_NET_MTU_SIZE 4096
14 |
15 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
16 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
17 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
18 |
19 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
20 |
21 | typedef struct {
22 | ncclNetDeviceType netDeviceType; // Network offload type
23 | int netDeviceVersion; // Version number for network offload
24 | void* handle;
25 | size_t size;
26 | int needsProxyProgress;
27 | } ncclNetDeviceHandle_v7_t;
28 |
29 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
30 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
31 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
32 | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
33 |
34 | #endif
35 |
--------------------------------------------------------------------------------
/include/net_v10.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | */
6 | #ifndef NCCL_NET_V10_H_
7 | #define NCCL_NET_V10_H_
8 |
9 | #include "net_device.h"
10 |
11 | #define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
12 | #define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
13 | typedef struct {
14 | int ndevs;
15 | int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
16 | } ncclNetVDeviceProps_v10_t;
17 | typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
18 |
19 | #define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
20 | typedef struct {
21 | // Plugin-specific TC value
22 | int trafficClass;
23 | } ncclNetCommConfig_v10_t;
24 | typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
25 |
26 | typedef struct {
27 | char* name; // Used mostly for logging.
28 | char* pciPath; // Path to the PCI device in /sys.
29 | uint64_t guid; // Unique identifier for the NIC chip. Important for
30 | // cards with multiple PCI functions (Physical or virtual).
31 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
32 | int regIsGlobal; // regMr is not tied to a particular comm
33 | int forceFlush; // Force a flush on receives
34 | int speed; // Port speed in Mbps.
35 | int port; // Port number.
36 | float latency; // Network latency
37 | int maxComms; // Maximum number of comms we can create
38 | int maxRecvs; // Maximum number of grouped receives.
39 | ncclNetDeviceType netDeviceType; // Network offload type
40 | int netDeviceVersion; // Version number for network offload
41 | ncclNetVDeviceProps_v10_t vProps;
42 | size_t maxP2pBytes; // Max transfer size for point-to-point operations
43 | size_t maxCollBytes; // Max transfer size for collective operations
44 | } ncclNetProperties_v10_t;
45 |
46 | typedef ncclNetProperties_v10_t ncclNetProperties_t;
47 |
48 | typedef struct {
49 | // Name of the network (mainly for logs)
50 | const char* name;
51 | // Initialize the network.
52 | ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
53 | // Return the number of adapters.
54 | ncclResult_t (*devices)(int* ndev);
55 | // Get various device properties.
56 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
57 | // Create a receiving object and provide a handle to connect to it. The
58 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
59 | // between ranks to create a connection.
60 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
61 | // Connect to a handle and return a sending comm object for that peer.
62 | // This call must not block for the connection to be established, and instead
63 | // should return successfully with sendComm == NULL with the expectation that
64 | // it will be called again until sendComm != NULL.
65 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
66 | ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
67 | // Finalize connection establishment after remote peer has called connect.
68 | // This call must not block for the connection to be established, and instead
69 | // should return successfully with recvComm == NULL with the expectation that
70 | // it will be called again until recvComm != NULL.
71 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
72 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
73 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
74 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
75 | ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
76 | /* DMA-BUF support */
77 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
78 | ncclResult_t (*deregMr)(void* comm, void* mhandle);
79 | // Asynchronous send to a peer.
80 | // May return request == NULL if the call cannot be performed (or would block)
81 | ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
82 | // Asynchronous recv from a peer.
83 | // May return request == NULL if the call cannot be performed (or would block)
84 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
85 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
86 | // visible to the GPU
87 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
88 | // Test whether a request is complete. If size is not NULL, it returns the
89 | // number of bytes sent/received.
90 | ncclResult_t (*test)(void* request, int* done, int* sizes);
91 | // Close and free send/recv comm objects
92 | ncclResult_t (*closeSend)(void* sendComm);
93 | ncclResult_t (*closeRecv)(void* recvComm);
94 | ncclResult_t (*closeListen)(void* listenComm);
95 |
96 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code
97 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
98 |
99 | // Notify the plugin that a recv has completed by the device
100 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
101 |
102 | // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
103 | // what index this new vNIC exists at
104 | ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
105 | } ncclNet_v10_t;
106 |
107 | typedef struct {
108 | void* mhandle;
109 | void* address;
110 | size_t size;
111 | } ncclNetSGE_v10_t;
112 |
113 | typedef struct {
114 | // Name of the collective network (mainly for logs)
115 | const char* name;
116 | // Initialize the collective network.
117 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
118 | // Return the number of adapters capable of doing collective operations.
119 | // If ndev returns 0, all other functions might be set to NULL.
120 | ncclResult_t (*devices)(int* ndev);
121 | // Get various device properties.
122 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
123 | // Create a receiving object and provide a handle to connect to it. The
124 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
125 | // between ranks to create connections.
126 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
127 | // Create a group for collective operations. handles have been created
128 | // using listen() above. rank indicates caller's rank in the collective network.
129 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
130 | // Returns whether a reduction operation on a data type is supported.
131 | // 1 for supported, 0 otherwise.
132 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
133 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
134 | ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
135 | /* DMA-BUF support */
136 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
137 | ncclResult_t (*deregMr)(void* collComm, void* mhandle);
138 | // Performs an asynchronous allreduce operation on the collective group.
139 | // May return request == NULL if the call cannot be performed (or would block).
140 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
141 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
142 | ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
143 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
144 | void* sendMhandle, void** request);
145 | ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
146 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
147 | ncclDataType_t dataType, ncclRedOp_t redOp,
148 | void* recvMhandle, void** request);
149 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
150 | // visible to the GPU
151 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
152 | // Test whether a request is complete. If size is not NULL, it returns the
153 | // number of bytes sent/received.
154 | ncclResult_t (*test)(void* request, int* done, int* size);
155 | // Close and free collective comm objects
156 | ncclResult_t (*closeColl)(void* collComm);
157 | ncclResult_t (*closeListen)(void* listenComm);
158 | // Create a virtual NIC given the specified properties, which can be accessed at device index d
159 | ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
160 | } ncclCollNet_v10_t;
161 |
162 | typedef ncclCollNet_v10_t ncclCollNet_t;
163 |
164 | #endif // end include guard
165 |
--------------------------------------------------------------------------------
/include/net_v5.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | */
6 |
7 | #ifndef NCCL_NET_V5_H_
8 | #define NCCL_NET_V5_H_
9 |
10 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
11 | typedef struct {
12 | // Name of the network (mainly for logs)
13 | const char* name;
14 | // Initialize the network.
15 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
16 | // Return the number of adapters.
17 | ncclResult_t (*devices)(int* ndev);
18 | // Get various device properties.
19 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
20 | // Create a receiving object and provide a handle to connect to it. The
21 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
22 | // between ranks to create a connection.
23 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
24 | // Connect to a handle and return a sending comm object for that peer.
25 | // This call must not block for the connection to be established, and instead
26 | // should return successfully with sendComm == NULL with the expectation that
27 | // it will be called again until sendComm != NULL.
28 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
29 | // Finalize connection establishment after remote peer has called connect.
30 | // This call must not block for the connection to be established, and instead
31 | // should return successfully with recvComm == NULL with the expectation that
32 | // it will be called again until recvComm != NULL.
33 | ncclResult_t (*accept)(void* listenComm, void** recvComm);
34 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
35 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
36 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
37 | ncclResult_t (*deregMr)(void* comm, void* mhandle);
38 | // Asynchronous send to a peer.
39 | // May return request == NULL if the call cannot be performed (or would block)
40 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
41 | // Asynchronous recv from a peer.
42 | // May return request == NULL if the call cannot be performed (or would block)
43 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
44 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
45 | // visible to the GPU
46 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
47 | // Test whether a request is complete. If size is not NULL, it returns the
48 | // number of bytes sent/received.
49 | ncclResult_t (*test)(void* request, int* done, int* sizes);
50 | // Close and free send/recv comm objects
51 | ncclResult_t (*closeSend)(void* sendComm);
52 | ncclResult_t (*closeRecv)(void* recvComm);
53 | ncclResult_t (*closeListen)(void* listenComm);
54 | } ncclNet_v5_t;
55 |
56 |
57 | typedef struct {
58 | // Name of the collective network (mainly for logs)
59 | const char* name;
60 | // Initialize the collective network.
61 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
62 | // Return the number of adapters capable of doing collective operations.
63 | // If ndev returns 0, all other functions might be set to NULL.
64 | ncclResult_t (*devices)(int* ndev);
65 | // Get various device properties.
66 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
67 | // Create a receiving object and provide a handle to connect to it. The
68 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
69 | // between ranks to create connections.
70 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
71 | // Create a group for collective operations. handles have been created
72 | // using listen() above. rank indicates caller's rank in the collective network.
73 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
74 | // Returns whether a reduction operation on a data type is supported.
75 | // 1 for supported, 0 otherwise.
76 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
77 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
78 | ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
79 | ncclResult_t (*deregMr)(void* collComm, void* mhandle);
80 | // Performs an asynchronous allreduce operation on the collective group.
81 | // May return request == NULL if the call cannot be performed (or would block).
82 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
83 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
84 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
85 | // visible to the GPU
86 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
87 | // Test whether a request is complete. If size is not NULL, it returns the
88 | // number of bytes sent/received.
89 | ncclResult_t (*test)(void* request, int* done, int* size);
90 | // Close and free collective comm objects
91 | ncclResult_t (*closeColl)(void* collComm);
92 | ncclResult_t (*closeListen)(void* listenComm);
93 | } ncclCollNet_v5_t;
94 |
95 | #endif
96 |
--------------------------------------------------------------------------------
/include/net_v6.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | */
6 |
7 | #ifndef NCCL_NET_V6_H_
8 | #define NCCL_NET_V6_H_
9 |
10 | typedef struct {
11 | char* name; // Used mostly for logging.
12 | char* pciPath; // Path to the PCI device in /sys.
13 | uint64_t guid; // Unique identifier for the NIC chip. Important for
14 | // cards with multiple PCI functions (Physical or virtual).
15 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
16 | int speed; // Port speed in Mbps.
17 | int port; // Port number.
18 | float latency; // Network latency
19 | int maxComms; // Maximum number of comms we can create
20 | int maxRecvs; // Maximum number of grouped receives.
21 | }ncclNetProperties_v6_t;
22 |
23 | typedef struct {
24 | // Name of the network (mainly for logs)
25 | const char* name;
26 | // Initialize the network.
27 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
28 | // Return the number of adapters.
29 | ncclResult_t (*devices)(int* ndev);
30 | // Get various device properties.
31 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
32 | // Create a receiving object and provide a handle to connect to it. The
33 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
34 | // between ranks to create a connection.
35 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
36 | // Connect to a handle and return a sending comm object for that peer.
37 | // This call must not block for the connection to be established, and instead
38 | // should return successfully with sendComm == NULL with the expectation that
39 | // it will be called again until sendComm != NULL.
40 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
41 | // Finalize connection establishment after remote peer has called connect.
42 | // This call must not block for the connection to be established, and instead
43 | // should return successfully with recvComm == NULL with the expectation that
44 | // it will be called again until recvComm != NULL.
45 | ncclResult_t (*accept)(void* listenComm, void** recvComm);
46 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
47 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
48 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
49 | /* DMA-BUF support */
50 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
51 | ncclResult_t (*deregMr)(void* comm, void* mhandle);
52 | // Asynchronous send to a peer.
53 | // May return request == NULL if the call cannot be performed (or would block)
54 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
55 | // Asynchronous recv from a peer.
56 | // May return request == NULL if the call cannot be performed (or would block)
57 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
58 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
59 | // visible to the GPU
60 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
61 | // Test whether a request is complete. If size is not NULL, it returns the
62 | // number of bytes sent/received.
63 | ncclResult_t (*test)(void* request, int* done, int* sizes);
64 | // Close and free send/recv comm objects
65 | ncclResult_t (*closeSend)(void* sendComm);
66 | ncclResult_t (*closeRecv)(void* recvComm);
67 | ncclResult_t (*closeListen)(void* listenComm);
68 | } ncclNet_v6_t;
69 |
70 | typedef struct {
71 | // Name of the collective network (mainly for logs)
72 | const char* name;
73 | // Initialize the collective network.
74 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
75 | // Return the number of adapters capable of doing collective operations.
76 | // If ndev returns 0, all other functions might be set to NULL.
77 | ncclResult_t (*devices)(int* ndev);
78 | // Get various device properties.
79 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
80 | // Create a receiving object and provide a handle to connect to it. The
81 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
82 | // between ranks to create connections.
83 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
84 | // Create a group for collective operations. handles have been created
85 | // using listen() above. rank indicates caller's rank in the collective network.
86 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
87 | // Returns whether a reduction operation on a data type is supported.
88 | // 1 for supported, 0 otherwise.
89 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
90 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
91 | ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
92 | /* DMA-BUF support */
93 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
94 | ncclResult_t (*deregMr)(void* collComm, void* mhandle);
95 | // Performs an asynchronous allreduce operation on the collective group.
96 | // May return request == NULL if the call cannot be performed (or would block).
97 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
98 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
99 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
100 | // visible to the GPU
101 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
102 | // Test whether a request is complete. If size is not NULL, it returns the
103 | // number of bytes sent/received.
104 | ncclResult_t (*test)(void* request, int* done, int* size);
105 | // Close and free collective comm objects
106 | ncclResult_t (*closeColl)(void* collComm);
107 | ncclResult_t (*closeListen)(void* listenComm);
108 | } ncclCollNet_v6_t;
109 |
110 | #endif // end include guard
111 |
--------------------------------------------------------------------------------
/include/net_v7.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | */
6 |
7 | #ifndef NCCL_NET_V7_H_
8 | #define NCCL_NET_V7_H_
9 |
10 | #include "net_device.h"
11 |
12 | typedef struct {
13 | char* name; // Used mostly for logging.
14 | char* pciPath; // Path to the PCI device in /sys.
15 | uint64_t guid; // Unique identifier for the NIC chip. Important for
16 | // cards with multiple PCI functions (Physical or virtual).
17 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
18 | int speed; // Port speed in Mbps.
19 | int port; // Port number.
20 | float latency; // Network latency
21 | int maxComms; // Maximum number of comms we can create
22 | int maxRecvs; // Maximum number of grouped receives.
23 | ncclNetDeviceType netDeviceType; // Network offload type
24 | int netDeviceVersion; // Version number for network offload
25 | } ncclNetProperties_v7_t;
26 |
27 | typedef struct {
28 | // Name of the network (mainly for logs)
29 | const char* name;
30 | // Initialize the network.
31 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
32 | // Return the number of adapters.
33 | ncclResult_t (*devices)(int* ndev);
34 | // Get various device properties.
35 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
36 | // Create a receiving object and provide a handle to connect to it. The
37 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
38 | // between ranks to create a connection.
39 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
40 | // Connect to a handle and return a sending comm object for that peer.
41 | // This call must not block for the connection to be established, and instead
42 | // should return successfully with sendComm == NULL with the expectation that
43 | // it will be called again until sendComm != NULL.
44 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
45 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
46 | // Finalize connection establishment after remote peer has called connect.
47 | // This call must not block for the connection to be established, and instead
48 | // should return successfully with recvComm == NULL with the expectation that
49 | // it will be called again until recvComm != NULL.
50 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
51 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
52 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
53 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
54 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
55 | /* DMA-BUF support */
56 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
57 | ncclResult_t (*deregMr)(void* comm, void* mhandle);
58 | // Asynchronous send to a peer.
59 | // May return request == NULL if the call cannot be performed (or would block)
60 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
61 | // Asynchronous recv from a peer.
62 | // May return request == NULL if the call cannot be performed (or would block)
63 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
64 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
65 | // visible to the GPU
66 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
67 | // Test whether a request is complete. If size is not NULL, it returns the
68 | // number of bytes sent/received.
69 | ncclResult_t (*test)(void* request, int* done, int* sizes);
70 | // Close and free send/recv comm objects
71 | ncclResult_t (*closeSend)(void* sendComm);
72 | ncclResult_t (*closeRecv)(void* recvComm);
73 | ncclResult_t (*closeListen)(void* listenComm);
74 |
75 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code
76 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
77 |
78 | // Notify the plugin that a recv has completed by the device
79 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
80 | } ncclNet_v7_t;
81 |
82 | // v7 struct for backwards compatibility
83 | typedef struct {
84 | // Name of the collective network (mainly for logs)
85 | const char* name;
86 | // Initialize the collective network.
87 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
88 | // Return the number of adapters capable of doing collective operations.
89 | // If ndev returns 0, all other functions might be set to NULL.
90 | ncclResult_t (*devices)(int* ndev);
91 | // Get various device properties.
92 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
93 | // Create a receiving object and provide a handle to connect to it. The
94 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
95 | // between ranks to create connections.
96 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
97 | // Create a group for collective operations. handles have been created
98 | // using listen() above. rank indicates caller's rank in the collective network.
99 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
100 | // Returns whether a reduction operation on a data type is supported.
101 | // 1 for supported, 0 otherwise.
102 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
103 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
104 | ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
105 | /* DMA-BUF support */
106 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
107 | ncclResult_t (*deregMr)(void* collComm, void* mhandle);
108 | // Performs an asynchronous allreduce operation on the collective group.
109 | // May return request == NULL if the call cannot be performed (or would block).
110 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
111 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
112 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
113 | // visible to the GPU
114 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
115 | // Test whether a request is complete. If size is not NULL, it returns the
116 | // number of bytes sent/received.
117 | ncclResult_t (*test)(void* request, int* done, int* size);
118 | // Close and free collective comm objects
119 | ncclResult_t (*closeColl)(void* collComm);
120 | ncclResult_t (*closeListen)(void* listenComm);
121 | } ncclCollNet_v7_t;
122 |
123 | #endif // end include guard
124 |
--------------------------------------------------------------------------------
/include/net_v8.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | */
6 |
7 | #ifndef NCCL_NET_V8_H_
8 | #define NCCL_NET_V8_H_
9 | #include "net_device.h"
10 |
11 | typedef struct {
12 | char* name; // Used mostly for logging.
13 | char* pciPath; // Path to the PCI device in /sys.
14 | uint64_t guid; // Unique identifier for the NIC chip. Important for
15 | // cards with multiple PCI functions (Physical or virtual).
16 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
17 | int regIsGlobal; // regMr is not tied to a particular comm
18 | int speed; // Port speed in Mbps.
19 | int port; // Port number.
20 | float latency; // Network latency
21 | int maxComms; // Maximum number of comms we can create
22 | int maxRecvs; // Maximum number of grouped receives.
23 | ncclNetDeviceType netDeviceType; // Network offload type
24 | int netDeviceVersion; // Version number for network offload
25 | } ncclNetProperties_v8_t;
26 |
27 | typedef struct {
28 | // Name of the network (mainly for logs)
29 | const char* name;
30 | // Initialize the network.
31 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
32 | // Return the number of adapters.
33 | ncclResult_t (*devices)(int* ndev);
34 | // Get various device properties.
35 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
36 | // Create a receiving object and provide a handle to connect to it. The
37 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
38 | // between ranks to create a connection.
39 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
40 | // Connect to a handle and return a sending comm object for that peer.
41 | // This call must not block for the connection to be established, and instead
42 | // should return successfully with sendComm == NULL with the expectation that
43 | // it will be called again until sendComm != NULL.
44 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
45 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
46 | // Finalize connection establishment after remote peer has called connect.
47 | // This call must not block for the connection to be established, and instead
48 | // should return successfully with recvComm == NULL with the expectation that
49 | // it will be called again until recvComm != NULL.
50 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
51 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
52 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
53 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
54 | ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
55 | /* DMA-BUF support */
56 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
57 | ncclResult_t (*deregMr)(void* comm, void* mhandle);
58 | // Asynchronous send to a peer.
59 | // May return request == NULL if the call cannot be performed (or would block)
60 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
61 | // Asynchronous recv from a peer.
62 | // May return request == NULL if the call cannot be performed (or would block)
63 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
64 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
65 | // visible to the GPU
66 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
67 | // Test whether a request is complete. If size is not NULL, it returns the
68 | // number of bytes sent/received.
69 | ncclResult_t (*test)(void* request, int* done, int* sizes);
70 | // Close and free send/recv comm objects
71 | ncclResult_t (*closeSend)(void* sendComm);
72 | ncclResult_t (*closeRecv)(void* recvComm);
73 | ncclResult_t (*closeListen)(void* listenComm);
74 |
75 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code
76 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
77 |
78 | // Notify the plugin that a recv has completed by the device
79 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
80 | } ncclNet_v8_t;
81 |
82 |
83 | typedef struct {
84 | void* mhandle;
85 | void* address;
86 | uint32_t size;
87 | } ncclNetSGE_v8_t;
88 |
89 | typedef struct {
90 | // Name of the collective network (mainly for logs)
91 | const char* name;
92 | // Initialize the collective network.
93 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
94 | // Return the number of adapters capable of doing collective operations.
95 | // If ndev returns 0, all other functions might be set to NULL.
96 | ncclResult_t (*devices)(int* ndev);
97 | // Get various device properties.
98 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
99 | // Create a receiving object and provide a handle to connect to it. The
100 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
101 | // between ranks to create connections.
102 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
103 | // Create a group for collective operations. handles have been created
104 | // using listen() above. rank indicates caller's rank in the collective network.
105 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
106 | // Returns whether a reduction operation on a data type is supported.
107 | // 1 for supported, 0 otherwise.
108 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
109 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
110 | ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
111 | /* DMA-BUF support */
112 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
113 | ncclResult_t (*deregMr)(void* collComm, void* mhandle);
114 | // Performs an asynchronous allreduce operation on the collective group.
115 | // May return request == NULL if the call cannot be performed (or would block).
116 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
117 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
118 | ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
119 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
120 | void* sendMhandle, void** request);
121 | ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
122 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
123 | ncclDataType_t dataType, ncclRedOp_t redOp,
124 | void* recvMhandle, void** request);
125 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
126 | // visible to the GPU
127 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
128 | // Test whether a request is complete. If size is not NULL, it returns the
129 | // number of bytes sent/received.
130 | ncclResult_t (*test)(void* request, int* done, int* size);
131 | // Close and free collective comm objects
132 | ncclResult_t (*closeColl)(void* collComm);
133 | ncclResult_t (*closeListen)(void* listenComm);
134 | } ncclCollNet_v8_t;
135 |
136 |
137 | #endif // end include guard
138 |
--------------------------------------------------------------------------------
/include/net_v9.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | */
6 |
7 | #ifndef NCCL_NET_V9_H_
8 | #define NCCL_NET_V9_H_
9 | #include "net_device.h"
10 |
11 | // Max number of ncclNet objects which can live in the same process
12 | #define NCCL_NET_MAX_PLUGINS 3
13 |
14 | #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
15 |
16 | typedef struct {
17 | int ndevs;
18 | int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
19 | } ncclNetVDeviceProps_v9_t;
20 |
21 | typedef struct {
22 | char* name; // Used mostly for logging.
23 | char* pciPath; // Path to the PCI device in /sys.
24 | uint64_t guid; // Unique identifier for the NIC chip. Important for
25 | // cards with multiple PCI functions (Physical or virtual).
26 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
27 | int regIsGlobal; // regMr is not tied to a particular comm
28 | int forceFlush; // Force a flush on receives
29 | int speed; // Port speed in Mbps.
30 | int port; // Port number.
31 | float latency; // Network latency
32 | int maxComms; // Maximum number of comms we can create
33 | int maxRecvs; // Maximum number of grouped receives.
34 | ncclNetDeviceType netDeviceType; // Network offload type
35 | int netDeviceVersion; // Version number for network offload
36 | ncclNetVDeviceProps_v9_t vProps;
37 | size_t maxP2pBytes; // Max transfer size for point-to-point operations
38 | size_t maxCollBytes; // Max transfer size for collective operations
39 | } ncclNetProperties_v9_t;
40 |
41 | typedef struct {
42 | // Name of the network (mainly for logs)
43 | const char* name;
44 | // Initialize the network.
45 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
46 | // Return the number of adapters.
47 | ncclResult_t (*devices)(int* ndev);
48 | // Get various device properties.
49 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
50 | // Create a receiving object and provide a handle to connect to it. The
51 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
52 | // between ranks to create a connection.
53 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
54 | // Connect to a handle and return a sending comm object for that peer.
55 | // This call must not block for the connection to be established, and instead
56 | // should return successfully with sendComm == NULL with the expectation that
57 | // it will be called again until sendComm != NULL.
58 | // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
59 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
60 | // Finalize connection establishment after remote peer has called connect.
61 | // This call must not block for the connection to be established, and instead
62 | // should return successfully with recvComm == NULL with the expectation that
63 | // it will be called again until recvComm != NULL.
64 | // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
65 | ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
66 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
67 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
68 | ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
69 | /* DMA-BUF support */
70 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
71 | ncclResult_t (*deregMr)(void* comm, void* mhandle);
72 | // Asynchronous send to a peer.
73 | // May return request == NULL if the call cannot be performed (or would block)
74 | ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
75 | // Asynchronous recv from a peer.
76 | // May return request == NULL if the call cannot be performed (or would block)
77 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
78 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
79 | // visible to the GPU
80 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
81 | // Test whether a request is complete. If size is not NULL, it returns the
82 | // number of bytes sent/received.
83 | ncclResult_t (*test)(void* request, int* done, int* sizes);
84 | // Close and free send/recv comm objects
85 | ncclResult_t (*closeSend)(void* sendComm);
86 | ncclResult_t (*closeRecv)(void* recvComm);
87 | ncclResult_t (*closeListen)(void* listenComm);
88 |
89 | // Copy the given mhandle to a dptr in a format usable by this plugin's device code
90 | ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
91 |
92 | // Notify the plugin that a recv has completed by the device
93 | ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
94 |
95 | // Create a virtual NIC given the specified properties, which can be accessed at device index d
96 | ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
97 | } ncclNet_v9_t;
98 |
99 | typedef struct {
100 | // Name of the collective network (mainly for logs)
101 | const char* name;
102 | // Initialize the collective network.
103 | ncclResult_t (*init)(ncclDebugLogger_t logFunction);
104 | // Return the number of adapters capable of doing collective operations.
105 | // If ndev returns 0, all other functions might be set to NULL.
106 | ncclResult_t (*devices)(int* ndev);
107 | // Get various device properties.
108 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
109 | // Create a receiving object and provide a handle to connect to it. The
110 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
111 | // between ranks to create connections.
112 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
113 | // Create a group for collective operations. handles have been created
114 | // using listen() above. rank indicates caller's rank in the collective network.
115 | ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
116 | // Returns whether a reduction operation on a data type is supported.
117 | // 1 for supported, 0 otherwise.
118 | ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
119 | // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
120 | ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
121 | /* DMA-BUF support */
122 | ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
123 | ncclResult_t (*deregMr)(void* collComm, void* mhandle);
124 | // Performs an asynchronous allreduce operation on the collective group.
125 | // May return request == NULL if the call cannot be performed (or would block).
126 | ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
127 | ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
128 | ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
129 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
130 | void* sendMhandle, void** request);
131 | ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
132 | size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
133 | ncclDataType_t dataType, ncclRedOp_t redOp,
134 | void* recvMhandle, void** request);
135 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
136 | // visible to the GPU
137 | ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
138 | // Test whether a request is complete. If size is not NULL, it returns the
139 | // number of bytes sent/received.
140 | ncclResult_t (*test)(void* request, int* done, int* size);
141 | // Close and free collective comm objects
142 | ncclResult_t (*closeColl)(void* collComm);
143 | ncclResult_t (*closeListen)(void* listenComm);
144 |
145 | // Create a virtual NIC given the specified properties, which can be accessed at device index d
146 | ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
147 | } ncclCollNet_v9_t;
148 |
149 | #endif // end include guard
150 |
--------------------------------------------------------------------------------
/include/p2p_plugin.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_P2P_PLUGIN_H_
10 | #define NCCL_P2P_PLUGIN_H_
11 |
12 | #include
13 | #include
14 | #include
15 |
16 | #include "nccl.h"
17 | #include "net.h"
18 | #include "ibvwrap.h"
19 | #include "param.h"
20 | #include "socket.h"
21 | #include "utils.h"
22 |
23 | #define MAXNAMESIZE 64
24 | #define NCCL_NET_IB_MAX_RECVS 8
25 | // We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive
26 | #define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS)
27 | //static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion");
28 | #define IB_DEVICE_SYSFS_FMT "/sys/class/infiniband/%s/device/%s"
29 |
30 | #define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
31 |
32 | typedef enum nccl_p2p_plugin {
33 | NCCL_P2P_IB,
34 | NCCL_P2P_UCX,
35 | NCCL_P2P_UCX_RMA,
36 | NCCL_P2P_UCX_UCT,
37 | NCCL_P2P_UCX_UCT_RD,
38 | NCCL_P2P_LAST
39 | } nccl_p2p_plugin_t;
40 |
41 | struct ncclIbMr {
42 | uintptr_t addr;
43 | size_t pages;
44 | int refs;
45 | struct ibv_mr *mr;
46 | };
47 |
48 | struct ncclIbMrCache {
49 | struct ncclIbMr *slots;
50 | int capacity, population;
51 | };
52 |
53 | #define NCCL_IB_MAX_DEVS_PER_NIC 4
54 | #define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC
55 | typedef struct ncclIbMergedDev {
56 | ncclNetVDeviceProps_t vProps;
57 | int speed;
58 | char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
59 | } __attribute__((aligned(64))) ncclIbMergedDev;
60 |
61 | struct ncclIbStats {
62 | int fatalErrorCount;
63 | };
64 |
65 | struct ncclIbRequest {
66 | struct ncclIbNetCommBase* base;
67 | int type;
68 | struct ncclSocket* sock;
69 | int events[NCCL_IB_MAX_DEVS_PER_NIC];
70 | struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC];
71 | int nreqs;
72 | union {
73 | struct {
74 | int size;
75 | void* data;
76 | uint32_t lkeys[NCCL_IB_MAX_DEVS_PER_NIC];
77 | int offset;
78 | } send;
79 | struct {
80 | int* sizes;
81 | } recv;
82 | };
83 | };
84 |
85 | // Retain local RoCE address for error logging
86 | struct ncclIbGidInfo {
87 | uint8_t link_layer;
88 | union ibv_gid localGid;
89 | int32_t localGidIndex;
90 | };
91 |
92 | typedef struct ncclIbNetCommDevBase {
93 | int ibDevN;
94 | struct ibv_pd* pd;
95 | struct ibv_cq* cq;
96 | uint64_t pad[2];
97 | struct ncclIbGidInfo gidInfo;
98 | } ncclIbNetCommDevBase;
99 |
100 | typedef struct ncclIbDev {
101 | pthread_mutex_t lock;
102 | int device;
103 | uint64_t guid;
104 | uint8_t portNum;
105 | uint8_t link;
106 | uint8_t isSharpDev;
107 | int speed;
108 | struct ibv_context* context;
109 | int pdRefs;
110 | struct ibv_pd* pd;
111 | char devName[MAXNAMESIZE];
112 | char *pciPath;
113 | char* virtualPciPath;
114 | int realPort;
115 | int maxQp;
116 | float latency;
117 | struct ncclIbMrCache mrCache;
118 | int ar; // ADAPTIVE_ROUTING
119 | struct ibv_port_attr portAttr;
120 | struct ncclIbStats stats;
121 | int dmaBufSupported;
122 | } __attribute__((aligned(64))) ncclIbDev;
123 |
124 |
125 | #define MAX_IB_DEVS 32
126 | #define MAX_IB_VDEVS MAX_IB_DEVS*8
127 | extern struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
128 | extern struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
129 | /* Detect whether GDR can work on a given NIC with the current CUDA device
130 | * Returns :
131 | * ncclSuccess : GDR works
132 | * ncclSystemError : no module or module loaded but not supported by GPU */
133 | ncclResult_t nccl_p2p_gdr_support();
134 |
135 | ncclResult_t nccl_p2p_dmabuf_support(int dev);
136 |
137 | ncclResult_t nccl_p2p_ib_pci_path(ncclIbDev *devs, int num_devs, char* dev_name, char** path, int* real_port);
138 |
139 | ncclResult_t nccl_p2p_ib_get_properties(ncclIbDev *devs, int ncclNMergedIbDevs, int dev, ncclNetProperties_t* props);
140 |
141 | ncclResult_t nccl_p2p_ib_init(int *nDevs, int *nmDevs, ncclIbDev *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr,
142 | pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction);
143 |
144 | /* Convert value returtned by ibv_query_port to actual link width */
145 | int nccl_p2p_ib_width(int width);
146 |
147 | /* Convert value returtned by ibv_query_port to actual link speed */
148 | int nccl_p2p_ib_speed(int speed);
149 |
150 | int64_t ncclParamSharpMaxComms();
151 |
152 | int64_t ncclParamIbMergeVfs();
153 |
154 | int64_t ncclParamIbMergeNics();
155 |
156 | int ncclIbRelaxedOrderingCapable(void);
157 |
158 | nccl_p2p_plugin_t nccl_p2p_get_plugin_type();
159 |
160 | ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat);
161 |
162 | ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props, int nDevs, int *nmDevs);
163 |
164 | #endif
165 |
--------------------------------------------------------------------------------
/include/param.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2017-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_PARAM_H_
10 | #define NCCL_PARAM_H_
11 |
12 | #include
13 |
14 | const char* userHomeDir();
15 | void setEnvFile(const char* fileName);
16 | void initEnv();
17 | const char *ncclGetEnv(const char *name);
18 |
19 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
20 |
21 | #define NCCL_PARAM(name, env, deftVal) \
22 | int64_t ncclParam##name() { \
23 | NCCL_STATIC_ASSERT(deftVal != INT64_MIN, "default value cannot be the uninitialized value."); \
24 | static int64_t cache = INT64_MIN; \
25 | if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == INT64_MIN, false)) { \
26 | ncclLoadParam("NCCL_" env, deftVal, INT64_MIN, &cache); \
27 | } \
28 | return cache; \
29 | }
30 |
31 | #endif
32 |
--------------------------------------------------------------------------------
/include/socket.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_SOCKET_H_
10 | #define NCCL_SOCKET_H_
11 |
12 | #include "nccl.h"
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include "stdbool.h"
20 | #include "utils.h"
21 |
22 | #define MAX_IFS 16
23 | #define MAX_IF_NAME_SIZE 16
24 | #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
25 | #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
26 |
27 | /* Common socket address storage structure for IPv4/IPv6 */
28 | union ncclSocketAddress {
29 | struct sockaddr sa;
30 | struct sockaddr_in sin;
31 | struct sockaddr_in6 sin6;
32 | };
33 |
34 | enum ncclSocketState {
35 | ncclSocketStateNone = 0,
36 | ncclSocketStateInitialized = 1,
37 | ncclSocketStateAccepting = 2,
38 | ncclSocketStateAccepted = 3,
39 | ncclSocketStateConnecting = 4,
40 | ncclSocketStateConnectPolling = 5,
41 | ncclSocketStateConnected = 6,
42 | ncclSocketStateReady = 7,
43 | ncclSocketStateTerminating = 8,
44 | ncclSocketStateClosed = 9,
45 | ncclSocketStateError = 10,
46 | ncclSocketStateNum = 11
47 |
48 | };
49 |
50 | enum ncclSocketType {
51 | ncclSocketTypeUnknown = 0,
52 | ncclSocketTypeBootstrap = 1,
53 | ncclSocketTypeProxy = 2,
54 | ncclSocketTypeNetIb = 4,
55 | ncclSocketTypeRasNetwork = 5
56 | };
57 |
58 | struct ncclSocket {
59 | int fd;
60 | int acceptFd;
61 | int errorRetries;
62 | union ncclSocketAddress addr;
63 | volatile uint32_t* abortFlag;
64 | int asyncFlag;
65 | enum ncclSocketState state;
66 | int salen;
67 | uint64_t magic;
68 | enum ncclSocketType type;
69 | int customRetry;
70 | int finalizeCounter; // Used to keep track of initial handshake for async sockets.
71 | char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets.
72 | };
73 |
74 | const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm);
75 | ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
76 | int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
77 | int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
78 |
79 | // Initialize a socket
80 | ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry);
81 | // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
82 | ncclResult_t ncclSocketListen(struct ncclSocket* sock);
83 | ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
84 | // Connect to sock->addr. sock->fd is set after a successful call.
85 | ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
86 | // Return socket connection state.
87 | ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
88 | // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
89 | ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
90 | ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
91 | ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
92 |
93 | #define NCCL_SOCKET_SEND 0
94 | #define NCCL_SOCKET_RECV 1
95 |
96 | ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed);
97 | ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
98 | ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
99 | ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
100 | ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
101 | ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait);
102 | #endif
103 |
--------------------------------------------------------------------------------
/include/timer.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_TIMER_H_
10 | #define NCCL_TIMER_H_
11 | #if ENABLE_TIMER
12 | #include
13 | #include
14 | #include
15 | static double freq = -1;
16 | static void calibrate() {
17 | struct timeval tv;
18 | gettimeofday(&tv, NULL);
19 | uint64_t timeCycles = __rdtsc();
20 | double time = - tv.tv_sec*1E6 - tv.tv_usec;
21 | uint64_t total = 0ULL;
22 | for (int i=0; i<10000; i++) total += __rdtsc();
23 | gettimeofday(&tv, NULL);
24 | timeCycles = __rdtsc() - timeCycles;
25 | time += tv.tv_sec*1E6 + tv.tv_usec;
26 | freq = timeCycles/time;
27 | }
28 | static inline double gettime() {
29 | if (freq == -1) calibrate();
30 | return __rdtsc()/freq;
31 | }
32 | static uint64_t counts[8];
33 | static double times[8];
34 | static double startTimes[8];
35 | #define TIME_START(index) do { \
36 | counts[index]++; \
37 | startTimes[index] = gettime(); \
38 | } while (0)
39 |
40 | #define TIME_STOP(index) do { \
41 | times[index] += gettime() - startTimes[index]; \
42 | } while (0)
43 |
44 | #define TIME_CANCEL(index) do { \
45 | counts[index]--; \
46 | } while (0)
47 |
48 | #define TIME_PRINT(name) do { \
49 | printf("%s stats", name); \
50 | for (int i=0; i<8; i++) { \
51 | if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
52 | counts[i] = 0; \
53 | } \
54 | printf("\n"); \
55 | } while (0)
56 | #else
57 | #define TIME_START(index) do {} while(0)
58 | #define TIME_STOP(index) do {} while(0)
59 | #define TIME_CANCEL(index) do {} while(0)
60 | #define TIME_PRINT(name)
61 | #endif
62 | #endif
63 |
--------------------------------------------------------------------------------
/include/ucx_uct_lib.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_UCX_UCT_LIB_H_
10 | #define NCCL_UCX_UCT_LIB_H_
11 |
12 | #include
13 | #include
14 | #include
15 |
16 | #include "p2p_plugin.h"
17 | #include "socket.h"
18 |
19 | #include
20 |
21 | #define NCCL_UCX_UCT_MAX_RECVS NCCL_NET_IB_MAX_RECVS
22 | #define NCCL_UCT_LISTEN_HANDLE_MAGIC 0x43cf19ed91abdb85
23 | #define NCCL_UCT_REG_ALIGN 4096
24 |
25 | typedef enum {
26 | NCCL_UCT_AM_RTR = 14, /* Use particular values */
27 | NCCL_UCT_AM_ATP = 15,
28 | NCCL_UCT_AM_RTS = 16,
29 | NCCL_UCT_AM_ATS = 17
30 | } nccl_uct_am_type_t;
31 |
32 | typedef enum {
33 | NCCL_UCT_START = 0,
34 | NCCL_UCT_CONNECT,
35 | NCCL_UCT_ACCEPT,
36 | NCCL_UCT_RECEIVE_REMOTE, /* Acceptor receives ep addr/remote communicator */
37 | NCCL_UCT_RECEIVE_ADDR,
38 | NCCL_UCT_RX_READY,
39 | NCCL_UCT_DONE
40 | } nccl_uct_state_t;
41 |
42 | /* UCT EP address to exchange and connect to */
43 | typedef struct {
44 | uint8_t dev_addr_size;
45 | uint8_t ep_addr_size;
46 | uint8_t data[64];
47 | } nccl_uct_ep_addr_t;
48 |
49 | typedef struct {
50 | uct_iface_h iface;
51 | uct_md_h md;
52 | uct_component_h comp;
53 | void *addr;
54 | size_t addr_size;
55 | void *dev_addr;
56 | size_t dev_addr_size;
57 | size_t ep_addr_size;
58 | size_t rkey_packed_size;
59 |
60 | size_t am_max_short;
61 | size_t min_get_zcopy;
62 | } nccl_uct_iface_t;
63 |
64 | struct nccl_uct_context;
65 |
66 | typedef struct nccl_uct_worker {
67 | struct nccl_uct_worker *next;
68 | struct {
69 | pthread_t thread;
70 | int dev;
71 | } id;
72 |
73 | int count;
74 | ucs_async_context_t *async;
75 | uct_worker_h worker;
76 | nccl_uct_iface_t *uct_iface;
77 | struct nccl_uct_context *context;
78 | } nccl_uct_worker_t;
79 |
80 | typedef struct {
81 | uct_ep_h ep;
82 | uct_ep_addr_t *addr;
83 | size_t addr_size;
84 | nccl_uct_iface_t *uct_iface;
85 | uint8_t data[];
86 | } nccl_uct_ep_t;
87 |
88 | /* All the remote addresses for the communicator */
89 | typedef struct nccl_uct_comm_addr {
90 | nccl_uct_ep_addr_t rma;
91 | /* TODO: Add multi-QP here */
92 | } nccl_uct_comm_addr_t;
93 |
94 | /* Either Receiver or Sender communicator, connected to one peer */
95 | typedef struct nccl_uct_comm {
96 | struct ncclSocket sock;
97 | struct nccl_uct_context *context;
98 | int dev;
99 |
100 | nccl_uct_worker_t *uct_worker;
101 | nccl_uct_iface_t *uct_iface;
102 | nccl_uct_ep_t *uct_ep;
103 |
104 | struct nccl_uct_comm_remote {
105 | nccl_uct_comm_addr_t addr; /* Remote addresses */
106 | const struct nccl_uct_comm *comm; /* Cookie received in connect */
107 | } remote;
108 |
109 | /* Local GET on current device */
110 | struct {
111 | int enabled;
112 | nccl_uct_ep_t *uct_ep; /* Locally read from HCA */
113 | nccl_uct_ep_addr_t addr;
114 |
115 | uint8_t *mem; /* Dummy memory to read into */
116 | uct_mem_h memh;
117 | } gpu_flush;
118 | } nccl_uct_comm_t;
119 |
120 | /* State tracking used while connecting/accepting only */
121 | typedef struct {
122 | nccl_uct_state_t state;
123 | nccl_uct_comm_t *comm; /* current communicator being created */
124 | int offset; /* for Socket reading */
125 | int ready; /* accept must complete after connect */
126 | } nccl_uct_stage_t;
127 |
128 | /* Memory registration handle in NCCL UCT plugin returned by ->regMR() */
129 | typedef struct {
130 | uct_mem_h memh;
131 | nccl_uct_comm_t *comm;
132 | uct_rkey_bundle_t bundle;
133 | uint8_t rkey[];
134 | } nccl_uct_memh_t;
135 |
136 | /* On-the-wire handle passed OOB by NCCL from listener to connector */
137 | typedef struct {
138 | uint64_t magic;
139 | struct {
140 | union ncclSocketAddress addr;
141 | uint32_t id;
142 | } listener;
143 | nccl_uct_comm_t *comm; /* Created communicator in accept */
144 | nccl_uct_stage_t stage; /* Used by connector */
145 | } nccl_uct_listen_handle_t;
146 |
147 | /* Communicator while listening to remote ranks */
148 | typedef struct {
149 | struct ncclSocket sock;
150 | struct nccl_uct_context *context;
151 | int dev;
152 | uint32_t id;
153 | nccl_uct_worker_t *uct_worker;
154 | nccl_uct_comm_t *comm;
155 |
156 | /* Used by acceptor */
157 | nccl_uct_stage_t stage;
158 | } nccl_uct_listen_comm_t;
159 |
160 | /* Global state of the plugin */
161 | typedef struct nccl_uct_context {
162 | /* Transport to use */
163 | const char *tl_name;
164 |
165 | /* IB devices available */
166 | int dev_count;
167 | int merge_dev_count;
168 |
169 | /* Use by common code to setup communicators */
170 | struct nccl_uct_ops {
171 | ncclResult_t (*comm_alloc)(nccl_uct_comm_t **comm);
172 | ncclResult_t (*comm_init)(nccl_uct_comm_t *comm,
173 | struct nccl_uct_context *context,
174 | nccl_uct_worker_t *worker, int dev,
175 | const nccl_uct_comm_t *remote_comm);
176 | ncclResult_t (*iface_set)(nccl_uct_iface_t *uct_iface);
177 | } ops;
178 |
179 | /* Max sizes needed */
180 | size_t am_short_size;
181 | size_t rkey_size;
182 |
183 | /* OOB socket for accepting/connecting */
184 | char if_name[MAX_IF_NAME_SIZE];
185 | union ncclSocketAddress if_addr;
186 |
187 | /* Number of listener created */
188 | uint32_t listener_count;
189 |
190 | /* List of created workers */
191 | nccl_uct_worker_t *worker_list;
192 | } nccl_uct_context_t;
193 |
194 | #define UCXCHECK(statement, failure_action, message, ...) \
195 | do { \
196 | ucs_status_t _status = statement; \
197 | if (_status != UCS_OK) { \
198 | WARN("Failed: " message ": %s", ##__VA_ARGS__, \
199 | ucs_status_string(_status)); \
200 | failure_action; \
201 | } \
202 | } while (0)
203 |
204 | extern nccl_uct_context_t context;
205 |
206 | /* Library functions */
207 | ncclResult_t nccl_uct_iface_set_handler(nccl_uct_iface_t *uct_iface, int id,
208 | uct_am_callback_t callback);
209 | ncclResult_t nccl_uct_devices(int *ndev);
210 | ncclResult_t nccl_uct_comm_init(nccl_uct_comm_t *comm,
211 | nccl_uct_context_t *context,
212 | nccl_uct_worker_t *worker, int dev,
213 | const nccl_uct_comm_t *remote_comm);
214 | void nccl_uct_comm_deinit(nccl_uct_comm_t *comm);
215 | int nccl_uct_flush_index(nccl_uct_comm_t *base, int *sizes, int n);
216 | ncclResult_t nccl_uct_flush(nccl_uct_comm_t *base_comm, void *data, int size,
217 | nccl_uct_memh_t *uct_memh,
218 | uct_completion_t *completion, void **request);
219 | void nccl_uct_empty_callback(uct_completion_t *comp);
220 |
221 | /* NCCL common plugin callbacks */
222 | ncclResult_t nccl_uct_listen(int dev, void *listen_handle, void **listen_comm);
223 | ncclResult_t nccl_uct_accept(void *listen_comm, void **recv_comm,
224 | ncclNetDeviceHandle_v7_t **recvDevComm);
225 | ncclResult_t nccl_uct_connect(int dev, ncclNetCommConfig_t* config, void *listen_handle, void **send_comm,
226 | ncclNetDeviceHandle_t **sendDevComm);
227 | ncclResult_t nccl_uct_close_listen(void *listen_comm);
228 | ncclResult_t nccl_uct_reg_mr_dmabuf(void *reg_comm, void *data, size_t size,
229 | int type, uint64_t offset, int fd,
230 | void **mhandle);
231 | ncclResult_t nccl_uct_reg_mr(void *reg_comm, void *data, size_t size, int type,
232 | void **mhandle);
233 | ncclResult_t nccl_uct_dereg_mr(void *dereg_comm, void *mhandle);
234 |
235 | /* Compatibility callback */
236 | ncclResult_t nccl_uct_get_properties_v9(int dev,
237 | ncclNetProperties_v9_t *props_v9);
238 | ncclResult_t nccl_uct_get_properties_v8(int dev,
239 | ncclNetProperties_v8_t *props_v8);
240 | ncclResult_t nccl_uct_get_properties_v7(int dev,
241 | ncclNetProperties_v7_t *props_v7);
242 | ncclResult_t nccl_uct_reg_mr_v7(void *comm, void *data, int size, int type,
243 | void **mhandle);
244 | ncclResult_t nccl_uct_get_properties_v6(int dev,
245 | ncclNetProperties_v6_t *props_v6);
246 | ncclResult_t nccl_uct_connect_v9(int dev, void *listen_handle, void **send_comm,
247 | ncclNetDeviceHandle_t **sendDevComm);
248 | ncclResult_t nccl_uct_connect_v6(int dev, void *handle, void **send_comm);
249 | ncclResult_t nccl_uct_accept_v6(void *listen_comm, void **recv_comm);
250 | ncclResult_t nccl_uct_get_properties(int dev, ncclNetProperties_t *props);
251 |
252 |
253 | #define NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_func, get_properties_func, \
254 | connect_func, accept_func, reg_mr_func, \
255 | isend_func, irecv_func) \
256 | { \
257 | .name = plugin_name, \
258 | .init = prefix##_##init_func, \
259 | .devices = nccl_uct_devices, \
260 | .getProperties = get_properties_func, \
261 | .listen = nccl_uct_listen, \
262 | .connect = connect_func, \
263 | .accept = accept_func, \
264 | .regMr = reg_mr_func, \
265 | .regMrDmaBuf = nccl_uct_reg_mr_dmabuf, \
266 | .deregMr = nccl_uct_dereg_mr, \
267 | .isend = prefix##_##isend_func, \
268 | .irecv = prefix##_##irecv_func, \
269 | .iflush = prefix##_iflush, \
270 | .test = prefix##_test, \
271 | .closeSend = prefix##_close, \
272 | .closeRecv = prefix##_close, \
273 | .closeListen = nccl_uct_close_listen \
274 | }
275 |
276 | #define NCCL_UCT_PLUGIN_V10(plugin_name, prefix) \
277 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init, nccl_uct_get_properties, \
278 | nccl_uct_connect, nccl_uct_accept, nccl_uct_reg_mr, \
279 | isend, irecv)
280 |
281 | #define NCCL_UCT_PLUGIN_V9(plugin_name, prefix) \
282 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v9, \
283 | nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr, \
284 | isend_v9, irecv_v9)
285 |
286 | #define NCCL_UCT_PLUGIN_V8(plugin_name, prefix) \
287 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v8, \
288 | nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr, \
289 | isend_v8, irecv_v8)
290 |
291 | #define NCCL_UCT_PLUGIN_V7(plugin_name, prefix) \
292 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v7, \
293 | nccl_uct_connect_v9, nccl_uct_accept, nccl_uct_reg_mr_v7, \
294 | isend_v8, irecv_v8)
295 |
296 | #define NCCL_UCT_PLUGIN_V6(plugin_name, prefix) \
297 | NCCL_UCT_PLUGIN_BASE(plugin_name, prefix, init_v9, nccl_uct_get_properties_v6, \
298 | nccl_uct_connect_v6, nccl_uct_accept_v6, \
299 | nccl_uct_reg_mr_v7, isend_v8, irecv_v8)
300 |
301 | #define NCCL_UCT_PLUGIN_V5(plugin_name, prefix) \
302 | { \
303 | .name = plugin_name, \
304 | .init = prefix##_init_v9, \
305 | .devices = nccl_uct_devices, \
306 | .getProperties = nccl_uct_get_properties_v6, \
307 | .listen = nccl_uct_listen, \
308 | .connect = nccl_uct_connect_v6, \
309 | .accept = nccl_uct_accept_v6, \
310 | .regMr = nccl_uct_reg_mr_v7, \
311 | .deregMr = nccl_uct_dereg_mr, \
312 | .isend = prefix##_isend_v8, \
313 | .irecv = prefix##_irecv_v8, \
314 | .iflush = prefix##_iflush, \
315 | .test = prefix##_test, \
316 | .closeSend = prefix##_close, \
317 | .closeRecv = prefix##_close, \
318 | .closeListen = nccl_uct_close_listen \
319 | }
320 |
321 | #endif /* NCCL_UCX_UCT_LIB_H_ */
322 |
--------------------------------------------------------------------------------
/include/ucx_uct_ring.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_UCX_UCT_RING_H_
10 | #define NCCL_UCX_UCT_RING_H_
11 |
12 | #include "nccl.h"
13 | #include
14 |
15 | #define NCCL_UCT_RING_SIZE (1 << 7)
16 | #define NCCL_UCT_RING_MASK (NCCL_UCT_RING_SIZE - 1)
17 |
18 | typedef struct nccl_uct_ring {
19 | unsigned first;
20 | unsigned last;
21 | unsigned size;
22 | unsigned entry_size;
23 | int tag[NCCL_UCT_RING_SIZE];
24 | void *entry;
25 | } nccl_uct_ring_t;
26 |
27 | static inline ncclResult_t nccl_uct_ring_init(nccl_uct_ring_t *ring,
28 | unsigned entry_size) {
29 | int i;
30 |
31 | ring->first = 0;
32 | ring->last = 0;
33 | ring->entry_size = entry_size;
34 | ring->entry = malloc(entry_size * NCCL_UCT_RING_SIZE);
35 | if (ring->entry == NULL) {
36 | free(ring->entry);
37 | return ncclSystemError;
38 | }
39 |
40 | for (i = 0; i < NCCL_UCT_RING_SIZE; i++) {
41 | ring->tag[i] = INT_MAX;
42 | }
43 | return ncclSuccess;
44 | }
45 |
46 | static inline void nccl_uct_ring_deinit(nccl_uct_ring_t *ring) {
47 | free(ring->entry);
48 | }
49 |
50 | static inline void *nccl_uct_ring_get_entry(nccl_uct_ring_t *ring, unsigned i) {
51 | return (uint8_t*)ring->entry + (ring->entry_size * (i & NCCL_UCT_RING_MASK));
52 | }
53 |
54 | static inline void nccl_uct_ring_append(nccl_uct_ring_t *ring, int tag,
55 | void *data, size_t len) {
56 | int j = ring->last & NCCL_UCT_RING_MASK;
57 |
58 | ring->last++;
59 |
60 | assert((ring->last & NCCL_UCT_RING_MASK) !=
61 | (ring->first & NCCL_UCT_RING_MASK));
62 | assert(ring->tag[j] == INT_MAX);
63 | assert(len == ring->entry_size);
64 |
65 | ring->tag[j] = tag;
66 | memcpy(nccl_uct_ring_get_entry(ring, j), data, len);
67 | }
68 |
69 | static inline int nccl_uct_ring_is_empty(const nccl_uct_ring_t *ring) {
70 | return ring->first == ring->last;
71 | }
72 |
73 | static inline void nccl_uct_ring_consume(nccl_uct_ring_t *ring, unsigned i) {
74 | unsigned j = i & NCCL_UCT_RING_MASK;
75 |
76 | assert(ring->tag[j] != INT_MAX);
77 | ring->tag[j] = INT_MAX;
78 |
79 | /* Cleanup upon tag hit */
80 | if (i == ring->first) {
81 | for (; i != ring->last; i++) {
82 | j = i & NCCL_UCT_RING_MASK;
83 | if (ring->tag[j] != INT_MAX) {
84 | break;
85 | }
86 | ring->first = i + 1;
87 | }
88 | }
89 | }
90 |
91 | static inline unsigned nccl_uct_ring_find(nccl_uct_ring_t *ring, int tag) {
92 | unsigned i;
93 |
94 | assert(tag != INT_MAX);
95 |
96 | for (i = ring->first; i != ring->last; i++) {
97 | if (ring->tag[i & NCCL_UCT_RING_MASK] == tag) {
98 | return i;
99 | }
100 | }
101 |
102 | return ring->last;
103 | }
104 |
105 | #endif /* NCCL_UCX_UCT_RING_H_ */
106 |
--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #ifndef NCCL_UTILS_H_
10 | #define NCCL_UTILS_H_
11 |
12 | #include "nccl.h"
13 | #include
14 |
15 | #define NCCL_STATIC_ASSERT(_cond, _msg) \
16 | switch(0) {case 0:case (_cond):;}
17 |
18 | ncclResult_t ncclIbMalloc(void** ptr, size_t size);
19 | ncclResult_t ncclRealloc(void** ptr, size_t old_size, size_t new_size);
20 | ncclResult_t getHostName(char* hostname, int maxlen);
21 | uint64_t getHostHash();
22 | uint64_t getPidHash();
23 |
24 | struct netIf {
25 | char prefix[64];
26 | int port;
27 | };
28 |
29 | int parseStringList(const char* string, struct netIf* ifList, int maxList);
30 | int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact);
31 | const char *get_plugin_lib_path();
32 |
33 | #endif
34 |
--------------------------------------------------------------------------------
/m4/sharp.m4:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2001-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # See file LICENSE for terms.
4 | #
5 |
6 | AC_DEFUN([CHECK_SHARP],[
7 |
8 | AS_IF([test "x$sharp_checked" != "xyes"],[
9 |
10 | sharp_happy="no"
11 |
12 | AC_ARG_WITH([sharp],
13 | [AS_HELP_STRING([--with-sharp=(DIR)], [Enable the use of SHARP (default is guess).])],
14 | [], [with_sharp=guess])
15 |
16 | AS_IF([test "x$with_sharp" != "xno"],
17 | [
18 | save_CPPFLAGS="$CPPFLAGS"
19 | save_CFLAGS="$CFLAGS"
20 | save_LDFLAGS="$LDFLAGS"
21 |
22 | AS_IF([test ! -z "$with_sharp" -a "x$with_sharp" != "xyes" -a "x$with_sharp" != "xguess"],
23 | [
24 | check_sharp_dir="$with_sharp"
25 | check_sharp_libdir="$with_sharp/lib"
26 | CPPFLAGS="-I$with_sharp/include $save_CPPFLAGS"
27 | LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS"
28 | ])
29 |
30 | AS_IF([test "x$check_sharp_dir" = "x" -a "x$HPCX_SHARP_DIR" != "x"],
31 | [
32 | check_sharp_dir="$HPCX_SHARP_DIR"
33 | check_sharp_libdir="$HPCX_SHARP_DIR/lib"
34 | CPPFLAGS="-I$check_sharp_dir/include $save_CPPFLAGS"
35 | LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS"
36 | ])
37 |
38 | AS_IF([test "x$check_sharp_dir" = "x" -a -d "/opt/mellanox/sharp/"],
39 | [
40 | check_sharp_dir="/opt/mellanox/sharp/"
41 | check_sharp_libdir="/opt/mellanox/sharp/lib"
42 | CPPFLAGS="-I$check_sharp_dir/include $save_CPPFLAGS"
43 | LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS"
44 | ])
45 |
46 |
47 | AS_IF([test ! -z "$with_sharp_libdir" -a "x$with_sharp_libdir" != "xyes"],
48 | [
49 | check_sharp_libdir="$with_sharp_libdir"
50 | LDFLAGS="-L$check_sharp_libdir $save_LDFLAGS"
51 | ])
52 |
53 | AC_CHECK_HEADERS([sharp/api/sharp_coll.h],
54 | [
55 | AC_CHECK_LIB([sharp_coll], [sharp_coll_init],
56 | [
57 | sharp_happy="yes"
58 | ],
59 | [
60 | sharp_happy="no"
61 | ])
62 | ],
63 | [
64 | sharp_happy="no"
65 | ])
66 |
67 | AS_IF([test "x$sharp_happy" = "xyes"],
68 | [
69 | AS_IF([test "x$check_sharp_dir" != "x"],
70 | [
71 | AC_MSG_RESULT([SHARP dir: $check_sharp_dir])
72 | AC_SUBST(SHARP_CPPFLAGS, "-I$check_sharp_dir/include/")
73 | ])
74 |
75 | AS_IF([test "x$check_sharp_libdir" != "x"],
76 | [
77 | AC_SUBST(SHARP_LDFLAGS, "-L$check_sharp_libdir")
78 | ])
79 |
80 | AC_SUBST(SHARP_LIBADD, "-lsharp_coll")
81 | AC_CHECK_DECLS([SHARP_DTYPE_BFLOAT16], [AC_DEFINE([HAVE_SHARP_DTYPE_BFLOAT16_UINT8_INT8], 1,
82 | [SHARP v3 datatypes : bfloat16, uint8, int8])], [],
83 | [[#include ]])
84 | AC_CHECK_DECLS([sharp_coll_reg_mr_v2], [], [], [[#include ]])
85 |
86 | ],
87 | [
88 | AS_IF([test "x$with_sharp" != "xguess"],
89 | [
90 | AC_MSG_ERROR([SHARP support is requested but SHARP packages cannot be found])
91 | ],
92 | [
93 | AC_MSG_WARN([SHARP not found])
94 | ])
95 | ])
96 |
97 | CFLAGS="$save_CFLAGS"
98 | CPPFLAGS="$save_CPPFLAGS"
99 | LDFLAGS="$save_LDFLAGS"
100 |
101 | ],
102 | [
103 | AC_MSG_WARN([SHARP was explicitly disabled])
104 | ])
105 |
106 | sharp_checked=yes
107 | AM_CONDITIONAL([HAVE_SHARP_PLUGIN], [test "x$sharp_happy" != xno])
108 | ])
109 |
110 | ])
111 |
--------------------------------------------------------------------------------
/m4/ucx.m4:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2001-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # See file LICENSE for terms.
4 | #
5 |
6 | AC_DEFUN([CHECK_UCX],[
7 |
8 | AS_IF([test "x$ucx_checked" != "xyes"],[
9 |
10 | ucx_happy="no"
11 |
12 | AC_ARG_WITH([ucx],
13 | [AS_HELP_STRING([--with-ucx=(DIR)], [Enable the use of UCX (default is guess).])],
14 | [], [with_ucx=guess])
15 |
16 | AS_IF([test "x$with_ucx" != "xno"],
17 | [
18 | save_CPPFLAGS="$CPPFLAGS"
19 | save_CFLAGS="$CFLAGS"
20 | save_LDFLAGS="$LDFLAGS"
21 |
22 | AS_IF([test ! -z "$with_ucx" -a "x$with_ucx" != "xyes" -a "x$with_ucx" != "xguess"],
23 | [
24 | check_ucx_dir="$with_ucx"
25 | check_ucx_libdir="$with_ucx/lib"
26 | CPPFLAGS="-I$with_ucx/include $save_CPPFLAGS"
27 | LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS"
28 | ])
29 |
30 | AS_IF([test "x$check_ucx_dir" = "x" -a "x$HPCX_UCX_DIR" != "x"],
31 | [
32 | check_ucx_dir="$HPCX_UCX_DIR"
33 | check_ucx_libdir="$HPCX_UCX_DIR/lib"
34 | CPPFLAGS="-I$check_ucx_dir/include $save_CPPFLAGS"
35 | LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS"
36 | ])
37 |
38 | AS_IF([test ! -z "$with_ucx_libdir" -a "x$with_ucx_libdir" != "xyes"],
39 | [
40 | check_ucx_libdir="$with_ucx_libdir"
41 | LDFLAGS="-L$check_ucx_libdir $save_LDFLAGS"
42 | ])
43 |
44 | AC_CHECK_HEADERS([ucp/api/ucp.h],
45 | [
46 | AC_CHECK_LIB([ucp], [ucp_tag_send_nb],
47 | [
48 | ucx_happy="yes"
49 | ],
50 | [
51 | ucx_happy="no"
52 | ], [-luct -lucm -lucs])
53 | ],
54 | [
55 | ucx_happy="no"
56 | ])
57 |
58 | AS_IF([test "x$ucx_happy" = "xyes"],
59 | [
60 | AS_IF([test "x$check_ucx_dir" != "x"],
61 | [
62 | AC_MSG_RESULT([UCX dir: $check_ucx_dir])
63 | AC_SUBST(UCX_CPPFLAGS, "-I$check_ucx_dir/include/")
64 | ])
65 |
66 | AS_IF([test "x$check_ucx_libdir" != "x"],
67 | [
68 | AC_SUBST(UCX_LDFLAGS, "-L$check_ucx_libdir")
69 | ])
70 |
71 | AC_SUBST(UCX_LIBADD, "-lucp -lucs -lucm -luct")
72 | ],
73 | [
74 | AS_IF([test "x$with_ucx" != "xguess"],
75 | [
76 | AC_MSG_ERROR([UCX support is requested but UCX packages cannot be found])
77 | ],
78 | [
79 | AC_MSG_WARN([UCX not found])
80 | ])
81 | ])
82 |
83 | CFLAGS="$save_CFLAGS"
84 | CPPFLAGS="$save_CPPFLAGS"
85 | LDFLAGS="$save_LDFLAGS"
86 |
87 | ],
88 | [
89 | AC_MSG_WARN([UCX was explicitly disabled])
90 | ])
91 |
92 | ucx_checked=yes
93 | AM_CONDITIONAL([HAVE_UCX_PLUGIN], [test "x$ucx_happy" != xno])
94 | ])
95 |
96 | ])
97 |
--------------------------------------------------------------------------------
/nccl-rdma-sharp-plugins.pc.in:
--------------------------------------------------------------------------------
1 | #
2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # Redistribution and use in source and binary forms, with or without
7 | # modification, are permitted provided that the following conditions are met:
8 | #
9 | # 1. Redistributions of source code must retain the above copyright notice, this
10 | # list of conditions and the following disclaimer.
11 | #
12 | # 2. Redistributions in binary form must reproduce the above copyright notice,
13 | # this list of conditions and the following disclaimer in the documentation
14 | # and/or other materials provided with the distribution.
15 | #
16 | # 3. Neither the name of the copyright holder nor the names of its
17 | # contributors may be used to endorse or promote products derived from
18 | # this software without specific prior written permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | #
31 |
32 | prefix = @prefix@
33 | exec_prefix = @exec_prefix@
34 | libdir = @libdir@
35 |
36 | Name: @PACKAGE@
37 | URL: @PACKAGE_URL@
38 | Description: RDMA and SHARP plugins for NCCL Collective library
39 | Version: @MAJOR_VERSION@.@MINOR_VERSION@
40 | Libs: -L${libdir} -lnccl-net.so
41 |
42 |
--------------------------------------------------------------------------------
/nccl-rdma-sharp-plugins.spec.in:
--------------------------------------------------------------------------------
1 | #
2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # Redistribution and use in source and binary forms, with or without
7 | # modification, are permitted provided that the following conditions are met:
8 | #
9 | # 1. Redistributions of source code must retain the above copyright notice, this
10 | # list of conditions and the following disclaimer.
11 | #
12 | # 2. Redistributions in binary form must reproduce the above copyright notice,
13 | # this list of conditions and the following disclaimer in the documentation
14 | # and/or other materials provided with the distribution.
15 | #
16 | # 3. Neither the name of the copyright holder nor the names of its
17 | # contributors may be used to endorse or promote products derived from
18 | # this software without specific prior written permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | #
31 |
32 | %global rel @RPM_RELEASE@
33 | %global version @VERSION@
34 | %global pkgname @PACKAGE@
35 | %global prefix @prefix@
36 | %global __check_files %{nil}
37 | %global _libdir %{prefix}/lib
38 | %{!?configure_opts: %global configure_opts %{nil}}
39 | %global debug_package %{nil}
40 | %bcond_with valgrind
41 | %global _binary_filedigest_algorithm 1
42 | %global _source_filedigest_algorithm 1
43 |
44 | %global lt_release @LT_RELEASE@
45 | %global lt_version @LT_CURRENT@.@LT_REVISION@.@LT_AGE@
46 |
47 | Name: %{pkgname}
48 | Summary: RDMA and SHARP plugins for NCCL
49 | Version: %{version}
50 | Release: %{rel}
51 |
52 | License: Proprietary
53 | Group: Applications
54 | Source: %{pkgname}-%{version}.tar.gz
55 | Requires: libibverbs
56 | %if 0%{?suse_version} < 1100
57 | BuildRequires: gcc-c++ libibverbs-devel binutils
58 | %else
59 | BuildRequires: gcc-c++ libibverbs-devel binutils-devel
60 | %endif
61 | %if %{with valgrind}
62 | BuildRequires: valgrind-devel
63 | %endif
64 |
65 | BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
66 | URL: http://www.mellanox.com
67 | Prefix: %{prefix}
68 | Provides: nccl-rdma-sharp-plugins
69 | Vendor: mellanox
70 |
71 |
72 | %description
73 | Provides RDMA and SHARP plugins for NCCL Collective library
74 |
75 | %prep
76 | rm -rf $RPM_BUILD_ROOT
77 |
78 | %setup -q
79 |
80 | %build
81 | ./configure
82 | make %{?_smp_mflags}
83 |
84 | %install
85 |
86 | rm -rf "$RPM_BUILD_ROOT"
87 |
88 | # Strip out some dependencies
89 | cat > find-requires.sh <<'EOF'
90 | exec %{__find_requires} "$@" | egrep -v '^perl'
91 | EOF
92 | chmod +x find-requires.sh
93 | %global _use_internal_dependency_generator 0
94 | %global __find_requires %{_builddir}/%{buildsubdir}/find-requires.sh
95 |
96 | make DESTDIR="$RPM_BUILD_ROOT" install
97 | mkdir -p $RPM_BUILD_ROOT/etc/ld.so.conf.d/
98 | echo %{_libdir} > $RPM_BUILD_ROOT/etc/ld.so.conf.d/nccl-rdma-sharp-plugins.conf
99 | mkdir -p $RPM_BUILD_ROOT/usr/lib64/pkgconfig
100 | cp nccl-rdma-sharp-plugins.pc $RPM_BUILD_ROOT/usr/lib64/pkgconfig
101 |
102 | %clean
103 | # We may be in the directory that we're about to remove, so cd out of
104 | # there before we remove it
105 | cd /tmp
106 |
107 | # Remove installed driver after rpm build finished
108 | chmod -R o+w $RPM_BUILD_DIR/%{name}-%{version}
109 | rm -rf $RPM_BUILD_DIR/%{name}-%{version}
110 |
111 | test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
112 |
113 |
114 | %files
115 | %defattr(-, root, root)
116 | %{prefix}
117 | /etc/ld.so.conf.d/nccl-rdma-sharp-plugins.conf
118 | /usr/lib64/pkgconfig/nccl-rdma-sharp-plugins.pc
119 |
120 |
121 | # Your application file list goes here
122 | # %{prefix}/lib/lib*.so*
123 | #%doc COPYRIGHT ChangeLog README AUTHORS NEWS
124 | #%doc doc/*
125 |
126 | # If you install a library
127 | %post
128 | /sbin/ldconfig || exit 1
129 |
130 | # If you install a library
131 | %postun
132 | /sbin/ldconfig
133 | exit 0
134 |
135 |
--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
1 | #
2 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | # Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | # See file LICENSE for terms.
6 | #
7 |
8 | lib_LTLIBRARIES = libnccl-net.la
9 |
10 | libnccl_net_la_CPPFLAGS = -I$(top_srcdir)/include
11 | libnccl_net_la_CFLAGS = $(CFLAGS) -DGNU_SOURCE
12 | libnccl_net_la_LIBADD = -lcudart_static
13 | libnccl_net_la_LDFLAGS = $(LDFLAGS)
14 |
15 | libnccl_net_la_SOURCES = \
16 | ibvwrap.c \
17 | utils.c \
18 | param.c \
19 | socket.c \
20 | p2p_plugin.c \
21 | ib_plugin.c
22 |
23 | if HAVE_UCX_PLUGIN
24 | libnccl_net_la_CPPFLAGS += -DHAVE_UCX_PLUGIN $(UCX_CPPFLAGS)
25 | libnccl_net_la_LIBADD += $(UCX_LIBADD)
26 | libnccl_net_la_LDFLAGS += $(UCX_LDFLAGS)
27 | libnccl_net_la_SOURCES += \
28 | ucx_plugin.c \
29 | ucx_rma_plugin.c \
30 | ucx_uct_lib.c \
31 | ucx_uct_plugin.c \
32 | ucx_uct_rd_plugin.c
33 | endif
34 |
35 | if HAVE_SHARP_PLUGIN
36 | libnccl_net_la_CPPFLAGS += -DHAVE_SHARP_PLUGIN $(SHARP_CPPFLAGS)
37 | libnccl_net_la_LIBADD += $(SHARP_LIBADD)
38 | libnccl_net_la_LDFLAGS += $(SHARP_LDFLAGS)
39 | libnccl_net_la_SOURCES += sharp_plugin.c
40 | endif
41 |
42 | install-exec-hook:
43 | cd $(libdir) && ln -sf libnccl-net.so libnccl-net-ibext.so
44 |
--------------------------------------------------------------------------------
/src/ibvwrap.c:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2015-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #include
10 | #include
11 |
12 | #include "ibvwrap.h"
13 | #include "utils.h"
14 | #include "nccl.h"
15 | #include "param.h"
16 |
17 | #define IBV_PTR_CHECK_ERRNO(call, retval, error_retval, name) \
18 | retval = call; \
19 | if (retval == error_retval) { \
20 | WARN("Call to " name " failed with error %s", strerror(errno)); \
21 | return ncclSystemError; \
22 | } \
23 | return ncclSuccess;
24 |
25 | #define IBV_PTR_CHECK(call, retval, error_retval, name) \
26 | retval = call; \
27 | if (retval == error_retval) { \
28 | WARN("Call to " name " failed"); \
29 | return ncclSystemError; \
30 | } \
31 | return ncclSuccess;
32 |
33 | #define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(call, success_retval, name, supported) \
34 | int ret = call; \
35 | if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
36 | INFO(NCCL_NET, "Call to " name " not supported"); \
37 | *supported = 0; \
38 | return ncclSuccess; \
39 | } else if (ret != success_retval) { \
40 | WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
41 | *supported = 1; \
42 | return ncclSystemError; \
43 | } \
44 | *supported = 1; \
45 | return ncclSuccess;
46 |
47 | #define IBV_INT_CHECK_RET_ERRNO(call, success_retval, name) \
48 | int ret = call; \
49 | if (ret != success_retval) { \
50 | WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
51 | return ncclSystemError; \
52 | } \
53 | return ncclSuccess;
54 |
55 | #define IBV_INT_CHECK(call, error_retval, name) \
56 | int ret = call; \
57 | if (ret == error_retval) { \
58 | WARN("Call to " name " failed"); \
59 | return ncclSystemError; \
60 | } \
61 | return ncclSuccess;
62 |
63 | #define IBV_PASSTHRU(call) \
64 | call; \
65 | return ncclSuccess;
66 |
67 | NCCL_PARAM(IbMQpRetryAll, "IB_MQP_RETRY_ALL", 0);
68 | NCCL_PARAM(IbMQpRetryCnt, "IB_MQP_RETRY_CNT", 34);
69 | NCCL_PARAM(IbMQpRetryTimeout, "IB_MQP_RETRY_SLEEP_MSEC", 100); // in milliseconds
70 |
71 | #define IBV_ERR_EQ(e, code) (e == code || e == (-code))
72 | #define IBV_MQP_RETRY_ERRNO(e) (IBV_ERR_EQ(e, ETIMEDOUT))
73 | #define IBV_MQP_RETRY_ERRNO_ALL(e) (ncclParamIbMQpRetryAll() ? (e != 0) : IBV_MQP_RETRY_ERRNO(e))
74 |
75 | ncclResult_t wrap_ibv_fork_init() {
76 | IBV_INT_CHECK(ibv_fork_init(), -1, "ibv_fork_init");
77 | }
78 |
79 | ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) {
80 | *ret = ibv_get_device_list(num_devices);
81 | if (*ret == NULL) *num_devices = 0;
82 | return ncclSuccess;
83 | }
84 |
85 | ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) {
86 | IBV_PASSTHRU(ibv_free_device_list(list));
87 | }
88 |
89 | const char *wrap_ibv_get_device_name(struct ibv_device *device) {
90 | return ibv_get_device_name(device);
91 | }
92 |
93 | ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
94 | IBV_PTR_CHECK(ibv_open_device(device), *ret, NULL, "ibv_open_device");
95 | }
96 |
97 | ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
98 | IBV_INT_CHECK(ibv_close_device(context), -1, "ibv_close_device");
99 | }
100 |
101 | ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/
102 | IBV_INT_CHECK(ibv_get_async_event(context, event), -1, "ibv_get_async_event");
103 | }
104 |
105 | ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) {
106 | IBV_PASSTHRU(ibv_ack_async_event(event));
107 | }
108 |
109 | ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
110 | IBV_INT_CHECK_RET_ERRNO(ibv_query_device(context, device_attr), 0, "ibv_query_device");
111 | }
112 |
113 | ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
114 | IBV_INT_CHECK_RET_ERRNO(ibv_query_port(context, port_num, port_attr), 0, "ibv_query_port");
115 | }
116 |
117 | ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
118 | IBV_INT_CHECK_RET_ERRNO(ibv_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
119 | }
120 |
121 | ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) {
122 | IBV_INT_CHECK_RET_ERRNO(ibv_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
123 | }
124 |
125 | ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
126 | IBV_PTR_CHECK(ibv_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
127 | }
128 |
129 | ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
130 | IBV_INT_CHECK_RET_ERRNO(ibv_dealloc_pd(pd), 0, "ibv_dealloc_pd");
131 | }
132 |
133 | ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
134 | IBV_PTR_CHECK(ibv_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
135 | }
136 |
137 | struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
138 | return ibv_reg_mr(pd, addr, length, access);
139 | }
140 |
141 | ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
142 | #if HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING
143 | IBV_PTR_CHECK(ibv_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
144 | #else
145 | return ncclSystemError;
146 | #endif
147 | }
148 |
149 | /* DMA-BUF support */
150 | ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
151 | #if HAVE_DECL_IBV_REG_DMABUF_MR
152 | IBV_PTR_CHECK_ERRNO(ibv_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
153 | #else
154 | return ncclSystemError;
155 | #endif
156 | }
157 |
158 | struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
159 | #if HAVE_DECL_IBV_REG_DMABUF_MR
160 | return ibv_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
161 | #else
162 | errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
163 | return NULL;
164 | #endif
165 | }
166 |
167 | ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
168 | IBV_INT_CHECK_RET_ERRNO(ibv_dereg_mr(mr), 0, "ibv_dereg_mr");
169 | }
170 |
171 | ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
172 | IBV_PTR_CHECK_ERRNO(ibv_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
173 | }
174 |
175 | ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
176 | IBV_INT_CHECK_RET_ERRNO(ibv_destroy_cq(cq), 0, "ibv_destroy_cq");
177 | }
178 |
179 | ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
180 | IBV_INT_CHECK_RET_ERRNO(ibv_destroy_qp(qp), 0, "ibv_destroy_qp");
181 | }
182 |
183 | ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
184 | IBV_PTR_CHECK_ERRNO(ibv_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
185 | }
186 |
187 | static void ibvQpStateName(enum ibv_qp_state state, char* msg, const size_t len) {
188 | switch (state) {
189 | case (IBV_QPS_RESET): snprintf(msg, len, "RESET"); break;
190 | case (IBV_QPS_INIT): snprintf(msg, len, "INIT"); break;
191 | case (IBV_QPS_RTR): snprintf(msg, len, "RTR"); break;
192 | case (IBV_QPS_RTS): snprintf(msg, len, "RTS"); break;
193 | case (IBV_QPS_SQD): snprintf(msg, len, "SQD"); break;
194 | case (IBV_QPS_SQE): snprintf(msg, len, "SQE"); break;
195 | case (IBV_QPS_ERR): snprintf(msg, len, "ERR"); break;
196 | case (IBV_QPS_UNKNOWN): snprintf(msg, len, "UNKNOWN"); break;
197 | default: snprintf(msg, len, "NOT RECOGNIZED (%d)", state); break;
198 | }
199 | }
200 |
201 | #define QP_ATTR(attr, userAttr, userFlag, mask) ((userFlag & mask) ? (userAttr) : (attr))
202 |
203 | static void ibvModifyQpLog(struct ibv_qp* qp, enum ibv_qp_state qpState, struct ibv_qp_attr* userAttr, int userFlag, char* msg, size_t msgLen) {
204 | ncclResult_t res;
205 | int portNum = -1, gidIndex = -1;
206 | char localGidName[INET6_ADDRSTRLEN], remoteGidName[INET6_ADDRSTRLEN];
207 | const char *localGidRes = NULL, *remoteGidRes = NULL;
208 |
209 | char nextState[32], currState[32];
210 | ibvQpStateName(qp->state, currState, sizeof(currState));
211 | ibvQpStateName(qpState, nextState, sizeof(nextState));
212 | char devName[IBV_SYSFS_NAME_MAX] = "";
213 | snprintf(devName, sizeof(devName), "%s", (qp->pd->context) ? wrap_ibv_get_device_name(qp->pd->context->device) : "N/A");
214 |
215 | struct ibv_qp_attr attr;
216 | struct ibv_qp_init_attr init_attr;
217 | int attr_mask = IBV_QP_PORT | IBV_QP_AV;
218 | res = wrap_ibv_query_qp(qp, &attr, attr_mask, &init_attr);
219 | struct ibv_qp_attr *qpAttr = (res == ncclSuccess) ? &attr : NULL;
220 |
221 | // port info, portAttr can be NULL if not given by the user and query_qp failed
222 | struct ibv_qp_attr *portAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_PORT);
223 | portNum = portAttr ? portAttr->port_num : -1;
224 |
225 | // address info, avAttr can be NULL if not given by the user and query_qp failed
226 | struct ibv_qp_attr *avAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_AV);
227 | if (avAttr && avAttr->ah_attr.is_global) {
228 | union ibv_gid *remoteGid = &avAttr->ah_attr.grh.dgid;
229 | remoteGidRes = ibvGetGidStr(remoteGid, remoteGidName, sizeof(remoteGidName));
230 | // we need pd->context to retrieve local GID, skip if not there
231 | if (!qp->pd->context) goto print;
232 | gidIndex = avAttr->ah_attr.grh.sgid_index;
233 | union ibv_gid localGid;
234 | NCCLCHECKGOTO(wrap_ibv_query_gid(qp->pd->context, portNum, gidIndex, &localGid), res, print);
235 | localGidRes = ibvGetGidStr(&localGid, localGidName, sizeof(localGidName));
236 | }
237 | print:
238 | snprintf(msg, msgLen, "on dev %s:%d, curr state %s, next state %s, local GID index %d, local GID %s, remote GID %s",
239 | devName, portNum, currState, nextState, gidIndex, localGidRes ? localGidName : "N/A", remoteGidRes ? remoteGidName : "N/A");
240 | return;
241 | }
242 |
243 | ncclResult_t wrap_ibv_modify_qp(struct ibv_qp* qp, struct ibv_qp_attr* attr, int attr_mask) {
244 | char qpMsg[1024];
245 | int ret = 0, attempts = 0;
246 | int maxCnt = (int)ncclParamIbMQpRetryCnt() + 1; // number of attempts = number of retry + 1
247 | int timeOut = (int)ncclParamIbMQpRetryTimeout();
248 | do {
249 | if (attempts > 0) {
250 | unsigned int sleepTime = timeOut * attempts;
251 | ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
252 | INFO(NCCL_NET, "Call to ibv_modify_qp failed with %d %s, %s, retrying %d/%d after %u msec of sleep", ret, strerror(ret), qpMsg, attempts, maxCnt, sleepTime);
253 | // sleep before retrying
254 | struct timespec tv = {.tv_sec = sleepTime / 1000, .tv_nsec = (sleepTime % 1000) * ((long)1e6)};
255 | nanosleep(&tv, NULL);
256 | }
257 | ret = ibv_modify_qp(qp, attr, attr_mask);
258 | attempts++;
259 | } while (IBV_MQP_RETRY_ERRNO_ALL(ret) && attempts < maxCnt);
260 | if (ret != 0) {
261 | ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
262 | WARN("Call to ibv_modify_qp failed with %d %s, %s", ret, strerror(ret), qpMsg);
263 | return ncclSystemError;
264 | }
265 | return ncclSuccess;
266 | }
267 |
268 | ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
269 | IBV_INT_CHECK_RET_ERRNO(qp->context->ops.post_send(qp, wr, bad_wr), 0, "ibv_post_send");
270 | }
271 |
272 | ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
273 | IBV_INT_CHECK_RET_ERRNO(qp->context->ops.post_recv(qp, wr, bad_wr), 0, "ibv_post_recv");
274 | return ncclSuccess;
275 | }
276 |
277 | ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
278 | #if HAVE_DECL_IBV_QUERY_ECE
279 | IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_query_ece(qp, ece), 0, "ibv_query_ece", supported);
280 | #else
281 | INFO(NCCL_NET, "Call to ibv_query_ece is skipped, doesn't exist");
282 | *supported = 0;
283 | return ncclSuccess;
284 | #endif
285 | }
286 |
287 | ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
288 | #if HAVE_DECL_IBV_SET_ECE
289 | IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_set_ece(qp, ece), 0, "ibv_set_ece", supported);
290 | #else
291 | INFO(NCCL_NET, "Call to ibv_set_ece skipped, doesn't exist");
292 | *supported = 0;
293 | return ncclSuccess;
294 | #endif
295 | }
296 |
297 | ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
298 | *ret = (char *) ibv_event_type_str(event);
299 | return ncclSuccess;
300 | }
301 |
--------------------------------------------------------------------------------
/src/param.c:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #include "param.h"
10 | #include "debug.h"
11 |
12 | //#include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 |
22 | #define MIN(a, b) ((a)<(b)?(a):(b))
23 | const char* userHomeDir() {
24 | struct passwd *pwUser = getpwuid(getuid());
25 | return pwUser == NULL ? NULL : pwUser->pw_dir;
26 | }
27 |
28 | void setEnvFile(const char* fileName) {
29 | FILE * file = fopen(fileName, "r");
30 | if (file == NULL) return;
31 |
32 | char *line = NULL;
33 | char envVar[1024];
34 | char envValue[1024];
35 | size_t n = 0;
36 | ssize_t read;
37 | while ((read = getline(&line, &n, file)) != -1) {
38 | if (line[0] == '#') continue;
39 | if (line[read-1] == '\n') line[read-1] = '\0';
40 | int s=0; // Env Var Size
41 | while (line[s] != '\0' && line[s] != '=') s++;
42 | if (line[s] == '\0') continue;
43 | strncpy(envVar, line, MIN(1023,s));
44 | envVar[MIN(1023,s)] = '\0';
45 | s++;
46 | strncpy(envValue, line+s, 1023);
47 | envValue[1023]='\0';
48 | setenv(envVar, envValue, 0);
49 | //printf("%s : %s->%s\n", fileName, envVar, envValue);
50 | }
51 | if (line) free(line);
52 | fclose(file);
53 | }
54 |
55 | static void initEnvFunc() {
56 | char confFilePath[1024];
57 | const char* userFile = getenv("NCCL_CONF_FILE");
58 | if (userFile && strlen(userFile) > 0) {
59 | snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
60 | setEnvFile(confFilePath);
61 | } else {
62 | const char* userDir = userHomeDir();
63 | if (userDir) {
64 | snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir);
65 | setEnvFile(confFilePath);
66 | }
67 | }
68 | snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf");
69 | setEnvFile(confFilePath);
70 | }
71 |
72 | void initEnv() {
73 | static pthread_once_t once = PTHREAD_ONCE_INIT;
74 | pthread_once(&once, initEnvFunc);
75 | }
76 |
77 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
78 | static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
79 | pthread_mutex_lock(&mutex);
80 | if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
81 | const char* str = ncclGetEnv(env);
82 | int64_t value = deftVal;
83 | if (str && strlen(str) > 0) {
84 | errno = 0;
85 | value = strtoll(str, NULL, 0);
86 | if (errno) {
87 | value = deftVal;
88 | INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
89 | } else {
90 | INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value);
91 | }
92 | }
93 | __atomic_store_n(cache, value, __ATOMIC_RELAXED);
94 | }
95 | pthread_mutex_unlock(&mutex);
96 | }
97 |
98 | const char* ncclGetEnv(const char* name) {
99 | initEnv();
100 | return getenv(name);
101 | }
102 |
--------------------------------------------------------------------------------
/src/ucx_uct_plugin.c:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 | * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3 | * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | * SPDX-License-Identifier: BSD-3-Clause
5 | *
6 | * See LICENSE.txt for license information
7 | ************************************************************************/
8 |
9 | #include "ucx_uct_lib.h"
10 |
11 | typedef enum {
12 | NCCL_UCT_REQ_IRECV = -1,
13 | NCCL_UCT_REQ_IFLUSH = -2
14 | } nccl_uct_request_type_t;
15 |
16 | struct nccl_uct_rdesc;
17 |
18 | /* On-the-wire descriptor of a posted receive request entry */
19 | typedef struct {
20 | int tag;
21 | int size;
22 | void *data;
23 | int matched;
24 | uct_rkey_t rkey;
25 | } nccl_uct_chunk_t;
26 |
27 | /* On-the-wire descriptor of a receive request containing many chunks */
28 | typedef struct {
29 | uint64_t id;
30 | uint16_t count;
31 | uint32_t size;
32 | struct nccl_uct_rdesc *peer_rdesc; /* Acts as a cookie along with id */
33 | nccl_uct_chunk_t chunk[];
34 | } nccl_uct_rdesc_hdr_t;
35 |
36 | /* On-the-wire descriptor for receive request completion */
37 | typedef struct {
38 | uint64_t id;
39 | struct nccl_uct_rdesc *rdesc;
40 | int count; /* Number of sizes contained */
41 | int sizes[NCCL_UCX_UCT_MAX_RECVS];
42 | } nccl_uct_atp_t;
43 |
44 | /*
45 | * NCCL local request handler to progress:
46 | * - size -1 for multi receive
47 | * - size -2 for flush
48 | * - size > 0 for send
49 | */
50 | typedef struct {
51 | /* Pending GET (iflush) PUT (isend) or receiving one ATP (irecv) */
52 | uct_completion_t completion;
53 | int size;
54 | struct nccl_uct_rdesc *rdesc;
55 | } nccl_uct_req_t;
56 |
57 | /* Pending receive descriptor either on the receive or sending side */
58 | typedef struct nccl_uct_rdesc {
59 | int nccl_usage; /* NCCL requests not finished/started */
60 | int send_atp; /* >1 pending isend, ==1 pending atp send */
61 |
62 | union {
63 | ucs_list_link_t list; /* comm's linked list */
64 | struct nccl_uct_rdesc *next; /* inserted in free list */
65 | };
66 |
67 | struct nccl_uct_wr_comm *comm;
68 | nccl_uct_rdesc_hdr_t desc;
69 | nccl_uct_chunk_t storage[NCCL_UCX_UCT_MAX_RECVS]; /* Don't use directly */
70 | nccl_uct_req_t reqs[NCCL_UCX_UCT_MAX_RECVS]; /* NCCL requests */
71 | int sizes[NCCL_UCX_UCT_MAX_RECVS]; /* ATP received sizes */
72 | } nccl_uct_rdesc_t;
73 |
74 | typedef struct nccl_uct_wr_comm {
75 | nccl_uct_comm_t base;
76 |
77 | int rdesc_alloc; /* Track allocated rdescs */
78 | nccl_uct_rdesc_t *free_rdesc; /* Available rdesc for reuse */
79 | uint64_t rdesc_id; /* Next sequence number to use */
80 |
81 | /* Received RTRs: used by Sender communicator in ->isend() */
82 | ucs_list_link_t rdesc_list;
83 |
84 | } nccl_uct_wr_comm_t;
85 |
86 | static inline nccl_uct_wr_comm_t *
87 | nccl_uct_wr_comm_get(nccl_uct_comm_t *base_comm) {
88 | return ucs_container_of(base_comm, nccl_uct_wr_comm_t, base);
89 | }
90 |
91 | static nccl_uct_rdesc_t *nccl_uct_comm_rdesc_get(nccl_uct_wr_comm_t *comm) {
92 | nccl_uct_rdesc_t *rdesc = comm->free_rdesc;
93 |
94 | if (rdesc == NULL) {
95 | rdesc = calloc(1, sizeof(*rdesc));
96 | } else {
97 | comm->free_rdesc = rdesc->next;
98 | }
99 |
100 | rdesc->next = NULL;
101 | rdesc->comm = comm;
102 | comm->rdesc_alloc++;
103 | return rdesc;
104 | }
105 |
106 | static size_t nccl_uct_rdesc_size(int n) {
107 | return n * sizeof(nccl_uct_chunk_t) + sizeof(nccl_uct_rdesc_hdr_t);
108 | }
109 |
110 | /* Prepare a receive descriptor from irecv()/iflush() side */
111 | static void nccl_uct_rdesc_set(nccl_uct_rdesc_t *rdesc, uint64_t id, int n,
112 | void **data, size_t *sizes, int *tags,
113 | nccl_uct_memh_t **uct_memh) {
114 | nccl_uct_rdesc_hdr_t *desc = &rdesc->desc;
115 | int i;
116 |
117 | /* Populate header */
118 | desc->id = id;
119 | desc->count = n;
120 | desc->size = nccl_uct_rdesc_size(n);
121 | desc->peer_rdesc = rdesc; /* cookie, will be returned in ATP */
122 |
123 | /* Ref count that prevents NCCL from releasing memory */
124 | rdesc->nccl_usage = 1;
125 | rdesc->send_atp = 0;
126 |
127 | /* Zero (iflush) or one or many receive request are contained */
128 | for (i = 0; i < n; i++) {
129 | desc->chunk[i].tag = tags[i];
130 | desc->chunk[i].size = sizes[i];
131 | desc->chunk[i].data = data[i];
132 | desc->chunk[i].matched = 0;
133 | desc->chunk[i].rkey = uct_memh[i]->bundle.rkey;
134 | }
135 | }
136 |
137 | static nccl_uct_req_t *nccl_uct_rdesc_get_req(nccl_uct_rdesc_t *rdesc, int i,
138 | int size) {
139 | nccl_uct_req_t *req;
140 |
141 | assert(i < NCCL_UCX_UCT_MAX_RECVS);
142 |
143 | req = &rdesc->reqs[i];
144 | req->size = size;
145 | req->rdesc = rdesc;
146 |
147 | req->completion.func = nccl_uct_empty_callback;
148 | req->completion.count = 1;
149 | req->completion.status = UCS_OK;
150 |
151 | return &rdesc->reqs[i];
152 | }
153 |
154 | static void nccl_uct_comm_rdesc_put(nccl_uct_rdesc_t *rdesc) {
155 | nccl_uct_wr_comm_t *comm = rdesc->comm;
156 |
157 | assert(comm != NULL);
158 |
159 | rdesc->desc.id = -1;
160 | rdesc->comm = NULL;
161 | rdesc->next = comm->free_rdesc;
162 | comm->free_rdesc = rdesc;
163 | comm->rdesc_alloc--;
164 | }
165 |
166 | /* On receiver side, after ->irecv(), expect corresponding ATP */
167 | static ucs_status_t nccl_uct_atp_callback(void *arg, void *data, size_t length,
168 | unsigned flags) {
169 | nccl_uct_atp_t *atp = (nccl_uct_atp_t*)((uint8_t*)data + 8);
170 |
171 | assert(length == (sizeof(*atp) + 8));
172 | assert(*(nccl_uct_comm_t**)data == &atp->rdesc->comm->base);
173 | assert(atp->id == atp->rdesc->desc.id);
174 | assert(atp->count == atp->rdesc->desc.count);
175 | assert(atp->rdesc->reqs[0].completion.count == 1);
176 |
177 | atp->rdesc->reqs[0].completion.count--;
178 | memcpy(atp->rdesc->sizes, atp->sizes, atp->count * sizeof(*atp->sizes));
179 | return UCS_OK;
180 | }
181 |
182 | /* On sender side, asynchronously receive rdesc/RTR, later used by ->isend() */
183 | static ucs_status_t nccl_uct_rtr_callback(void *arg, void *data, size_t length,
184 | unsigned flags) {
185 | nccl_uct_comm_t *base_comm = *(nccl_uct_comm_t **)data;
186 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(base_comm);
187 | nccl_uct_rdesc_hdr_t *desc = (nccl_uct_rdesc_hdr_t*)((uint8_t*)data + 8);
188 | size_t size = desc->size;
189 | nccl_uct_rdesc_t *rdesc;
190 |
191 | rdesc = nccl_uct_comm_rdesc_get(comm);
192 | if (rdesc == NULL) {
193 | WARN("Failed to get an rdesc in RTR callback");
194 | return UCS_ERR_NO_MEMORY; /* Cannot happend */
195 | }
196 |
197 | ucs_list_add_tail(&comm->rdesc_list, &rdesc->list);
198 |
199 | assert((size + 8) == length);
200 | assert(size == nccl_uct_rdesc_size(desc->count));
201 |
202 | memcpy(&rdesc->desc, desc, size);
203 | rdesc->nccl_usage = desc->count;
204 | rdesc->send_atp = desc->count + 1;
205 | return UCS_OK;
206 | }
207 |
208 | static ncclResult_t nccl_uct_wr_iface_set(nccl_uct_iface_t *uct_iface) {
209 | NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_RTR,
210 | nccl_uct_rtr_callback));
211 | NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_ATP,
212 | nccl_uct_atp_callback));
213 | return ncclSuccess;
214 | }
215 |
216 | static ncclResult_t nccl_uct_wr_comm_alloc(nccl_uct_comm_t **comm_p) {
217 | nccl_uct_wr_comm_t *comm = calloc(1, sizeof(nccl_uct_wr_comm_t));
218 | if (comm != NULL) {
219 | *comm_p = &comm->base;
220 | return ncclSuccess;
221 | }
222 |
223 | return ncclSystemError;
224 | }
225 |
226 | static ncclResult_t nccl_uct_wr_comm_init(nccl_uct_comm_t *base_comm,
227 | nccl_uct_context_t *context,
228 | nccl_uct_worker_t *worker, int dev,
229 | const nccl_uct_comm_t *remote_comm) {
230 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(base_comm);
231 |
232 | ucs_list_head_init(&comm->rdesc_list);
233 | return nccl_uct_comm_init(&comm->base, context, worker, dev, remote_comm);
234 | }
235 |
236 | static ncclResult_t nccl_uct_wr_init(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
237 | context.ops.comm_alloc = nccl_uct_wr_comm_alloc;
238 | context.ops.comm_init = nccl_uct_wr_comm_init;
239 | context.ops.iface_set = nccl_uct_wr_iface_set;
240 | context.am_short_size = nccl_uct_rdesc_size(NCCL_UCX_UCT_MAX_RECVS);
241 | context.rkey_size = sizeof(((nccl_uct_chunk_t*)0)->rkey);
242 |
243 | return nccl_p2p_ib_init(&context.dev_count, &context.merge_dev_count, ncclIbDevs, context.if_name,
244 | &context.if_addr, NULL, logFunction);
245 | }
246 |
247 | /* Outcome is either send_atp equal to 1 or 0 */
248 | static void nccl_uct_send_atp(nccl_uct_wr_comm_t *comm,
249 | nccl_uct_rdesc_t *rdesc) {
250 | ucs_status_t status;
251 | nccl_uct_atp_t atp;
252 | int i;
253 |
254 | assert(rdesc->send_atp == 1);
255 |
256 | status = uct_ep_fence(comm->base.uct_ep->ep, 0);
257 | if (status != UCS_OK) {
258 | return;
259 | }
260 |
261 | atp.id = rdesc->desc.id;
262 | atp.rdesc = rdesc->desc.peer_rdesc;
263 | atp.count = rdesc->desc.count;
264 |
265 | /* Sizes from isend() are lower or equal to their irecv() side */
266 | for (i = 0; i < rdesc->desc.count; i++) {
267 | atp.sizes[i] = rdesc->reqs[i].size;
268 | }
269 |
270 | status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_ATP,
271 | (uint64_t)comm->base.remote.comm, &atp, sizeof(atp));
272 | if (status == UCS_OK) {
273 | rdesc->send_atp = 0;
274 | }
275 | }
276 |
277 | static ncclResult_t nccl_uct_send(nccl_uct_wr_comm_t *comm, void *data,
278 | int size, nccl_uct_memh_t *uct_memh,
279 | nccl_uct_rdesc_t *rdesc, int i,
280 | void **request) {
281 | ucs_status_t status;
282 | uct_iov_t iov;
283 | nccl_uct_req_t *req;
284 |
285 | *request = NULL;
286 |
287 | /* Details for local data */
288 | iov.buffer = data;
289 | iov.length = size;
290 | iov.memh = uct_memh->memh;
291 | iov.stride = iov.length;
292 | iov.count = 1;
293 |
294 | assert(size <= rdesc->desc.chunk[i].size);
295 |
296 | req = nccl_uct_rdesc_get_req(rdesc, i, size); /* NCCL request */
297 |
298 | status = uct_ep_put_zcopy(comm->base.uct_ep->ep, &iov, 1,
299 | (uint64_t)rdesc->desc.chunk[i].data,
300 | rdesc->desc.chunk[i].rkey, &req->completion);
301 |
302 | if (status == UCS_OK) {
303 | req->completion.count--;
304 | } else if (status != UCS_INPROGRESS) {
305 | return ncclSuccess;
306 | }
307 |
308 | rdesc->desc.chunk[i].matched = 1;
309 | --rdesc->send_atp;
310 |
311 | if (rdesc->send_atp == 1) {
312 | ucs_list_del(&rdesc->list); /* all ->isend() were now matched */
313 | nccl_uct_send_atp(comm, rdesc);
314 | }
315 |
316 | *request = req;
317 | return ncclSuccess;
318 | }
319 |
320 | static ncclResult_t nccl_uct_wr_isend(void *send_comm, void *data, size_t size,
321 | int tag, void *mhandle, void* phandle, void **request) {
322 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(send_comm);
323 | nccl_uct_rdesc_t *rdesc;
324 | int i;
325 |
326 | *request = NULL;
327 |
328 | ucs_list_for_each(rdesc, &comm->rdesc_list, list) {
329 | for (i = 0; i < rdesc->desc.count; i++) {
330 | if (rdesc->desc.chunk[i].matched || (rdesc->desc.chunk[i].tag != tag)) {
331 | continue;
332 | }
333 |
334 | return nccl_uct_send(comm, data, size, mhandle, rdesc, i, request);
335 | }
336 | }
337 |
338 | /* Progress here to make sure we receive non-solicited RTRs */
339 | uct_worker_progress(comm->base.uct_worker->worker);
340 | return ncclSuccess;
341 | }
342 |
343 | static ncclResult_t nccl_uct_wr_irecv(void *recv_comm, int n, void **data,
344 | size_t *sizes, int *tags, void **mhandles,
345 | void** phandles, void **request) {
346 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(recv_comm);
347 | nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandles;
348 | nccl_uct_rdesc_t *rdesc;
349 | ucs_status_t status;
350 |
351 | assert(n <= NCCL_UCX_UCT_MAX_RECVS);
352 |
353 | rdesc = nccl_uct_comm_rdesc_get(comm);
354 | if (rdesc == NULL) {
355 | return ncclInternalError;
356 | }
357 |
358 | nccl_uct_rdesc_set(rdesc, comm->rdesc_id++, n, data, sizes, tags, uct_memh);
359 |
360 | status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_RTR,
361 | (uint64_t)comm->base.remote.comm, &rdesc->desc,
362 | nccl_uct_rdesc_size(n));
363 | if (status != UCS_OK) {
364 | nccl_uct_comm_rdesc_put(rdesc);
365 | *request = NULL;
366 | } else {
367 | /* Wait for receiving ATP */
368 | *request = nccl_uct_rdesc_get_req(rdesc, 0, NCCL_UCT_REQ_IRECV);
369 | }
370 |
371 | return ncclSuccess;
372 | }
373 |
374 | static ncclResult_t nccl_uct_wr_iflush(void *recv_comm, int n, void **data,
375 | int *sizes, void **mhandle,
376 | void **request) {
377 | nccl_uct_comm_t *base_comm = recv_comm;
378 | int last = nccl_uct_flush_index(base_comm, sizes, n);
379 | nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandle;
380 | nccl_uct_rdesc_t *rdesc;
381 | nccl_uct_req_t *req;
382 | ncclResult_t result;
383 |
384 | if (last == -1) {
385 | return ncclSuccess;
386 | }
387 |
388 | rdesc = nccl_uct_comm_rdesc_get(nccl_uct_wr_comm_get(base_comm));
389 | if (rdesc == NULL) {
390 | return ncclInternalError;
391 | }
392 |
393 | nccl_uct_rdesc_set(rdesc, ~0, 0, NULL, NULL, NULL, NULL);
394 | /* Wait for local GET completion */
395 | req = nccl_uct_rdesc_get_req(rdesc, 0, NCCL_UCT_REQ_IFLUSH);
396 | *request = req;
397 |
398 | result = nccl_uct_flush(base_comm, data[last], sizes[last], uct_memh[last],
399 | &req->completion, request);
400 | if (*request == NULL) {
401 | nccl_uct_comm_rdesc_put(rdesc);
402 | }
403 |
404 | return result;
405 | }
406 |
407 | static ncclResult_t nccl_uct_wr_test(void *request, int *done, int *sizes) {
408 | nccl_uct_req_t *req = request;
409 | nccl_uct_rdesc_t *rdesc = req->rdesc;
410 | nccl_uct_wr_comm_t *comm = rdesc->comm;
411 |
412 | uct_worker_progress(comm->base.uct_worker->worker);
413 |
414 | *done = 0;
415 |
416 | if (rdesc->send_atp == 1) {
417 | /* Slowpath */
418 | nccl_uct_send_atp(comm, rdesc);
419 |
420 | if (rdesc->send_atp && rdesc->nccl_usage == 1) {
421 | /* Keep the last isend request until ATP is out */
422 | return ncclSuccess;
423 | }
424 | }
425 |
426 | if (req->completion.count > 0) {
427 | return ncclSuccess;
428 | }
429 |
430 | *done = 1;
431 |
432 | if (req->size == NCCL_UCT_REQ_IRECV) {
433 | assert(&rdesc->reqs[0] == req);
434 | if (sizes != NULL) {
435 | memcpy(sizes, rdesc->sizes, rdesc->desc.count * sizeof(*sizes));
436 | }
437 | } else if (req->size == NCCL_UCT_REQ_IFLUSH) {
438 | assert(&rdesc->reqs[0] == req);
439 | } else {
440 | /* ->isend() request */
441 | assert(req->size > -1);
442 | if (sizes != NULL) {
443 | sizes[0] = req->size;
444 | }
445 | }
446 |
447 | if (--rdesc->nccl_usage < 1) {
448 | assert(rdesc->send_atp == 0);
449 | assert(rdesc->nccl_usage == 0);
450 | nccl_uct_comm_rdesc_put(rdesc);
451 | }
452 |
453 | return ncclSuccess;
454 | }
455 |
456 | static ncclResult_t nccl_uct_wr_close(void *close_comm) {
457 | nccl_uct_wr_comm_t *comm = nccl_uct_wr_comm_get(close_comm);
458 | nccl_uct_rdesc_t *rdesc;
459 |
460 | nccl_uct_comm_deinit(close_comm);
461 |
462 | while ((rdesc = comm->free_rdesc) != NULL) {
463 | comm->free_rdesc = rdesc->next;
464 | free(rdesc);
465 | }
466 |
467 | assert(ucs_list_is_empty(&comm->rdesc_list));
468 | assert(comm->rdesc_alloc == 0);
469 | free(comm);
470 | return ncclSuccess;
471 | }
472 |
473 |
474 | static ncclResult_t nccl_uct_wr_init_v9(ncclDebugLogger_t logFunction) {
475 | return nccl_uct_wr_init(logFunction, NULL);
476 | }
477 |
478 | static ncclResult_t nccl_uct_wr_isend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
479 | return nccl_uct_wr_isend(sendComm, data, size, tag, mhandle, NULL, request);
480 | }
481 |
482 | static ncclResult_t nccl_uct_wr_irecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
483 | return nccl_uct_wr_irecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
484 | }
485 |
486 | static ncclResult_t nccl_uct_wr_isend_v8(void *send_comm, void *data, int size,
487 | int tag, void *mhandle, void **request) {
488 | return nccl_uct_wr_isend_v9(send_comm, data, (size_t)size, tag, mhandle, request);
489 | }
490 |
491 | static ncclResult_t nccl_uct_wr_irecv_v8(void *recv_comm, int n, void **data,
492 | int *sizes, int *tags, void **mhandles,
493 | void **request) {
494 | size_t sizes_sizet[NCCL_NET_IB_MAX_RECVS];
495 | for (int i=0; isend_rts == 0);
83 | assert(req->rts_count == req->count);
84 | assert(req->completion.count == 1);
85 |
86 | status = uct_ep_am_short(req->comm->base.uct_ep->ep, NCCL_UCT_AM_ATS,
87 | (uint64_t)req->comm->base.remote.comm,
88 | req->remote_req,
89 | sizeof(*req->remote_req) * req->rts_count);
90 | if (status == UCS_OK) {
91 | req->completion.count--;
92 | }
93 | }
94 |
95 | static void nccl_uct_rd_pending_add(nccl_uct_rd_comm_t *comm,
96 | nccl_uct_mem_t *src, nccl_uct_mem_t *dst) {
97 | nccl_uct_rd_req_t *req = dst->req;
98 | nccl_uct_get_param_t *param;
99 |
100 | assert(src->size <= dst->size);
101 | assert(req->rts_count < NCCL_UCX_UCT_MAX_RECVS);
102 |
103 | req->sizes[dst->index] = src->size;
104 | req->remote_req[req->rts_count++] = src->req; /* src->req is a cookie */
105 |
106 | if (src->size == 0) {
107 | req->completion.count--;
108 | return;
109 | }
110 |
111 | param = &comm->pending.param[comm->pending.last & NCCL_UCT_PENDING_MASK];
112 | comm->pending.last++;
113 |
114 | assert((comm->pending.first & NCCL_UCT_PENDING_MASK) !=
115 | (comm->pending.last & NCCL_UCT_PENDING_MASK));
116 |
117 | param->iov.buffer = dst->data;
118 | param->iov.length = src->size;
119 | param->iov.memh = dst->u.uct_memh->memh;
120 | param->iov.stride = 0;
121 | param->iov.count = 1;
122 | param->rva = (uint64_t)src->data;
123 | param->rkey = src->u.rkey;
124 | param->req = req;
125 | }
126 |
127 | static void nccl_uct_rd_pending_drain(nccl_uct_rd_comm_t *comm) {
128 | ucs_status_t status;
129 | nccl_uct_get_param_t *param;
130 |
131 | for (; comm->pending.first != comm->pending.last; comm->pending.first++) {
132 | param = &comm->pending.param[comm->pending.first & NCCL_UCT_PENDING_MASK];
133 |
134 | status = uct_ep_get_zcopy(comm->base.uct_ep->ep, ¶m->iov, 1, param->rva,
135 | param->rkey, ¶m->req->completion);
136 | if (status == UCS_OK) {
137 | param->req->completion.count--;
138 | } else if (status != UCS_INPROGRESS) {
139 | break;
140 | }
141 |
142 | if (param->req->completion.count == 1) {
143 | nccl_uct_rd_send_ats(param->req);
144 | }
145 | }
146 | }
147 |
148 | static ucs_status_t nccl_uct_rd_ats_callback(void *arg, void *data,
149 | size_t length, unsigned flags) {
150 | nccl_uct_rd_req_t **req = (nccl_uct_rd_req_t **)((uint8_t *)data + 8);
151 | nccl_uct_rd_req_t **end = (nccl_uct_rd_req_t **)((uint8_t *)data + length);
152 |
153 | for (; req + 1 <= end; req++) {
154 | assert((*req)->completion.count == 1);
155 | assert((*req)->comm == nccl_uct_rd_comm_get(*(nccl_uct_comm_t**)data));
156 |
157 | (*req)->completion.count = 0;
158 | }
159 |
160 | assert(req == end);
161 | return UCS_OK;
162 | }
163 |
164 | static ucs_status_t nccl_uct_rd_rts_callback(void *arg, void *data,
165 | size_t length, unsigned flags) {
166 |
167 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(*(nccl_uct_comm_t**)data);
168 | nccl_uct_mem_t *rts = (nccl_uct_mem_t *)((uint8_t *)data + 8);
169 | nccl_uct_ring_t *exp;
170 | nccl_uct_mem_t *dst;
171 | unsigned i;
172 |
173 | assert(length == (sizeof(*rts) + 8));
174 |
175 | /* Do we already expect it? */
176 | exp = &comm->exp;
177 | i = nccl_uct_ring_find(exp, rts->tag);
178 | if (i == exp->last) {
179 | nccl_uct_ring_append(&comm->unexp, rts->tag, rts, sizeof(*rts));
180 | } else {
181 | /* Receive request was already posted */
182 | dst = nccl_uct_ring_get_entry(exp, i);
183 | nccl_uct_rd_pending_add(comm, rts, dst);
184 | nccl_uct_ring_consume(exp, i);
185 | }
186 |
187 | return UCS_OK;
188 | }
189 |
190 | static ncclResult_t nccl_uct_rd_iface_set(nccl_uct_iface_t *uct_iface) {
191 | NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_RTS,
192 | nccl_uct_rd_rts_callback));
193 | NCCLCHECK(nccl_uct_iface_set_handler(uct_iface, NCCL_UCT_AM_ATS,
194 | nccl_uct_rd_ats_callback));
195 | return ncclSuccess;
196 | }
197 |
198 | static ncclResult_t nccl_uct_rd_comm_alloc(nccl_uct_comm_t **comm_p) {
199 | nccl_uct_rd_comm_t *comm = calloc(1, sizeof(*comm));
200 | if (comm != NULL) {
201 | *comm_p = &comm->base;
202 | return ncclSuccess;
203 | }
204 |
205 | return ncclSystemError;
206 | }
207 |
208 | static ncclResult_t nccl_uct_rd_comm_init(nccl_uct_comm_t *base_comm,
209 | nccl_uct_context_t *context,
210 | nccl_uct_worker_t *worker, int dev,
211 | const nccl_uct_comm_t *remote_comm) {
212 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(base_comm);
213 |
214 | comm->pending.first = 0;
215 | comm->pending.last = 0;
216 | comm->req_count = 0;
217 | comm->free_req = NULL;
218 |
219 | NCCLCHECK(nccl_uct_ring_init(&comm->exp, sizeof(nccl_uct_mem_t)));
220 | NCCLCHECK(nccl_uct_ring_init(&comm->unexp, sizeof(nccl_uct_mem_t)));
221 |
222 | return nccl_uct_comm_init(&comm->base, context, worker, dev, remote_comm);
223 | }
224 |
225 | static ncclResult_t nccl_uct_rd_init(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
226 | NCCL_STATIC_ASSERT(NCCL_UCT_RING_SIZE >= 2 * MAX_REQUESTS,
227 | "Cannot handle expected/unexpected requests");
228 | NCCL_STATIC_ASSERT(NCCL_UCT_PENDING_SIZE > MAX_REQUESTS,
229 | "Cannot handle enough pending requests");
230 |
231 | context.ops.comm_alloc = nccl_uct_rd_comm_alloc;
232 | context.ops.comm_init = nccl_uct_rd_comm_init;
233 | context.ops.iface_set = nccl_uct_rd_iface_set;
234 | context.rkey_size = sizeof(((nccl_uct_mem_t*)0)->u.rkey);
235 | context.am_short_size = sizeof(((nccl_uct_rd_req_t*)0)->remote_req);
236 | if (sizeof(nccl_uct_mem_t) > context.am_short_size) {
237 | context.am_short_size = sizeof(nccl_uct_mem_t);
238 | }
239 |
240 | return nccl_p2p_ib_init(&context.dev_count, &context.merge_dev_count, ncclIbDevs, context.if_name,
241 | &context.if_addr, NULL, logFunction);
242 | }
243 |
244 | static nccl_uct_rd_req_t *nccl_uct_rd_req_alloc(nccl_uct_rd_comm_t *comm,
245 | int count) {
246 | nccl_uct_rd_req_t *req = comm->free_req;
247 |
248 | if (req == NULL) {
249 | req = malloc(sizeof(*req));
250 | if (req == NULL) {
251 | return req;
252 | }
253 | } else {
254 | comm->free_req = req->next;
255 | }
256 |
257 | comm->req_count++;
258 | req->comm = comm;
259 | req->completion.func = nccl_uct_empty_callback;
260 | req->completion.count = count;
261 | req->completion.status = UCS_OK;
262 | return req;
263 | }
264 |
265 | static inline void nccl_uct_rd_req_free(nccl_uct_rd_req_t *req) {
266 | req->next = req->comm->free_req;
267 | req->comm->free_req = req;
268 | req->comm->req_count--;
269 | }
270 |
271 | static ncclResult_t nccl_uct_rd_isend(void *send_comm, void *data, size_t size,
272 | int tag, void *mhandle, void* phandle, void **request) {
273 |
274 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(send_comm);
275 | nccl_uct_memh_t *uct_memh = mhandle;
276 | nccl_uct_mem_t rts;
277 | nccl_uct_rd_req_t *req;
278 | ucs_status_t status;
279 |
280 | req = nccl_uct_rd_req_alloc(comm, 1);
281 | if (req == NULL) {
282 | *request = NULL;
283 | return ncclSuccess;
284 | }
285 |
286 | req->send_rts = 1;
287 | req->count = 1;
288 | req->sizes[0] = size;
289 | *request = req;
290 |
291 | rts.tag = tag;
292 | rts.size = size;
293 | rts.data = data;
294 | rts.u.rkey = uct_memh->bundle.rkey;
295 | rts.req = req;
296 |
297 | status = uct_ep_am_short(comm->base.uct_ep->ep, NCCL_UCT_AM_RTS,
298 | (uint64_t)comm->base.remote.comm, &rts, sizeof(rts));
299 | if (status != UCS_OK) {
300 | nccl_uct_rd_req_free(req);
301 | *request = NULL;
302 | }
303 |
304 | return ncclSuccess;
305 | }
306 |
307 | static ncclResult_t nccl_uct_rd_irecv(void *recv_comm, int n, void **data,
308 | size_t *sizes, int *tags, void **mhandles,
309 | void** phandles, void **request) {
310 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(recv_comm);
311 | nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandles;
312 | nccl_uct_ring_t *unexp;
313 | nccl_uct_rd_req_t *req;
314 | nccl_uct_mem_t *rts, recv;
315 | unsigned i, j;
316 |
317 | assert(n <= NCCL_UCX_UCT_MAX_RECVS);
318 |
319 | /* Create a request */
320 | req = nccl_uct_rd_req_alloc(comm, n + 1);
321 | *request = req;
322 | if (req == NULL) {
323 | return ncclSuccess;
324 | }
325 |
326 | req->send_rts = 0;
327 | req->count = n;
328 | req->rts_count = 0;
329 |
330 | /* Try to match or build expected list */
331 | for (i = 0; i < n; i++) {
332 | recv.tag = tags[i];
333 | recv.size = sizes[i];
334 | recv.data = data[i];
335 | recv.u.uct_memh = uct_memh[i];
336 | recv.req = req;
337 | recv.index = i;
338 |
339 | unexp = &comm->unexp;
340 | j = nccl_uct_ring_find(unexp, tags[i]);
341 | if (j == unexp->last) {
342 | nccl_uct_ring_append(&comm->exp, tags[i], &recv, sizeof(recv));
343 | } else {
344 | rts = nccl_uct_ring_get_entry(unexp, j);
345 | nccl_uct_rd_pending_add(comm, rts, &recv);
346 | nccl_uct_ring_consume(unexp, j);
347 | }
348 | }
349 |
350 | return ncclSuccess;
351 | }
352 |
353 | static ncclResult_t nccl_uct_rd_iflush(void *recv_comm, int n, void **data,
354 | int *sizes, void **mhandle,
355 | void **request) {
356 | ncclResult_t result = ncclSuccess;
357 | nccl_uct_comm_t *base_comm = recv_comm;
358 | nccl_uct_memh_t **uct_memh = (nccl_uct_memh_t**)mhandle;
359 | int last = nccl_uct_flush_index(base_comm, sizes, n);
360 | nccl_uct_rd_req_t *req;
361 |
362 | *request = NULL;
363 |
364 | if (last != -1) {
365 | req = nccl_uct_rd_req_alloc(nccl_uct_rd_comm_get(recv_comm), 1);
366 | if (req != NULL) {
367 | req->send_rts = -1;
368 | *request = req;
369 |
370 | result = nccl_uct_flush(base_comm, data[last], sizes[last],
371 | uct_memh[last], &req->completion, request);
372 | if (*request == NULL) {
373 | nccl_uct_rd_req_free(req);
374 | }
375 | }
376 | }
377 |
378 | return result;
379 | }
380 |
381 | static ncclResult_t nccl_uct_rd_test(void *request, int *done, int *sizes) {
382 | nccl_uct_rd_req_t *req = request;
383 |
384 | while (uct_worker_progress(req->comm->base.uct_worker->worker))
385 | ; /* empty */
386 |
387 | nccl_uct_rd_pending_drain(req->comm);
388 |
389 | if (req->completion.count > 0) {
390 | if ((req->send_rts == 0) && (req->completion.count == 1)) {
391 | nccl_uct_rd_send_ats(req);
392 | }
393 |
394 | if (req->completion.count > 0) {
395 | *done = 0;
396 | return ncclSuccess;
397 | }
398 | }
399 |
400 | if ((sizes != NULL) && (req->send_rts > -1)) {
401 | memcpy(sizes, req->sizes, req->count * sizeof(*req->sizes));
402 | }
403 |
404 | *done = 1;
405 | nccl_uct_rd_req_free(req);
406 | return ncclSuccess;
407 | }
408 |
409 | static ncclResult_t nccl_uct_rd_close(void *close_comm) {
410 | nccl_uct_rd_comm_t *comm = nccl_uct_rd_comm_get(close_comm);
411 | nccl_uct_rd_req_t *req;
412 |
413 | nccl_uct_comm_deinit(close_comm);
414 |
415 | while ((req = comm->free_req) != NULL) {
416 | comm->free_req = req->next;
417 | free(req);
418 | }
419 |
420 | assert(nccl_uct_ring_is_empty(&comm->exp));
421 | assert(nccl_uct_ring_is_empty(&comm->unexp));
422 | assert(comm->req_count == 0);
423 | assert(comm->pending.first == comm->pending.last);
424 |
425 | nccl_uct_ring_deinit(&comm->exp);
426 | nccl_uct_ring_deinit(&comm->unexp);
427 | free(comm);
428 | return ncclSuccess;
429 | }
430 |
431 | static ncclResult_t nccl_uct_rd_init_v9(ncclDebugLogger_t logFunction) {
432 | return nccl_uct_rd_init(logFunction, NULL);
433 | }
434 |
435 | static ncclResult_t nccl_uct_rd_isend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
436 | return nccl_uct_rd_isend(sendComm, data, size, tag, mhandle, NULL, request);
437 | }
438 |
439 | static ncclResult_t nccl_uct_rd_irecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
440 | return nccl_uct_rd_irecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
441 | }
442 |
443 | static ncclResult_t nccl_uct_rd_isend_v8(void *send_comm, void *data, int size,
444 | int tag, void *mhandle, void **request) {
445 | return nccl_uct_rd_isend_v9(send_comm, data, (size_t)size, tag, mhandle, request);
446 | }
447 |
448 | static ncclResult_t nccl_uct_rd_irecv_v8(void *recv_comm, int n, void **data,
449 | int *sizes, int *tags, void **mhandles,
450 | void **request) {
451 | size_t sizes_sizet[NCCL_NET_IB_MAX_RECVS];
452 | for (int i=0; i
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include "utils.h"
20 | #include "core.h"
21 | #include "param.h"
22 |
23 | // Allocate memory to be potentially ibv_reg_mr'd. This needs to be
24 | // allocated on separate pages as those pages will be marked DONTFORK
25 | // and if they are shared, that could cause a crash in a child process
26 | ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
27 | size_t page_size = sysconf(_SC_PAGESIZE);
28 | void* p;
29 | int size_aligned = ROUNDUP(size, page_size);
30 | int ret = posix_memalign(&p, page_size, size_aligned);
31 | if (ret != 0) return ncclSystemError;
32 | memset(p, 0, size);
33 | *ptr = p;
34 | return ncclSuccess;
35 | }
36 |
37 | ncclResult_t ncclRealloc(void **ptr, size_t oldNelem, size_t nelem) {
38 | if (nelem < oldNelem) return ncclInternalError;
39 | if (nelem == oldNelem) return ncclSuccess;
40 |
41 | void* oldp = *ptr;
42 | void* p = (void*)malloc(nelem);
43 | if (p == NULL) {
44 | WARN("Failed to malloc %ld bytes", nelem);
45 | return ncclSystemError;
46 | }
47 | memcpy(p, oldp, oldNelem);
48 | free(oldp);
49 | memset(p+oldNelem, 0, (nelem-oldNelem));
50 | *ptr = p;
51 | INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem, nelem, *ptr);
52 | return ncclSuccess;
53 | }
54 |
55 |
56 | int parseStringList(const char* string, struct netIf* ifList, int maxList) {
57 | if (!string) return 0;
58 |
59 | const char* ptr = string;
60 |
61 | int ifNum = 0;
62 | int ifC = 0;
63 | char c;
64 | do {
65 | c = *ptr;
66 | if (c == ':') {
67 | if (ifC > 0) {
68 | ifList[ifNum].prefix[ifC] = '\0';
69 | ifList[ifNum].port = atoi(ptr+1);
70 | ifNum++; ifC = 0;
71 | }
72 | while (c != ',' && c != '\0') c = *(++ptr);
73 | } else if (c == ',' || c == '\0') {
74 | if (ifC > 0) {
75 | ifList[ifNum].prefix[ifC] = '\0';
76 | ifList[ifNum].port = -1;
77 | ifNum++; ifC = 0;
78 | }
79 | } else {
80 | ifList[ifNum].prefix[ifC] = c;
81 | ifC++;
82 | }
83 | ptr++;
84 | } while (ifNum < maxList && c);
85 | return ifNum;
86 | }
87 |
88 | static int matchIf(const char* string, const char* ref, int matchExact) {
89 | // Make sure to include '\0' in the exact case
90 | int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
91 | return strncmp(string, ref, matchLen) == 0;
92 | }
93 |
94 | static int matchPort(const int port1, const int port2) {
95 | if (port1 == -1) return 1;
96 | if (port2 == -1) return 1;
97 | if (port1 == port2) return 1;
98 | return 0;
99 | }
100 |
101 |
102 | int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact) {
103 | // Make an exception for the case where no user list is defined
104 | if (listSize == 0) return 1;
105 |
106 | for (int i=0; i