├── .azuredevops
    ├── multinode-ci-nightly.yml
    ├── multinode-ci-pr.yml
    ├── rocm-ci.yml
    └── tests
    │   └── pytest
    │       └── HelloWorld.py
├── .github
    ├── CODEOWNERS
    ├── PULL_REQUEST_TEMPLATE.md
    └── dependabot.yml
├── .gitignore
├── .gitmodules
├── .jenkins
    ├── common.groovy
    ├── extended.groovy
    ├── precheckin.groovy
    ├── staticanalysis.groovy
    └── staticlibrary.groovy
├── .readthedocs.yaml
├── CHANGELOG.md
├── CMakeLists.txt
├── CppCheckSuppressions.txt
├── LICENSE.txt
├── Makefile
├── NOTICES.txt
├── README.md
├── cmake
    ├── CheckSymbolExistsNoWarn.cmake
    ├── Dependencies.cmake
    ├── DownloadProject.CMakeLists.cmake.in
    ├── DownloadProject.cmake
    ├── FindIBVerbs.cmake
    ├── Findmscclpp_nccl.cmake
    ├── MSCCLPP.cmake
    └── scripts
    │   ├── add_faults.sh
    │   ├── add_unroll.sh
    │   ├── extract_metadata.cmake
    │   └── git_version.cmake
├── docker
    ├── Dockerfile.ubuntu
    └── README.md
├── docs
    ├── .gitignore
    ├── api-reference
    │   ├── api-library.rst
    │   └── library-specification.rst
    ├── attributions.rst
    ├── conf.py
    ├── data
    │   └── how-to
    │   │   └── rccl-usage-tips
    │   │       ├── in-place_allreduce.png
    │   │       └── out-of-place_allreduce.png
    ├── doxygen
    │   ├── Doxyfile
    │   └── mainpage.txt
    ├── how-to
    │   ├── rccl-usage-tips.rst
    │   ├── troubleshooting-rccl.rst
    │   ├── using-nccl.rst
    │   └── using-rccl-tuner-plugin-api.rst
    ├── index.rst
    ├── install
    │   ├── building-installing.rst
    │   ├── docker-install.rst
    │   └── installation.rst
    ├── license.rst
    ├── sphinx
    │   ├── _toc.yml.in
    │   ├── requirements.in
    │   └── requirements.txt
    └── what-is-rccl.rst
├── ext-net
    ├── README.md
    ├── example
    │   ├── Makefile
    │   ├── nccl
    │   │   ├── common.h
    │   │   ├── err.h
    │   │   ├── net.h
    │   │   ├── net_device.h
    │   │   ├── net_v2.h
    │   │   ├── net_v3.h
    │   │   ├── net_v4.h
    │   │   ├── net_v5.h
    │   │   ├── net_v6.h
    │   │   ├── net_v7.h
    │   │   ├── net_v8.h
    │   │   ├── net_v9.h
    │   │   └── types.h
    │   └── plugin.c
    └── google-fastsocket
    │   └── Makefile
├── ext-profiler
    ├── README.md
    └── example
    │   ├── Makefile
    │   ├── README.md
    │   ├── event.c
    │   ├── event.h
    │   ├── nccl
    │       ├── common.h
    │       ├── err.h
    │       ├── profiler.h
    │       ├── profiler_v1.h
    │       ├── profiler_v2.h
    │       └── types.h
    │   ├── plugin.c
    │   ├── print_event.c
    │   └── print_event.h
├── ext-src
    ├── bf16-tuning.patch
    ├── check_ibv_access_relaxed_ordering.cc
    ├── cpx.patch
    ├── device-flag.patch
    ├── mem-reg.patch
    ├── mscclpp_ibv_access_relaxed_ordering.patch
    ├── no-cache.patch
    ├── non-multiple-128-fix.patch
    ├── read-allred.patch
    ├── reg-fix.patch
    └── remove-clip.patch
├── ext-tuner
    ├── README.md
    └── example
    │   ├── Makefile
    │   ├── nccl
    │       ├── common.h
    │       ├── err.h
    │       └── tuner.h
    │   └── plugin.c
├── install.sh
├── makefiles
    ├── common.mk
    ├── formatting.mk
    └── version.mk
├── pkg
    ├── Makefile
    ├── debian
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── changelog.in
    │   ├── compat
    │   ├── control.in
    │   ├── copyright
    │   ├── gbp.conf
    │   ├── libnccl-dev.install.in
    │   ├── libnccl2.install.in
    │   ├── rules
    │   └── source
    │   │   └── format
    ├── redhat
    │   ├── Makefile
    │   └── nccl.spec.in
    ├── srctxz
    │   ├── Makefile
    │   └── create_srctxz.sh.in
    └── txz
    │   ├── Makefile
    │   └── create_txz.sh.in
├── rtest.xml
├── src
    ├── Makefile
    ├── bootstrap.cc
    ├── channel.cc
    ├── collectives.cc
    ├── debug.cc
    ├── device
    │   ├── all_gather.h
    │   ├── all_reduce.h
    │   ├── alltoall_pivot.h
    │   ├── broadcast.h
    │   ├── common.cu
    │   ├── common.h
    │   ├── common_kernel.h
    │   ├── generate.py
    │   ├── msccl_kernel_impl.h
    │   ├── network
    │   │   └── unpack
    │   │   │   ├── unpack.h
    │   │   │   └── unpack_defs.h
    │   ├── onerank.cu
    │   ├── op128.h
    │   ├── primitives.h
    │   ├── prims_ll.h
    │   ├── prims_ll128.h
    │   ├── prims_simple.h
    │   ├── reduce.h
    │   ├── reduce_kernel.h
    │   ├── reduce_scatter.h
    │   └── sendrecv.h
    ├── enhcompat.cc
    ├── enqueue.cc
    ├── graph
    │   ├── connect.cc
    │   ├── paths.cc
    │   ├── rings.cc
    │   ├── rings.h
    │   ├── rome_models.cc
    │   ├── rome_models.h
    │   ├── search.cc
    │   ├── topo.cc
    │   ├── topo.h
    │   ├── trees.cc
    │   ├── tuning.cc
    │   ├── xml.cc
    │   └── xml.h
    ├── group.cc
    ├── include
    │   ├── BfdBacktrace.hpp
    │   ├── alloc.h
    │   ├── alt_rsmi.h
    │   ├── api_trace.h
    │   ├── archinfo.h
    │   ├── argcheck.h
    │   ├── bitops.h
    │   ├── bootstrap.h
    │   ├── channel.h
    │   ├── checks.h
    │   ├── coll_net.h
    │   ├── collectives.h
    │   ├── comm.h
    │   ├── core.h
    │   ├── cpuset.h
    │   ├── cudawrap.h
    │   ├── debug.h
    │   ├── device.h
    │   ├── enqueue.h
    │   ├── gdrwrap.h
    │   ├── git_version.h
    │   ├── graph.h
    │   ├── group.h
    │   ├── hip_rocm_version_info.h
    │   ├── ibvcore.h
    │   ├── ibvsymbols.h
    │   ├── ibvwrap.h
    │   ├── info.h
    │   ├── ipcsocket.h
    │   ├── mnnvl.h
    │   ├── msccl
    │   │   ├── msccl_kernel.h
    │   │   ├── msccl_lifecycle.h
    │   │   ├── msccl_parser.h
    │   │   ├── msccl_scheduler.h
    │   │   ├── msccl_setup.h
    │   │   ├── msccl_status.h
    │   │   └── msccl_struct.h
    │   ├── mscclpp
    │   │   └── mscclpp_nccl.h
    │   ├── nccl_common.h
    │   ├── nccl_net.h
    │   ├── nccl_profiler.h
    │   ├── nccl_tuner.h
    │   ├── net.h
    │   ├── net_device.h
    │   ├── npkit
    │   │   ├── npkit.h
    │   │   ├── npkit_event.h
    │   │   └── npkit_struct.h
    │   ├── nvmlwrap.h
    │   ├── nvtx.h
    │   ├── nvtx3
    │   │   ├── nvToolsExt.h
    │   │   ├── nvToolsExtCounters.h
    │   │   ├── nvToolsExtCuda.h
    │   │   ├── nvToolsExtCudaRt.h
    │   │   ├── nvToolsExtMem.h
    │   │   ├── nvToolsExtMemCudaRt.h
    │   │   ├── nvToolsExtOpenCL.h
    │   │   ├── nvToolsExtPayload.h
    │   │   ├── nvToolsExtPayloadHelper.h
    │   │   ├── nvToolsExtSemanticsCounters.h
    │   │   ├── nvToolsExtSemanticsScope.h
    │   │   ├── nvToolsExtSync.h
    │   │   ├── nvtx3.hpp
    │   │   └── nvtxDetail
    │   │   │   ├── nvtxExtHelperMacros.h
    │   │   │   ├── nvtxExtImpl.h
    │   │   │   ├── nvtxExtImplCounters_v1.h
    │   │   │   ├── nvtxExtImplMemCudaRt_v1.h
    │   │   │   ├── nvtxExtImplMem_v1.h
    │   │   │   ├── nvtxExtImplPayload_v1.h
    │   │   │   ├── nvtxExtInit.h
    │   │   │   ├── nvtxExtPayloadHelperInternal.h
    │   │   │   ├── nvtxExtPayloadTypeInfo.h
    │   │   │   ├── nvtxExtTypes.h
    │   │   │   ├── nvtxImpl.h
    │   │   │   ├── nvtxImplCore.h
    │   │   │   ├── nvtxImplCudaRt_v3.h
    │   │   │   ├── nvtxImplCuda_v3.h
    │   │   │   ├── nvtxImplOpenCL_v3.h
    │   │   │   ├── nvtxImplSync_v3.h
    │   │   │   ├── nvtxInit.h
    │   │   │   ├── nvtxInitDecls.h
    │   │   │   ├── nvtxInitDefs.h
    │   │   │   ├── nvtxLinkOnce.h
    │   │   │   └── nvtxTypes.h
    │   ├── nvtx_payload_schemas.h
    │   ├── nvtx_stub.h
    │   ├── p2p.h
    │   ├── param.h
    │   ├── profiler.h
    │   ├── proxy.h
    │   ├── ras.h
    │   ├── rccl_common.h
    │   ├── rccl_float8.h
    │   ├── rccl_vars.h
    │   ├── recorder.h
    │   ├── register.h
    │   ├── rocm_smi_wrap.h
    │   ├── rocmwrap.h
    │   ├── roctx.h
    │   ├── shm.h
    │   ├── shmutils.h
    │   ├── signals.h
    │   ├── socket.h
    │   ├── strongstream.h
    │   ├── timer.h
    │   ├── transport.h
    │   ├── trees.h
    │   ├── tuner.h
    │   └── utils.h
    ├── init.cc
    ├── init_nvtx.cc
    ├── misc
    │   ├── alt_rsmi.cc
    │   ├── api_trace.c
    │   ├── api_trace.cc
    │   ├── archinfo.cc
    │   ├── argcheck.cc
    │   ├── cudawrap.cc
    │   ├── gdrwrap.cc
    │   ├── ibvsymbols.cc
    │   ├── ibvwrap.cc
    │   ├── ipcsocket.cc
    │   ├── msccl
    │   │   ├── msccl_lifecycle.cc
    │   │   ├── msccl_parser.cc
    │   │   ├── msccl_setup.cc
    │   │   └── msccl_status.cc
    │   ├── mscclpp
    │   │   ├── mscclpp_nccl.cc
    │   │   └── mscclpp_nccl_syms.txt
    │   ├── npkit.cc
    │   ├── nvmlwrap.cc
    │   ├── nvmlwrap_stub.cc
    │   ├── param.cc
    │   ├── profiler.cc
    │   ├── recorder.cc
    │   ├── rocm_smi_wrap.cc
    │   ├── rocmwrap.cc
    │   ├── roctx.cc
    │   ├── shmutils.cc
    │   ├── signals.cc
    │   ├── socket.cc
    │   ├── strongstream.cc
    │   ├── tuner.cc
    │   └── utils.cc
    ├── mnnvl.cc
    ├── msccl.cc
    ├── nccl.h.in
    ├── nccl.pc.in
    ├── net.cc
    ├── proxy.cc
    ├── ras
    │   ├── client.cc
    │   ├── client_support.cc
    │   ├── collectives.cc
    │   ├── peers.cc
    │   ├── ras.cc
    │   ├── ras_internal.h
    │   └── rasnet.cc
    ├── rccl_wrap.cc
    ├── register
    │   ├── coll_reg.cc
    │   ├── register.cc
    │   └── sendrecv_reg.cc
    ├── transport.cc
    └── transport
    │   ├── coll_net.cc
    │   ├── generic.cc
    │   ├── net.cc
    │   ├── net_ib.cc
    │   ├── net_socket.cc
    │   ├── nvls.cc
    │   ├── p2p.cc
    │   └── shm.cc
├── test
    ├── AllGatherTests.cpp
    ├── AllReduceTests.cpp
    ├── AllToAllTests.cpp
    ├── AllToAllVTests.cpp
    ├── BroadcastTests.cpp
    ├── CMakeLists.txt
    ├── GatherTests.cpp
    ├── GroupCallTests.cpp
    ├── NonBlockingTests.cpp
    ├── ReduceScatterTests.cpp
    ├── ReduceTests.cpp
    ├── ScatterTests.cpp
    ├── SendRecvTests.cpp
    ├── StandaloneTests.cpp
    ├── _RecorderTests.cpp
    └── common
    │   ├── CallCollectiveForked.cpp
    │   ├── CallCollectiveForked.hpp
    │   ├── CollectiveArgs.cpp
    │   ├── CollectiveArgs.hpp
    │   ├── EnvVars.cpp
    │   ├── EnvVars.hpp
    │   ├── ErrCode.hpp
    │   ├── PrepDataFuncs.cpp
    │   ├── PrepDataFuncs.hpp
    │   ├── PtrUnion.cpp
    │   ├── PtrUnion.hpp
    │   ├── RcclMockFuncs.hpp
    │   ├── StandaloneUtils.cpp
    │   ├── StandaloneUtils.hpp
    │   ├── TestBed.cpp
    │   ├── TestBed.hpp
    │   ├── TestBedChild.cpp
    │   ├── TestBedChild.hpp
    │   └── main.cpp
├── toolchain-linux.cmake
└── tools
    ├── EmptyKernelTest
        ├── EmptyKernelTest.cpp
        └── Makefile
    ├── GraphBench
        ├── GraphBench.cpp
        └── Makefile
    ├── HelloRccl
        ├── HelloRccl.cpp
        ├── HelloRccl.hpp
        ├── Makefile
        └── runTest.sh
    ├── JitterBench
        ├── Common.hpp
        ├── Compatibility.hpp
        ├── GetClosestNumaNode.hpp
        ├── JitterBench.cpp
        ├── Makefile
        ├── Timeline.hpp
        └── runSweep.sh
    ├── RcclReplayer
        ├── Makefile
        ├── README.md
        ├── rcclReplayer.cpp
        └── rcclReplayer.hpp
    ├── TopoVisual
        ├── 4_nodes.log.png
        ├── README.md
        ├── extract_topo.awk
        └── topo_visual.sh
    ├── TransferBench
        └── README.md
    ├── ib-test
        ├── Makefile
        ├── ib_test.cpp
        ├── include
        │   └── nccl.h
        └── utils.cpp
    ├── msccl-algorithms
        ├── allgather_16n_direct_0_3m_ll128.xml
        ├── allgather_16n_direct_0_3m_ll128_op.xml
        ├── allgather_32n_direct_0_6m_ll128.xml
        ├── allgather_32n_direct_0_6m_ll128_op.xml
        ├── allreduce-allpairs-8n-ll-32tb-op.xml
        ├── allreduce-allpairs-8n-ll-32tb.xml
        ├── allreduce-allpairs-8n-ll-64tb-op.xml
        ├── allreduce-allpairs-8n-ll-64tb.xml
        ├── allreduce-allpairs-8n-simple-op.xml
        ├── allreduce-allpairs-8n-simple.xml
        ├── alltoall-8n-0-9kb.xml
        ├── alltoall-8n-190kb-512kb.xml
        ├── alltoall-8n-512kb-7mb.xml
        ├── alltoall-8n-7mb-43mb.xml
        └── alltoall-8n-9kb-190kb.xml
    ├── msccl-unit-test-algorithms
        ├── all-reduce-ring-ll.xml
        ├── all-reduce-ring-ll128.xml
        └── all-reduce-ring-simple.xml
    ├── p2p-latency-test
        ├── Makefile
        ├── build_and_run.sh
        ├── ll_latency_test.cpp
        ├── ll_latency_test.cu
        └── p2p_latency_test.cpp
    ├── rccl-prim-test
        ├── Makefile
        ├── copy_kernel.h
        └── rccl_prim_test.cpp
    ├── scripts
        ├── npkit_trace_analysis.py
        ├── npkit_trace_generator.py
        ├── pytorch-all-reduce
        │   ├── README.md
        │   ├── all_reduce.py
        │   └── trace_runs.sh
        ├── pytorch-log-parser.py
        ├── rcclDiagnostics.py
        ├── rccl_bw_test.py
        ├── rocprof-log-parser.py
        ├── topo_val.sh
        └── ucx_ompi_rccl_rccltests_TB_script.sh
    ├── time-trace
        ├── rccl-TimeTrace.sh
        └── time_trace_generator.py
    └── topo_expl
        ├── Makefile
        ├── include
            ├── device_table.h
            ├── model.h
            ├── nccl.h
            └── utils.h
        ├── model.cpp
        ├── models
            ├── topo_16p1h.xml
            ├── topo_16p1h_vm.xml
            ├── topo_16p_gio-1s-1rp-cascade.xml
            ├── topo_16p_gio-3s-1rp-split-flat.xml
            ├── topo_3p_pcie.xml
            ├── topo_3p_pcie_1.xml
            ├── topo_4p1h.xml
            ├── topo_4p1h_1.xml
            ├── topo_4p2h.xml
            ├── topo_4p2h_1.xml
            ├── topo_4p2h_2nic.xml
            ├── topo_4p3l.xml
            ├── topo_4p3l_2h.xml
            ├── topo_4p3l_ia.xml
            ├── topo_4p3l_n2.xml
            ├── topo_4p3l_n2_1.xml
            ├── topo_4p3l_n4.xml
            ├── topo_4p4h.xml
            ├── topo_4p_942.xml
            ├── topo_8p1h.xml
            ├── topo_8p1h_1.xml
            ├── topo_8p1h_2.xml
            ├── topo_8p1h_3.xml
            ├── topo_8p1h_4.xml
            ├── topo_8p1h_5.xml
            ├── topo_8p1h_n1.xml
            ├── topo_8p6l.xml
            ├── topo_8p6l_1nic.xml
            ├── topo_8p6l_2nic.xml
            ├── topo_8p6l_3nic.xml
            ├── topo_8p6l_4nic.xml
            ├── topo_8p6l_5nic.xml
            ├── topo_8p6l_6nic.xml
            ├── topo_8p_4nics.xml
            ├── topo_8p_90a.xml
            ├── topo_8p_90a_1.xml
            ├── topo_8p_942.xml
            ├── topo_8p_942vm.xml
            ├── topo_8p_pcie.xml
            ├── topo_8p_pcie_1.xml
            ├── topo_8p_pcie_2nic.xml
            ├── topo_8p_rome.xml
            ├── topo_8p_rome_4n_1.xml
            ├── topo_8p_rome_4n_2.xml
            ├── topo_8p_rome_4nics.xml
            ├── topo_8p_rome_n2.xml
            ├── topo_8p_rome_n2_1.xml
            ├── topo_8p_rome_n2_2.xml
            ├── topo_8p_rome_n4.xml
            ├── topo_8p_rome_n4_1.xml
            ├── topo_8p_rome_pcie.xml
            ├── topo_8p_rome_vm1.xml
            ├── topo_8p_ts1.xml
            ├── topo_8p_ts1_1.xml
            ├── topo_8p_ts1_n4.xml
            ├── topo_8p_ts1_n4_1.xml
            ├── topo_8p_ts1_n4_2.xml
            ├── topo_collnet_n1.xml
            └── topo_collnet_n4.xml
        ├── topo_expl.cpp
        └── utils.cpp


/.azuredevops/multinode-ci-nightly.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   repositories:
 3 |   - repository: pipelines_repo
 4 |     type: github
 5 |     endpoint: ROCm
 6 |     name: ROCm/ROCm
 7 | 
 8 | variables:
 9 | - group: common
10 | - template: /.azuredevops/variables-global.yml@pipelines_repo
11 | - name: pytestFolder
12 |   value: '.azuredevops/tests/pytest'
13 | 
14 | parameters:
15 | - name: pytestList
16 |   type: object
17 |   default:
18 |     - HelloWorld
19 | 
20 | trigger: none
21 | pr: none
22 | schedules:
23 |   - cron: "0 5 * 11-3 *"  # 11 PM CST (November - March)
24 |     displayName: "Nightly Build (CST)"
25 |     branches:
26 |       include:
27 |         - develop
28 |     always: false
29 | 
30 |   - cron: "0 4 * 4-10 *"  # 11 PM CDT (April - October)
31 |     displayName: "Nightly Build (CDT)"
32 |     branches:
33 |       include:
34 |         - develop
35 |     always: false
36 | 
37 | jobs:
38 | - job: rccl
39 |   timeoutInMinutes: 180
40 |   pool: rocm-ci_rccl_pool
41 |   workspace:
42 |     clean: all
43 |   steps:
44 |   - task: DeleteFiles@1
45 |     inputs:
46 |       Contents: '**/*'
47 |   - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
48 |     parameters:
49 |       submoduleBehaviour: recursive
50 |   - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
51 |     parameters:
52 |       installEnabled: false
53 |       printDiskSpace: false
54 |       extraBuildFlags: >-
55 |         -DCMAKE_BUILD_TYPE=Release
56 |         -DBUILD_TESTS=ON
57 |         -GNinja
58 |   - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
59 |     parameters:
60 |       componentName: rccl
61 |       testDir: $(Build.SourcesDirectory)/build/test
62 |       testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
63 |       testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
64 |   - ${{ each pytestScript in parameters.pytestList }}:
65 |     - task: Bash@3
66 |       displayName: Test ${{ pytestScript }}
67 |       continueOnError: true
68 |       inputs:
69 |         targetType: inline
70 |         workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
71 |         script: pytest ${{ pytestScript }}.py
72 | 


--------------------------------------------------------------------------------
/.azuredevops/multinode-ci-pr.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   repositories:
 3 |   - repository: pipelines_repo
 4 |     type: github
 5 |     endpoint: ROCm
 6 |     name: ROCm/ROCm
 7 | 
 8 | variables:
 9 | - group: common
10 | - template: /.azuredevops/variables-global.yml@pipelines_repo
11 | - name: pytestFolder
12 |   value: '.azuredevops/tests/pytest'
13 | 
14 | parameters:
15 | - name: pytestList
16 |   type: object
17 |   default:
18 |     - HelloWorld
19 | 
20 | trigger: none
21 | pr:
22 |   autoCancel: true
23 |   branches:
24 |     include:
25 |     - develop
26 |   paths:
27 |     exclude:
28 |     - .github
29 |     - .jenkins
30 |     - docs
31 |     - '*.md'
32 |     - LICENSE.txt
33 |     - NOTICES.txt
34 |   drafts: false
35 | 
36 | stages:
37 | - stage: rcclStage
38 |   displayName: 'RCCL develop PR'
39 |   jobs:
40 |   - deployment: rccl_pr_approval
41 |     displayName: "CI Run Requires Approval"
42 |     environment: rccl
43 |   - job: rccl
44 |     timeoutInMinutes: 180
45 |     pool: rocm-ci_rccl_pool
46 |     workspace:
47 |       clean: all
48 |     steps:
49 |     - task: DeleteFiles@1
50 |       inputs:
51 |         Contents: '**/*'
52 |     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
53 |       parameters:
54 |         submoduleBehaviour: recursive
55 |     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
56 |       parameters:
57 |         installEnabled: false
58 |         printDiskSpace: false
59 |         extraBuildFlags: >-
60 |           -DCMAKE_BUILD_TYPE=Release
61 |           -DBUILD_TESTS=ON
62 |           -DGPU_TARGETS=gfx942
63 |           -GNinja
64 |     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
65 |       parameters:
66 |         componentName: rccl
67 |         testDir: $(Build.SourcesDirectory)/build/test
68 |         testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
69 |         testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
70 |     - ${{ each pytestScript in parameters.pytestList }}:
71 |       - task: Bash@3
72 |         displayName: Test ${{ pytestScript }}
73 |         continueOnError: true
74 |         inputs:
75 |           targetType: inline
76 |           workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
77 |           script: pytest ${{ pytestScript }}.py
78 | 


--------------------------------------------------------------------------------
/.azuredevops/rocm-ci.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   repositories:
 3 |   - repository: pipelines_repo
 4 |     type: github
 5 |     endpoint: ROCm
 6 |     name: ROCm/ROCm
 7 | 
 8 | variables:
 9 | - group: common
10 | - template: /.azuredevops/variables-global.yml@pipelines_repo
11 | 
12 | trigger:
13 |   batch: true
14 |   branches:
15 |     include:
16 |     - develop
17 |     - mainline
18 |   paths:
19 |     exclude:
20 |     - .github
21 |     - .jenkins
22 |     - docs
23 |     - '.*.y*ml'
24 |     - '*.md'
25 |     - LICENSE.txt
26 |     - NOTICES.txt
27 | 
28 | pr:
29 |   autoCancel: true
30 |   branches:
31 |     include:
32 |     - develop
33 |     - mainline
34 |   paths:
35 |     exclude:
36 |     - .github
37 |     - .jenkins
38 |     - docs
39 |     - '.*.y*ml'
40 |     - '*.md'
41 |     - LICENSE.txt
42 |     - NOTICES.txt
43 |   drafts: false
44 | 
45 | jobs:
46 |   - template: ${{ variables.CI_COMPONENT_PATH }}/rccl.yml@pipelines_repo
47 | 


--------------------------------------------------------------------------------
/.azuredevops/tests/pytest/HelloWorld.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | def test_HelloWorld():
4 |     greeting = "Hello, World!"
5 |     assert greeting == "Hello, World!"
6 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @wenkaidu @gilbertlee-amd @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @BertanDogancay @rahulvaidya20 @isaki001 @PJAvinash @AbandiGa @Nikhil-Nunna @haripriya-amd# Documentation files
2 | docs/ @ROCm/rocm-documentation
3 | *.md @ROCm/rocm-documentation
4 | *.rst @ROCm/rocm-documentation
5 | .readthedocs.yaml @ROCm/rocm-documentation
6 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Details
 2 | ___Do not mention proprietary info or link to internal work items in this PR.___
 3 | 
 4 | **Work item:** _"Internal", or link to GitHub issue (if applicable)._
 5 | 
 6 | **What were the changes?**  
 7 | _One sentence describing the work done._
 8 | 
 9 | **Why were the changes made?**  
10 | _Explain the motivation behind the work. Provide any publicly-available historical context._
11 | 
12 | **How was the outcome achieved?**  
13 | _Technical details behind the work. Explain any publicly-available hardware peculiarities._
14 | 
15 | **Additional Documentation:**  
16 | _What else should the reviewer know?_
17 | 
18 | ## Approval Checklist
19 | ___Do not approve until these items are satisfied.___
20 | - [ ] Verify the CHANGELOG has been updated, if
21 |   - there are any NCCL API version changes,
22 |   - any changes impact library users, and/or
23 |   - any changes impact any other ROCm library.
24 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/docs/sphinx" # Location of package manifests
10 |     open-pull-requests-limit: 10
11 |     schedule:
12 |       interval: "daily"
13 |     labels:
14 |       - "dependencies"
15 |       - "ci:docs-only"
16 |     reviewers:
17 |       - "samjwu"
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
2 | *.gcov
3 | /coverage/
4 | build/
5 | ext/
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "ext-src/mscclpp"]
 2 | 	path = ext-src/mscclpp
 3 | 	url = https://github.com/microsoft/mscclpp.git
 4 | 	ignore = dirty
 5 | 	shallow = true
 6 | [submodule "ext-src/json"]
 7 | 	path = ext-src/json
 8 | 	url = https://github.com/nlohmann/json.git
 9 | 	ignore = dirty
10 | 	shallow = true
11 | 


--------------------------------------------------------------------------------
/.jenkins/common.groovy:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
 2 | // This file is for internal AMD use.
 3 | // If you are interested in running your own Jenkins, please raise a github issue for assistance.
 4 | 
 5 | def runCompileCommand(platform, project, jobName)
 6 | {
 7 |     project.paths.construct_build_prefix()
 8 | 
 9 |     def command = """#!/usr/bin/env bash
10 |                 set -x
11 |                 cd ${project.paths.project_build_prefix}
12 |                 ${project.paths.build_command}
13 |             """
14 | 
15 |     platform.runCommand(this,command)
16 | }
17 | 
18 | def runTestCommand (platform, project, gfilter, envars)
19 | {
20 |     String sudo = auxiliary.sudo(platform.jenkinsLabel)
21 | 
22 |     def command = """#!/usr/bin/env bash
23 |                 set -x
24 |                 export RUN_TEST_ROOT=\$(pwd)
25 |                 cd ${project.paths.project_build_prefix}/build/release/test
26 |                 ${sudo} ulimit -l unlimited
27 |                 ulimit -a
28 |                 ${sudo} ${envars} LD_LIBRARY_PATH=\${RUN_TEST_ROOT}/${project.paths.project_build_prefix}/build/release:\${LD_LIBRARY_PATH} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 UT_MULTITHREAD=1 UT_PROCESS_MASK=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
29 |             """
30 | 
31 |    platform.runCommand(this, command)
32 | }
33 | 
34 | def runPackageCommand(platform, project, jobName)
35 | {
36 |     def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
37 | 
38 |     platform.runCommand(this, packageHelper[0])
39 |     platform.archiveArtifacts(this, packageHelper[1])
40 | }
41 | 
42 | return this
43 | 


--------------------------------------------------------------------------------
/.jenkins/staticanalysis.groovy:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env groovy
 2 | // Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
 3 | // This shared library is available at https://github.com/ROCm/rocJENKINS/
 4 | @Library('rocJenkins@pong') _
 5 | 
 6 | // This is file for internal AMD use.
 7 | // If you are interested in running your own Jenkins, please raise a github issue for assistance.
 8 | 
 9 | import com.amd.project.*
10 | import com.amd.docker.*
11 | import java.nio.file.Path
12 | 
13 | def runCompileCommand(platform, project, jobName, boolean debug=false)
14 | {
15 |     project.paths.construct_build_prefix()
16 | }
17 | 
18 | def runCI =
19 | {
20 |     nodeDetails, jobName->
21 | 
22 |     def prj  = new rocProject('rccl', 'StaticAnalysis')
23 | 
24 |     // Define test architectures, optional rocm version argument is available
25 |     def nodes = new dockerNodes(nodeDetails, jobName, prj)
26 | 
27 |     boolean formatCheck = false
28 |     boolean staticAnalysis = true
29 | 
30 |     def compileCommand =
31 |     {
32 |         platform, project->
33 | 
34 |         runCompileCommand(platform, project, jobName, false)
35 |     }
36 | 
37 |     buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
38 | }
39 | 
40 | ci: {
41 |     String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
42 | 
43 |     def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
44 |                         "rocm-docker":[]]
45 |     propertyList = auxiliary.appendPropertyList(propertyList)
46 | 
47 |     def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]]
48 |     jobNameList = auxiliary.appendJobNameList(jobNameList)
49 | 
50 |     propertyList.each
51 |     {
52 |         jobName, property->
53 |         if (urlJobName == jobName)
54 |             properties(auxiliary.addCommonProperties(property))
55 |     }
56 | 
57 |     jobNameList.each
58 |     {
59 |         jobName, nodeDetails->
60 |         if (urlJobName == jobName)
61 |             stage(jobName) {
62 |                 runCI(nodeDetails, jobName)
63 |             }
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |    os: ubuntu-22.04
 8 |    tools:
 9 |       python: "3.10"
10 | 
11 | sphinx:
12 |    configuration: docs/conf.py
13 | 
14 | formats: [htmlzip, pdf, epub]
15 | 
16 | python:
17 |    install:
18 |    - requirements: docs/sphinx/requirements.txt
19 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Attributions
 3 | 
 4 | Contains contributions from NVIDIA.
 5 | 
 6 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 7 | Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 8 | Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 9 | 
10 | Redistribution and use in source and binary forms, with or without
11 | modification, are permitted provided that the following conditions
12 | are met:
13 | 
14 | *  Redistributions of source code must retain the above copyright
15 |    notice, this list of conditions and the following disclaimer.
16 | *  Redistributions in binary form must reproduce the above copyright
17 |    notice, this list of conditions and the following disclaimer in the
18 |    documentation and/or other materials provided with the distribution.
19 | *  Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
20 |    Laboratory, the U.S. Department of Energy, nor the names of their
21 |    contributors may be used to endorse or promote products derived
22 |    from this software without specific prior written permission.
23 | 
24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
25 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
28 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 | 
36 | The U.S. Department of Energy funded the development of this software
37 | under subcontract 7078610 with Lawrence Berkeley National Laboratory.
38 | 
39 | 
40 | This code also includes files from the NVIDIA Tools Extension SDK project.
41 | 
42 | See:
43 | 
44 | https://github.com/NVIDIA/NVTX
45 | 
46 | for more information and license details.
47 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : src.build
 9 | install : src.install
10 | BUILDDIR ?= $(abspath ./build)
11 | ABSBUILDDIR := $(abspath $(BUILDDIR))
12 | TARGETS := src pkg
13 | clean: ${TARGETS:%=%.clean}
14 | test.build: src.build
15 | LICENSE_FILES := LICENSE.txt
16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
17 | lic: $(LICENSE_TARGETS)
18 | 
19 | ${BUILDDIR}/%.txt: %.txt
20 | 	@printf "Copying    %-35s > %s\n" $< $@
21 | 	mkdir -p ${BUILDDIR}
22 | 	cp $< $@
23 | 
24 | src.%:
25 | 	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
26 | 
27 | pkg.%:
28 | 	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
29 | 
30 | pkg.debian.prep: lic
31 | pkg.txz.prep: lic
32 | 


--------------------------------------------------------------------------------
/cmake/CheckSymbolExistsNoWarn.cmake:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile,
24 | # while ROCMChecks gives a warning if this variable is modified manually without a target.
25 | 
26 | # We now choose to disable ROCMChecks for this one case.
27 | 
28 | set(DISABLE_ROCM_CHECK OFF)
29 | 
30 | function(rocm_check_toolchain_var var access value list_file)
31 |   if(NOT DISABLE_ROCM_CHECK)
32 |     _rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
33 |   endif()
34 | endfunction()
35 | 
36 | macro(CHECK_SYMBOL_EXISTS)
37 |   set(DISABLE_ROCM_CHECK ON)
38 |   _check_symbol_exists(${ARGN})
39 |   set(DISABLE_ROCM_CHECK OFF)
40 | endmacro()
41 | 


--------------------------------------------------------------------------------
/cmake/DownloadProject.CMakeLists.cmake.in:
--------------------------------------------------------------------------------
 1 | # Distributed under the OSI-approved MIT License.  See accompanying
 2 | # file LICENSE or https://github.com/Crascit/DownloadProject for details.
 3 | 
 4 | cmake_minimum_required(VERSION 2.8.2)
 5 | 
 6 | project(${DL_ARGS_PROJ}-download NONE)
 7 | 
 8 | include(ExternalProject)
 9 | ExternalProject_Add(${DL_ARGS_PROJ}-download
10 |                     ${DL_ARGS_UNPARSED_ARGUMENTS}
11 |                     SOURCE_DIR          "${DL_ARGS_SOURCE_DIR}"
12 |                     BUILD_IN_SOURCE     TRUE
13 |                     TEST_COMMAND        ""
14 | )


--------------------------------------------------------------------------------
/cmake/FindIBVerbs.cmake:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | find_path(IBVERBS_INCLUDE_DIRS
24 |   NAMES infiniband/verbs.h
25 |   HINTS
26 |   ${IBVERBS_INCLUDE_DIR}
27 |   ${IBVERBS_ROOT_DIR}
28 |   ${IBVERBS_ROOT_DIR}/include)
29 | 
30 | find_library(IBVERBS_LIBRARIES
31 |   NAMES ibverbs
32 |   HINTS
33 |   ${IBVERBS_LIB_DIR}
34 |   ${IBVERBS_ROOT_DIR}
35 |   ${IBVERBS_ROOT_DIR}/lib)
36 | 
37 | include(FindPackageHandleStandardArgs)
38 | find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES)
39 | mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES)
40 | 


--------------------------------------------------------------------------------
/cmake/Findmscclpp_nccl.cmake:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | find_path(MSCCLPP_INCLUDE_DIRS
24 |     NAMES mscclpp/gpu.hpp
25 |     HINTS
26 |     ${MSCCLPP_ROOT}/include)
27 | 
28 | find_library(MSCCLPP_LIBRARIES
29 |     NAMES mscclpp_nccl
30 |     HINTS
31 |     ${MSCCLPP_ROOT}/lib)
32 | 
33 | include (FindPackageHandleStandardArgs)
34 | find_package_handle_standard_args(mscclpp_nccl DEFAULT_MSG MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
35 | mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
36 |     


--------------------------------------------------------------------------------
/cmake/scripts/add_faults.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in all
11 | # copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | # SOFTWARE.
20 | 
21 | HIP_FILE=$1
22 | 
23 | if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
24 |   sed -i "s/__syncthreads()/__syncthreads(); insert_random_delay_per_warp()/" "$HIP_FILE"
25 | 
26 |   echo "Added fault injection to $HIP_FILE"
27 | fi


--------------------------------------------------------------------------------
/cmake/scripts/add_unroll.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in all
11 | # copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | # SOFTWARE.
20 | 
21 | HIP_FILE=$1
22 | 
23 | if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
24 |   perl -pi -e 's/(template<typename T, typename RedOp(?:, typename Proto)?)(, bool isNetOffload.*?)?>/\1, int COLL_UNROLL\2>/g' "$HIP_FILE"
25 |   perl -pi -e 's/(ProtoSimple<[^,]*?,[^,]+?)>/\1, COLL_UNROLL>/g' "$HIP_FILE"
26 |   perl -pi -e 's/(runRing<T.*?)((, (true|false))?>\()/\1, COLL_UNROLL\2/g' "$HIP_FILE"
27 |   perl -pi -e 's/(runTreeUpDown<T.*?)>\(/\1, COLL_UNROLL>(/' "$HIP_FILE"
28 |   perl -pi -e 's/(runTreeSplit<T.*?)>\(/\1, COLL_UNROLL>(/' "$HIP_FILE"
29 |   sed -i "s/\\(struct RunWorkColl<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
30 |   sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
31 | 
32 |   echo "Added COLL_UNROLL template argument to $HIP_FILE"
33 | fi


--------------------------------------------------------------------------------
/cmake/scripts/extract_metadata.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in all
11 | # copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | # SOFTWARE.
20 | 
21 | ## List the objects for each gfx architecture
22 | execute_process( COMMAND roc-obj-ls librccl.so
23 |     RESULT_VARIABLE list_result
24 |     OUTPUT_VARIABLE cmd_output
25 | )
26 | 
27 | if(list_result EQUAL 0)
28 |     ## Convert cmd output to list of lines
29 |     string(REGEX REPLACE "\n$" "" cmd_output "${cmd_output}")
30 |     string(REPLACE "\n" ";" cmd_output "${cmd_output}")
31 | 
32 |     ## Extract file paths for the selected gfx archs
33 |     foreach(line ${cmd_output})
34 |         if(line MATCHES "(gfx90a|gfx942|gfx950)")
35 |             string(REGEX MATCH "\\file://(.*)" file_match ${line})
36 |             if(file_match)
37 |                 list(APPEND file_paths ${file_match})
38 |             endif()
39 |         endif()
40 |     endforeach()
41 | 
42 |     ## Extract objects from files
43 |     foreach(file ${file_paths})
44 |         execute_process(
45 |           COMMAND roc-obj-extract ${file}
46 |           RESULT_VARIABLE extraction_result
47 |         )
48 |         if(NOT extraction_result EQUAL 0)
49 |           message(WARNING "Could not extract objects from ${file}")
50 |         endif()
51 |     endforeach()
52 | else()
53 |     ## We don't want to stop building unit-tests if this command fails.
54 |     message(WARNING "Command failed with error code ${result}")
55 | endif()


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Using RCCL/RCCL-Tests in a docker environment
 2 | 
 3 | ## Docker build
 4 | 
 5 | Assuming you have docker installed on your system:
 6 | 
 7 | ### To build the docker image :
 8 | 
 9 | By default, the given Dockerfile uses `docker.io/rocm/dev-ubuntu-22.04:latest` as the base docker image, and then installs RCCL (develop branch) and RCCL-Tests (develop branch), targetting `gfx942` GPUs.
10 | ```shell
11 | $ docker build -t rccl-tests -f Dockerfile.ubuntu --pull .
12 | ```
13 | 
14 | The base docker image, rccl repo, rccl-tests repo, and GPU targets can be modified using `--build-args` in the `docker build` command above. E.g., to use a different base docker image for the MI250 GPU:
15 | ```shell
16 | $ docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull .
17 | ```
18 | 
19 | ### To start an interactive docker container on a system with AMD GPUs :
20 | 
21 | ```shell
22 | $ docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash
23 | ```
24 | 
25 | ### To run rccl-tests (all\_reduce\_perf) on 8 AMD GPUs (inside the docker container) :
26 | 
27 | If using ROCm 6.3.x or earlier
28 | ```shell
29 | $ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
30 | ```
31 | 
32 | If using ROCm 6.4.0 or later
33 | ```shell
34 | $ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
35 | ```
36 | 
37 | For more information on rccl-tests options, refer to the [Usage](https://github.com/ROCm/rccl-tests#usage) section of rccl-tests.
38 | 
39 | 
40 | ## Copyright
41 | 
42 | All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
43 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build/
2 | _doxygen/
3 | doxygen/html
4 | doxygen/xml
5 | sphinx/_toc.yml
6 | 


--------------------------------------------------------------------------------
/docs/api-reference/api-library.rst:
--------------------------------------------------------------------------------
 1 | .. meta::
 2 |    :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
 3 |    :keywords: RCCL, ROCm, library, API
 4 | 
 5 | .. _api-library:
 6 | 
 7 | =============
 8 | API library
 9 | =============
10 | 
11 | .. doxygenindex::
12 | 


--------------------------------------------------------------------------------
/docs/attributions.rst:
--------------------------------------------------------------------------------
 1 | .. meta::
 2 |    :description: RCCL attributions information
 3 |    :keywords: RCCL, ROCm, library, API, attributions
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 4
 7 |    :caption: Attributions
 8 | 
 9 | Attributions
10 | ============
11 | 
12 | Contains contributions from NVIDIA.
13 | 
14 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
15 | 
16 | Redistribution and use in source and binary forms, with or without
17 | modification, are permitted provided that the following conditions
18 | are met:
19 | 
20 | -  Redistributions of source code must retain the above copyright
21 |    notice, this list of conditions and the following disclaimer.
22 | -  Redistributions in binary form must reproduce the above copyright
23 |    notice, this list of conditions and the following disclaimer in the
24 |    documentation and/or other materials provided with the distribution.
25 | -  Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
26 |    Laboratory, the U.S. Department of Energy, nor the names of their
27 |    contributors may be used to endorse or promote products derived
28 |    from this software without specific prior written permission.
29 | 
30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
31 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
33 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
35 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
36 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
37 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
38 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 | 
42 | The U.S. Department of Energy funded the development of this software
43 | under subcontract 7078610 with Lawrence Berkeley National Laboratory.
44 | 
45 | This code also includes files from the NVIDIA Tools Extension SDK project.
46 | 
47 | For more information and license details, see `https://github.com/NVIDIA/NVTX <https://github.com/NVIDIA/NVTX>`_
48 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | import subprocess
 8 | 
 9 | from rocm_docs import ROCmDocs
10 | 
11 | name = "RCCL"
12 | get_major = r'sed -n -e "s/^NCCL_MAJOR.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk'
13 | get_minor = r'sed -n -e "s/^NCCL_MINOR.*\([0-9]\{2,\}\).*/\1/p" ../makefiles/version.mk'
14 | get_patch = r'sed -n -e "s/^NCCL_PATCH.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk'
15 | major = subprocess.getoutput(get_major)
16 | minor = subprocess.getoutput(get_minor)
17 | patch = subprocess.getoutput(get_patch)
18 | version_number = f"{major}.{minor}.{patch}"
19 | 
20 | # for PDF output on Read the Docs
21 | project = f"{name} Documentation"
22 | author = "Advanced Micro Devices, Inc."
23 | copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
24 | version = version_number
25 | release = version_number
26 | 
27 | external_toc_path = "./sphinx/_toc.yml"
28 | 
29 | docs_core = ROCmDocs(f"{name} {version_number} Documentation")
30 | docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
31 | docs_core.setup()
32 | 
33 | external_projects_current_project = "rccl"
34 | 
35 | for sphinx_var in ROCmDocs.SPHINX_VARS:
36 |     globals()[sphinx_var] = getattr(docs_core, sphinx_var)
37 | 


--------------------------------------------------------------------------------
/docs/data/how-to/rccl-usage-tips/in-place_allreduce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/rccl/e94b36024615b65ddcd372ac39d9e9f5f43f685f/docs/data/how-to/rccl-usage-tips/in-place_allreduce.png


--------------------------------------------------------------------------------
/docs/data/how-to/rccl-usage-tips/out-of-place_allreduce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/rccl/e94b36024615b65ddcd372ac39d9e9f5f43f685f/docs/data/how-to/rccl-usage-tips/out-of-place_allreduce.png


--------------------------------------------------------------------------------
/docs/doxygen/mainpage.txt:
--------------------------------------------------------------------------------
 1 | /*! \mainpage RCCL Documentation
 2 | 
 3 | \tableofcontents
 4 | 
 5 | \section intro_sec Introduction
 6 | 
 7 | RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
 8 | 
 9 | The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
10 | 
11 | \section API RCCL API Contents
12 | - @ref rccl_api_version
13 | - @ref rccl_result_code
14 | - @ref rccl_config_type
15 | - @ref rccl_api_communicator
16 | - @ref rccl_api_errcheck
17 | - @ref rccl_api_comminfo
18 | - @ref rccl_api_enumerations
19 | - @ref rccl_api_custom_redop
20 | - @ref rccl_collective_api
21 | - @ref rccl_group_api
22 | - @ref msccl_api
23 | 
24 | \section Full RCCL API File
25 | - nccl.h.in
26 | 
27 | */
28 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. meta::
 2 |    :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
 3 |    :keywords: RCCL, ROCm, library, API
 4 | 
 5 | .. _index:
 6 | 
 7 | ******************
 8 | RCCL documentation
 9 | ******************
10 | 
11 | The ROCm Communication Collectives Library (RCCL) is a stand-alone library
12 | that provides multi-GPU and multi-node collective communication primitives
13 | optimized for AMD GPUs. It uses PCIe and xGMI high-speed interconnects.
14 | To learn more, see :doc:`what-is-rccl`
15 | 
16 | The RCCL public repository is located at `<https://github.com/ROCm/rccl>`_.
17 | 
18 | .. grid:: 2
19 |   :gutter: 3
20 | 
21 |   .. grid-item-card:: Install
22 | 
23 |     * :doc:`Installing RCCL using the install script <./install/installation>`
24 |     * :doc:`Running RCCL using Docker <./install/docker-install>`
25 |     * :doc:`Building and installing RCCL from source code <./install/building-installing>`
26 | 
27 |   .. grid-item-card:: How to
28 | 
29 |     * :doc:`Using the RCCL Tuner plugin <./how-to/using-rccl-tuner-plugin-api>`
30 |     * :doc:`Using the NCCL Net plugin <./how-to/using-nccl>`
31 |     * :doc:`Troubleshoot RCCL <./how-to/troubleshooting-rccl>`
32 |     * :doc:`RCCL usage tips <./how-to/rccl-usage-tips>`
33 | 
34 | 
35 |   .. grid-item-card:: Examples
36 | 
37 |     * `RCCL Tuner plugin examples <https://github.com/ROCm/rccl/tree/develop/ext-tuner/example>`_
38 |     * `NCCL Net plugin examples <https://github.com/ROCm/rccl/tree/develop/ext-net/example>`_
39 |        
40 |   .. grid-item-card:: API reference
41 | 
42 |     * :ref:`Library specification<library-specification>`
43 |     * :ref:`api-library`
44 |        
45 | To contribute to the documentation, see
46 | `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
47 | 
48 | You can find licensing information on the
49 | `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
50 | 


--------------------------------------------------------------------------------
/docs/install/docker-install.rst:
--------------------------------------------------------------------------------
 1 | .. meta::
 2 |    :description: Instruction on how to install the RCCL library for collective communication primitives using Docker
 3 |    :keywords: RCCL, ROCm, library, API, install, Docker
 4 | 
 5 | .. _install-docker:
 6 | 
 7 | *****************************************
 8 | Running RCCL using Docker
 9 | *****************************************
10 | 
11 | To use Docker to run RCCL, Docker must already be installed on the system.
12 | To build the Docker image and run the container, follow these steps.
13 | 
14 | #. Build the Docker image
15 | 
16 |    By default, the Dockerfile uses ``docker.io/rocm/dev-ubuntu-22.04:latest`` as the base Docker image.
17 |    It then installs RCCL and rccl-tests (in both cases, it uses the version from the ``develop`` branch).
18 | 
19 |    Use this command to build the Docker image:
20 | 
21 |    .. code-block:: shell
22 | 
23 |       docker build -t rccl-tests -f Dockerfile.ubuntu --pull .
24 | 
25 |    The base Docker image, rccl repository, rccl-tests repository, and GPU targets can be modified
26 |    by using ``--build-args`` in the ``docker build`` command above. For example, to use a different base Docker image for the MI250 GPU,
27 |    use this command:
28 | 
29 |    .. code-block:: shell
30 | 
31 |       docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull .
32 | 
33 | #. Launch an interactive Docker container on a system with AMD GPUs:
34 | 
35 |    .. code-block:: shell
36 | 
37 |       docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash
38 | 
39 | To run, for example, the ``all_reduce_perf`` test from rccl-tests on 8 AMD GPUs from inside the Docker container, use this command:
40 | 
41 | If using ROCm 6.3.x or earlier
42 | .. code-block:: shell
43 | 
44 |    mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
45 | 
46 | If using ROCm 6.4.0 or later
47 | .. code-block:: shell
48 | 
49 |    mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
50 | 
51 | For more information on the rccl-tests options, see the `Usage guidelines <https://github.com/ROCm/rccl-tests#usage>`_ in the GitHub repository.
52 | 


--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 |    :description: RCCL licensing information
3 |    :keywords: RCCL, ROCm, library, API, license
4 | 
5 | License
6 | =======
7 | 
8 | .. include:: ../LICENSE.txt
9 | 


--------------------------------------------------------------------------------
/docs/sphinx/_toc.yml.in:
--------------------------------------------------------------------------------
 1 | root: index
 2 | subtrees:
 3 | 
 4 | - entries:
 5 |   - file: what-is-rccl.rst
 6 |     title: What is RCCL?
 7 | 
 8 | - caption: Install 
 9 |   entries:
10 |   - file: install/installation
11 |     title: Installation guide
12 |   - file: install/docker-install
13 |     title: Running RCCL using Docker
14 |   - file: install/building-installing
15 |     title: Building and installing from source
16 | 
17 | - caption: How to 
18 |   entries:
19 |   - file: how-to/using-rccl-tuner-plugin-api
20 |     title: Using the RCCL Tuner plugin
21 |   - file: how-to/using-nccl
22 |     title: Using the NCCL Net plugin
23 |   - file: how-to/troubleshooting-rccl
24 |     title: Troubleshoot RCCL
25 |   - file: how-to/rccl-usage-tips
26 | 
27 | - caption: Examples
28 |   entries:
29 |   - url: https://github.com/ROCm/rccl/tree/develop/ext-tuner/example
30 |     title: RCCL Tuner plugin examples
31 |   - url: https://github.com/ROCm/rccl/tree/develop/ext-net/example
32 |     title: NCCL Net plugin examples
33 | 
34 | - caption: API reference 
35 |   entries:
36 |   - file: api-reference/library-specification
37 |     title: Library specification
38 |   - file: api-reference/api-library
39 |   
40 | - caption: About
41 |   entries:
42 |   - file: license
43 |   - file: attributions
44 | 


--------------------------------------------------------------------------------
/docs/sphinx/requirements.in:
--------------------------------------------------------------------------------
1 | rocm-docs-core==1.18.2
2 | 


--------------------------------------------------------------------------------
/docs/what-is-rccl.rst:
--------------------------------------------------------------------------------
 1 | .. meta::
 2 |    :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
 3 |    :keywords: RCCL, ROCm, library, API
 4 | 
 5 | .. _what-is:
 6 | 
 7 | ******************
 8 | What is RCCL?
 9 | ******************
10 | 
11 | The ROCm Communication Collectives Library (RCCL) includes multi-GPU and
12 | multi-node collective communication primitives optimized for AMD GPUs.
13 | It implements routines such as ``all-reduce``, ``all-gather``, ``reduce``,
14 | ``broadcast``, ``reduce-scatter``, ``gather``, ``scatter``, ``all-to-allv``,
15 | and ``all-to-all``, as well as direct point-to-point (GPU-to-GPU) send
16 | and receive operations. It is optimized to achieve high bandwidth
17 | on platforms using PCIe and xGMI and networking using InfiniBand Verbs or TCP/IP
18 | sockets. RCCL supports an arbitrary number of GPUs installed in a single node
19 | or multiple nodes and can be used in either
20 | single- or multi-process (for example, MPI) applications.
21 | 
22 | The collective operations are implemented using ring and tree algorithms and have been optimized
23 | for throughput and latency by leveraging topology awareness, high-speed interconnects,
24 | and RDMA-based collectives. For best performance, small operations can be either
25 | batched into larger operations or aggregated through the API.
26 | 
27 | RCCL uses PCIe and xGMI high-speed interconnects for intra-node communication
28 | as well as InfiniBand, RoCE, and TCP/IP for inter-node communication.
29 | It supports an arbitrary number of GPUs installed in a single-node or
30 | multi-node platform and can easily integrate into
31 | single- or multi-process (for example, MPI) applications.


--------------------------------------------------------------------------------
/ext-net/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME:=../../build/
 7 | CUDA_HOME:=/usr/local/cuda
 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 9 | PLUGIN_SO:=libnccl-net.so
10 | 
11 | default: $(PLUGIN_SO)
12 | 
13 | $(PLUGIN_SO): plugin.c
14 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
15 | 
16 | clean:
17 | 	rm -f $(PLUGIN_SO)
18 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12 | 
13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_ERR_H_
 6 | #define NCCL_ERR_H_
 7 | 
 8 | /* Error type for plugins */
 9 | typedef enum { ncclSuccess                 =  0,
10 |                ncclUnhandledCudaError      =  1,
11 |                ncclSystemError             =  2,
12 |                ncclInternalError           =  3,
13 |                ncclInvalidArgument         =  4,
14 |                ncclInvalidUsage            =  5,
15 |                ncclRemoteError             =  6 } ncclResult_t;
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_H_
 6 | #define NCCL_NET_H_
 7 | 
 8 | #include <stdint.h>
 9 | #include <stdlib.h>
10 | 
11 | #include "common.h"
12 | #include "err.h"
13 | 
14 | #define NCCL_NET_HANDLE_MAXSIZE 128
15 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
16 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
17 | 
18 | #define NCCL_PTR_HOST 0x1
19 | #define NCCL_PTR_CUDA 0x2
20 | #define NCCL_PTR_DMABUF 0x4
21 | 
22 | // Maximum number of requests per comm object
23 | #define NCCL_NET_MAX_REQUESTS 32
24 | 
25 | #include "net_v9.h"
26 | #include "net_v8.h"
27 | #include "net_v7.h"
28 | #include "net_v6.h"
29 | #include "net_v5.h"
30 | #include "net_v4.h"
31 | #include "net_v3.h"
32 | #include "net_v2.h"
33 | 
34 | #endif // end include guard
35 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_device.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NET_DEVICE_H_
 8 | #define NET_DEVICE_H_
 9 | 
10 | #define NCCL_NET_DEVICE_INVALID_VERSION      0x0
11 | #define NCCL_NET_MTU_SIZE                    4096
12 | 
13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
16 | 
17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
18 | 
19 | typedef struct {
20 |   ncclNetDeviceType netDeviceType; // Network offload type
21 |   int netDeviceVersion;            // Version number for network offload
22 |   void* handle;
23 |   size_t size;
24 |   int needsProxyProgress;
25 | } ncclNetDeviceHandle_v7_t;
26 | 
27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
29 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/net_v3.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_NET_V3_H_
 6 | #define NCCL_NET_V3_H_
 7 | 
 8 | #define NCCL_NET_MAX_REQUESTS_V3 16
 9 | 
10 | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
11 | typedef struct {
12 |   // Name of the network (mainly for logs)
13 |   const char* name;
14 |   // Initialize the network.
15 |   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
16 |   // Return the number of adapters.
17 |   ncclResult_t (*devices)(int* ndev);
18 |   // Get various device properties.
19 |   ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
20 |   // Create a receiving object and provide a handle to connect to it. The
21 |   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
22 |   // between ranks to create a connection.
23 |   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
24 |   // Connect to a handle and return a sending comm object for that peer.
25 |   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
26 |   // Finalize connection establishment after remote peer has called connectHandle
27 |   ncclResult_t (*accept)(void* listenComm, void** recvComm);
28 |   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
29 |   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
30 |   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
31 |   ncclResult_t (*deregMr)(void* comm, void* mhandle);
32 |   // Asynchronous send to a peer.
33 |   // May return request == NULL if the call cannot be performed (or would block)
34 |   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
35 |   // Asynchronous recv from a peer.
36 |   // May return request == NULL if the call cannot be performed (or would block)
37 |   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
38 |   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
39 |   // visible to the GPU
40 |   ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
41 |   // Test whether a request is complete. If size is not NULL, it returns the
42 |   // number of bytes sent/received.
43 |   ncclResult_t (*test)(void* request, int* done, int* size);
44 |   // Close and free send/recv comm objects
45 |   ncclResult_t (*closeSend)(void* sendComm);
46 |   ncclResult_t (*closeRecv)(void* recvComm);
47 |   ncclResult_t (*closeListen)(void* listenComm);
48 | } ncclNet_v3_t;
49 | 
50 | #endif // end include guard
51 | 


--------------------------------------------------------------------------------
/ext-net/example/nccl/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_TYPES_H_
 6 | #define NCCL_TYPES_H_
 7 | 
 8 | /* Data types */
 9 | typedef enum { ncclInt8       = 0, ncclChar       = 0,
10 |                ncclUint8      = 1,
11 |                ncclInt32      = 2, ncclInt        = 2,
12 |                ncclUint32     = 3,
13 |                ncclInt64      = 4,
14 |                ncclUint64     = 5,
15 |                ncclFloat16    = 6, ncclHalf       = 6,
16 |                ncclFloat32    = 7, ncclFloat      = 7,
17 |                ncclFloat64    = 8, ncclDouble     = 8,
18 |                ncclBfloat16   = 9,
19 |                ncclFloat8e4m3 = 10,
20 |                ncclFloat8e5m2 = 11,
21 | } ncclDataType_t;
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/ext-net/google-fastsocket/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME?=/usr/local/cuda
 2 | INC:=-I$(CUDA_HOME)/include
 3 | PLUGIN_SO:=libnccl-net.so
 4 | 
 5 | default: $(PLUGIN_SO)
 6 | 
 7 | $(PLUGIN_SO): nccl-fastsocket/*.cc
 8 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
 9 | 
10 | nccl-fastsocket/*.cc:
11 | 	git clone https://github.com/google/nccl-fastsocket.git
12 | 
13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO)
14 | 
15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO)
16 | 	@printf "Grabbing %-35s > %s\n" $< $@
17 | 	mkdir -p $(BUILDDIR)/lib
18 | 	install -m 644 $< $@
19 | 
20 | clean:
21 | 	rm -f $(PLUGIN_SO)
22 | 	rm -Rf nccl-fastsocket
23 | 


--------------------------------------------------------------------------------
/ext-profiler/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | NCCL_HOME := ../../build
 7 | INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 8 | PLUGIN_SO := libnccl-profiler.so
 9 | 
10 | default: $(PLUGIN_SO)
11 | 
12 | $(PLUGIN_SO): plugin.c event.c print_event.c
13 | 	$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
14 | 
15 | clean:
16 | 	rm -f $(PLUGIN_SO)
17 | 


--------------------------------------------------------------------------------
/ext-profiler/example/event.c:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <stdio.h>
 8 | #include "event.h"
 9 | 
10 | int taskEventQueueEmpty(struct group* g) {
11 |   return g->eventHead == NULL;
12 | }
13 | 
14 | void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
15 |   event->next = NULL;
16 |   if (g->eventHead) g->eventTail->next = event;
17 |   else g->eventHead = event;
18 |   g->eventTail = event;
19 | }
20 | 
21 | struct taskEventBase* taskEventQueueHead(struct group* g) {
22 |   return g->eventHead;
23 | }
24 | 
25 | struct taskEventBase* taskEventQueueDequeue(struct group* g) {
26 |   struct taskEventBase* tmp = g->eventHead;
27 |   g->eventHead = g->eventHead->next;
28 |   if (g->eventHead == NULL) g->eventTail = NULL;
29 |   return tmp;
30 | }
31 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12 | 
13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ERR_H_
 8 | #define NCCL_ERR_H_
 9 | 
10 | /* Error type for plugins */
11 | typedef enum { ncclSuccess                 =  0,
12 |                ncclUnhandledCudaError      =  1,
13 |                ncclSystemError             =  2,
14 |                ncclInternalError           =  3,
15 |                ncclInvalidArgument         =  4,
16 |                ncclInvalidUsage            =  5,
17 |                ncclRemoteError             =  6 } ncclResult_t;
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_PROFILER_H_
 8 | #define NCCL_PROFILER_H_
 9 | 
10 | #include <stdint.h>
11 | #include <stdlib.h>
12 | 
13 | #include "common.h"
14 | #include "err.h"
15 | 
16 | #include "profiler_v2.h"
17 | #include "profiler_v1.h"
18 | 
19 | #endif // end include guard
20 | 


--------------------------------------------------------------------------------
/ext-profiler/example/nccl/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_TYPES_H_
 6 | #define NCCL_TYPES_H_
 7 | 
 8 | /* Data types */
 9 | typedef enum { ncclInt8       = 0, ncclChar       = 0,
10 |                ncclUint8      = 1,
11 |                ncclInt32      = 2, ncclInt        = 2,
12 |                ncclUint32     = 3,
13 |                ncclInt64      = 4,
14 |                ncclUint64     = 5,
15 |                ncclFloat16    = 6, ncclHalf       = 6,
16 |                ncclFloat32    = 7, ncclFloat      = 7,
17 |                ncclFloat64    = 8, ncclDouble     = 8,
18 |                ncclBfloat16   = 9,
19 | } ncclDataType_t;
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/ext-profiler/example/print_event.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PRINT_EVENT_H_
 8 | #define PRINT_EVENT_H_
 9 | 
10 | void debugEvent(void* eHandle, const char* tag);
11 | void printEvent(FILE* fh, void* handle);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/ext-src/bf16-tuning.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
 2 | index 7a2cd4a..a14dfbc 100644
 3 | --- a/apps/nccl/src/allreduce.hpp
 4 | +++ b/apps/nccl/src/allreduce.hpp
 5 | @@ -850,7 +850,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
 6 |                                                           flag++);
 7 |  #endif
 8 |    } else {
 9 | -    int nBlocks = 5*(nRanksPerNode - 1);
10 | +    int nBlocks = 8 * (nRanksPerNode - 1);
11 |      int nThreadsPerBlock = 512;
12 |      if (hieAllred && worldSize >= 8) {
13 |  	nBlocks = 20;
14 | diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp
15 | index ca2c272..a6056ea 100644
16 | --- a/apps/nccl/src/common.hpp
17 | +++ b/apps/nccl/src/common.hpp
18 | @@ -17,7 +17,7 @@ constexpr int NRANKS1_PER_NODE = 4;
19 |  constexpr int NRANKS_PER_NODE = 8;
20 |  constexpr int NPEERS = 7;
21 |  
22 | -constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
23 | +constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 112;  // double buffer * 56 thread-blocks * 8 ranks * 256KB = 112MB
24 |  
25 |  __device__ mscclpp::DeviceSyncer deviceSyncer;
26 |  
27 | 


--------------------------------------------------------------------------------
/ext-src/check_ibv_access_relaxed_ordering.cc:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | #include <infiniband/verbs.h>
3 | 
4 | int main(void) {
5 |   enum ibv_access_flags has_ibv_access_relaxed_ordering = IBV_ACCESS_RELAXED_ORDERING;
6 |   printf("IBV_ACCESS_RELAXED_ORDERING: %d\n", has_ibv_access_relaxed_ordering);
7 |   return 0;
8 | }
9 | 


--------------------------------------------------------------------------------
/ext-src/cpx.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/numa.cc b/src/numa.cc
 2 | index d72c99e..16c903d 100644
 3 | --- a/src/numa.cc
 4 | +++ b/src/numa.cc
 5 | @@ -26,6 +26,7 @@ namespace mscclpp {
 6 |  
 7 |  MSCCLPP_API_CPP int getDeviceNumaNode(int cudaDev) {
 8 |    std::string busId = getBusId(cudaDev);
 9 | +  busId[busId.length() - 1] = '0';
10 |    std::string file_str = "/sys/bus/pci/devices/" + busId + "/numa_node";
11 |    std::ifstream file(file_str);
12 |    int numaNode;
13 | 


--------------------------------------------------------------------------------
/ext-src/mscclpp_ibv_access_relaxed_ordering.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt
 2 | index a95a8e5..62b4f22 100644
 3 | --- a/CMakeLists.txt
 4 | +++ b/CMakeLists.txt
 5 | @@ -96,6 +96,24 @@ include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake)
 6 |  
 7 |  # Find ibverbs and libnuma
 8 |  find_package(IBVerbs)
 9 | +
10 | +# Check if IBV_ACCESS_RELAXED_ORDERING exists in infiniband/verbs.h
11 | +# Disable use of this symbol in mscclpp/src/ib.cc if it does not exist
12 | +if(IBVERBS_FOUND)
13 | +  try_compile(HAS_IBV_ACCESS_RELAXED_ORDERING
14 | +    ${CMAKE_BINARY_DIR}
15 | +    "${CMAKE_CURRENT_SOURCE_DIR}/../check_ibv_access_relaxed_ordering.cc"
16 | +    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${IBVERBS_INCLUDE_DIRS}"
17 | +    OUTPUT_VARIABLE try_compile_output
18 | +  )
19 | +  message(STATUS "try_compile_output: ${try_compile_output}")
20 | +  if(NOT HAS_IBV_ACCESS_RELAXED_ORDERING)
21 | +    message(WARNING "IBV_ACCESS_RELAXED_ORDERING does not exist in ${IBVERBS_INCLUDE_DIRS}/infiniband/verbs.h. Disabling this symbol in mscclpp/src/ib.cc.")
22 | +  else()
23 | +    message(STATUS "IBV_ACCESS_RELAXED_ORDERING exists in ${IBVERBS_INCLUDE_DIRS}/infiniband/verbs.h.")
24 | +  endif()
25 | +endif()
26 | +
27 |  find_package(NUMA REQUIRED)
28 |  find_package(Threads REQUIRED)
29 |  
30 | diff --git a/src/ib.cc b/src/ib.cc
31 | index d9d72d1..bddd4a8 100644
32 | --- a/src/ib.cc
33 | +++ b/src/ib.cc
34 | @@ -48,9 +48,17 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : buff(buff) {
35 |    }
36 |    uintptr_t addr = reinterpret_cast<uintptr_t>(buff) & -pageSize;
37 |    std::size_t pages = (size + (reinterpret_cast<uintptr_t>(buff) - addr) + pageSize - 1) / pageSize;
38 | +
39 | +#if defined(HAS_IBV_ACCESS_RELAXED_ORDERING)
40 |    this->mr = IBVerbs::ibv_reg_mr2(pd, reinterpret_cast<void*>(addr), pages * pageSize,
41 |                                    IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
42 |                                        IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
43 | +#else
44 | +  this->mr = IBVerbs::ibv_reg_mr2(pd, reinterpret_cast<void*>(addr), pages * pageSize,
45 | +                                  IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
46 | +                                      IBV_ACCESS_REMOTE_ATOMIC);
47 | +#endif
48 | +
49 |    if (this->mr == nullptr) {
50 |      std::stringstream err;
51 |      err << "ibv_reg_mr failed (errno " << errno << ")";
52 | 


--------------------------------------------------------------------------------
/ext-src/non-multiple-128-fix.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
 2 | index 76674ba..7a2cd4a 100644
 3 | --- a/apps/nccl/src/allreduce.hpp
 4 | +++ b/apps/nccl/src/allreduce.hpp
 5 | @@ -368,7 +368,10 @@ __global__ void __launch_bounds__(512, 1)
 6 |    const size_t chanOffset = nPeer * blockIdx.x;
 7 |    // assume (nelems * sizeof(T)) is divisible by (16 * worldSize)
 8 |    const size_t nInt4 = nelems * sizeof(T) / sizeof(int4);
 9 | -  const size_t nInt4PerRank = nInt4 / worldSize;
10 | +  size_t nInt4PerRank = nInt4 / worldSize;
11 | +  if (nInt4 % worldSize)
12 | +        nInt4PerRank = nInt4PerRank + 1;
13 | +
14 |    auto smChans = smChannels + chanOffset;
15 |    auto smOutChans = smOutChannels + chanOffset;
16 |  
17 | 


--------------------------------------------------------------------------------
/ext-src/reg-fix.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
 2 | index 5c19dc6..5fb99ef 100644
 3 | --- a/apps/nccl/src/nccl.cu
 4 | +++ b/apps/nccl/src/nccl.cu
 5 | @@ -85,6 +85,7 @@ struct ncclComm {
 6 |    std::unordered_map<channelKey, ChannelInfo> channelInInfos;
 7 |    std::unordered_map<channelKey, ChannelInfo> channelOutInfos;
 8 |    std::unordered_map<channelKey, ChannelInfo> channelScratchInfos;
 9 | +  std::unordered_map<channelKey, cudaIpcMemHandle_t> regHandles;
10 |    std::unordered_map<void*, channelKey> handleKeys;
11 |    std::shared_ptr<char> scratchBuff;
12 |    std::vector<mscclpp::RegisteredMemory> remoteScratchRegMemories;
13 | @@ -616,6 +617,11 @@ NCCL_API ncclResult_t ncclCommRegister(ncclComm_t comm, void* buff, size_t size,
14 |    p->ipcHandle = ipcHandle;
15 |    *handle = p;
16 |  
17 | +  auto regIt = comm->regHandles.find(buffKey);
18 | +  if (regIt == comm->regHandles.end()) {
19 | +        comm->regHandles[buffKey] = ipcHandle;
20 | +  }
21 | +
22 |    auto it = comm->handleKeys.find(*handle);
23 |    if (it == comm->handleKeys.end()) {
24 |       comm->handleKeys[*handle] = buffKey;
25 | @@ -642,6 +648,7 @@ NCCL_API ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle) {
26 |       if (outIt != comm->channelOutInfos.end()) {
27 |          comm->channelOutInfos.erase(outIt);
28 |       }
29 | +     comm->regHandles.erase(buffKey);
30 |       comm->handleKeys.erase(handle);
31 |       free(handle);
32 |    }
33 | @@ -655,8 +662,8 @@ bool mscclpp_BuffIsRegistered(ncclComm_t comm, const void* buff){
34 |    CUdeviceptr buffBasePtr;
35 |    MSCCLPP_CUTHROW(cuMemGetAddressRange(&buffBasePtr, &buffBytes, (CUdeviceptr)buff));
36 |    channelKey buffKey{(void*)buffBasePtr, buffBytes};
37 | -  auto buffIt = comm->channelScratchInfos.find(buffKey);
38 | -  bool registered =  buffIt != comm->channelScratchInfos.end();
39 | +  auto buffIt = comm->regHandles.find(buffKey);
40 | +  bool registered =  buffIt != comm->regHandles.end();
41 |    return registered;
42 |  }
43 |  size_t
44 | 


--------------------------------------------------------------------------------
/ext-src/remove-clip.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
 2 | index fac105a..9ef93ce 100644
 3 | --- a/apps/nccl/src/allreduce.hpp
 4 | +++ b/apps/nccl/src/allreduce.hpp
 5 | @@ -71,17 +71,29 @@ __forceinline__ __device__ __bfloat162 clip(__bfloat162 val) {
 6 |  
 7 |  template <typename T>
 8 |  __forceinline__ __device__ T add_elements(T a, T b) {
 9 | -  return clip(a + b);
10 | +  #ifdef MSCCLPP_CLIP_ENABLED
11 | +    return clip(a + b);
12 | +  #else
13 | +    return a + b;
14 | +  #endif
15 |  }
16 |  
17 |  template <>
18 |  __forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
19 | -  return clip(__hadd2(a, b));
20 | +  #ifdef MSCCLPP_CLIP_ENABLED
21 | +    return clip(__hadd2(a, b));
22 | +  #else
23 | +    return __hadd2(a, b);
24 | +  #endif
25 |  }
26 |  
27 |  template <>
28 |  __forceinline__ __device__ __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) {
29 | -  return clip(__hadd2(a, b));
30 | +  #ifdef MSCCLPP_CLIP_ENABLED
31 | +    return clip(__hadd2(a, b));
32 | +  #else
33 | +    return __hadd2(a, b);
34 | +  #endif
35 |  }
36 |  
37 |  template <typename T>
38 | @@ -558,7 +570,7 @@ __global__ void __launch_bounds__(512, 1)
39 |  
40 |  
41 |  template <typename T>
42 | -__global__ void __launch_bounds__(512, 1)
43 | +__global__ void __launch_bounds__(1024, 1)
44 |      allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
45 |                 mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, size_t channelOutDataOffset,
46 |                 size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
47 | @@ -1045,6 +1057,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
48 |               allreduce8Mod<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smScrChannels,
49 |                         channelScratchOffset, rank, nRanksPerNode, worldSize, nelems);
50 |  	 } else {
51 | +             nThreadsPerBlock = std::is_same<T,__bfloat16>::value ? 1024 : nThreadsPerBlock;
52 |               allreduce8<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smScrChannels,
53 |  		smOutChannels, channelOutOffset, channelScratchOffset, rank, nRanksPerNode,
54 |                     worldSize, nelems);
55 | 


--------------------------------------------------------------------------------
/ext-tuner/example/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | RCCL_HOME:=../../build/release
 7 | HIP_HOME:=/opt/rocm
 8 | INC:= -I$(RCCL_HOME)/include/  -I$(HIP_HOME)/include/ -D__HIP_PLATFORM_AMD__ -Inccl
 9 | PLUGIN_SO:=libnccl-tuner.so
10 | 
11 | default: $(PLUGIN_SO)
12 | 
13 | $(PLUGIN_SO): plugin.c
14 | 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
15 | 
16 | clean:
17 | 	rm -f $(PLUGIN_SO)
18 | 


--------------------------------------------------------------------------------
/ext-tuner/example/nccl/common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12 | 
13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ext-tuner/example/nccl/err.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  */
 4 | 
 5 | #ifndef NCCL_ERR_H_
 6 | #define NCCL_ERR_H_
 7 | 
 8 | /* Error type for plugins */
 9 | typedef enum { ncclSuccess                 =  0,
10 |                ncclUnhandledCudaError      =  1,
11 |                ncclSystemError             =  2,
12 |                ncclInternalError           =  3,
13 |                ncclInvalidArgument         =  4,
14 |                ncclInvalidUsage            =  5,
15 |                ncclRemoteError             =  6 } ncclResult_t;
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/makefiles/formatting.mk:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
 8 | # As this file defines a new target (format), it should be included at least after the definition of the
 9 | # default target.
10 | 
11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
12 | ASTYLEDIR := $(BUILDDIR)/contrib
13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
16 | ASTYLEVER := 3.1
17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
18 | 
19 | $(ASTYLEDIR) :
20 | 	@mkdir -p $(ASTYLEDIR)
21 | 
22 | $(ASTYLETAR) : $(ASTYLEDIR)
23 | 	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
24 | 
25 | $(ASTYLEBLD) : $(ASTYLETAR)
26 | 	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
27 | 
28 | $(ASTYLEBIN) : $(ASTYLEBLD)
29 | 	${MAKE} -C $(ASTYLEBLD)
30 | 
31 | .PHONY : format
32 | format : $(ASTYLEBIN)
33 | 	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
34 | 


--------------------------------------------------------------------------------
/makefiles/version.mk:
--------------------------------------------------------------------------------
1 | ##### version
2 | NCCL_MAJOR   := 2
3 | NCCL_MINOR   := 25
4 | NCCL_PATCH   := 1
5 | NCCL_SUFFIX  :=
6 | PKG_REVISION := 1
7 | 


--------------------------------------------------------------------------------
/pkg/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | .PHONY : all clean
 7 | 
 8 | default : build
 9 | build : debian.build txz.build
10 | 
11 | BUILDDIR ?= $(abspath ../build)
12 | ABSBUILDDIR := $(abspath $(BUILDDIR))
13 | TARGETS := debian txz
14 | all:   ${TARGETS:%=%.build}
15 | prep:  ${TARGETS:%=%.prep}
16 | build: ${TARGETS:%=%.build}
17 | clean: ${TARGETS:%=%.clean}
18 | 
19 | %.prep:
20 | 	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
21 | 
22 | %.build:
23 | 	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
24 | 
25 | %.clean:
26 | 	${MAKE} -C $* clean
27 | 


--------------------------------------------------------------------------------
/pkg/debian/.gitignore:
--------------------------------------------------------------------------------
1 | /*.debhelper.log
2 | /*.debhelper
3 | /*.substvars
4 | /tmp/
5 | /files
6 | /libnccl1/
7 | /libnccl-dev/
8 | 


--------------------------------------------------------------------------------
/pkg/debian/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | DEBPREPDIR := $(BUILDDIR)/debian
11 | PKGDIR  := $(BUILDDIR)/pkg/deb/
12 | 
13 | DEBGEN_IN  := $(wildcard *.in)
14 | DEBGEN     := $(DEBGEN_IN:.in=)
15 | DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | PKG_ARCH       ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
20 | PKG_MULTIARCH  ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
21 | 
22 | prep : $(DEBTARGETS)
23 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
24 | 
25 | build : prep
26 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
27 | 	@printf "Building Debian package\n"
28 | 	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz)
29 | 	mkdir -p $(PKGDIR)
30 | 	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
31 | 
32 | clean:
33 | 	rm -Rf $(DEBPREPDIR) $(PKGDIR)
34 | 
35 | $(DEBPREPDIR)/% : %.in
36 | 	@printf "Generating %-35s > %s\n" $< $@
37 | 	mkdir -p $(DEBPREPDIR)
38 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
39 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
40 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
41 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
42 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
43 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
44 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
45 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
46 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
47 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
48 | 	    $< > $@
49 | 
50 | $(DEBPREPDIR)/% : %
51 | 	@printf "Grabbing   %-35s > %s\n" $< $@
52 | 	mkdir -p $(DEBPREPDIR)
53 | 	cp -f $< $@
54 | 


--------------------------------------------------------------------------------
/pkg/debian/changelog.in:
--------------------------------------------------------------------------------
1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
2 | 
3 |   * Automatic Debian package from build
4 | 
5 |  -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
6 | 


--------------------------------------------------------------------------------
/pkg/debian/compat:
--------------------------------------------------------------------------------
1 | 9
2 | 


--------------------------------------------------------------------------------
/pkg/debian/control.in:
--------------------------------------------------------------------------------
 1 | Source: nccl
 2 | Section: libs
 3 | Maintainer: cudatools <cudatools@nvidia.com>
 4 | Priority: optional
 5 | Build-depends: debhelper(>=9)
 6 | Standards-Version: 3.9.5
 7 | 
 8 | Package: libnccl${nccl:Major}
 9 | Section: libs
10 | Architecture: ${pkg:Arch}
11 | Depends: ${misc:Depends}, ${shlibs:Depends}
12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime
13 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
14 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
15 |  broadcast, and reduce-scatter.
16 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
17 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
18 |  sockets.
19 | 
20 | Package: libnccl-dev
21 | Section: libdevel
22 | Architecture: ${pkg:Arch}
23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files
25 |  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
26 |  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
27 |  broadcast, and reduce-scatter.
28 |  It has been optimized to achieve high bandwidth on any platform using PCIe,
29 |  NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
30 |  sockets.
31 | 


--------------------------------------------------------------------------------
/pkg/debian/copyright:
--------------------------------------------------------------------------------
1 | ../../LICENSE.txt


--------------------------------------------------------------------------------
/pkg/debian/gbp.conf:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | debian-branch   = master
 3 | upstream-branch = master
 4 | 
 5 | ignore-new = True
 6 | 
 7 | [git-buildpackage]
 8 | 
 9 | no-purge = True
10 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl-dev.install.in:
--------------------------------------------------------------------------------
1 | bin/ncclras /usr/bin
2 | include/nccl.h /usr/include
3 | lib/libnccl.so /usr/lib/${pkg:MultiArch}
4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
5 | 


--------------------------------------------------------------------------------
/pkg/debian/libnccl2.install.in:
--------------------------------------------------------------------------------
1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
3 | 


--------------------------------------------------------------------------------
/pkg/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | %:
 4 | 	dh $@ --parallel
 5 | 
 6 | override_dh_auto_install:
 7 | 	PREFIX=debian/tmp dh_auto_install
 8 | 
 9 | override_dh_auto_test:
10 | 	# Do not make test
11 | 
12 | override_dh_auto_clean:
13 | 	# Do not make clean
14 | 
15 | override_dh_builddeb:
16 | 	dh_builddeb -- -Zxz
17 | 


--------------------------------------------------------------------------------
/pkg/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (native)
2 | 


--------------------------------------------------------------------------------
/pkg/redhat/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | RPMPREPDIR := $(BUILDDIR)/redhat
11 | PKGDIR  := $(BUILDDIR)/pkg/rpm/
12 | 
13 | RPMGEN_IN  := $(wildcard *.in)
14 | RPMGEN     := $(RPMGEN_IN:.in=)
15 | RPMFILES   := $(RPMGEN)
16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
17 | 
18 | PKG_TIMESTAMP  := $(shell date -R)
19 | ARCH           := $(shell uname -m)
20 | PKG_ARCH       ?= $(shell uname -m)
21 | PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
22 | ifeq ($(PKG_MULTIARCH),)
23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
24 | PKG_MULTIARCH  := $(ARCH)-linux-gnu
25 | endif
26 | 
27 | prep : $(RPMTARGETS)
28 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
29 | 
30 | build : prep
31 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
32 | 	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
33 | 	@printf "Building Redhat package\n"
34 | 	mkdir -p $(PKGDIR)
35 | 	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
36 |                  --define "_rpmdir $(PKGDIR)" \
37 |                  --define "_builddir $(PKGDIR)/build/" \
38 |                  --define "_buildrootdir $(PKGDIR)/buildroot/" \
39 |                  -bb $(BUILDDIR)/redhat/nccl.spec
40 | 
41 | clean:
42 | 	rm -Rf $(RPMPREPDIR) $(PKGDIR)
43 | 
44 | $(RPMPREPDIR)/% : %.in
45 | 	@printf "Generating %-35s > %s\n" $< $@
46 | 	mkdir -p $(RPMPREPDIR)
47 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
48 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
49 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
50 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
51 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
52 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
53 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
54 | 	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
55 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
56 | 	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
57 | 	    $< > $@
58 | 
59 | $(RPMPREPDIR)/% : %
60 | 	@printf "Grabbing   %-35s > %s\n" $< $@
61 | 	mkdir -p $(RPMPREPDIR)
62 | 	cp -f $< $@
63 | 


--------------------------------------------------------------------------------
/pkg/srctxz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/srctxz
11 | PKGDIR  := $(BUILDDIR)/pkg/srctxz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_REVISION   ?= 3
18 | PKG_ARCH       := $(shell uname -m)
19 | 
20 | prep: $(TXZTARGETS)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../../src clean
24 | 	@printf "Building source tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
40 | 	    $< > $@
41 | 


--------------------------------------------------------------------------------
/pkg/srctxz/create_srctxz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | cd ..
11 | NCCLDIR=`basename $PWD`
12 | 
13 | echo "Checking for unclean directory ..."
14 | git clean -x -i
15 | echo "Clean done."
16 | echo "Checking for uncommited files ..."
17 | if [ "`git status -s | wc -l`" != "0" ]; then
18 |   git status -s
19 |   echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
20 |   read
21 | fi
22 | 
23 | cd ..
24 | NCCL_MAJOR=${nccl:Major}
25 | NCCL_MINOR=${nccl:Minor}
26 | NCCL_PATCH=${nccl:Patch}
27 | NCCL_SUFFIX=${nccl:Suffix}
28 | NCCL_BUILD=${pkg:Revision}
29 | 
30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
31 | 
32 | tar --exclude build \
33 |     --exclude ".git*" \
34 |     --exclude pkg/srctxz \
35 |     --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
36 | 


--------------------------------------------------------------------------------
/pkg/txz/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include ../../makefiles/common.mk
 8 | include ../../makefiles/version.mk
 9 | BUILDDIR ?= $(abspath ../../build)
10 | TXZPREPDIR  := $(BUILDDIR)/txz
11 | PKGDIR  := $(BUILDDIR)/pkg/txz/
12 | 
13 | TXZGEN_IN  := $(wildcard *.in)
14 | TXZGEN     := $(TXZGEN_IN:.in=)
15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
16 | 
17 | PKG_ARCH   := $(shell uname -m)
18 | 
19 | prep: $(TXZTARGETS)
20 | 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
21 | 
22 | build: prep
23 | 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
24 | 	@printf "Building tar.xz package\n"
25 | 	(cd $(BUILDDIR); bash txz/create_txz.sh)
26 | 	mkdir -p $(PKGDIR)
27 | 	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
28 | 
29 | clean:
30 | 	rm -Rf $(TXZPREPDIR) $(PKGDIR)
31 | 
32 | $(TXZPREPDIR)/% : %.in
33 | 	@printf "Generating %-35s > %s\n" $< $@
34 | 	mkdir -p $(TXZPREPDIR)
35 | 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
36 | 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
37 | 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
38 | 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
39 | 	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
40 | 	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
41 | 	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
42 | 	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
43 | 	    $< > $@
44 | 


--------------------------------------------------------------------------------
/pkg/txz/create_txz.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # See LICENSE.txt for license information
 6 | #
 7 | 
 8 | # To run from $BUILDDIR/
 9 | 
10 | BUILDDIR=`basename $PWD`
11 | 
12 | cd ..
13 | NCCL_MAJOR=${nccl:Major}
14 | NCCL_MINOR=${nccl:Minor}
15 | NCCL_PATCH=${nccl:Patch}
16 | NCCL_SUFFIX=${nccl:Suffix}
17 | CUDA_MAJOR=${cuda:Major}
18 | CUDA_MINOR=${cuda:Minor}
19 | PKG_REVISION=${pkg:Revision}
20 | PKG_ARCH=${pkg:Arch}
21 | 
22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
23 | 
24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
25 | 


--------------------------------------------------------------------------------
/rtest.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <testset>
 3 |     <!-- Typically run with environment variables NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 -->
 4 |     <var name="GTEST_FILTER">rccl-UnitTests  --gtest_color=yes --gtest_filter=</var>
 5 |     <test sets="psdb">
 6 |         <run name="all-psdb">{GTEST_FILTER}*sum_float32* --gtest_output=xml:output_psdb.xml </run>
 7 |     </test>
 8 |     <test sets="osdb">
 9 |         <run name="all-osdb">{GTEST_FILTER}* --gtest_output=xml:output_osdb.xml </run>
10 |     </test>
11 | </testset>
12 | 


--------------------------------------------------------------------------------
/src/device/common.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "device.h"
 8 | #include "collectives.h"
 9 | #include "common.h"
10 | 
11 | __shared__ ncclShmemData ncclShmem;
12 | #if __CUDA_ARCH__ < 700
13 |   __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
14 | #endif
15 | 
16 | struct RunWorkNop {
17 |   __device__ void run() {}
18 | };
19 | 
20 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
21 |   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args);
22 | }
23 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
24 |   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args);
25 | }
26 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
27 |   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args);
28 | }
29 | #ifdef ENABLE_COLLTRACE
30 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
31 |   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args);
32 | }
33 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
34 |   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args);
35 | }
36 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
37 |   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&args4K.args);
38 | }
39 | #endif
40 | 
41 | #ifdef USE_INDIRECT_FUNCTION_CALL
42 | __device__ void ncclDevFunc_Nop();
43 | #else
44 | __device__ __attribute__((noinline)) void ncclDevFunc_Nop();
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/device/network/unpack/unpack_defs.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, Google LLC.  All rights reserved.
 3 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | #ifndef NET_DEVICE_UNPACK_DEFS_H
 8 | #define NET_DEVICE_UNPACK_DEFS_H
 9 | 
10 | #include <stdint.h>
11 | 
12 | #include "device.h"
13 | 
14 | #define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16
15 | 
16 | union alignas(16) loadMeta {
17 |   uint64_t r64[2];
18 |   struct {
19 |     uint32_t src_off;
20 |     uint32_t len;
21 |     uint64_t dst_off;
22 |   };
23 | };
24 | static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned");
25 | 
26 | /****** global memory ******/
27 | 
28 | #define NET_UNPACK_MAX_QUEUE_DEPTH 16  // MAX_REQUESTS
29 | #define NET_UNPACK_MAX_SLICE_SIZE 4194304  // 4MB per Irecv call
30 | #define SLICE_PAGE_SIZE 4096
31 | #define NET_UNPACK_MAX_SLICE_PAGES \
32 |   (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2)  // * 2 for slack, wasteful..
33 | 
34 | struct netUnpackMeta {
35 |   loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES];
36 |   uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH];
37 | };
38 | 
39 | struct unpackNetDeviceHandle {
40 |   struct netUnpackMeta *meta;  // mapped
41 |   void* bounce_buf;
42 |   uint64_t head;
43 | };
44 | 
45 | /****** shared memory ******/
46 | 
47 | #define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h
48 | #define NET_UNPACK_MAX_NPEERS 2  // The most you should have is 2 network peers per-group (indexed by index)
49 | #define WARP_SHM_PAGE_CNT 4
50 | #define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta))
51 | struct unpackShmem {
52 |   void* bounce_buf;
53 | };
54 | 
55 | struct unpackGroupShmem {
56 |   int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
57 |   uint64_t head[NET_UNPACK_MAX_NPEERS];
58 |   struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
59 | };
60 | 
61 | #endif // NET_DEVICE_UNPACK_DEFS_H_
62 | 


--------------------------------------------------------------------------------
/src/enhcompat.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
 8 | 
 9 | enum cudaError_t { cudaErrorStubLibrary = 34 };
10 | 
11 | extern "C" {
12 | 
13 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         __attribute__((visibility("hidden"))) __attribute((weak));
14 | cudaError_t cudaStreamGetCaptureInfo_v2(...)         { return cudaErrorStubLibrary; }
15 | 
16 | cudaError_t cudaUserObjectCreate(...)                __attribute__((visibility("hidden"))) __attribute((weak));
17 | cudaError_t cudaUserObjectCreate(...)                { return cudaErrorStubLibrary; }
18 | 
19 | cudaError_t cudaGraphRetainUserObject(...)           __attribute__((visibility("hidden"))) __attribute((weak));
20 | cudaError_t cudaGraphRetainUserObject(...)           { return cudaErrorStubLibrary; }
21 | 
22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
24 | 
25 | cudaError_t cudaGetDriverEntryPoint(...)             __attribute__((visibility("hidden"))) __attribute((weak));
26 | cudaError_t cudaGetDriverEntryPoint(...)             { return cudaErrorStubLibrary; }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/graph/rings.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "core.h"
 8 | 
 9 | void dumpLine(int* values, int nranks, const char* prefix) {
10 |   constexpr int line_length = 128;
11 |   char line[line_length];
12 |   int num_width = snprintf(nullptr, 0, "%d", nranks-1);  // safe as per "man snprintf"
13 |   int n = snprintf(line, line_length, "%s", prefix);
14 |   for (int i = 0; i < nranks && n < line_length-1; i++) {
15 |     n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]);
16 |     // At this point n may be more than line_length-1, so don't use it
17 |     // for indexing into "line".
18 |   }
19 |   if (n >= line_length) {
20 |     // Sprintf wanted to write more than would fit in the buffer. Assume
21 |     // line_length is at least 4 and replace the end with "..." to
22 |     // indicate that it was truncated.
23 |     snprintf(line+line_length-4, 4, "...");
24 |   }
25 |   INFO(NCCL_INIT, "%s", line);
26 | }
27 | 
28 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
29 |   for (int r=0; r<nrings; r++) {
30 |     char prefix[40];
31 |     /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
32 |     dumpLine(prev+r*nranks, nranks, prefix);
33 |     sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
34 |     dumpLine(next+r*nranks, nranks, prefix);*/
35 | 
36 |     int current = rank;
37 |     for (int i=0; i<nranks; i++) {
38 |       rings[r*nranks+i] = current;
39 |       current = next[r*nranks+current];
40 |     }
41 |     snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings);
42 |     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
43 |     if (current != rank) {
44 |       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
45 |       return ncclInternalError;
46 |     }
47 |     // Check that all ranks are there
48 |     for (int i=0; i<nranks; i++) {
49 |       int found = 0;
50 |       for (int j=0; j<nranks; j++) {
51 |         if (rings[r*nranks+j] == i) {
52 |           found = 1;
53 |           break;
54 |         }
55 |       }
56 |       if (found == 0) {
57 |         WARN("Error : ring %d does not contain rank %d", r, i);
58 |         return ncclInternalError;
59 |       }
60 |     }
61 |   }
62 |   return ncclSuccess;
63 | }
64 | 


--------------------------------------------------------------------------------
/src/graph/rings.h:
--------------------------------------------------------------------------------
1 | /*************************************************************************
2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 |  *
4 |  * See LICENSE.txt for license information
5 |  ************************************************************************/
6 | 
7 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
8 | 


--------------------------------------------------------------------------------
/src/graph/rome_models.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 3 | Copyright (c) 2024 GigaIO Networks, Inc. All rights reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | */
23 | #ifndef RCCL_ROME_MODELS_H_
24 | #define RCCL_ROME_MODELS_H_
25 | 
26 | ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int* net_map, int reverse);
27 | ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map);
28 | ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph2, const char *ringBase);
29 | ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
30 | ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
31 | ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
32 | ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, const char *ringBase);
33 | ncclResult_t parseGIOTopos(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/src/include/alt_rsmi.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef __ALT_RSMI_H__
 8 | #define __ALT_RSMI_H__
 9 | 
10 | /*
11 | ** This is a light-weight implementation of the RSMI functionality used in RCCL
12 | ** The code is based on the actual rocm_smi_library code, but extracted to contain only
13 | ** the bits actually required by RCCL.
14 | */
15 | 
16 | #include <stdio.h>
17 | #include <dirent.h>
18 | #include <sys/types.h>
19 | #include <sys/stat.h>
20 | #include <unistd.h>
21 | 
22 | #include <iostream>
23 | #include <fstream>
24 | #include <sstream>
25 | #include <cstring>
26 | #include <map>
27 | #include <cassert>
28 | #include <algorithm>
29 | #include <iomanip>
30 | 
31 | /**
32 |  ** This is an exact copy of the IO Link types from rocm_smi.h
33 |  ** These definitions are required since we do not know whether the
34 |  ** code will also be compiled such that it includes the rocm_smi.h
35 |  ** file or not. The values have to be identical however
36 |  */
37 | typedef enum _ARSMI_IO_LINK_TYPE {
38 |   ARSMI_IOLINK_TYPE_UNDEFINED      = 0,          //!< unknown type.
39 |   ARSMI_IOLINK_TYPE_PCIEXPRESS,                  //!< PCI Express
40 |   ARSMI_IOLINK_TYPE_XGMI,                        //!< XGMI
41 |   ARSMI_IOLINK_TYPE_NUMIOLINKTYPES,              //!< Number of IO Link types
42 |   ARSMI_IOLINK_TYPE_SIZE           = 0xFFFFFFFF  //!< Max of IO Link types
43 | } ARSMI_IO_LINK_TYPE;
44 | 
45 | struct ARSMI_linkInfo {
46 |     uint32_t src_node;
47 |     uint32_t dst_node;
48 |     uint64_t hops;
49 |     ARSMI_IO_LINK_TYPE type;
50 |     uint64_t weight;
51 |     uint64_t min_bandwidth;
52 |     uint64_t max_bandwidth;
53 | };
54 | typedef struct ARSMI_linkInfo ARSMI_linkInfo;
55 | 
56 | int ARSMI_init (void);
57 | int ARSMI_get_num_devices (uint32_t *num_devices);
58 | int ARSMI_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid);
59 | int ARSMI_topo_get_link_info(uint32_t dv_ind_src, uint32_t dv_ind_dst,
60 |                              ARSMI_linkInfo *info);
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/src/include/archinfo.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #ifndef ARCHINFO_H_
24 | #define ARCHINFO_H_
25 | 
26 | #include <string.h>
27 | 
28 | /*
29 | #include <hip/hip_runtime_api.h>
30 | #include <hip/hip_runtime.h>
31 | */
32 | 
33 | void GcnArchNameFormat(char *gcnArchName, char* out);
34 | void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName);
35 | int GetGcnArchName(int deviceId, char* out);
36 | double GetDeviceWallClockRateInKhz(int deviceId);
37 | bool IsArchMatch(char const* arch, char const* target);
38 | 
39 | #endif // ARCHINFO_H
40 | 


--------------------------------------------------------------------------------
/src/include/argcheck.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ARGCHECK_H_
 8 | #define NCCL_ARGCHECK_H_
 9 | 
10 | #include "core.h"
11 | #include "info.h"
12 | 
13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
14 | ncclResult_t CommCheck(struct ncclComm* ptr, const char* opname, const char* ptrname);
15 | ncclResult_t ArgsCheck(struct ncclInfo* info);
16 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname);
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/src/include/bootstrap.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_BOOTSTRAP_H_
 8 | #define NCCL_BOOTSTRAP_H_
 9 | 
10 | #include "nccl.h"
11 | #include "comm.h"
12 | 
13 | struct ncclBootstrapHandle {
14 |   uint64_t magic;
15 |   union ncclSocketAddress addr;
16 | };
17 | static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
18 | 
19 | ncclResult_t bootstrapNetInit();
20 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
21 | ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
22 | ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
23 | ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
24 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
25 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
26 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
27 | ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag);
28 | ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size);
29 | ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
30 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
31 | ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
32 | ncclResult_t bootstrapClose(void* commState);
33 | ncclResult_t bootstrapAbort(void* commState);
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/include/channel.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CHANNEL_H_
 8 | #define NCCL_CHANNEL_H_
 9 | #include "comm.h"
10 | #include "utils.h"
11 | 
12 | #include <algorithm>
13 | 
14 | ncclResult_t initChannel(struct ncclComm* comm, int channelid);
15 | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
16 | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
17 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
18 | 
19 | inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
20 |   if (comm->nNodes > 1) {
21 |     int nodeDelta = p2pRound/comm->maxLocalRanks;
22 |     int localDelta = p2pRound%comm->maxLocalRanks;
23 |     int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
24 |     base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
25 |     return base & 0xff;
26 |   } else {
27 |     return p2pRound & 0xff;
28 |   }
29 | }
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/src/include/core.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_CORE_H_
 9 | #define NCCL_CORE_H_
10 | 
11 | #include <pthread.h>
12 | #include <unistd.h>
13 | #include <stdlib.h>
14 | #include <stdint.h>
15 | #include <algorithm> // For std::min/std::max
16 | #include "nccl.h"
17 | 
18 | #ifdef PROFAPI
19 | #define NCCL_API(ret, func, args...)        \
20 |     __attribute__ ((visibility("default"))) \
21 |     __attribute__ ((alias(#func)))          \
22 |     ret p##func (args);                     \
23 |     extern "C"                              \
24 |     __attribute__ ((visibility("default"))) \
25 |     __attribute__ ((weak))                  \
26 |     ret func(args)
27 | #else
28 | #define NCCL_API(ret, func, args...)        \
29 |     extern "C"                              \
30 |     __attribute__ ((visibility("default"))) \
31 |     ret func(args)
32 | #endif // end PROFAPI
33 | 
34 | #include "debug.h"
35 | #include "checks.h"
36 | #include "rocmwrap.h"
37 | #include "alloc.h"
38 | #include "utils.h"
39 | #include "param.h"
40 | #ifdef NVTX_NO_IMPL
41 | #include "nvtx_stub.h"
42 | #else
43 | #include "nvtx.h"
44 | #endif
45 | 
46 | #endif // end include guard
47 | 


--------------------------------------------------------------------------------
/src/include/cpuset.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_CPUSET_H_
 8 | #define NCCL_CPUSET_H_
 9 | 
10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
11 | 
12 | static int hexToInt(char c) {
13 |   int v = c - '0';
14 |   if (v < 0) return -1;
15 |   if (v > 9) v = 10 + c - 'a';
16 |   if ((v < 0) || (v > 15)) return -1;
17 |   return v;
18 | }
19 | 
20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
21 | 
22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
23 |   uint32_t cpumasks[CPU_SET_N_U32];
24 |   int m = CPU_SET_N_U32-1;
25 |   cpumasks[m] = 0;
26 |   for (int o=0; o<strlen(str); o++) {
27 |     char c = str[o];
28 |     if (c == ',') {
29 |       m--;
30 |       cpumasks[m] = 0;
31 |     } else {
32 |       int v = hexToInt(c);
33 |       if (v == -1) break;
34 |       cpumasks[m] <<= 4;
35 |       cpumasks[m] += v;
36 |     }
37 |   }
38 |   // Copy cpumasks to mask
39 |   for (int a=0; m<CPU_SET_N_U32; a++,m++) {
40 |     memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
41 |   }
42 |   return ncclSuccess;
43 | }
44 | 
45 | static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
46 |   int c = 0;
47 |   uint8_t* m8 = (uint8_t*)mask;
48 |   for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
49 |     if (c == 0 && m8[o] == 0) continue;
50 |     sprintf(str+c, "%02x", m8[o]);
51 |     c+=2;
52 |     if (o && o%4 == 0) {
53 |       sprintf(str+c, ",");
54 |       c++;
55 |     }
56 |   }
57 |   str[c] = '\0';
58 |   return ncclSuccess;
59 | }
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/include/debug.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INT_DEBUG_H_
 8 | #define NCCL_INT_DEBUG_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_common.h"
12 | #include <stdio.h>
13 | 
14 | #include <pthread.h>
15 | 
16 | // Conform to pthread and NVTX standard
17 | #define NCCL_THREAD_NAMELEN 16
18 | 
19 | extern int ncclDebugLevel;
20 | extern FILE *ncclDebugFile;
21 | 
22 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
23 | 
24 | // Let code temporarily downgrade WARN into INFO
25 | extern thread_local int ncclDebugNoWarn;
26 | extern char ncclLastError[];
27 | 
28 | #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
29 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
30 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
31 | #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
32 | 
33 | #ifdef ENABLE_TRACE
34 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
35 | #else
36 | #define TRACE(...)
37 | #endif
38 | 
39 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
40 | 
41 | void ncclResetDebugInit();
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/include/enqueue.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ENQUEUE_H_
 8 | #define NCCL_ENQUEUE_H_
 9 | 
10 | #include "comm.h"
11 | #include "group.h"
12 | #include "collectives.h"
13 | #include "utils.h"
14 | 
15 | #define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t)
16 | #define NCCL_LL128_ALIGNMENT_PER_WARP 480
17 | #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
18 | #define NCCL_BYTES_ALIGNMENT 16
19 | 
20 | ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize);
21 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
22 | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
23 | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
24 | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
25 | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
26 | ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
27 | ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
28 | ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm);
29 | 
30 | static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) {
31 |   return func == ncclFuncReduceScatter ? nRanks*count : count;
32 | }
33 | static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
34 |   return func == ncclFuncAllGather ? nRanks*count : count;
35 | }
36 | rccl_static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
37 |   return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
38 | }
39 | 
40 | #endif // End include guard
41 | 


--------------------------------------------------------------------------------
/src/include/git_version.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef RCCL_GIT_VERSION_H_
 8 | #define RCCL_GIT_VERSION_H_
 9 | 
10 | extern const char *rcclGitHash;
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/include/hip_rocm_version_info.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #ifndef RCCL_HIP_ROCM_VERSION_INFO_H_
24 | #define RCCL_HIP_ROCM_VERSION_INFO_H_
25 | 
26 | #define STR2(v) #v
27 | #define STR(v) STR2(v)
28 | 
29 | // HIP version info retrieval
30 | #if ROCM_VERSION >= 50000
31 |    #define HIP_BUILD_INFO STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) "." STR(HIP_VERSION_PATCH) "-" HIP_VERSION_GITHASH
32 | // HIP Githash info not available in older ROCm versions < 5.0
33 | #elif ROCM_VERSION >= 40000
34 |    #define HIP_BUILD_INFO STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) "." STR(HIP_VERSION_PATCH)
35 | #else
36 |    #define HIP_BUILD_INFO "Unknown"
37 | #endif
38 | 
39 | // ROCm version info retrieval  
40 | #if ROCM_VERSION >= 60000
41 |    // rocm_version.h moved to rocm/include/rocm-core from ROCm 6.0
42 |    #include <rocm-core/rocm_version.h>
43 | #else
44 |    // rocm-core/rocm_version.h not present in some ROCm versions < 6.0. 
45 |    // So, including it from rocm/include/rocm_version.h
46 |    #if ROCM_VERSION >= 50000
47 |       #include <rocm_version.h>
48 |       //ROCM_BUILD_INFO not defined in ROCm Versions < 5.50
49 |       #ifndef ROCM_BUILD_INFO
50 |          #define ROCM_BUILD_INFO STR(ROCM_VERSION_MAJOR) "." STR(ROCM_VERSION_MINOR) "." STR(ROCM_VERSION_PATCH)
51 |       #endif
52 |    //ROCm version info not available for ROCm versions < 5.0
53 |    #else
54 |       #define ROCM_BUILD_INFO "Unknown"
55 |    #endif
56 | #endif
57 | 
58 | #endif


--------------------------------------------------------------------------------
/src/include/info.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_INFO_H_
 9 | #define NCCL_INFO_H_
10 | 
11 | #include "nccl.h"
12 | #include "collectives.h"
13 | #include "core.h"
14 | #include "utils.h"
15 | 
16 | // Used to pass NCCL call information between functions
17 | struct ncclInfo {
18 |   ncclFunc_t coll;
19 |   const char* opName;
20 |   // NCCL Coll Args
21 |   const void* sendbuff;
22 |   void* recvbuff;
23 |   size_t count;
24 |   ncclDataType_t datatype;
25 |   ncclRedOp_t op;
26 |   int root; // peer for p2p operations
27 |   ncclComm_t comm;
28 |   cudaStream_t stream;
29 |   // Algorithm details
30 |   int chunkSteps;
31 |   int sliceSteps;
32 | };
33 | 
34 | #endif


--------------------------------------------------------------------------------
/src/include/ipcsocket.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See COPYRIGHT for license information
 5 |  */
 6 | 
 7 | #ifndef NCCL_IPCSOCKET_H
 8 | #define NCCL_IPCSOCKET_H
 9 | 
10 | #include "nccl.h"
11 | #include <stdio.h>
12 | #include <fcntl.h>
13 | #include <sys/mman.h>
14 | #include <unistd.h>
15 | #include <errno.h>
16 | #include <sys/wait.h>
17 | #include <sys/types.h>
18 | #include <sys/socket.h>
19 | #include <memory.h>
20 | #include <sys/un.h>
21 | #include <inttypes.h>
22 | 
23 | #define NCCL_IPC_SOCKNAME_LEN 64
24 | 
25 | struct ncclIpcSocket {
26 |   int fd;
27 |   char socketName[NCCL_IPC_SOCKNAME_LEN];
28 |   volatile uint32_t* abortFlag;
29 | };
30 | 
31 | ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
32 | ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
33 | ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
34 | 
35 | ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
36 | ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
37 | 
38 | ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash);
39 | ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd);
40 | 
41 | #endif /* NCCL_IPCSOCKET_H */
42 | 


--------------------------------------------------------------------------------
/src/include/mnnvl.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_MNNVL_H_
 8 | #define NCCL_MNNVL_H_
 9 | 
10 | #include "nccl.h"
11 | #include "comm.h"
12 | 
13 | ncclResult_t ncclMnnvlCheck(struct ncclComm* comm);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_lifecycle.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_LIFECYCLE_H_
 7 | #define MSCCL_LIFECYCLE_H_
 8 | 
 9 | #include "enqueue.h"
10 | 
11 | #include "msccl/msccl_struct.h"
12 | 
13 | bool mscclEnabled();
14 | bool mscclForceEnabled();
15 | 
16 | void mscclSetIsCallerFlag();
17 | void mscclClearIsCallerFlag();
18 | bool mscclIsCaller();
19 | 
20 | /**
21 |  * @brief mscclAvailable() is used to determine if msccl functionality is avaliable
22 |  * @param comm is an optional rccl communicator, if provided uses the mscclStatus
23 |  * from a global map<comm -> mscclStatus> to determine if msccl is available. If not available
24 |  * in the map, this invocations inserts a new key value pair in the global map.
25 |  * If comm == nullptr, on the first invocation it initializes a static thread local variable 
26 |  * mscclStatus and uses the same object in subsequent calls from same thread if comm is null ptr
27 |  */
28 | bool mscclAvailable(const ncclComm_t comm = nullptr);
29 | 
30 | ncclResult_t mscclSchedulerInit(ncclComm_t comm, int* numChannelsRequired);
31 | 
32 | ncclResult_t mscclInit(ncclComm_t comm);
33 | 
34 | ncclResult_t mscclGroupStart();
35 | 
36 | ncclResult_t mscclEnqueueCheck(
37 |     const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
38 |     void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
39 |     size_t count, ncclDataType_t datatype, int root, int peer, ncclRedOp_t op,
40 |     mscclFunc_t mscclFunc, ncclComm_t comm, hipStream_t stream);
41 | 
42 | ncclResult_t mscclGroupEnd();
43 | 
44 | ncclResult_t mscclTeardown(const ncclComm_t comm);
45 | 
46 | size_t mscclKernMaxLocalSize();
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_scheduler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_SCHEDULER_H_
 7 | #define MSCCL_SCHEDULER_H_
 8 | 
 9 | typedef enum { mscclFuncReduce             =  0,
10 |                mscclFuncBroadcast          =  1,
11 |                mscclFuncAllReduce          =  2,
12 |                mscclFuncReduceScatter      =  3,
13 |                mscclFuncAllGather          =  4,
14 |                mscclFuncSend               =  5,
15 |                mscclFuncRecv               =  6,
16 |                mscclFuncGather             =  7,
17 |                mscclFuncScatter            =  8,
18 |                mscclFuncAllToAll           =  9,
19 |                mscclFuncAllToAllv          =  10,
20 |                mscclNumFuncs               =  11 } mscclFunc_t;
21 | 
22 | struct mscclSchedulerParam {
23 |   const void* sendBuff;
24 |   const size_t* sendCounts;
25 |   const size_t* sDisPls;
26 |   void* recvBuff;
27 |   const size_t* recvCounts;
28 |   const size_t* rDisPls;
29 |   size_t count;
30 |   ncclDataType_t dataType;
31 |   int root;
32 |   int peer;
33 |   ncclRedOp_t op;
34 |   mscclFunc_t func;
35 |   int rank;
36 |   int nRanks;
37 |   bool scheduled;
38 |   mscclAlgoHandle_t handle;
39 |   uint64_t opCount;
40 | };
41 | 
42 | typedef struct {
43 |   // Name of the scheduler (mainly for logs)
44 |   const char* name;
45 |   // Load all algorithms
46 |   ncclResult_t (*init)();
47 |   // Select an algorithm
48 |   ncclResult_t (*selectAlgo)(struct mscclSchedulerParam* param);
49 |   // Unload all algorithms
50 |   ncclResult_t (*teardown)();
51 | } mscclSchedulerInterface;
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_setup.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_SETUP_H_
 7 | #define MSCCL_SETUP_H_
 8 | 
 9 | #include <hip/hip_runtime.h>
10 | 
11 | #include "comm.h"
12 | #include "msccl/msccl_struct.h"
13 | 
14 | ncclResult_t mscclGetCaptureStatus(const ncclComm_t comm, hipStream_t stream);
15 | 
16 | ncclResult_t mscclSetupScratch(struct mscclAlgo* hostAlgo, hipStream_t stream);
17 | 
18 | ncclResult_t mscclSetupSyncFlags(const ncclComm_t comm, hipStream_t stream);
19 | 
20 | ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo,const ncclComm_t comm);
21 | 
22 | ncclResult_t mscclSetupCount(struct mscclAlgo* hostAlgo, ncclComm_t comm, size_t count, ncclDataType_t dataType);
23 | 
24 | ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm, hipStream_t stream);
25 | 
26 | ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count,
27 |     ncclDataType_t dataType, ncclRedOp_t op, struct mscclAlgo* hostAlgo, struct mscclAlgo* devAlgo,
28 |     ncclComm_t comm, hipStream_t stream);
29 | 
30 | ncclResult_t mscclInitWorkFifoStatus(mscclWorkFifoStatus* workFifoStatus);
31 | 
32 | ncclResult_t mscclDestroyWorkFifoStatus(mscclWorkFifoStatus* workFifoStatus);
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/include/msccl/msccl_status.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef MSCCL_STATUS_H_
 7 | #define MSCCL_STATUS_H_
 8 | 
 9 | #include "msccl/msccl_struct.h"
10 | 
11 | bool mscclInitialized(const ncclComm_t comm);
12 | 
13 | void mscclSetInitialized(const ncclComm_t comm, bool initialized = true);
14 | 
15 | void mscclRemoveRank(const ncclComm_t comm);
16 | 
17 | mscclStatus& mscclGetStatus(const ncclComm_t comm);
18 | 
19 | mscclSavedProxyArgs& mscclGetSavedProxyArgs(const ncclComm_t comm);
20 | 
21 | mscclThreadLocalStatus& mscclGetThreadLocalStatus();
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/include/mscclpp/mscclpp_nccl.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt and NOTICES.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef MSCCLPP_NCCL_H_
 8 | #define MSCCLPP_NCCL_H_
 9 | 
10 | #include "nccl.h"
11 | #include <unordered_map>
12 | #include <unordered_set>
13 | 
14 | typedef struct mscclppComm* mscclppComm_t;
15 | 
16 | typedef ncclUniqueId mscclppUniqueId;
17 | 
18 | /* A ncclUniqueId and a mscclppUniqueId will always be created together and used alternatively. This maps between them. */
19 | extern std::unordered_map<ncclUniqueId, mscclppUniqueId> mscclpp_uniqueIdMap;
20 | extern std::unordered_map<mscclppUniqueId, std::unordered_set<ncclUniqueId>> mscclpp_uniqueIdReverseMap;
21 | extern std::unordered_map<mscclppComm_t, mscclppUniqueId> mscclpp_commToUniqueIdMap;
22 | extern std::unordered_map<ncclComm_t, ncclUniqueId> ncclCommToUniqueIdMap;
23 | 
24 | extern "C" {
25 |   /* See ncclGetUniqueId. */
26 |   ncclResult_t  mscclpp_ncclGetUniqueId(mscclppUniqueId* uniqueId);
27 | 
28 |   /* See ncclCommInitRank. */
29 |   ncclResult_t  mscclpp_ncclCommInitRank(mscclppComm_t* comm, int nranks, mscclppUniqueId commId, int rank);
30 | 
31 |   /* See ncclCommDestroy. */
32 |   ncclResult_t  mscclpp_ncclCommDestroy(mscclppComm_t comm);
33 | 
34 |   /* See ncclAllReduce. */
35 |   ncclResult_t  mscclpp_ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
36 |       ncclDataType_t datatype, ncclRedOp_t op, mscclppComm_t comm, hipStream_t stream);
37 | 
38 |   /* See ncclAllGather. */
39 |   ncclResult_t  mscclpp_ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
40 |       ncclDataType_t datatype, mscclppComm_t comm, hipStream_t stream);
41 | 
42 |   ncclResult_t mscclpp_ncclCommRegister(mscclppComm_t comm, void* buff, size_t size, void** handle);
43 | 
44 |   ncclResult_t mscclpp_ncclCommDeregister(mscclppComm_t comm, void* handle);
45 | 
46 |   bool mscclpp_BuffIsRegistered(mscclppComm_t comm, const void* buff);
47 | 
48 |   size_t mscclpp_BufferSize(mscclppComm_t comm, void* handle);
49 | 
50 |   ncclResult_t mscclpp_ncclMemAlloc(void** ptr, size_t size);
51 | 
52 |   ncclResult_t mscclpp_ncclMemFree(void* ptr);
53 | }
54 | 
55 | namespace std {
56 |   template <>
57 |   struct hash<ncclUniqueId> {
58 |     size_t operator ()(const ncclUniqueId& uniqueId) const noexcept;
59 |   };
60 | }
61 | 
62 | bool operator ==(const ncclUniqueId& a, const ncclUniqueId& b);
63 | 
64 | bool mscclppCommCompatible(ncclComm_t comm);
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/src/include/nccl_common.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_DEBUG_H_
 8 | #define NCCL_DEBUG_H_
 9 | 
10 | typedef enum {
11 |   NCCL_LOG_NONE = 0,
12 |   NCCL_LOG_VERSION = 1,
13 |   NCCL_LOG_WARN = 2,
14 |   NCCL_LOG_INFO = 3,
15 |   NCCL_LOG_ABORT = 4,
16 |   NCCL_LOG_TRACE = 5
17 | } ncclDebugLogLevel;
18 | 
19 | typedef enum {
20 |   NCCL_INIT = 0x1,
21 |   NCCL_COLL = 0x2,
22 |   NCCL_P2P = 0x4,
23 |   NCCL_SHM = 0x8,
24 |   NCCL_NET = 0x10,
25 |   NCCL_GRAPH = 0x20,
26 |   NCCL_TUNING = 0x40,
27 |   NCCL_ENV = 0x80,
28 |   NCCL_ALLOC = 0x100,
29 |   NCCL_CALL = 0x200,
30 |   NCCL_PROXY = 0x400,
31 |   NCCL_NVLS = 0x800,
32 |   NCCL_BOOTSTRAP = 0x1000,
33 |   NCCL_REG = 0x2000,
34 |   NCCL_PROFILE = 0x4000,
35 |   NCCL_RAS = 0x8000,
36 |   NCCL_VERBS = 0x10000,
37 |   NCCL_ALL = ~0
38 | } ncclDebugLogSubSys;
39 | 
40 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
41 | 
42 | #define NCCL_NUM_ONERANK 12
43 | #define FUNC_INDEX_TOTAL 656 + NCCL_NUM_ONERANK
44 | 
45 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
46 | typedef enum {
47 |   ncclFuncBroadcast = 0,
48 |   ncclFuncReduce = 1,
49 |   ncclFuncAllGather = 2,
50 |   ncclFuncReduceScatter = 3,
51 |   ncclFuncAllReduce = 4,
52 |   ncclFuncSendRecv = 5,
53 |   ncclFuncSend = 6,
54 |   ncclFuncRecv = 7,
55 |   ncclFuncAllToAllPivot = 8,
56 |   ncclNumFuncs = 9
57 | } ncclFunc_t;
58 | 
59 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
60 | #define NCCL_ALGO_UNDEF -1
61 | #define NCCL_ALGO_TREE 0
62 | #define NCCL_ALGO_RING 1
63 | #define NCCL_ALGO_COLLNET_DIRECT 2
64 | #define NCCL_ALGO_COLLNET_CHAIN 3
65 | #define NCCL_ALGO_NVLS 4
66 | #define NCCL_ALGO_NVLS_TREE 5
67 | #define NCCL_ALGO_PAT 6
68 | 
69 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
70 | #define NCCL_PROTO_UNDEF -1
71 | #define NCCL_PROTO_LL 0
72 | #define NCCL_PROTO_LL128 1
73 | #define NCCL_PROTO_SIMPLE 2
74 | 
75 | #define NCCL_ALGO_PROTO_IGNORE -1.0
76 | 
77 | #define NCCL_NUM_UNROLLS 3 // 1/2/4
78 | #define NCCL_UNROLL_1 0
79 | #define NCCL_UNROLL_2 1
80 | #define NCCL_UNROLL_4 2
81 | 
82 | #define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8
83 | #endif
84 | 


--------------------------------------------------------------------------------
/src/include/net.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_INT_NET_H_
 8 | #define NCCL_INT_NET_H_
 9 | 
10 | #include "nccl.h"
11 | #include "nccl_net.h"
12 | #include "comm.h"
13 | #include "checks.h"
14 | 
15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
16 | 
17 | ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
18 | ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
19 | ncclResult_t ncclNetInit(struct ncclComm* comm);
20 | ncclResult_t ncclNetFinalize(struct ncclComm* comm);
21 | int ncclNetVersion(struct ncclComm* comm);
22 | 
23 | // Test whether the current GPU support GPU Direct RDMA.
24 | ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
25 | 
26 | extern ncclNet_t ncclNetIb;
27 | extern ncclNet_t ncclNetSocket;
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/include/net_device.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_NET_DEVICE_H_
 8 | #define NCCL_NET_DEVICE_H_
 9 | 
10 | #define NCCL_NET_DEVICE_INVALID_VERSION      0x0
11 | #define NCCL_NET_MTU_SIZE                    4096
12 | 
13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
16 | 
17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
18 | 
19 | typedef struct {
20 |   ncclNetDeviceType netDeviceType; // Network offload type
21 |   int netDeviceVersion;            // Version number for network offload
22 |   void* handle;
23 |   size_t size;
24 |   int needsProxyProgress;
25 | } ncclNetDeviceHandle_v7_t;
26 | 
27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
29 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/src/include/npkit/npkit.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef NPKIT_H_
 7 | #define NPKIT_H_
 8 | 
 9 | #include <string>
10 | #include <thread>
11 | 
12 | #include <hip/hip_runtime.h>
13 | 
14 | #include "npkit/npkit_event.h"
15 | #include "npkit/npkit_struct.h"
16 | #include "common.h"
17 | 
18 | #define NPKIT_GET_GPU_TIMESTAMP wall_clock64
19 | #define NPKIT_GET_CPU_TIMESTAMP_FROM_BLOCK \
20 |   __atomic_load_n(reinterpret_cast<uint64_t*>((uint8_t *)ncclShmem.comm.cpuTimestamp + 128*blockIdx.x), __ATOMIC_RELAXED)
21 | 
22 | 
23 | class NpKit {
24 |  public:
25 |   static const uint64_t kNumGpuEventBuffers = 1024;
26 | 
27 |   static const uint64_t kNumCpuEventBuffers = 64;
28 | 
29 |   static ncclResult_t Init(int rank);
30 | 
31 |   static ncclResult_t Dump(const std::string& dump_dir);
32 | 
33 |   static ncclResult_t Shutdown();
34 | 
35 |   static NpKitEventCollectContext* GetGpuEventCollectContexts();
36 | 
37 |   static inline __device__ void CollectGpuEvent(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp,
38 |                                                 NpKitEventCollectContext* ctx) {
39 |     uint64_t event_buffer_head = ctx->event_buffer_head;
40 |     if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
41 |       NpKitEvent& event = ctx->event_buffer[event_buffer_head];
42 |       event.fields.type = type;
43 |       event.fields.size = size < 0 ? 0 : size;
44 |       event.fields.rsvd = rsvd;
45 |       event.fields.timestamp = timestamp;
46 |       ctx->event_buffer_head++;
47 |     }
48 |   }
49 | 
50 |   static void CollectCpuEvent(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
51 | 
52 |   static uint64_t *GetCpuTimestamp();
53 | 
54 |  private:
55 |   static void CpuTimestampUpdateThread();
56 | 
57 |   // 64K * 512 * 16B = 512MB per GPU
58 |   static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 16;
59 | 
60 |   // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
61 |   static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
62 | 
63 |   static NpKitEvent** gpu_event_buffers_;
64 |   static NpKitEvent** cpu_event_buffers_;
65 | 
66 |   static NpKitEventCollectContext* gpu_collect_contexts_;
67 |   static NpKitEventCollectContext* cpu_collect_contexts_;
68 |   static uint64_t* cpu_timestamp_;
69 | 
70 |   static uint64_t rank_;
71 | 
72 |   static std::thread* cpu_timestamp_update_thread_;
73 |   static volatile bool cpu_timestamp_update_thread_should_stop_;
74 | };
75 | 
76 | #endif
77 | 


--------------------------------------------------------------------------------
/src/include/npkit/npkit_struct.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Microsoft Corporation.
 3 |  * Licensed under the MIT License.
 4 |  ************************************************************************/
 5 | 
 6 | #ifndef NPKIT_STRUCT_H_
 7 | #define NPKIT_STRUCT_H_
 8 | 
 9 | #include <cstdint>
10 | 
11 | #pragma pack(push, 1)
12 | 
13 | union NpKitEvent {
14 |   uint64_t bits[2];
15 |   struct {
16 |     uint64_t type : 8;
17 |     uint32_t size : 32;
18 |     uint64_t rsvd : 24;
19 |     uint64_t timestamp;
20 |   } fields;
21 | };
22 | 
23 | struct NpKitEventCollectContext {
24 |   NpKitEvent* event_buffer;
25 |   uint64_t event_buffer_head;
26 | };
27 | 
28 | #pragma pack(pop)
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/include/nvtx3/nvToolsExtSemanticsScope.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2024  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /**
10 |  * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
11 |  */
12 | 
13 | #ifndef NVTX_SEMANTIC_ID_SCOPE_V1
14 | #define NVTX_SEMANTIC_ID_SCOPE_V1 1
15 | 
16 | /**
17 |  * \brief Specify the NVTX scope for a payload entry.
18 |  *
19 |  * This allows the scope to be set for a specific value or counter in a payload.
20 |  * The scope must be known at schema registration time.
21 |  */
22 | typedef struct nvtxSemanticsScope_v1
23 | {
24 |     struct nvtxSemanticsHeader_v1 header;
25 | 
26 |     /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */
27 |     uint64_t scopeId;
28 | } nvtxSemanticsScope_t;
29 | 
30 | #endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2023  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | #ifndef NVTX_EXT_HELPER_MACROS_H
10 | #define NVTX_EXT_HELPER_MACROS_H
11 | 
12 | /* Combine tokens */
13 | #define _NVTX_EXT_CONCAT(a, b) a##b
14 | #define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b)
15 | 
16 | /* Resolves to the number of arguments passed. */
17 | #define NVTX_EXT_NUM_ARGS(...) \
18 |     NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway)
19 | #define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16
20 | 
21 | /* Cast argument(s) to void to prevent unused variable warnings. */
22 | #define _NVTX_EXT_VOIDIFY1(a1) (void)a1;
23 | #define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2;
24 | #define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3;
25 | #define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4;
26 | 
27 | /* Mark function arguments as unused. */
28 | #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \
29 |     NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
30 | 
31 | #endif /* NVTX_EXT_HELPER_MACROS_H */


--------------------------------------------------------------------------------
/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2021  NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions.
 5 | * See https://llvm.org/LICENSE.txt for license information.
 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 7 | */
 8 | 
 9 | /* This header defines types which are used by the internal implementation
10 | *  of NVTX and callback subscribers.  API clients do not use these types,
11 | *  so they are defined here instead of in nvToolsExt.h to clarify they are
12 | *  not part of the NVTX client API. */
13 | 
14 | #ifndef NVTXEXTTYPES_H
15 | #define NVTXEXTTYPES_H
16 | 
17 | #ifndef NVTX_EXT_TYPES_GUARD
18 | #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h.
19 | #endif
20 | 
21 | typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
22 | 
23 | typedef struct nvtxExtModuleSegment_t
24 | {
25 |     size_t segmentId;
26 |     size_t slotCount;
27 |     intptr_t* functionSlots;
28 | } nvtxExtModuleSegment_t;
29 | 
30 | typedef struct nvtxExtModuleInfo_t
31 | {
32 |     uint16_t nvtxVer;
33 |     uint16_t structSize;
34 |     uint16_t moduleId;
35 |     uint16_t compatId;
36 |     size_t segmentsCount;
37 |     nvtxExtModuleSegment_t* segments;
38 |     NvtxExtGetExportFunction_t getExportFunction;
39 |     const void* extInfo;
40 | } nvtxExtModuleInfo_t;
41 | 
42 | typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
43 | 
44 | #endif /* NVTXEXTTYPES_H */


--------------------------------------------------------------------------------
/src/include/nvtx_stub.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_NVTX_STUB_H_
 8 | #define NCCL_NVTX_STUB_H_
 9 | 
10 | #include <nvtx3/nvToolsExtPayload.h>
11 | 
12 | struct nccl_domain{static constexpr char const* name{"NCCL"};};
13 | 
14 | #define NVTX3_FUNC_RANGE_IN(domain)
15 | #define nvtxNameOsThreadA(syscall, thread)
16 | #define NVTX3_FUNC_WITH_PARAMS(N, T, P)
17 | #define NVTX3_PAYLOAD(...) __VA_ARGS__
18 | #define NVTX3_RANGE(T)
19 | #define NVTX3_RANGE_ADD_PAYLOAD(N, S, P)
20 | 
21 | #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/include/p2p.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <stdlib.h>
 8 | 
 9 | #ifndef NCCL_P2P_H_
10 | #define NCCL_P2P_H_
11 | 
12 | #include <cuda.h>
13 | #include <cuda_runtime.h>
14 | 
15 | #include "core.h"
16 | 
17 | #if CUDART_VERSION < 12030
18 | // MNNVL: FABRIC handle support lifted from CUDA 12.3
19 | #define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128)
20 | #define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
21 | #define CU_IPC_HANDLE_SIZE 64
22 | typedef struct CUmemFabricHandle_st {
23 |     unsigned char data[CU_IPC_HANDLE_SIZE];
24 | } CUmemFabricHandle_v1;
25 | typedef CUmemFabricHandle_v1 CUmemFabricHandle;
26 | #endif
27 | 
28 | typedef union {
29 |   uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
30 |   CUmemFabricHandle handle;
31 | } ncclCuDesc;
32 | 
33 | typedef union {
34 |   // Legacy CUDA IPC
35 |   cudaIpcMemHandle_t devIpc;
36 |   // cuMem API support
37 |   struct {
38 |     ncclCuDesc cuDesc;
39 |     CUmemGenericAllocationHandle memHandle;
40 |   };
41 | } ncclIpcDesc;
42 | 
43 | enum ncclIpcRegType {
44 |   NCCL_IPC_SENDRECV = 0,
45 |   NCCL_IPC_COLLECTIVE = 1
46 | };
47 | 
48 | struct ncclIpcImpInfo {
49 |   void* rmtRegAddr;
50 |   bool legacyIpcCap;
51 |   uintptr_t offset;
52 | };
53 | 
54 | struct ncclIpcRegInfo {
55 |   int peerRank;
56 |   void* baseAddr;
57 |   struct ncclProxyConnector* ipcProxyconn;
58 |   struct ncclIpcImpInfo impInfo;
59 | };
60 | 
61 | ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr);
62 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
63 | ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
64 | ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut);
65 | ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts);
66 | 
67 | ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo);
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/src/include/param.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_PARAM_H_
 9 | #define NCCL_PARAM_H_
10 | 
11 | #include <stdint.h>
12 | 
13 | const char* userHomeDir();
14 | void setEnvFile(const char* fileName);
15 | void initEnv();
16 | const char *ncclGetEnv(const char *name);
17 | 
18 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
19 | 
20 | #define NCCL_PARAM(name, env, deftVal) \
21 |   int64_t ncclParam##name() { \
22 |     constexpr int64_t uninitialized = INT64_MIN; \
23 |     static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
24 |     static int64_t cache = uninitialized; \
25 |     if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
26 |       ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \
27 |     } \
28 |     return cache; \
29 |   }
30 | 
31 | #define RCCL_PARAM_DECLARE(name) \
32 | int64_t rcclParam##name()
33 | 
34 | #define RCCL_PARAM(name, env, deftVal) \
35 | pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
36 | int64_t rcclParam##name() { \
37 |     constexpr int64_t uninitialized = INT64_MIN; \
38 |     static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
39 |     static int64_t cache = uninitialized; \
40 |     if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
41 |       ncclLoadParam("RCCL_" env, deftVal, uninitialized, &cache); \
42 |     } \
43 |     return cache; \
44 |   }
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/src/include/profiler.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef PROFILER_H_
 8 | #define PROFILER_H_
 9 | 
10 | #include <cuda_runtime.h>
11 | #include "nccl_profiler.h"
12 | 
13 | struct ncclProxyArgs;
14 | struct ncclKernelPlan;
15 | struct ncclTaskColl;
16 | struct ncclTaskP2p;
17 | struct ncclInfo;
18 | struct ncclComm;
19 | struct ncclProxyOp;
20 | 
21 | // Plugin Init/Finalize Wrappers
22 | ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
23 | ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm);
24 | 
25 | // Profiler Start/Stop Group Wrappers
26 | ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan);
27 | ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan);
28 | 
29 | // Profiler Start/Stop Task Events Wrappers
30 | ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
31 | ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
32 | 
33 | // Proxy Op Start/Stop Event Wrappers
34 | ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
35 | ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
36 | ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
37 | 
38 | // Proxy Step Start/Stop Event Wrappers
39 | ncclResult_t ncclProfilerStartSendProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
40 | ncclResult_t ncclProfilerStartRecvProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
41 | ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
42 | 
43 | // Proxy Control Start/Stop Events Wrappers
44 | ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
45 | ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
46 | 
47 | // Record Event Wrappers
48 | ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
49 | ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
50 | ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
51 | 
52 | // Profiler utility functions
53 | ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/src/include/ras.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_RAS_H_
 8 | #define NCCL_RAS_H_
 9 | 
10 | #include "socket.h"
11 | 
12 | // Structure used to communicate data about NCCL ranks from NCCL threads to RAS.
13 | struct rasRankInit {
14 |   union ncclSocketAddress addr;
15 |   pid_t pid;
16 |   int cudaDev;
17 |   int nvmlDev;
18 | };
19 | 
20 | ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
21 | ncclResult_t ncclRasCommFini(const struct ncclComm* comm);
22 | ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks);
23 | 
24 | #endif // !NCCL_RAS_H_
25 | 


--------------------------------------------------------------------------------
/src/include/rccl_vars.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #ifndef RCCL_VARS_H_
24 | #define RCCL_VARS_H_
25 | 
26 | #include "param.h"
27 | 
28 | RCCL_PARAM_DECLARE(EnableHipGraph);  // Opt-in environment variable for enabling hipGraph
29 | 
30 | #ifdef RCCL_EXPOSE_STATIC
31 | #define rccl_static
32 | #define rccl_static_inline
33 | #else
34 | #define rccl_static static
35 | #define rccl_static_inline static inline
36 | #endif
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/include/register.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_REGISTER_H_
 2 | #define NCCL_REGISTER_H_
 3 | 
 4 | #include "device.h"
 5 | 
 6 | #include <cuda.h>
 7 | #include <stdint.h>
 8 | 
 9 | int64_t ncclParamLocalRegister();
10 | int64_t ncclParamGraphRegister();
11 | 
12 | enum {
13 |   NET_REG_COMPLETE = 0x01,
14 |   NVLS_REG_COMPLETE = 0x02,
15 |   NVLS_REG_POSSIBLE = 0x04,
16 |   NVLS_REG_NO_SUPPORT = 0x08,
17 |   COLLNET_REG_COMPLETE = 0x10,
18 |   IPC_REG_COMPLETE = 0x20
19 | };
20 | 
21 | struct ncclPeerRegIpcAddr {
22 |   uintptr_t* devPeerRmtAddrs;
23 |   uintptr_t* hostPeerRmtAddrs;
24 | };
25 | 
26 | struct ncclRegNetHandles {
27 |   void* handle;
28 |   struct ncclProxyConnector* proxyConn;
29 |   struct ncclRegNetHandles* next;
30 | };
31 | 
32 | struct ncclReg {
33 |   // common attributes
34 |   size_t pages;
35 |   int localRefs;
36 |   int graphRefs;
37 |   uintptr_t addr;
38 |   uint32_t state;
39 |   // net reg
40 |   struct ncclRegNetHandles* netHandleHead;
41 |   // nvls reg
42 |   uintptr_t baseAddr;
43 |   size_t baseSize;
44 |   CUdeviceptr regAddr;
45 |   size_t regSize;
46 |   int dev;
47 |   CUmemGenericAllocationHandle mcHandle;
48 |   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
49 |   // collnet reg
50 |   void* collnetHandle;
51 |   struct ncclProxyConnector* collnetProxyconn;
52 |   // general ipc reg
53 |   struct ncclPeerRegIpcAddr regIpcAddrs;
54 |   struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
55 | };
56 | 
57 | struct ncclRegCache {
58 |   struct ncclReg **slots;
59 |   int capacity, population;
60 |   uintptr_t pageSize;
61 | };
62 | 
63 | ncclResult_t ncclRegCleanup(struct ncclComm* comm);
64 | ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
65 | ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
66 | ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
67 | ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/src/include/rocm_smi_wrap.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #ifndef ROCM_SMI_WRAP_H_
24 | #define ROCM_SMI_WRAP_H_
25 | 
26 | #include "rocm_smi/rocm_smi.h"
27 | #ifdef USE_ROCM_SMI64CONFIG
28 | #include "rocm_smi/rocm_smi64Config.h"
29 | #endif
30 | #include "nccl.h"
31 | 
32 | ncclResult_t rocm_smi_init();
33 | ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs);
34 | ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
35 | ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
36 | ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *count);
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/include/shm.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCCL_SHM_H_
 2 | #define NCCL_SHM_H_
 3 | 
 4 | #include "comm.h"
 5 | 
 6 | struct shmLegacyIpc {
 7 |   char shmSuffix[7];
 8 |   ncclShmHandle_t handle;
 9 |   size_t shmSize;
10 | };
11 | 
12 | struct shmCuIpc {
13 |   union {
14 |     CUmemFabricHandle handle;
15 |     CUmemGenericAllocationHandle data;
16 |   };
17 |   int tpProxyRank;
18 |   void *ptr;
19 |   size_t size;
20 | };
21 | 
22 | struct shmIpcDesc {
23 |   union
24 |   {
25 |     struct shmLegacyIpc shmli;
26 |     struct shmCuIpc shmci;
27 |   };
28 |   bool legacy;
29 | };
30 | 
31 | typedef struct shmIpcDesc ncclShmIpcDesc_t;
32 | 
33 | ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
34 | ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
35 | ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/include/shmutils.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SHMUTILS_H_
 8 | #define NCCL_SHMUTILS_H_
 9 | 
10 | #include "nccl.h"
11 | 
12 | typedef void* ncclShmHandle_t;
13 | ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
14 | ncclResult_t ncclShmClose(ncclShmHandle_t handle);
15 | ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
16 | 
17 | struct ncclShmemCollBuff {
18 |   volatile size_t *cnt[2];
19 |   volatile void *ptr[2];
20 |   int round;
21 |   size_t maxTypeSize;
22 | };
23 | 
24 | ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/include/signals.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef RCCL_SIGNALS_H_
 8 | #define RCCL_SIGNALS_H_
 9 | 
10 | void RegisterSignalHandlers();
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/include/timer.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TIMER_H_
 8 | #define NCCL_TIMER_H_
 9 | #if ENABLE_TIMER
10 | #include <unistd.h>
11 | #include <sys/time.h>
12 | #include <x86intrin.h>
13 | static double freq = -1;
14 | static void calibrate() {
15 |   struct timeval tv;
16 |   gettimeofday(&tv, NULL);
17 |   uint64_t timeCycles = __rdtsc();
18 |   double time = - tv.tv_sec*1E6 - tv.tv_usec;
19 |   uint64_t total = 0ULL;
20 |   for (int i=0; i<10000; i++) total += __rdtsc();
21 |   gettimeofday(&tv, NULL);
22 |   timeCycles = __rdtsc() - timeCycles;
23 |   time += tv.tv_sec*1E6 + tv.tv_usec;
24 |   freq = timeCycles/time;
25 | }
26 | static inline double gettime() {
27 |   if (freq == -1) calibrate();
28 |   return __rdtsc()/freq;
29 | }
30 | static uint64_t counts[8];
31 | static double times[8];
32 | static double startTimes[8];
33 | #define TIME_START(index) do { \
34 |   counts[index]++; \
35 |   startTimes[index] = gettime(); \
36 | } while (0)
37 | 
38 | #define TIME_STOP(index) do { \
39 |   times[index] += gettime() - startTimes[index]; \
40 | } while (0)
41 | 
42 | #define TIME_CANCEL(index) do { \
43 |   counts[index]--; \
44 | } while (0)
45 | 
46 | #define TIME_PRINT(name) do { \
47 |   printf("%s stats", name); \
48 |   for (int i=0; i<8; i++) { \
49 |     if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
50 |     counts[i] = 0; \
51 |   } \
52 |   printf("\n"); \
53 | } while (0)
54 | #else
55 | #define TIME_START(index) do {} while(0)
56 | #define TIME_STOP(index) do {} while(0)
57 | #define TIME_CANCEL(index) do {} while(0)
58 | #define TIME_PRINT(name)
59 | #endif
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/include/trees.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_TREES_H_
 8 | #define NCCL_TREES_H_
 9 | 
10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/include/tuner.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef NCCL_INT_TUNER_H_
 9 | #define NCCL_INT_TUNER_H_
10 | 
11 | #include "nccl_tuner.h"
12 | #include "comm.h"
13 | 
14 | // Tuning plugin to override NCCL's default algorithm/protocol tuning.
15 | 
16 | // Attempts to load NCCL tuner from environmental variable.
17 | // Returns ncclSuccess if the correct tuner symbol has been found and
18 | // successully loaded.  Otherwise returns an error and also logs the error.
19 | ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm);
20 | 
21 | // Cleans up NCCL tuner plugin.
22 | ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm);
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/init_nvtx.cc:
--------------------------------------------------------------------------------
 1 | #include "nccl.h"
 2 | #include "nvtx.h"
 3 | 
 4 | static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
 5 |   {"Sum", ncclSum, 0},
 6 |   {"Product", ncclProd, 0},
 7 |   {"Max", ncclMax, 0},
 8 |   {"Min", ncclMin, 0},
 9 |   {"Avg", ncclAvg, 0}
10 | };
11 | 
12 | // Must be called before the first call to any reduction operation.
13 | void initNvtxRegisteredEnums() {
14 | #ifndef NVTX_NO_IMPL
15 |   // Register schemas and strings
16 |   constexpr const nvtxPayloadEnumAttr_t eAttr {
17 |     .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES |
18 |       NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID,
19 |     .name = NULL,
20 |     .entries = NvtxEnumRedSchema,
21 |     .numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
22 |     .sizeOfEnum = sizeof(ncclRedOp_t),
23 |     .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP,
24 |     .extension = nullptr
25 |   };
26 | 
27 |   nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
28 | #endif
29 | }
30 | 


--------------------------------------------------------------------------------
/src/misc/api_trace.c:
--------------------------------------------------------------------------------
 1 | //
 2 | // This file just ensures that api_trace.h is C-compatible
 3 | //
 4 | 
 5 | #if defined(__cplusplus)
 6 | #    error "C source file compiling as C++"
 7 | #endif
 8 | 
 9 | #include "api_trace.h"
10 | 


--------------------------------------------------------------------------------
/src/misc/mscclpp/mscclpp_nccl.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt and NOTICES.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "mscclpp/mscclpp_nccl.h"
 8 | 
 9 | std::unordered_map<ncclUniqueId, mscclppUniqueId> mscclpp_uniqueIdMap;
10 | std::unordered_map<mscclppUniqueId, std::unordered_set<ncclUniqueId>> mscclpp_uniqueIdReverseMap;
11 | std::unordered_map<mscclppComm_t, mscclppUniqueId> mscclpp_commToUniqueIdMap;
12 | std::unordered_map<ncclComm_t, ncclUniqueId> ncclCommToUniqueIdMap;
13 | 


--------------------------------------------------------------------------------
/src/misc/mscclpp/mscclpp_nccl_syms.txt:
--------------------------------------------------------------------------------
 1 | # > ${PROJECT_BINARY_DIR}/mscclpp_nccl_syms.txt; 
 2 | # for sym in $(nm -fjust-symbols ${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a | grep "^nccl"); do 
 3 | #     echo $sym mscclpp_$sym>> ${PROJECT_BINARY_DIR}/mscclpp_nccl_syms.txt;
 4 | # done
 5 | ncclAllGather mscclpp_ncclAllGather
 6 | ncclAllReduce mscclpp_ncclAllReduce
 7 | ncclAllToAll mscclpp_ncclAllToAll
 8 | ncclBcast mscclpp_ncclBcast
 9 | ncclBroadcast mscclpp_ncclBroadcast
10 | ncclCommAbort mscclpp_ncclCommAbort
11 | ncclCommCount mscclpp_ncclCommCount
12 | ncclCommCuDevice mscclpp_ncclCommCuDevice
13 | ncclCommDestroy mscclpp_ncclCommDestroy
14 | ncclCommFinalize mscclpp_ncclCommFinalize
15 | ncclCommGetAsyncError mscclpp_ncclCommGetAsyncError
16 | ncclCommInitAll mscclpp_ncclCommInitAll
17 | ncclCommInitRank mscclpp_ncclCommInitRank
18 | ncclCommInitRankConfig mscclpp_ncclCommInitRankConfig
19 | ncclCommSplit mscclpp_ncclCommSplit
20 | ncclCommUserRank mscclpp_ncclCommUserRank
21 | ncclGetErrorString mscclpp_ncclGetErrorString
22 | ncclGetLastError mscclpp_ncclGetLastError
23 | ncclGetUniqueId mscclpp_ncclGetUniqueId
24 | ncclGetVersion mscclpp_ncclGetVersion
25 | ncclGroupEnd mscclpp_ncclGroupEnd
26 | ncclGroupStart mscclpp_ncclGroupStart
27 | ncclRecv mscclpp_ncclRecv
28 | ncclRedOpCreatePreMulSum mscclpp_ncclRedOpCreatePreMulSum
29 | ncclRedOpDestroy mscclpp_ncclRedOpDestroy
30 | ncclReduce mscclpp_ncclReduce
31 | ncclReduceScatter mscclpp_ncclReduceScatter
32 | ncclSend mscclpp_ncclSend
33 | ncclCommRegister mscclpp_ncclCommRegister
34 | ncclCommDeregister mscclpp_ncclCommDeregister
35 | ncclMemAlloc mscclpp_ncclMemAlloc
36 | ncclMemFree mscclpp_ncclMemFree
37 | 


--------------------------------------------------------------------------------
/src/misc/nvmlwrap_stub.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #include "nvmlwrap.h"
 9 | 
10 | ncclResult_t ncclNvmlSymbols(void) {
11 |   return ncclSuccess;
12 | }
13 | 
14 | ncclResult_t ncclNvmlInit(void) {
15 |   return ncclSuccess;
16 | }
17 | 
18 | ncclResult_t ncclNvmlShutdown(void) {
19 |   return ncclSuccess;
20 | }
21 | 
22 | ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
23 |   return ncclSystemError;
24 | }
25 | 
26 | ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
27 |   *index  = 0;
28 |   return ncclSuccess;
29 | }
30 | 
31 | ncclResult_t ncclNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
32 |   return ncclSystemError;
33 | }
34 | 
35 | ncclResult_t ncclNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
36 |   *minorNumber = 0;
37 |   return ncclSuccess;
38 | }
39 | 
40 | ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
41 |   return ncclSystemError;
42 | }
43 | 
44 | ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
45 |   return ncclSystemError;
46 | }
47 | 
48 | ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
49 |     nvmlNvLinkCapability_t capability, unsigned int *capResult) {
50 |   return ncclSystemError;
51 | }
52 | 
53 | ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
54 |   *major = *minor = 1;
55 |   return ncclSuccess;
56 | }
57 | 


--------------------------------------------------------------------------------
/src/misc/signals.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifdef HAVE_BFD
 8 | #include "BfdBacktrace.hpp"
 9 | #endif
10 | 
11 | #include <unistd.h>
12 | #include <signal.h>
13 | #include <execinfo.h>
14 | #include <string.h>
15 | #include "param.h"
16 | #include "debug.h"
17 | #include <vector>
18 | 
19 | void sig_handler(int signum)
20 | {
21 |   printf("\n[Process: %d] Inside handler function signal: %s (%d)\n", getpid(), strsignal(signum), signum);
22 | 
23 | #ifdef HAVE_BFD
24 |   void *addresses[BACKTRACE_MAX];
25 |   int num_addresses = backtrace(addresses, BACKTRACE_MAX);
26 |   struct backtrace_file file;
27 |   backtrace_line line;
28 |   backtrace_h bckt;
29 |   bckt.size = 0;
30 | 
31 |   for (int i = 0; i < num_addresses; ++i)
32 |   {
33 |     file.dl.address = (unsigned long)addresses[i];
34 |     if (dl_lookup_address(&file.dl) && load_file(&file))
35 |     {
36 |       bckt.size += get_line_info(&file, 1,
37 |                                  bckt.lines + bckt.size,
38 |                                  BACKTRACE_MAX - bckt.size);
39 |       unload_file(&file);
40 |     }
41 |   }
42 | 
43 |   for (int i=0; i<BACKTRACE_MAX; i++ )
44 |   {
45 |     if ((char*)bckt.lines[i].address == NULL) break;
46 |     printf("%p %s : %s line %u\n", (char*)bckt.lines[i].address,
47 |            bckt.lines[i].file, bckt.lines[i].function, bckt.lines[i].lineno);
48 |   }
49 | #else
50 | #define BT_BUF_SIZE 1024
51 |   void *buffer[BT_BUF_SIZE];
52 |   char **strings;
53 | 
54 |   int nptrs = backtrace(buffer, BT_BUF_SIZE);
55 |   strings = backtrace_symbols(buffer, nptrs);
56 |   for (int j = 0; j < nptrs; j++)
57 |     printf("%s\n", strings[j]);
58 |   free (strings);
59 | #endif
60 | 
61 |   if (signum == SIGUSR2) {
62 |     return;
63 |   }
64 | 
65 |   exit (-1);
66 | }
67 | 
68 | RCCL_PARAM(EnableSignalHandler, "ENABLE_SIGNALHANDLER", 0); // Opt-in environment variable for enabling custom signal handler
69 | 
70 | void RegisterSignalHandlers()
71 | {
72 |   if (rcclParamEnableSignalHandler())
73 |   {
74 |     INFO(NCCL_INIT, "Enabling custom signal handler");
75 | 
76 |     std::vector<int> signalsToCatch = {SIGILL, SIGBUS, SIGFPE, SIGSEGV, SIGUSR2};
77 | 
78 |     for (auto signum : signalsToCatch)
79 |     {
80 |       if (signal(signum, sig_handler) == SIG_ERR)
81 |       {
82 |         INFO(NCCL_INIT, "Unable to register signal handler for %s\n", strsignal(signum));
83 |       }
84 |     }
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/nccl.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=${nccl:Prefix}
 2 | exec_prefix=${prefix}
 3 | libdir=${exec_prefix}/lib
 4 | includedir=${prefix}/include
 5 | 
 6 | Name: nccl
 7 | Description: Optimized primitives for collective multi-GPU communication
 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
 9 | Libs: -L${libdir} -lnccl
10 | Cflags: -I${includedir}
11 | 


--------------------------------------------------------------------------------
/src/register/sendrecv_reg.cc:
--------------------------------------------------------------------------------
 1 | #include "register.h"
 2 | #include "transport.h"
 3 | 
 4 | ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
 5 |   ncclResult_t ret = ncclSuccess;
 6 | 
 7 |   *regFlag = 0;
 8 |   if (comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) {
 9 |     if (comm->planner.persistent && ncclParamGraphRegister()) {
10 |       ncclNetGraphRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle, cleanupQueue, NULL);
11 |     }
12 |     if (*regFlag == 0 && ncclParamLocalRegister()) {
13 |       ncclNetLocalRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle);
14 |     }
15 |   }
16 |   return ret;
17 | }
18 | 
19 | ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
20 |   ncclResult_t ret = ncclSuccess;
21 |   uintptr_t offset = 0;
22 |   uintptr_t* peerRmtAddrs = NULL;
23 | 
24 |   *regFlag = 0;
25 |   if (comm->planner.persistent && ncclParamGraphRegister()) {
26 |     ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
27 |   }
28 |   if (*regFlag == 0 && ncclParamLocalRegister()) {
29 |     ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
30 |   }
31 | 
32 |   if (*regFlag)
33 |     *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
34 |   return ret;
35 | }
36 | 


--------------------------------------------------------------------------------
/test/common/CallCollectiveForked.hpp:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef CALLCOLLECTIVEFORKED_H
 8 | #define CALLCOLLECTIVEFORKED_H
 9 | 
10 | #include <vector>
11 | 
12 | namespace RcclUnitTesting
13 | {
14 |     void callCollectiveForked(int nranks, int collID, const std::vector<int>& sendBuff, std::vector<int>& recvBuff, const std::vector<int>& expected, bool use_managed_mem = false);
15 | }
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/test/common/PrepDataFuncs.hpp:
--------------------------------------------------------------------------------
 1 |  /*************************************************************************
 2 |  * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | #pragma once
 7 | #include "ErrCode.hpp"
 8 | 
 9 | namespace RcclUnitTesting
10 | {
11 |   class CollectiveArgs;
12 | 
13 |   // Checks that enough memory has been allocated
14 |   ErrCode CheckAllocation(CollectiveArgs const& collArgs);
15 | 
16 |   // Default PrepareData functions
17 |   // PrepareData functions are responsible for setting up input / expected for the given collArgs
18 |   ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs);
19 |   ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs);
20 |   ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce);
21 |   ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather);
22 |   ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs);
23 |   ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs);
24 |   ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs);
25 |   ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs);
26 |   ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs);
27 |   ErrCode DefaultPrepData_Recv(CollectiveArgs &collArgs);
28 | }
29 | 


--------------------------------------------------------------------------------
/test/common/RcclMockFuncs.hpp:
--------------------------------------------------------------------------------
1 | #include "info.h"
2 | #include "comm.h"
3 | 
4 | void ncclDebugLog(ncclDebugLogLevel, unsigned long, char const*, int, char const*, ...) {};
5 | ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
6 |   return ncclSuccess;
7 | }
8 | 


--------------------------------------------------------------------------------
/test/common/StandaloneUtils.cpp:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | #include "CollectiveArgs.hpp"
 7 | #include "StandaloneUtils.hpp"
 8 | #include <iostream>
 9 | #include <regex>
10 | 
11 | 
12 | namespace RcclUnitTesting
13 | {
14 | 
15 | std::string executeCommand(const char* cmd) {
16 |     std::string result;
17 |     FILE* pipe = popen(cmd, "r");
18 | 
19 |     if (!pipe) {
20 |         std::cerr << "Error executing command: " << cmd << std::endl;
21 |         return result;
22 |     }
23 | 
24 |     char buffer[128];
25 |     while (!feof(pipe)) {
26 |         if (fgets(buffer, 128, pipe) != NULL) {
27 |             result += buffer;
28 |         }
29 |     }
30 | 
31 |     pclose(pipe);
32 |     return result;
33 | }
34 | 
35 | std::vector<std::string> splitString(const std::string& str, char delimiter) {
36 |     std::vector<std::string> result;
37 |     std::istringstream iss(str);
38 | 
39 |     std::string line;
40 |     while(std::getline(iss, line, delimiter)) {
41 |         result.push_back(line);
42 |     }
43 | 
44 |     return result;
45 | }
46 | 
47 | 
48 | ArchInfo parseMetadata(const std::vector<std::string>& list) {
49 |     ArchInfo archInfo;
50 |     KernelInfo currKernelInfo;
51 |     
52 |     std::regex amdhsaTargetRegex("amdhsa.target:\\s+(?:'?)amdgcn-amd-amdhsa--(\\w+)(?:'?)");
53 |     std::regex kernelNameRegex("\\.name:\\s+(\\w+)");
54 |     std::regex privateSegmentSizeRegex("\\.private_segment_fixed_size:\\s+(\\d+)");
55 |     
56 |     for (const auto& line : list) {
57 |         std::smatch match;
58 | 
59 |         if (std::regex_search(line, match, amdhsaTargetRegex)) {
60 |             archInfo.archName = match[1];
61 |         } else if (std::regex_search(line, match, kernelNameRegex)) {
62 |             currKernelInfo.name = match[1];
63 |         } else if (std::regex_search(line, match, privateSegmentSizeRegex)) {
64 |             currKernelInfo.privateSegmentFixedSize = std::stoi(match[1]);
65 |         }
66 |         
67 |         if (!currKernelInfo.name.empty() && currKernelInfo.privateSegmentFixedSize != 0) {
68 |             archInfo.kernels.push_back(currKernelInfo);
69 |             currKernelInfo = {}; // Empty kernelInfo
70 |         }
71 |     }
72 |     
73 |     return archInfo;
74 | }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/test/common/StandaloneUtils.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STANDALONE_UTILS_H
 2 | #define STANDALONE_UTILS_H
 3 | 
 4 | #include <cstdio>
 5 | #include <vector>
 6 | #include <string>
 7 | #include <rccl/rccl.h>
 8 | 
 9 | #define HIPCALL(cmd)                                                                          \
10 |     do {                                                                                      \
11 |         hipError_t error = (cmd);                                                             \
12 |         if (error != hipSuccess)                                                              \
13 |         {                                                                                     \
14 |             printf("Encountered HIP error (%s) at line %d in file %s\n",                      \
15 |                                   hipGetErrorString(error), __LINE__, __FILE__);              \
16 |             exit(-1);                                                                         \
17 |         }                                                                                     \
18 |     } while (0)
19 | 
20 | #define NCCLCHECK(cmd) do {                                     \
21 |     ncclResult_t res = cmd;                                     \
22 |     if (res != ncclSuccess) {                                   \
23 |          printf("NCCL failure %s:%d '%s'\n",                    \
24 |             __FILE__,__LINE__,ncclGetErrorString(res));         \
25 |     }                                                           \
26 | } while(0)
27 | 
28 | #define MAX_STACK_SIZE 570
29 | 
30 | #ifdef ENABLE_LL128
31 | #define MAX_STACK_SIZE_gfx90a 360
32 | #else
33 | #define MAX_STACK_SIZE_gfx90a MAX_STACK_SIZE
34 | #endif
35 | 
36 | namespace RcclUnitTesting
37 | {
38 |     struct KernelInfo {
39 |         std::string name;
40 |         int privateSegmentFixedSize = 0;
41 |     };
42 | 
43 |     struct ArchInfo {
44 |         std::string archName;
45 |         std::vector<KernelInfo> kernels;
46 |     };
47 | 
48 |     std::string executeCommand(const char* cmd);
49 | 
50 |     std::vector<std::string> splitString(const std::string& str, char delimiter);
51 | 
52 |     ArchInfo parseMetadata(const std::vector<std::string>& list);
53 | }
54 | #endif
55 | 


--------------------------------------------------------------------------------
/test/common/main.cpp:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include <gtest/gtest.h>
 8 | #include "EnvVars.hpp"
 9 | #include "TestBed.hpp"
10 | int main(int argc, char **argv)
11 | {
12 |   ::testing::InitGoogleTest(&argc, argv);
13 |   RcclUnitTesting::EnvVars ev;
14 |   ev.ShowConfig();
15 |   int retCode = RUN_ALL_TESTS();
16 |   printf("[ INFO     ] Total executed cases: %d\n", RcclUnitTesting::TestBed::NumTestsRun());
17 | 
18 |   // Show timing information
19 | 
20 |   if (ev.showTiming)
21 |   {
22 |     size_t totalTimeMsec = 0;
23 |     fflush(stdout);
24 |     printf("[ TIMING   ] %-20s: %-20s: %10s ms (%s)\n", "TEST SUITE", "TEST NAME", "TIME", "STATUS");
25 |     auto unitTest = ::testing::UnitTest::GetInstance();
26 |     for (int i = 0; i < unitTest->total_test_suite_count(); i++)
27 |     {
28 |       auto suiteInfo = unitTest->GetTestSuite(i);
29 |       if (!suiteInfo->should_run()) continue;
30 | 
31 |       for (int j = 0; j < suiteInfo->total_test_count(); j++)
32 |       {
33 |         auto testInfo = suiteInfo->GetTestInfo(j);
34 |         if (!testInfo->should_run()) continue;
35 |         auto testResult = testInfo->result();
36 |         if (testResult->Skipped()) continue;
37 |         printf("[ TIMING   ] %-20s: %-20s: %10.2f sec (%4s)\n", testInfo->test_suite_name(), testInfo->name(), testResult->elapsed_time() / 1000.0, testResult->Passed() ? "PASS" : "FAIL");
38 |       }
39 |       printf("[ TIMING   ] %-20s: %-20s: %10.2f sec (%4s)\n", suiteInfo->name(), "TOTAL", suiteInfo->elapsed_time() / 1000.0, suiteInfo->Passed() ? "PASS" : "FAIL");
40 |       totalTimeMsec += suiteInfo->elapsed_time();
41 |     }
42 |     printf("[ TIMING   ] Total time: %10.2f minutes\n", totalTimeMsec / (60 * 1000.0));
43 |   }
44 |   return retCode;
45 | }
46 | 


--------------------------------------------------------------------------------
/toolchain-linux.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | if (DEFINED ENV{ROCM_PATH})
 3 |   set(rocm_bin "$ENV{ROCM_PATH}/bin")
 4 | else()
 5 |   set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
 6 |   set(rocm_bin "/opt/rocm/bin")
 7 | endif()
 8 | 
 9 | if (NOT DEFINED ENV{CXX})
10 |   set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
11 | else()
12 |   set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
13 | endif()
14 | 
15 | if (NOT DEFINED ENV{CXXFLAGS})
16 |   set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
17 |   set(CMAKE_CXX_FLAGS_RELEASE "-O3")
18 | endif()
19 | 
20 | if (NOT DEFINED ENV{CC})
21 |   set(CMAKE_C_COMPILER "${rocm_bin}/amdclang" CACHE PATH "Path to the C compiler")
22 | else()
23 |   set(CMAKE_C_COMPILER "$ENV{CC}" CACHE PATH "Path to the C compiler")
24 | endif()
25 | 
26 | if (NOT DEFINED ENV{CFLAGS})
27 |   set(CMAKE_C_FLAGS_DEBUG "-g -O1")
28 |   set(CMAKE_C_FLAGS_RELEASE "-O3")
29 | endif()
30 | 


--------------------------------------------------------------------------------
/tools/EmptyKernelTest/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ROCM_PATH ?= /opt/rocm
 3 | CUDA_PATH ?= /usr/local/cuda
 4 | 
 5 | HIPCC=$(ROCM_PATH)/bin/hipcc
 6 | NVCC=$(CUDA_PATH)/bin/nvcc
 7 | 
 8 | # Compile TransferBenchCuda if nvcc detected
 9 | ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
10 | 	EXE=./EmptyKernelTestCuda
11 | else
12 | 	EXE=./EmptyKernelTest
13 | endif
14 | 
15 | all: $(EXE)
16 | 
17 | ./EmptyKernelTest:  EmptyKernelTest.cpp
18 | 	$(HIPCC) EmptyKernelTest.cpp -o EmptyKernelTest
19 | 
20 | ./EmptyKernelTestCuda: EmptyKernelTest.cpp
21 | 	$(NVCC) EmptyKernelTest.cpp -x cu -o EmptyKernelTestCuda
22 | 
23 | 
24 | clean:
25 | 	rm -f ./EmptyKernelTest ./EmptyKernelTestCuda
26 | 
27 | 


--------------------------------------------------------------------------------
/tools/GraphBench/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 2 | 
 3 | # Set to where RCCL is installed
 4 | RCCL_INSTALL=../../build/release
 5 | 
 6 | HIP_PATH?= $(wildcard /opt/rocm)
 7 | ifeq (,$(HIP_PATH))
 8 | HIP_PATH=../../..
 9 | endif
10 | HIPCC=$(HIP_PATH)/bin/hipcc
11 | 
12 | EXE=GraphBench
13 | CXXFLAGS = -std=c++11 -O3 -I../../src/include -I$(RCCL_INSTALL)/include -L$(RCCL_INSTALL) -lrccl
14 | 
15 | all: $(EXE)
16 | 
17 | $(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp")
18 | 	$(HIPCC) $(CXXFLAGS) $< -o $@
19 | 
20 | test: $(EXE)
21 | 	LD_LIBRARY_PATH=$(RCCL_INSTALL) RCCL_ENABLE_HIPGRAPH=1 ./$(EXE)
22 | 
23 | testInfo: $(EXE)
24 | 	NCCL_DEBUG=INFO LD_LIBRARY_PATH=$(RCCL_INSTALL) RCCL_ENABLE_HIPGRAPH=1 ./$(EXE)
25 | clean:
26 | 	rm -f *.o $(EXE)
27 | 


--------------------------------------------------------------------------------
/tools/HelloRccl/HelloRccl.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #ifndef HELLORCCL_HPP
24 | #define HELLORCCL_HPP
25 | #include <iostream>
26 | 
27 | #define HIP_CALL(cmd)                                                 \
28 |   do {                                                                \
29 |     hipError_t error = (cmd);                                         \
30 |     if (error != hipSuccess)                                          \
31 |     {                                                                   \
32 |       std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \
33 |                 << __LINE__ << " in file " << __FILE__ << "\n";         \
34 |       exit(-1);                                                         \
35 |     }                                                                   \
36 |   } while (0)
37 | 
38 | #define NCCL_CALL(cmd) \
39 |   do { \
40 |     ncclResult_t error = (cmd);                 \
41 |     if (error != ncclSuccess)                   \
42 |     {                                           \
43 |       std::cerr << "Encountered NCCL error (" << ncclGetErrorString(error) << ") at line " \
44 |                 << __LINE__ << " in file " << __FILE__ << "\n";         \
45 |       exit(-1);                                                         \
46 |     }                                                                   \
47 |   } while (0)
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/tools/HelloRccl/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
 2 | 
 3 | # Set to where RCCL is installed
 4 | RCCL_INSTALL=../../build/release
 5 | 
 6 | HIP_PATH?= $(wildcard /opt/rocm)
 7 | ifeq (,$(HIP_PATH))
 8 | HIP_PATH=../../..
 9 | endif
10 | HIPCC=$(HIP_PATH)/bin/hipcc
11 | 
12 | EXE=HelloRccl
13 | CXXFLAGS = -std=c++11 -O3 -I../../src/include -I$(RCCL_INSTALL) -L$(RCCL_INSTALL) -lrccl
14 | 
15 | all: $(EXE)
16 | 
17 | $(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp")
18 | 	$(HIPCC) $(CXXFLAGS) $< -o $@
19 | 
20 | clean:
21 | 	rm -f *.o $(EXE)
22 | 


--------------------------------------------------------------------------------
/tools/HelloRccl/runTest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | RCCL_INSTALL=../../build/release
 3 | EXE=$PWD/HelloRccl
 4 | LDPATH=$LD_LIBRARY_PATH:$RCCL_INSTALL
 5 | 
 6 | echo "Single process:"
 7 | NCCL_DEBUG=INFO LD_LIBRARY_PATH=$LDPATH $EXE 4
 8 | 
 9 | echo "Multi-process:"
10 | NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 0 &
11 | NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 1 &
12 | NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 2 &
13 | NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 3
14 | 


--------------------------------------------------------------------------------
/tools/JitterBench/Common.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #include <iostream>
24 | 
25 | #define HIP_CALL(cmd)                                                                   \
26 |     do {                                                                                \
27 |         hipError_t error = (cmd);                                                       \
28 |         if (error != hipSuccess)                                                        \
29 |         {                                                                               \
30 |             std::cout << "Encountered HIP error (" << hipGetErrorString(error)          \
31 |                       << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
32 |             exit(-1);                                                                   \
33 |         }                                                                               \
34 |     } while (0)
35 | 
36 | // Macro for collecting HW_REG_XCC_ID
37 | #if defined(__gfx942__) || defined(__gfx950__)
38 | #define GetXccId(val) \
39 |   asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
40 | #else
41 | #define GetXccId(val) \
42 |   val = 0
43 | #endif
44 | 
45 | // Macro for collecting HW_REG_HW_ID
46 | #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__)
47 | #define GetHwId(val) \
48 |   val = 0
49 | #else
50 | #define GetHwId(val) \
51 |   asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (val));
52 | #endif
53 | 


--------------------------------------------------------------------------------
/tools/JitterBench/Makefile:
--------------------------------------------------------------------------------
 1 | ROCM_PATH ?= /opt/rocm
 2 | CUDA_PATH ?= /usr/local/cuda
 3 | HIPCC     = $(ROCM_PATH)/bin/hipcc
 4 | NVCC      = $(CUDA_PATH)/bin/nvcc
 5 | 
 6 | CCFLAGS   = -O3 -lhsa-runtime64 -fopenmp -lnuma
 7 | NVFLAGS   = -O3  -x cu -lnuma -Xcompiler -fopenmp -gencode=arch=compute_90,code=sm_90
 8 | 
 9 | ifneq ("$(MPI_DIR)", "")
10 | MPIFLAGS = -DMPI_SUPPORT -I$(MPI_DIR)/include -L$(MPI_DIR)/lib -lmpi
11 | else
12 | MPIFLAGS =
13 | endif
14 | 
15 | all: JitterBench
16 | 
17 | JitterBench: JitterBench.cpp Common.hpp Timeline.hpp
18 | ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
19 | 	$(NVCC) $(NVFLAGS) $(MPIFLAGS) $< -o $@
20 | else
21 | 	$(HIPCC) $(CCFLAGS) $(MPIFLAGS) $< -o $@
22 | endif
23 | 
24 | clean:
25 | 	rm -f ./JitterBench
26 | 


--------------------------------------------------------------------------------
/tools/JitterBench/Timeline.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdio>
 3 | #include <cstdlib>
 4 | #include <vector>
 5 | 
 6 | struct TimelineData
 7 | {
 8 |   std::string rowLabel;
 9 |   std::string barLabel;
10 |   std::string toolTip;
11 |   uint64_t    startTime;
12 |   uint64_t    stopTime;
13 | };
14 | 
15 | void ExportToTimeLine(std::string outputFilename,
16 |                       std::string rowLabelName,
17 |                       std::string barLabelName,
18 |                       std::vector<TimelineData> const& timelineData)
19 | {
20 |   FILE *fp = fopen(outputFilename.c_str(), "w");
21 | 
22 |   fprintf(fp, "<script type=\"text/javascript\" src=\"https://www.gstatic.com/charts/loader.js\"></script>\n");
23 |   fprintf(fp, "<script type=\"text/javascript\">\n");
24 |   fprintf(fp, "google.charts.load(\"current\", {packages:[\"timeline\"]});\n");
25 |   fprintf(fp, "google.charts.setOnLoadCallback(drawChart);\n");
26 |   fprintf(fp, "\n");
27 |   fprintf(fp, "function drawChart() {\n");
28 |   fprintf(fp, "  var container = document.getElementById('myTimeline');\n");
29 |   fprintf(fp, "  var chart = new google.visualization.Timeline(container);\n");
30 |   fprintf(fp, "  var dataTable = new google.visualization.DataTable();\n");
31 |   fprintf(fp, "\n");
32 |   fprintf(fp, "  dataTable.addColumn({ type: 'string', id:   '%s' });\n", rowLabelName.c_str());
33 |   fprintf(fp, "  dataTable.addColumn({ type: 'string', id:   '%s' });\n", barLabelName.c_str());
34 |   fprintf(fp, "  dataTable.addColumn({ type: 'string', role: 'tooltip'});\n");
35 |   fprintf(fp, "  dataTable.addColumn({ type: 'number', id:   'Start' });\n");
36 |   fprintf(fp, "  dataTable.addColumn({ type: 'number', id:   'End' });\n");
37 |   fprintf(fp, "  dataTable.addRows([\n");
38 | 
39 |   for (int i = 0; i < timelineData.size(); i++)
40 |   {
41 |     TimelineData const& t = timelineData[i];
42 |     fprintf(fp, "   [ '%s', '%s', '%s', %lu, %lu ]%s\n", t.rowLabel.c_str(),
43 |             t.barLabel.c_str(), t.toolTip.c_str(), t.startTime, t.stopTime, i + 1 == timelineData.size() ? "]);" : ",");
44 |   }
45 | 
46 |   fprintf(fp, "  chart.draw(dataTable);\n");
47 |   fprintf(fp, "}\n");
48 |   fprintf(fp, "</script>\n");
49 |   fprintf(fp, "<div id=\"myTimeline\" style=\"width: 100%%; height: 100%%;\"></div>\n");
50 |   fclose(fp);
51 | }
52 | 


--------------------------------------------------------------------------------
/tools/JitterBench/runSweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for numBlocks in 1 2 4 8 16 32; do
 4 | 		for blockSize in 64 128 256; do
 5 | 				for numTimers in 0 1; do
 6 | 						for useNuma in 0 1; do
 7 | 								echo "numBlocks=$numBlocks blockSize=$blockSize numTimers=$numTimers useNuma=$useNuma";
 8 | 								./LaunchBench $numBlocks $blockSize $numTimers $useNuma &> output.$numBlocks.$blockSize.$numTimers.$useNuma.txt
 9 | 						done;
10 | 				done;
11 | 		done;
12 | done;
13 | 


--------------------------------------------------------------------------------
/tools/RcclReplayer/Makefile:
--------------------------------------------------------------------------------
 1 | ROCM_DIR ?= /opt/rocm
 2 | RCCL_DIR ?= ../../build/release
 3 | MPI_DIR  ?= /opt/ompi
 4 | 
 5 | INCLUDES = -I$(MPI_DIR)/include -I$(RCCL_DIR)/include -I$(RCCL_DIR)/hipify/src/include
 6 | LDFLAGS  = -L$(MPI_DIR)/lib -L$(RCCL_DIR) -lmpi -lrccl
 7 | 
 8 | main: rcclReplayer.cpp
 9 | 	$(ROCM_DIR)/bin/hipcc rcclReplayer.cpp -O1 -g -o rcclReplayer $(INCLUDES) $(LDFLAGS)
10 | 
11 | clean:
12 | 	rm -f ./rcclReplayer
13 | 


--------------------------------------------------------------------------------
/tools/TopoVisual/4_nodes.log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/rccl/e94b36024615b65ddcd372ac39d9e9f5f43f685f/tools/TopoVisual/4_nodes.log.png


--------------------------------------------------------------------------------
/tools/TopoVisual/README.md:
--------------------------------------------------------------------------------
 1 | # Topology Visualizer
 2 | Topology Visualizer extracts topology information from RCCL log file and presents graphically. Less than optimal connections between GPUs and nodes are highlighted in red for easy identification.
 3 | 
 4 | ## Requirements
 5 | Following packages are required to run Topology Visualizer:
 6 | 1. gawk
 7 | 2. graphviz
 8 | 
 9 | ## Usage
10 | Topology Visualizer accepts both RCCL log files or simulator output, i.e. [Topology Explorer](https://github.com/ROCm/rccl/tree/master/tools/topo_expl "Topology Explorer").
11 | 
12 | RCCL logs needs to be collected with NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=INIT,GRAPH environmental variables. Example command line:
13 | ```shell
14 | mpirun -np 4 -host rocm-framework-1,rocm-framework-3,rocm-framework-5,rocm-framework-6 \
15 |   -env HSA_FORCE_FINE_GRAIN_PCIE 1 -env NCCL_DEBUG INFO -env NCCL_DEBUG_SUBSYS INIT,GRAPH \
16 |   ~/rccl-tests/build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 | tee ~/4_nodes.log
17 | 
18 | ./topo_visual.sh -i 4_nodes.log
19 | ```
20 | 
21 | ## Legend
22 | 
23 | Solid lines: connections over P2P or shared memory
24 | 
25 | Dashed lines: connections over network
26 | 
27 | Green: P2P connections, network connections with GPU RDMA
28 | 
29 | Red: Connections over shared memory or without GPU RDMA
30 | 
31 | ## Example Output
32 | ![image info](./4_nodes.log.png)
33 | 
34 | ## Copyright
35 | All source code and accompanying documentation are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
36 | 


--------------------------------------------------------------------------------
/tools/TopoVisual/topo_visual.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | # THE SOFTWARE.
21 | 
22 | DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
23 | 
24 | exit_error() {
25 |   echo "Usage: $0 [ -i input_filename ]"
26 |   exit 1
27 | }
28 | 
29 | while getopts ":i:o:" options; do
30 |   case "${options}" in
31 |     i)
32 |       INPUT_NAME=${OPTARG}
33 |       ;;
34 |     :)
35 |       echo "Error: -${OPTARG} requires an argument."
36 |       exit_error
37 |       ;;
38 |     ?)
39 |       exit_error
40 |       ;;
41 |   esac
42 | done
43 | 
44 | if [ -z "$INPUT_NAME" ]
45 | then
46 |   exit_error
47 | else
48 |   $DIR/extract_topo.awk $INPUT_NAME | dot -Tpng -o "$INPUT_NAME.png"
49 |   echo "Extracted topology from $INPUT_NAME to $INPUT_NAME.png"
50 | fi
51 | 
52 | exit 0
53 | 


--------------------------------------------------------------------------------
/tools/TransferBench/README.md:
--------------------------------------------------------------------------------
1 | # TransferBench
2 | 
3 | TransferBench is a simple utility capable of benchmarking simultaneous copies between user-specified devices (CPUs/GPUs).
4 | TransferBench can now be found at: https://github.com/ROCm/TransferBench
5 | 
6 | ## Copyright
7 | 
8 | All source code and accompanying documentation is copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
9 | 


--------------------------------------------------------------------------------
/tools/ib-test/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 2 | HIP_PATH ?= $(wildcard /opt/rocm)
 3 | ifeq (,$(HIP_PATH))
 4 | HIP_PATH = ../../..
 5 | endif
 6 | HIPCC = $(HIP_PATH)/bin/hipcc
 7 | 
 8 | EXE = ib_test
 9 | CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/clique -DENABLE_TRACE -DRCCL_IB_TEST -ldl -lnuma
10 | 
11 | files = $(EXE).cpp utils.cpp ../../src/transport/net_ib.cc ../../src/misc/ibvwrap.cc ../../src/debug.cc
12 | 
13 | all: $(EXE)
14 | 
15 | $(EXE): $(files)
16 | 	$(HIPCC) $(CXXFLAGS) $^ -o $@
17 | 	#scp $(EXE) rocm-framework-3:$(shell pwd)
18 | 
19 | clean:
20 | 	rm -f *.o $(EXE)
21 | 


--------------------------------------------------------------------------------
/tools/p2p-latency-test/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | HIP_PATH ?= $(wildcard /opt/rocm)
 4 | ifeq (,$(HIP_PATH))
 5 | HIP_PATH = ../../..
 6 | endif
 7 | HIPCC = $(HIP_PATH)/bin/hipcc
 8 | 
 9 | all: p2p_latency_test ll_latency_test
10 | 
11 | CXXFLAGS = -g -O3
12 | p2p_latency_test: p2p_latency_test.cpp
13 | 	$(HIPCC) $(CXXFLAGS) $^ -o $@
14 | ll_latency_test: ll_latency_test.cpp
15 | 	$(HIPCC) $(CXXFLAGS) $^ -o $@
16 | 
17 | clean:
18 | 	rm -f *.o $(EXE)
19 | 


--------------------------------------------------------------------------------
/tools/p2p-latency-test/build_and_run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | make
 5 | 
 6 | # Example run: test one-way latency between GPU 0 and GPU 1 in both directions.
 7 | export HSA_FORCE_FINE_GRAIN_PCIE=1
 8 | 
 9 | echo Running p2p_latency_test using GPU pair 0 1
10 | ./p2p_latency_test 0 1
11 | 
12 | sleep 1
13 | 
14 | echo Running p2p_latency_test using GPU pair 1 0
15 | ./p2p_latency_test 1 0
16 | 
17 | sleep 1
18 | 
19 | echo Running ll_latency_test using GPU pair 0 1
20 | ./ll_latency_test 0 1
21 | 
22 | sleep 1
23 | 
24 | echo Running ll_latency_test using GPU pair 1 0
25 | ./ll_latency_test 1 0
26 | 


--------------------------------------------------------------------------------
/tools/rccl-prim-test/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 2 | HIP_PATH?= $(wildcard /opt/rocm)
 3 | ifeq (,$(HIP_PATH))
 4 | 	HIP_PATH=../../..
 5 | endif
 6 | HIPCC=$(HIP_PATH)/bin/hipcc
 7 | 
 8 | EXE=rccl_prim_test
 9 | CXXFLAGS = -O3 -g -I/opt/rocm/rocrand/include
10 | 
11 | all: $(EXE)
12 | 
13 | $(EXE): rccl_prim_test.cpp
14 | 	$(HIPCC) $(CXXFLAGS) $^ -o $@
15 | 
16 | clean:
17 | 	rm -f *.o $(EXE)
18 | 


--------------------------------------------------------------------------------
/tools/scripts/pytorch-all-reduce/README.md:
--------------------------------------------------------------------------------
 1 | Small benchmark utility for gpt-fast's all reduce. 
 2 | 
 3 | ### How to run 
 4 | Out of box run (This will try various sequence lengths and dump perf results to terminal output)
 5 | ```
 6 | torchrun --nproc_per_node=8 all_reduce.py 
 7 | ```
 8 | 
 9 | To enable intra node all-reduce algorithms use:
10 | ```
11 | ENABLE_INTRA_NODE_COMM=1 torchrun --nproc_per_node=8 python3 all_reduce.py
12 | ```
13 | 
14 | ### Rocprof trace script
15 | To create perfetto traces for each rank of each all reduce a bash script is provided. 
16 | ```
17 | ENABLE_INTRA_NODE_COMM=1 bash trace_runs.sh
18 | ```
19 | 
20 | ### Additional options:
21 | The tensor size is dependent on sequence_length and dim supplied in gpt fast. There are 4 different all-reduce calls in gpt-fast at runtime:
22 | - 1: [seq_len, dim]
23 | - 2: [seq_len, 2, dim]
24 | - 3: [1, dim]
25 | - 4: [1, 2, dim]
26 | ```
27 | --sequence_lengths (defaults to [50, 64, 128, 256, 512, 1024, 2048, 4096])
28 | --dim (defaults to 6144)
29 | --all_reduce (defaults to [0,1,2,3]) - Can be modified to only run a single all-reduce, mapping to the 4 all reduces listed above 
30 | --tracing - Enables tracing mode to skip CPU timers in recording 
31 | ```
32 | 
33 | 


--------------------------------------------------------------------------------
/tools/scripts/pytorch-all-reduce/trace_runs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEQUENCE_LENGTHS=(50 128 256 512 1024 2048 4096)
 4 | ALL_REDUCE_ALGOS=(1 2 3 4)
 5 | 
 6 | HIP_DEV_FORCE_KERNARG=1
 7 | 
 8 | for SEQ_LEN in "${SEQUENCE_LENGTHS[@]}"; do
 9 | 	for ALGO in "${ALL_REDUCE_ALGOS[@]}"; do
10 | 		echo "Running sequence length $SEQ_LEN with intra-node all_reduce $ALGO"
11 | 		ENABLE_INTRA_NODE_COMM=1 rocprofv3 --hip-trace --kernel-trace --stats --output-format PFTRACE -d rocprof_trace/intranode_input"$SEQ_LEN"_allreduce"$ALGO"/ -- torchrun --nproc_per_node=8 all_reduce.py --sequence_lengths $SEQ_LEN --all_reduce $ALGO --tracing
12 | 	done
13 | done
14 | 


--------------------------------------------------------------------------------
/tools/scripts/topo_val.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | # THE SOFTWARE.
21 | 
22 | DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
23 | 
24 | for i in {0..88}
25 | do
26 | 	if [[ $i -eq 50 ]] || [[ $i -eq 51 ]]
27 | 	then
28 | 		NCCL_COLLNET_ENABLE=1 $DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
29 | 	elif [[ $i -eq 54 ]]
30 | 	then
31 | 		RCCL_ENABLE_MULTIPLE_SAT=1 NCCL_COLLNET_ENABLE=1 $DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
32 | 	else
33 | 		$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
34 | 	fi
35 | 	$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
36 | done
37 | 


--------------------------------------------------------------------------------
/tools/time-trace/rccl-TimeTrace.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Directory path to search for JSON files
 4 | directory="../../build/release"
 5 | 
 6 | if command -v pip &>/dev/null; then
 7 |     echo "pip is already installed."
 8 | else
 9 |     echo "pip is not installed. Installing..."
10 |     sudo apt-get update
11 |     sudo apt install python3-pip
12 | fi
13 | 
14 | required_library='pandas'
15 | 
16 | # Check if pandas is installed
17 | if python3 -c "import $required_library" &> /dev/null; then
18 |     echo "$required_library is already installed."
19 | else
20 |     echo "$required_library is not installed. Installing..."
21 |     pip3 install $required_library
22 | fi
23 | 
24 | required_library='plotly'
25 | 
26 | # Check if the library is installed
27 | if python3 -c "import $required_library" &> /dev/null; then
28 |     echo "$required_library is already installed."
29 | else
30 |     echo "$required_library is not installed. Installing..."
31 |     pip3 install $required_library
32 | fi
33 | 
34 | # Check if the file exists
35 | if [ ! -f "$directory/.ninja_log" ]; then
36 |   echo "File '$directory/.ninja_log' does not exist."
37 |   exit 1
38 | fi
39 | 
40 | declare -A unique_values
41 | 
42 | # Use awk to compare and delete duplicates
43 | awk '!unique_values[$5]++' "$directory/.ninja_log" > temp_file.txt
44 | mv temp_file.txt "$directory/.ninja_log"
45 | 
46 | # Rename the file with .csv extension
47 | mv "$directory/.ninja_log" "$directory/time_trace.log"
48 | 
49 | # Run the python program
50 | python3 time_trace_generator.py --min_val 5 --include_linking


--------------------------------------------------------------------------------
/tools/topo_expl/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 2 | HIP_PATH ?= $(wildcard /opt/rocm)
 3 | ifeq (,$(HIP_PATH))
 4 | HIP_PATH = ../../..
 5 | endif
 6 | HIPCC = $(HIP_PATH)/bin/hipcc
 7 | 
 8 | EXE = topo_expl
 9 | CXXFLAGS = -g -ffunction-sections -fdata-sections -Wl,--gc-sections -fgpu-rdc -Iinclude -Ihipify_rccl/include -Ihipify_rccl/device/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DENABLE_LL128 -DNVTX_NO_IMPL -DRCCL_EXPOSE_STATIC -lpthread
10 | 
11 | files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
12 | 	hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc \
13 | 	hipify_rccl/collectives.cc hipify_rccl/register.cc hipify_rccl/enqueue.cc ../../src/rccl_wrap.cc
14 | 
15 | all: hipify $(EXE)
16 | 
17 | $(EXE): $(files)
18 | 	$(HIPCC) $(CXXFLAGS) $^ -o $@
19 | 
20 | hipify:
21 | 	rm -rf hipify_rccl
22 | 	mkdir -p hipify_rccl/device/include hipify_rccl/include/network/unpack
23 | 	cp -a ../../src/include/ hipify_rccl/
24 | 	cp -a ../../src/graph/ hipify_rccl/
25 | 	cp -a ../../src/device/*.h hipify_rccl/device/include
26 | 	cp -a ../../src/device/network/unpack/*.h hipify_rccl/include/network/unpack
27 | 	cp -a ../../src/enqueue.cc hipify_rccl/
28 | 	cp -a ../../src/register/register.cc hipify_rccl/
29 | 	cp -a ../../src/collectives.cc hipify_rccl/
30 | 	cp -a ../../src/misc/archinfo.cc hipify_rccl/graph/
31 | 	hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
32 | 	hipify-perl -inplace -quiet-warnings hipify_rccl/device/include/*.h
33 | 	sed -i "s/template<typename T, typename RedOp>/template<typename T, typename RedOp, int COLL_UNROLL>/g" "hipify_rccl/device/include/common.h"
34 | 	sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "hipify_rccl/device/include/common.h"
35 | 	hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
36 | 	hipify-perl -inplace -quiet-warnings hipify_rccl/include/network/unpack/*
37 | 	hipify-perl -inplace -quiet-warnings hipify_rccl/*.cc
38 | 
39 | clean:
40 | 	rm -rf hipify_rccl
41 | 	rm -f *.o $(EXE)


--------------------------------------------------------------------------------
/tools/topo_expl/include/device_table.h:
--------------------------------------------------------------------------------
1 | #ifndef DEVICE_TABLE_COMPATIBILITY
2 | #define DEVICE_TABLE_COMPATIBILITY
3 | __forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {}
4 | __forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {}
5 | __forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {}
6 | __forceinline__ __device__ void NCCL_CALL_FUNCTIONS_4(unsigned short funcIndex) noexcept {}
7 | #endif
8 | 


--------------------------------------------------------------------------------
/tools/topo_expl/include/utils.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 4 |  *
 5 |  * See LICENSE.txt for license information
 6 |  ************************************************************************/
 7 | 
 8 | #ifndef UTILS_H_
 9 | #define UTILS_H_
10 | 
11 | struct graphInfo {
12 |   int pattern;
13 |   int nChannels;
14 |   int sameChannels;
15 |   float bwIntra;
16 |   float bwInter;
17 |   int typeIntra;
18 |   int typeInter;
19 | };
20 | 
21 | struct allGatherInfo {
22 |   struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
23 |   struct ncclTopoRanks topoRanks;
24 |   int nc;
25 |   bool pivotA2AEnabled;
26 |   bool ll128Enabled;
27 |   bool mscclEnabled;
28 | };
29 | 
30 | void initCollNet();
31 | 
32 | ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system);
33 | 
34 | ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
35 | 
36 | ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash);
37 | 
38 | ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *allGather3Data,
39 |   struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph, struct ncclComm* parent = NULL);
40 | 
41 | ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *allGather3Data,
42 |   struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph);
43 | 
44 | #define TIME_START(index)
45 | 
46 | #define TIME_STOP(index)
47 | 
48 | #define TIME_CANCEL(index)
49 | 
50 | #define TIME_PRINT(name)
51 | 
52 | #endif


--------------------------------------------------------------------------------
/tools/topo_expl/models/topo_3p_pcie.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  - Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  - See LICENSE.txt for license information
 4 |  -->
 5 | <system version="2">
 6 |   <cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="175" modelid="1">
 7 |     <pci busid="0000:21:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
 8 |       <pci busid="0000:23:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
 9 |         <gpu dev="0" sm="104" gcn="910" arch="38911" rank="0" gdr="1"/>
10 |       </pci>
11 |     </pci>
12 |     <pci busid="0000:41:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
13 |       <nic>
14 |         <net name="mlx5_0" dev="0" speed="200000" port="1" guid="0xad9c300039f59b8" maxconn="262144" gdr="1"/>
15 |       </nic>
16 |     </pci>
17 |   </cpu>
18 |   <cpu numaid="1" affinity="ffff0000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="175" modelid="1">
19 |     <pci busid="0000:81:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
20 |       <pci busid="0000:83:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
21 |         <gpu dev="1" sm="104" gcn="910" arch="38911" rank="1" gdr="1"/>
22 |       </pci>
23 |     </pci>
24 |     <pci busid="0000:e2:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
25 |       <pci busid="0000:e4:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
26 |         <gpu dev="2" sm="104" gcn="910" arch="38911" rank="2" gdr="1"/>
27 |       </pci>
28 |     </pci>
29 |   </cpu>
30 | </system>
31 | 


--------------------------------------------------------------------------------
/tools/topo_expl/models/topo_3p_pcie_1.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  - Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  - See LICENSE.txt for license information
 4 |  -->
 5 | <system version="2">
 6 |   <cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="175" modelid="1">
 7 |     <pci busid="0000:21:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
 8 |       <pci busid="0000:23:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
 9 |         <gpu dev="0" sm="104" gcn="910" arch="38911" rank="0" gdr="1"/>
10 |       </pci>
11 |     </pci>
12 |   </cpu>
13 |   <cpu numaid="1" affinity="ffff0000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="175" modelid="1">
14 |     <pci busid="0000:81:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
15 |       <pci busid="0000:83:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
16 |         <gpu dev="1" sm="104" gcn="910" arch="38911" rank="1" gdr="1"/>
17 |       </pci>
18 |     </pci>
19 |     <pci busid="0000:e2:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
20 |       <pci busid="0000:e4:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
21 |         <gpu dev="2" sm="104" gcn="910" arch="38911" rank="2" gdr="1"/>
22 |       </pci>
23 |     </pci>
24 |     <pci busid="0000:a1:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
25 |       <nic>
26 |         <net name="mlx5_1" dev="0" speed="200000" port="1" guid="0x7657900003f6ceb8" maxconn="262144" gdr="1"/>
27 |       </nic>
28 |     </pci>
29 |   </cpu>
30 | </system>
31 | 


--------------------------------------------------------------------------------
/tools/topo_expl/models/topo_4p1h.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  - Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  - See LICENSE.txt for license information
 4 |  -->
 5 | <system version="2">
 6 |   <cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
 7 |     <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
 8 |       <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
 9 |         <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
10 |           <gpu dev="0" sm="64" gcn="906" arch="38911" rank="0" gdr="1">
11 |             <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
12 |             <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
13 |           </gpu>
14 |         </pci>
15 |       </pci>
16 |       <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
17 |         <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
18 |           <gpu dev="1" sm="64" gcn="906" arch="38911" rank="1" gdr="1">
19 |             <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
20 |             <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
21 |           </gpu>
22 |         </pci>
23 |       </pci>
24 |       <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
25 |         <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
26 |           <gpu dev="2" sm="64" gcn="906" arch="38911" rank="2" gdr="1">
27 |             <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
28 |             <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
29 |           </gpu>
30 |         </pci>
31 |       </pci>
32 |       <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
33 |         <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
34 |           <gpu dev="3" sm="64" gcn="906" arch="38911" rank="3" gdr="1">
35 |             <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
36 |             <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
37 |           </gpu>
38 |         </pci>
39 |       </pci>
40 |       <pci busid="0000:1a:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
41 |         <nic>
42 |           <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xf2bb2700034b6b50" maxconn="262144" gdr="1"/>
43 |         </nic>
44 |       </pci>
45 |     </pci>
46 |   </cpu>
47 | </system>
48 | 


--------------------------------------------------------------------------------
/tools/topo_expl/models/topo_4p1h_1.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  - Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 3 |  - See LICENSE.txt for license information
 4 |  -->
 5 | <system version="2">
 6 |   <cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
 7 |     <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
 8 |       <pci busid="0000:1a:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
 9 |         <gpu dev="0" sm="64" gcn="906" arch="38911" rank="0" gdr="0">
10 |           <xgmi target="0000:3d:00.0" count="1" tclass="0x038000"/>
11 |           <xgmi target="0000:b1:00.0" count="1" tclass="0x038000"/>
12 |         </gpu>
13 |       </pci>
14 |     </pci>
15 |     <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
16 |       <pci busid="0000:3d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
17 |         <gpu dev="1" sm="64" gcn="906" arch="38911" rank="1" gdr="0">
18 |           <xgmi target="0000:1a:00.0" count="1" tclass="0x038000"/>
19 |           <xgmi target="0000:88:00.0" count="1" tclass="0x038000"/>
20 |         </gpu>
21 |       </pci>
22 |     </pci>
23 |     <pci busid="0000:01:00.0" class="0x020000" link_speed="8 GT/s" link_width="4">
24 |       <nic>
25 |         <net name="eno1" dev="0" speed="10000" port="0" guid="0x0" maxconn="65536" gdr="0"/>
26 |       </nic>
27 |     </pci>
28 |     <nic>
29 |       <net name="virbr0" dev="1" speed="10000" port="0" guid="0x1" maxconn="65536" gdr="0"/>
30 |     </nic>
31 |   </cpu>
32 |   <cpu numaid="1" affinity="fffc00,0fffc000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
33 |     <pci busid="0000:86:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
34 |       <pci busid="0000:88:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
35 |         <gpu dev="2" sm="64" gcn="906" arch="38911" rank="2" gdr="0">
36 |           <xgmi target="0000:3d:00.0" count="1" tclass="0x038000"/>
37 |           <xgmi target="0000:b1:00.0" count="1" tclass="0x038000"/>
38 |         </gpu>
39 |       </pci>
40 |     </pci>
41 |     <pci busid="0000:af:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
42 |       <pci busid="0000:b1:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
43 |         <gpu dev="3" sm="64" gcn="906" arch="38911" rank="3" gdr="0">
44 |           <xgmi target="0000:1a:00.0" count="1" tclass="0x038000"/>
45 |           <xgmi target="0000:88:00.0" count="1" tclass="0x038000"/>
46 |         </gpu>
47 |       </pci>
48 |     </pci>
49 |   </cpu>
50 | </system>
51 | 


--------------------------------------------------------------------------------