├── .azuredevops ├── multinode-ci-nightly.yml ├── multinode-ci-pr.yml ├── rocm-ci.yml └── tests │ └── pytest │ └── HelloWorld.py ├── .github ├── CODEOWNERS ├── PULL_REQUEST_TEMPLATE.md └── dependabot.yml ├── .gitignore ├── .gitmodules ├── .jenkins ├── common.groovy ├── extended.groovy ├── precheckin.groovy ├── staticanalysis.groovy └── staticlibrary.groovy ├── .readthedocs.yaml ├── CHANGELOG.md ├── CMakeLists.txt ├── CppCheckSuppressions.txt ├── LICENSE.txt ├── Makefile ├── NOTICES.txt ├── README.md ├── cmake ├── CheckSymbolExistsNoWarn.cmake ├── Dependencies.cmake ├── DownloadProject.CMakeLists.cmake.in ├── DownloadProject.cmake ├── FindIBVerbs.cmake ├── Findmscclpp_nccl.cmake ├── MSCCLPP.cmake └── scripts │ ├── add_faults.sh │ ├── add_unroll.sh │ ├── extract_metadata.cmake │ └── git_version.cmake ├── docker ├── Dockerfile.ubuntu └── README.md ├── docs ├── .gitignore ├── api-reference │ ├── api-library.rst │ └── library-specification.rst ├── attributions.rst ├── conf.py ├── data │ └── how-to │ │ └── rccl-usage-tips │ │ ├── in-place_allreduce.png │ │ └── out-of-place_allreduce.png ├── doxygen │ ├── Doxyfile │ └── mainpage.txt ├── how-to │ ├── rccl-usage-tips.rst │ ├── troubleshooting-rccl.rst │ ├── using-nccl.rst │ └── using-rccl-tuner-plugin-api.rst ├── index.rst ├── install │ ├── building-installing.rst │ ├── docker-install.rst │ └── installation.rst ├── license.rst ├── sphinx │ ├── _toc.yml.in │ ├── requirements.in │ └── requirements.txt └── what-is-rccl.rst ├── ext-net ├── README.md ├── example │ ├── Makefile │ ├── nccl │ │ ├── common.h │ │ ├── err.h │ │ ├── net.h │ │ ├── net_device.h │ │ ├── net_v2.h │ │ ├── net_v3.h │ │ ├── net_v4.h │ │ ├── net_v5.h │ │ ├── net_v6.h │ │ ├── net_v7.h │ │ ├── net_v8.h │ │ ├── net_v9.h │ │ └── types.h │ └── plugin.c └── google-fastsocket │ └── Makefile ├── ext-profiler ├── README.md └── example │ ├── Makefile │ ├── README.md │ ├── event.c │ ├── event.h │ ├── nccl │ ├── common.h │ ├── err.h │ ├── profiler.h │ ├── profiler_v1.h │ ├── profiler_v2.h │ └── types.h │ ├── plugin.c │ ├── print_event.c │ └── print_event.h ├── ext-src ├── bf16-tuning.patch ├── check_ibv_access_relaxed_ordering.cc ├── cpx.patch ├── device-flag.patch ├── mem-reg.patch ├── mscclpp_ibv_access_relaxed_ordering.patch ├── no-cache.patch ├── non-multiple-128-fix.patch ├── read-allred.patch ├── reg-fix.patch └── remove-clip.patch ├── ext-tuner ├── README.md └── example │ ├── Makefile │ ├── nccl │ ├── common.h │ ├── err.h │ └── tuner.h │ └── plugin.c ├── install.sh ├── makefiles ├── common.mk ├── formatting.mk └── version.mk ├── pkg ├── Makefile ├── debian │ ├── .gitignore │ ├── Makefile │ ├── changelog.in │ ├── compat │ ├── control.in │ ├── copyright │ ├── gbp.conf │ ├── libnccl-dev.install.in │ ├── libnccl2.install.in │ ├── rules │ └── source │ │ └── format ├── redhat │ ├── Makefile │ └── nccl.spec.in ├── srctxz │ ├── Makefile │ └── create_srctxz.sh.in └── txz │ ├── Makefile │ └── create_txz.sh.in ├── rtest.xml ├── src ├── Makefile ├── bootstrap.cc ├── channel.cc ├── collectives.cc ├── debug.cc ├── device │ ├── all_gather.h │ ├── all_reduce.h │ ├── alltoall_pivot.h │ ├── broadcast.h │ ├── common.cu │ ├── common.h │ ├── common_kernel.h │ ├── generate.py │ ├── msccl_kernel_impl.h │ ├── network │ │ └── unpack │ │ │ ├── unpack.h │ │ │ └── unpack_defs.h │ ├── onerank.cu │ ├── op128.h │ ├── primitives.h │ ├── prims_ll.h │ ├── prims_ll128.h │ ├── prims_simple.h │ ├── reduce.h │ ├── reduce_kernel.h │ ├── reduce_scatter.h │ └── sendrecv.h ├── enhcompat.cc ├── enqueue.cc ├── graph │ ├── connect.cc │ ├── paths.cc │ ├── rings.cc │ ├── rings.h │ ├── rome_models.cc │ ├── rome_models.h │ ├── search.cc │ ├── topo.cc │ ├── topo.h │ ├── trees.cc │ ├── tuning.cc │ ├── xml.cc │ └── xml.h ├── group.cc ├── include │ ├── BfdBacktrace.hpp │ ├── alloc.h │ ├── alt_rsmi.h │ ├── api_trace.h │ ├── archinfo.h │ ├── argcheck.h │ ├── bitops.h │ ├── bootstrap.h │ ├── channel.h │ ├── checks.h │ ├── coll_net.h │ ├── collectives.h │ ├── comm.h │ ├── core.h │ ├── cpuset.h │ ├── cudawrap.h │ ├── debug.h │ ├── device.h │ ├── enqueue.h │ ├── gdrwrap.h │ ├── git_version.h │ ├── graph.h │ ├── group.h │ ├── hip_rocm_version_info.h │ ├── ibvcore.h │ ├── ibvsymbols.h │ ├── ibvwrap.h │ ├── info.h │ ├── ipcsocket.h │ ├── mnnvl.h │ ├── msccl │ │ ├── msccl_kernel.h │ │ ├── msccl_lifecycle.h │ │ ├── msccl_parser.h │ │ ├── msccl_scheduler.h │ │ ├── msccl_setup.h │ │ ├── msccl_status.h │ │ └── msccl_struct.h │ ├── mscclpp │ │ └── mscclpp_nccl.h │ ├── nccl_common.h │ ├── nccl_net.h │ ├── nccl_profiler.h │ ├── nccl_tuner.h │ ├── net.h │ ├── net_device.h │ ├── npkit │ │ ├── npkit.h │ │ ├── npkit_event.h │ │ └── npkit_struct.h │ ├── nvmlwrap.h │ ├── nvtx.h │ ├── nvtx3 │ │ ├── nvToolsExt.h │ │ ├── nvToolsExtCounters.h │ │ ├── nvToolsExtCuda.h │ │ ├── nvToolsExtCudaRt.h │ │ ├── nvToolsExtMem.h │ │ ├── nvToolsExtMemCudaRt.h │ │ ├── nvToolsExtOpenCL.h │ │ ├── nvToolsExtPayload.h │ │ ├── nvToolsExtPayloadHelper.h │ │ ├── nvToolsExtSemanticsCounters.h │ │ ├── nvToolsExtSemanticsScope.h │ │ ├── nvToolsExtSync.h │ │ ├── nvtx3.hpp │ │ └── nvtxDetail │ │ │ ├── nvtxExtHelperMacros.h │ │ │ ├── nvtxExtImpl.h │ │ │ ├── nvtxExtImplCounters_v1.h │ │ │ ├── nvtxExtImplMemCudaRt_v1.h │ │ │ ├── nvtxExtImplMem_v1.h │ │ │ ├── nvtxExtImplPayload_v1.h │ │ │ ├── nvtxExtInit.h │ │ │ ├── nvtxExtPayloadHelperInternal.h │ │ │ ├── nvtxExtPayloadTypeInfo.h │ │ │ ├── nvtxExtTypes.h │ │ │ ├── nvtxImpl.h │ │ │ ├── nvtxImplCore.h │ │ │ ├── nvtxImplCudaRt_v3.h │ │ │ ├── nvtxImplCuda_v3.h │ │ │ ├── nvtxImplOpenCL_v3.h │ │ │ ├── nvtxImplSync_v3.h │ │ │ ├── nvtxInit.h │ │ │ ├── nvtxInitDecls.h │ │ │ ├── nvtxInitDefs.h │ │ │ ├── nvtxLinkOnce.h │ │ │ └── nvtxTypes.h │ ├── nvtx_payload_schemas.h │ ├── nvtx_stub.h │ ├── p2p.h │ ├── param.h │ ├── profiler.h │ ├── proxy.h │ ├── ras.h │ ├── rccl_common.h │ ├── rccl_float8.h │ ├── rccl_vars.h │ ├── recorder.h │ ├── register.h │ ├── rocm_smi_wrap.h │ ├── rocmwrap.h │ ├── roctx.h │ ├── shm.h │ ├── shmutils.h │ ├── signals.h │ ├── socket.h │ ├── strongstream.h │ ├── timer.h │ ├── transport.h │ ├── trees.h │ ├── tuner.h │ └── utils.h ├── init.cc ├── init_nvtx.cc ├── misc │ ├── alt_rsmi.cc │ ├── api_trace.c │ ├── api_trace.cc │ ├── archinfo.cc │ ├── argcheck.cc │ ├── cudawrap.cc │ ├── gdrwrap.cc │ ├── ibvsymbols.cc │ ├── ibvwrap.cc │ ├── ipcsocket.cc │ ├── msccl │ │ ├── msccl_lifecycle.cc │ │ ├── msccl_parser.cc │ │ ├── msccl_setup.cc │ │ └── msccl_status.cc │ ├── mscclpp │ │ ├── mscclpp_nccl.cc │ │ └── mscclpp_nccl_syms.txt │ ├── npkit.cc │ ├── nvmlwrap.cc │ ├── nvmlwrap_stub.cc │ ├── param.cc │ ├── profiler.cc │ ├── recorder.cc │ ├── rocm_smi_wrap.cc │ ├── rocmwrap.cc │ ├── roctx.cc │ ├── shmutils.cc │ ├── signals.cc │ ├── socket.cc │ ├── strongstream.cc │ ├── tuner.cc │ └── utils.cc ├── mnnvl.cc ├── msccl.cc ├── nccl.h.in ├── nccl.pc.in ├── net.cc ├── proxy.cc ├── ras │ ├── client.cc │ ├── client_support.cc │ ├── collectives.cc │ ├── peers.cc │ ├── ras.cc │ ├── ras_internal.h │ └── rasnet.cc ├── rccl_wrap.cc ├── register │ ├── coll_reg.cc │ ├── register.cc │ └── sendrecv_reg.cc ├── transport.cc └── transport │ ├── coll_net.cc │ ├── generic.cc │ ├── net.cc │ ├── net_ib.cc │ ├── net_socket.cc │ ├── nvls.cc │ ├── p2p.cc │ └── shm.cc ├── test ├── AllGatherTests.cpp ├── AllReduceTests.cpp ├── AllToAllTests.cpp ├── AllToAllVTests.cpp ├── BroadcastTests.cpp ├── CMakeLists.txt ├── GatherTests.cpp ├── GroupCallTests.cpp ├── NonBlockingTests.cpp ├── ReduceScatterTests.cpp ├── ReduceTests.cpp ├── ScatterTests.cpp ├── SendRecvTests.cpp ├── StandaloneTests.cpp ├── _RecorderTests.cpp └── common │ ├── CallCollectiveForked.cpp │ ├── CallCollectiveForked.hpp │ ├── CollectiveArgs.cpp │ ├── CollectiveArgs.hpp │ ├── EnvVars.cpp │ ├── EnvVars.hpp │ ├── ErrCode.hpp │ ├── PrepDataFuncs.cpp │ ├── PrepDataFuncs.hpp │ ├── PtrUnion.cpp │ ├── PtrUnion.hpp │ ├── RcclMockFuncs.hpp │ ├── StandaloneUtils.cpp │ ├── StandaloneUtils.hpp │ ├── TestBed.cpp │ ├── TestBed.hpp │ ├── TestBedChild.cpp │ ├── TestBedChild.hpp │ └── main.cpp ├── toolchain-linux.cmake └── tools ├── EmptyKernelTest ├── EmptyKernelTest.cpp └── Makefile ├── GraphBench ├── GraphBench.cpp └── Makefile ├── HelloRccl ├── HelloRccl.cpp ├── HelloRccl.hpp ├── Makefile └── runTest.sh ├── JitterBench ├── Common.hpp ├── Compatibility.hpp ├── GetClosestNumaNode.hpp ├── JitterBench.cpp ├── Makefile ├── Timeline.hpp └── runSweep.sh ├── RcclReplayer ├── Makefile ├── README.md ├── rcclReplayer.cpp └── rcclReplayer.hpp ├── TopoVisual ├── 4_nodes.log.png ├── README.md ├── extract_topo.awk └── topo_visual.sh ├── TransferBench └── README.md ├── ib-test ├── Makefile ├── ib_test.cpp ├── include │ └── nccl.h └── utils.cpp ├── msccl-algorithms ├── allgather_16n_direct_0_3m_ll128.xml ├── allgather_16n_direct_0_3m_ll128_op.xml ├── allgather_32n_direct_0_6m_ll128.xml ├── allgather_32n_direct_0_6m_ll128_op.xml ├── allreduce-allpairs-8n-ll-32tb-op.xml ├── allreduce-allpairs-8n-ll-32tb.xml ├── allreduce-allpairs-8n-ll-64tb-op.xml ├── allreduce-allpairs-8n-ll-64tb.xml ├── allreduce-allpairs-8n-simple-op.xml ├── allreduce-allpairs-8n-simple.xml ├── alltoall-8n-0-9kb.xml ├── alltoall-8n-190kb-512kb.xml ├── alltoall-8n-512kb-7mb.xml ├── alltoall-8n-7mb-43mb.xml └── alltoall-8n-9kb-190kb.xml ├── msccl-unit-test-algorithms ├── all-reduce-ring-ll.xml ├── all-reduce-ring-ll128.xml └── all-reduce-ring-simple.xml ├── p2p-latency-test ├── Makefile ├── build_and_run.sh ├── ll_latency_test.cpp ├── ll_latency_test.cu └── p2p_latency_test.cpp ├── rccl-prim-test ├── Makefile ├── copy_kernel.h └── rccl_prim_test.cpp ├── scripts ├── npkit_trace_analysis.py ├── npkit_trace_generator.py ├── pytorch-all-reduce │ ├── README.md │ ├── all_reduce.py │ └── trace_runs.sh ├── pytorch-log-parser.py ├── rcclDiagnostics.py ├── rccl_bw_test.py ├── rocprof-log-parser.py ├── topo_val.sh └── ucx_ompi_rccl_rccltests_TB_script.sh ├── time-trace ├── rccl-TimeTrace.sh └── time_trace_generator.py └── topo_expl ├── Makefile ├── include ├── device_table.h ├── model.h ├── nccl.h └── utils.h ├── model.cpp ├── models ├── topo_16p1h.xml ├── topo_16p1h_vm.xml ├── topo_16p_gio-1s-1rp-cascade.xml ├── topo_16p_gio-3s-1rp-split-flat.xml ├── topo_3p_pcie.xml ├── topo_3p_pcie_1.xml ├── topo_4p1h.xml ├── topo_4p1h_1.xml ├── topo_4p2h.xml ├── topo_4p2h_1.xml ├── topo_4p2h_2nic.xml ├── topo_4p3l.xml ├── topo_4p3l_2h.xml ├── topo_4p3l_ia.xml ├── topo_4p3l_n2.xml ├── topo_4p3l_n2_1.xml ├── topo_4p3l_n4.xml ├── topo_4p4h.xml ├── topo_4p_942.xml ├── topo_8p1h.xml ├── topo_8p1h_1.xml ├── topo_8p1h_2.xml ├── topo_8p1h_3.xml ├── topo_8p1h_4.xml ├── topo_8p1h_5.xml ├── topo_8p1h_n1.xml ├── topo_8p6l.xml ├── topo_8p6l_1nic.xml ├── topo_8p6l_2nic.xml ├── topo_8p6l_3nic.xml ├── topo_8p6l_4nic.xml ├── topo_8p6l_5nic.xml ├── topo_8p6l_6nic.xml ├── topo_8p_4nics.xml ├── topo_8p_90a.xml ├── topo_8p_90a_1.xml ├── topo_8p_942.xml ├── topo_8p_942vm.xml ├── topo_8p_pcie.xml ├── topo_8p_pcie_1.xml ├── topo_8p_pcie_2nic.xml ├── topo_8p_rome.xml ├── topo_8p_rome_4n_1.xml ├── topo_8p_rome_4n_2.xml ├── topo_8p_rome_4nics.xml ├── topo_8p_rome_n2.xml ├── topo_8p_rome_n2_1.xml ├── topo_8p_rome_n2_2.xml ├── topo_8p_rome_n4.xml ├── topo_8p_rome_n4_1.xml ├── topo_8p_rome_pcie.xml ├── topo_8p_rome_vm1.xml ├── topo_8p_ts1.xml ├── topo_8p_ts1_1.xml ├── topo_8p_ts1_n4.xml ├── topo_8p_ts1_n4_1.xml ├── topo_8p_ts1_n4_2.xml ├── topo_collnet_n1.xml └── topo_collnet_n4.xml ├── topo_expl.cpp └── utils.cpp /.azuredevops/multinode-ci-nightly.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | repositories: 3 | - repository: pipelines_repo 4 | type: github 5 | endpoint: ROCm 6 | name: ROCm/ROCm 7 | 8 | variables: 9 | - group: common 10 | - template: /.azuredevops/variables-global.yml@pipelines_repo 11 | - name: pytestFolder 12 | value: '.azuredevops/tests/pytest' 13 | 14 | parameters: 15 | - name: pytestList 16 | type: object 17 | default: 18 | - HelloWorld 19 | 20 | trigger: none 21 | pr: none 22 | schedules: 23 | - cron: "0 5 * 11-3 *" # 11 PM CST (November - March) 24 | displayName: "Nightly Build (CST)" 25 | branches: 26 | include: 27 | - develop 28 | always: false 29 | 30 | - cron: "0 4 * 4-10 *" # 11 PM CDT (April - October) 31 | displayName: "Nightly Build (CDT)" 32 | branches: 33 | include: 34 | - develop 35 | always: false 36 | 37 | jobs: 38 | - job: rccl 39 | timeoutInMinutes: 180 40 | pool: rocm-ci_rccl_pool 41 | workspace: 42 | clean: all 43 | steps: 44 | - task: DeleteFiles@1 45 | inputs: 46 | Contents: '**/*' 47 | - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo 48 | parameters: 49 | submoduleBehaviour: recursive 50 | - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo 51 | parameters: 52 | installEnabled: false 53 | printDiskSpace: false 54 | extraBuildFlags: >- 55 | -DCMAKE_BUILD_TYPE=Release 56 | -DBUILD_TESTS=ON 57 | -GNinja 58 | - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo 59 | parameters: 60 | componentName: rccl 61 | testDir: $(Build.SourcesDirectory)/build/test 62 | testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests' 63 | testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes' 64 | - ${{ each pytestScript in parameters.pytestList }}: 65 | - task: Bash@3 66 | displayName: Test ${{ pytestScript }} 67 | continueOnError: true 68 | inputs: 69 | targetType: inline 70 | workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder) 71 | script: pytest ${{ pytestScript }}.py 72 | -------------------------------------------------------------------------------- /.azuredevops/multinode-ci-pr.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | repositories: 3 | - repository: pipelines_repo 4 | type: github 5 | endpoint: ROCm 6 | name: ROCm/ROCm 7 | 8 | variables: 9 | - group: common 10 | - template: /.azuredevops/variables-global.yml@pipelines_repo 11 | - name: pytestFolder 12 | value: '.azuredevops/tests/pytest' 13 | 14 | parameters: 15 | - name: pytestList 16 | type: object 17 | default: 18 | - HelloWorld 19 | 20 | trigger: none 21 | pr: 22 | autoCancel: true 23 | branches: 24 | include: 25 | - develop 26 | paths: 27 | exclude: 28 | - .github 29 | - .jenkins 30 | - docs 31 | - '*.md' 32 | - LICENSE.txt 33 | - NOTICES.txt 34 | drafts: false 35 | 36 | stages: 37 | - stage: rcclStage 38 | displayName: 'RCCL develop PR' 39 | jobs: 40 | - deployment: rccl_pr_approval 41 | displayName: "CI Run Requires Approval" 42 | environment: rccl 43 | - job: rccl 44 | timeoutInMinutes: 180 45 | pool: rocm-ci_rccl_pool 46 | workspace: 47 | clean: all 48 | steps: 49 | - task: DeleteFiles@1 50 | inputs: 51 | Contents: '**/*' 52 | - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo 53 | parameters: 54 | submoduleBehaviour: recursive 55 | - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo 56 | parameters: 57 | installEnabled: false 58 | printDiskSpace: false 59 | extraBuildFlags: >- 60 | -DCMAKE_BUILD_TYPE=Release 61 | -DBUILD_TESTS=ON 62 | -DGPU_TARGETS=gfx942 63 | -GNinja 64 | - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo 65 | parameters: 66 | componentName: rccl 67 | testDir: $(Build.SourcesDirectory)/build/test 68 | testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests' 69 | testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes' 70 | - ${{ each pytestScript in parameters.pytestList }}: 71 | - task: Bash@3 72 | displayName: Test ${{ pytestScript }} 73 | continueOnError: true 74 | inputs: 75 | targetType: inline 76 | workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder) 77 | script: pytest ${{ pytestScript }}.py 78 | -------------------------------------------------------------------------------- /.azuredevops/rocm-ci.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | repositories: 3 | - repository: pipelines_repo 4 | type: github 5 | endpoint: ROCm 6 | name: ROCm/ROCm 7 | 8 | variables: 9 | - group: common 10 | - template: /.azuredevops/variables-global.yml@pipelines_repo 11 | 12 | trigger: 13 | batch: true 14 | branches: 15 | include: 16 | - develop 17 | - mainline 18 | paths: 19 | exclude: 20 | - .github 21 | - .jenkins 22 | - docs 23 | - '.*.y*ml' 24 | - '*.md' 25 | - LICENSE.txt 26 | - NOTICES.txt 27 | 28 | pr: 29 | autoCancel: true 30 | branches: 31 | include: 32 | - develop 33 | - mainline 34 | paths: 35 | exclude: 36 | - .github 37 | - .jenkins 38 | - docs 39 | - '.*.y*ml' 40 | - '*.md' 41 | - LICENSE.txt 42 | - NOTICES.txt 43 | drafts: false 44 | 45 | jobs: 46 | - template: ${{ variables.CI_COMPONENT_PATH }}/rccl.yml@pipelines_repo 47 | -------------------------------------------------------------------------------- /.azuredevops/tests/pytest/HelloWorld.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | def test_HelloWorld(): 4 | greeting = "Hello, World!" 5 | assert greeting == "Hello, World!" 6 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @wenkaidu @gilbertlee-amd @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @BertanDogancay @rahulvaidya20 @isaki001 @PJAvinash @AbandiGa @Nikhil-Nunna @haripriya-amd# Documentation files 2 | docs/ @ROCm/rocm-documentation 3 | *.md @ROCm/rocm-documentation 4 | *.rst @ROCm/rocm-documentation 5 | .readthedocs.yaml @ROCm/rocm-documentation 6 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Details 2 | ___Do not mention proprietary info or link to internal work items in this PR.___ 3 | 4 | **Work item:** _"Internal", or link to GitHub issue (if applicable)._ 5 | 6 | **What were the changes?** 7 | _One sentence describing the work done._ 8 | 9 | **Why were the changes made?** 10 | _Explain the motivation behind the work. Provide any publicly-available historical context._ 11 | 12 | **How was the outcome achieved?** 13 | _Technical details behind the work. Explain any publicly-available hardware peculiarities._ 14 | 15 | **Additional Documentation:** 16 | _What else should the reviewer know?_ 17 | 18 | ## Approval Checklist 19 | ___Do not approve until these items are satisfied.___ 20 | - [ ] Verify the CHANGELOG has been updated, if 21 | - there are any NCCL API version changes, 22 | - any changes impact library users, and/or 23 | - any changes impact any other ROCm library. 24 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/docs/sphinx" # Location of package manifests 10 | open-pull-requests-limit: 10 11 | schedule: 12 | interval: "daily" 13 | labels: 14 | - "dependencies" 15 | - "ci:docs-only" 16 | reviewers: 17 | - "samjwu" 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. 2 | *.gcov 3 | /coverage/ 4 | build/ 5 | ext/ 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ext-src/mscclpp"] 2 | path = ext-src/mscclpp 3 | url = https://github.com/microsoft/mscclpp.git 4 | ignore = dirty 5 | shallow = true 6 | [submodule "ext-src/json"] 7 | path = ext-src/json 8 | url = https://github.com/nlohmann/json.git 9 | ignore = dirty 10 | shallow = true 11 | -------------------------------------------------------------------------------- /.jenkins/common.groovy: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved. 2 | // This file is for internal AMD use. 3 | // If you are interested in running your own Jenkins, please raise a github issue for assistance. 4 | 5 | def runCompileCommand(platform, project, jobName) 6 | { 7 | project.paths.construct_build_prefix() 8 | 9 | def command = """#!/usr/bin/env bash 10 | set -x 11 | cd ${project.paths.project_build_prefix} 12 | ${project.paths.build_command} 13 | """ 14 | 15 | platform.runCommand(this,command) 16 | } 17 | 18 | def runTestCommand (platform, project, gfilter, envars) 19 | { 20 | String sudo = auxiliary.sudo(platform.jenkinsLabel) 21 | 22 | def command = """#!/usr/bin/env bash 23 | set -x 24 | export RUN_TEST_ROOT=\$(pwd) 25 | cd ${project.paths.project_build_prefix}/build/release/test 26 | ${sudo} ulimit -l unlimited 27 | ulimit -a 28 | ${sudo} ${envars} LD_LIBRARY_PATH=\${RUN_TEST_ROOT}/${project.paths.project_build_prefix}/build/release:\${LD_LIBRARY_PATH} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 UT_MULTITHREAD=1 UT_PROCESS_MASK=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes 29 | """ 30 | 31 | platform.runCommand(this, command) 32 | } 33 | 34 | def runPackageCommand(platform, project, jobName) 35 | { 36 | def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") 37 | 38 | platform.runCommand(this, packageHelper[0]) 39 | platform.archiveArtifacts(this, packageHelper[1]) 40 | } 41 | 42 | return this 43 | -------------------------------------------------------------------------------- /.jenkins/staticanalysis.groovy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env groovy 2 | // Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved. 3 | // This shared library is available at https://github.com/ROCm/rocJENKINS/ 4 | @Library('rocJenkins@pong') _ 5 | 6 | // This is file for internal AMD use. 7 | // If you are interested in running your own Jenkins, please raise a github issue for assistance. 8 | 9 | import com.amd.project.* 10 | import com.amd.docker.* 11 | import java.nio.file.Path 12 | 13 | def runCompileCommand(platform, project, jobName, boolean debug=false) 14 | { 15 | project.paths.construct_build_prefix() 16 | } 17 | 18 | def runCI = 19 | { 20 | nodeDetails, jobName-> 21 | 22 | def prj = new rocProject('rccl', 'StaticAnalysis') 23 | 24 | // Define test architectures, optional rocm version argument is available 25 | def nodes = new dockerNodes(nodeDetails, jobName, prj) 26 | 27 | boolean formatCheck = false 28 | boolean staticAnalysis = true 29 | 30 | def compileCommand = 31 | { 32 | platform, project-> 33 | 34 | runCompileCommand(platform, project, jobName, false) 35 | } 36 | 37 | buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) 38 | } 39 | 40 | ci: { 41 | String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) 42 | 43 | def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], 44 | "rocm-docker":[]] 45 | propertyList = auxiliary.appendPropertyList(propertyList) 46 | 47 | def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]] 48 | jobNameList = auxiliary.appendJobNameList(jobNameList) 49 | 50 | propertyList.each 51 | { 52 | jobName, property-> 53 | if (urlJobName == jobName) 54 | properties(auxiliary.addCommonProperties(property)) 55 | } 56 | 57 | jobNameList.each 58 | { 59 | jobName, nodeDetails-> 60 | if (urlJobName == jobName) 61 | stage(jobName) { 62 | runCI(nodeDetails, jobName) 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.10" 10 | 11 | sphinx: 12 | configuration: docs/conf.py 13 | 14 | formats: [htmlzip, pdf, epub] 15 | 16 | python: 17 | install: 18 | - requirements: docs/sphinx/requirements.txt 19 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Attributions 3 | 4 | Contains contributions from NVIDIA. 5 | 6 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 7 | Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. 8 | Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions 12 | are met: 13 | 14 | * Redistributions of source code must retain the above copyright 15 | notice, this list of conditions and the following disclaimer. 16 | * Redistributions in binary form must reproduce the above copyright 17 | notice, this list of conditions and the following disclaimer in the 18 | documentation and/or other materials provided with the distribution. 19 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 20 | Laboratory, the U.S. Department of Energy, nor the names of their 21 | contributors may be used to endorse or promote products derived 22 | from this software without specific prior written permission. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY 25 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 28 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 29 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 30 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 31 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 32 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 33 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 34 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | 36 | The U.S. Department of Energy funded the development of this software 37 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 38 | 39 | 40 | This code also includes files from the NVIDIA Tools Extension SDK project. 41 | 42 | See: 43 | 44 | https://github.com/NVIDIA/NVTX 45 | 46 | for more information and license details. 47 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : src.build 9 | install : src.install 10 | BUILDDIR ?= $(abspath ./build) 11 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 12 | TARGETS := src pkg 13 | clean: ${TARGETS:%=%.clean} 14 | test.build: src.build 15 | LICENSE_FILES := LICENSE.txt 16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) 17 | lic: $(LICENSE_TARGETS) 18 | 19 | ${BUILDDIR}/%.txt: %.txt 20 | @printf "Copying %-35s > %s\n" $< $@ 21 | mkdir -p ${BUILDDIR} 22 | cp $< $@ 23 | 24 | src.%: 25 | ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} 26 | 27 | pkg.%: 28 | ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} 29 | 30 | pkg.debian.prep: lic 31 | pkg.txz.prep: lic 32 | -------------------------------------------------------------------------------- /cmake/CheckSymbolExistsNoWarn.cmake: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile, 24 | # while ROCMChecks gives a warning if this variable is modified manually without a target. 25 | 26 | # We now choose to disable ROCMChecks for this one case. 27 | 28 | set(DISABLE_ROCM_CHECK OFF) 29 | 30 | function(rocm_check_toolchain_var var access value list_file) 31 | if(NOT DISABLE_ROCM_CHECK) 32 | _rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}") 33 | endif() 34 | endfunction() 35 | 36 | macro(CHECK_SYMBOL_EXISTS) 37 | set(DISABLE_ROCM_CHECK ON) 38 | _check_symbol_exists(${ARGN}) 39 | set(DISABLE_ROCM_CHECK OFF) 40 | endmacro() 41 | -------------------------------------------------------------------------------- /cmake/DownloadProject.CMakeLists.cmake.in: -------------------------------------------------------------------------------- 1 | # Distributed under the OSI-approved MIT License. See accompanying 2 | # file LICENSE or https://github.com/Crascit/DownloadProject for details. 3 | 4 | cmake_minimum_required(VERSION 2.8.2) 5 | 6 | project(${DL_ARGS_PROJ}-download NONE) 7 | 8 | include(ExternalProject) 9 | ExternalProject_Add(${DL_ARGS_PROJ}-download 10 | ${DL_ARGS_UNPARSED_ARGUMENTS} 11 | SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" 12 | BUILD_IN_SOURCE TRUE 13 | TEST_COMMAND "" 14 | ) -------------------------------------------------------------------------------- /cmake/FindIBVerbs.cmake: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | find_path(IBVERBS_INCLUDE_DIRS 24 | NAMES infiniband/verbs.h 25 | HINTS 26 | ${IBVERBS_INCLUDE_DIR} 27 | ${IBVERBS_ROOT_DIR} 28 | ${IBVERBS_ROOT_DIR}/include) 29 | 30 | find_library(IBVERBS_LIBRARIES 31 | NAMES ibverbs 32 | HINTS 33 | ${IBVERBS_LIB_DIR} 34 | ${IBVERBS_ROOT_DIR} 35 | ${IBVERBS_ROOT_DIR}/lib) 36 | 37 | include(FindPackageHandleStandardArgs) 38 | find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES) 39 | mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES) 40 | -------------------------------------------------------------------------------- /cmake/Findmscclpp_nccl.cmake: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | find_path(MSCCLPP_INCLUDE_DIRS 24 | NAMES mscclpp/gpu.hpp 25 | HINTS 26 | ${MSCCLPP_ROOT}/include) 27 | 28 | find_library(MSCCLPP_LIBRARIES 29 | NAMES mscclpp_nccl 30 | HINTS 31 | ${MSCCLPP_ROOT}/lib) 32 | 33 | include (FindPackageHandleStandardArgs) 34 | find_package_handle_standard_args(mscclpp_nccl DEFAULT_MSG MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES) 35 | mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES) 36 | -------------------------------------------------------------------------------- /cmake/scripts/add_faults.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | HIP_FILE=$1 22 | 23 | if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then 24 | sed -i "s/__syncthreads()/__syncthreads(); insert_random_delay_per_warp()/" "$HIP_FILE" 25 | 26 | echo "Added fault injection to $HIP_FILE" 27 | fi -------------------------------------------------------------------------------- /cmake/scripts/add_unroll.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | HIP_FILE=$1 22 | 23 | if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then 24 | perl -pi -e 's/(template/\1, int COLL_UNROLL\2>/g' "$HIP_FILE" 25 | perl -pi -e 's/(ProtoSimple<[^,]*?,[^,]+?)>/\1, COLL_UNROLL>/g' "$HIP_FILE" 26 | perl -pi -e 's/(runRing\()/\1, COLL_UNROLL\2/g' "$HIP_FILE" 27 | perl -pi -e 's/(runTreeUpDown\(/\1, COLL_UNROLL>(/' "$HIP_FILE" 28 | perl -pi -e 's/(runTreeSplit\(/\1, COLL_UNROLL>(/' "$HIP_FILE" 29 | sed -i "s/\\(struct RunWorkColl]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE" 30 | sed -i "s/\\(struct RunWorkBatch]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE" 31 | 32 | echo "Added COLL_UNROLL template argument to $HIP_FILE" 33 | fi -------------------------------------------------------------------------------- /cmake/scripts/extract_metadata.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | ## List the objects for each gfx architecture 22 | execute_process( COMMAND roc-obj-ls librccl.so 23 | RESULT_VARIABLE list_result 24 | OUTPUT_VARIABLE cmd_output 25 | ) 26 | 27 | if(list_result EQUAL 0) 28 | ## Convert cmd output to list of lines 29 | string(REGEX REPLACE "\n$" "" cmd_output "${cmd_output}") 30 | string(REPLACE "\n" ";" cmd_output "${cmd_output}") 31 | 32 | ## Extract file paths for the selected gfx archs 33 | foreach(line ${cmd_output}) 34 | if(line MATCHES "(gfx90a|gfx942|gfx950)") 35 | string(REGEX MATCH "\\file://(.*)" file_match ${line}) 36 | if(file_match) 37 | list(APPEND file_paths ${file_match}) 38 | endif() 39 | endif() 40 | endforeach() 41 | 42 | ## Extract objects from files 43 | foreach(file ${file_paths}) 44 | execute_process( 45 | COMMAND roc-obj-extract ${file} 46 | RESULT_VARIABLE extraction_result 47 | ) 48 | if(NOT extraction_result EQUAL 0) 49 | message(WARNING "Could not extract objects from ${file}") 50 | endif() 51 | endforeach() 52 | else() 53 | ## We don't want to stop building unit-tests if this command fails. 54 | message(WARNING "Command failed with error code ${result}") 55 | endif() -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Using RCCL/RCCL-Tests in a docker environment 2 | 3 | ## Docker build 4 | 5 | Assuming you have docker installed on your system: 6 | 7 | ### To build the docker image : 8 | 9 | By default, the given Dockerfile uses `docker.io/rocm/dev-ubuntu-22.04:latest` as the base docker image, and then installs RCCL (develop branch) and RCCL-Tests (develop branch), targetting `gfx942` GPUs. 10 | ```shell 11 | $ docker build -t rccl-tests -f Dockerfile.ubuntu --pull . 12 | ``` 13 | 14 | The base docker image, rccl repo, rccl-tests repo, and GPU targets can be modified using `--build-args` in the `docker build` command above. E.g., to use a different base docker image for the MI250 GPU: 15 | ```shell 16 | $ docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull . 17 | ``` 18 | 19 | ### To start an interactive docker container on a system with AMD GPUs : 20 | 21 | ```shell 22 | $ docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash 23 | ``` 24 | 25 | ### To run rccl-tests (all\_reduce\_perf) on 8 AMD GPUs (inside the docker container) : 26 | 27 | If using ROCm 6.3.x or earlier 28 | ```shell 29 | $ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 30 | ``` 31 | 32 | If using ROCm 6.4.0 or later 33 | ```shell 34 | $ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 35 | ``` 36 | 37 | For more information on rccl-tests options, refer to the [Usage](https://github.com/ROCm/rccl-tests#usage) section of rccl-tests. 38 | 39 | 40 | ## Copyright 41 | 42 | All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. 43 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build/ 2 | _doxygen/ 3 | doxygen/html 4 | doxygen/xml 5 | sphinx/_toc.yml 6 | -------------------------------------------------------------------------------- /docs/api-reference/api-library.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs 3 | :keywords: RCCL, ROCm, library, API 4 | 5 | .. _api-library: 6 | 7 | ============= 8 | API library 9 | ============= 10 | 11 | .. doxygenindex:: 12 | -------------------------------------------------------------------------------- /docs/attributions.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: RCCL attributions information 3 | :keywords: RCCL, ROCm, library, API, attributions 4 | 5 | .. toctree:: 6 | :maxdepth: 4 7 | :caption: Attributions 8 | 9 | Attributions 10 | ============ 11 | 12 | Contains contributions from NVIDIA. 13 | 14 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 15 | 16 | Redistribution and use in source and binary forms, with or without 17 | modification, are permitted provided that the following conditions 18 | are met: 19 | 20 | - Redistributions of source code must retain the above copyright 21 | notice, this list of conditions and the following disclaimer. 22 | - Redistributions in binary form must reproduce the above copyright 23 | notice, this list of conditions and the following disclaimer in the 24 | documentation and/or other materials provided with the distribution. 25 | - Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 26 | Laboratory, the U.S. Department of Energy, nor the names of their 27 | contributors may be used to endorse or promote products derived 28 | from this software without specific prior written permission. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY 31 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 33 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 34 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 35 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 36 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 37 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 38 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 39 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 40 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 41 | 42 | The U.S. Department of Energy funded the development of this software 43 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 44 | 45 | This code also includes files from the NVIDIA Tools Extension SDK project. 46 | 47 | For more information and license details, see `https://github.com/NVIDIA/NVTX `_ 48 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | import subprocess 8 | 9 | from rocm_docs import ROCmDocs 10 | 11 | name = "RCCL" 12 | get_major = r'sed -n -e "s/^NCCL_MAJOR.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk' 13 | get_minor = r'sed -n -e "s/^NCCL_MINOR.*\([0-9]\{2,\}\).*/\1/p" ../makefiles/version.mk' 14 | get_patch = r'sed -n -e "s/^NCCL_PATCH.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk' 15 | major = subprocess.getoutput(get_major) 16 | minor = subprocess.getoutput(get_minor) 17 | patch = subprocess.getoutput(get_patch) 18 | version_number = f"{major}.{minor}.{patch}" 19 | 20 | # for PDF output on Read the Docs 21 | project = f"{name} Documentation" 22 | author = "Advanced Micro Devices, Inc." 23 | copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." 24 | version = version_number 25 | release = version_number 26 | 27 | external_toc_path = "./sphinx/_toc.yml" 28 | 29 | docs_core = ROCmDocs(f"{name} {version_number} Documentation") 30 | docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") 31 | docs_core.setup() 32 | 33 | external_projects_current_project = "rccl" 34 | 35 | for sphinx_var in ROCmDocs.SPHINX_VARS: 36 | globals()[sphinx_var] = getattr(docs_core, sphinx_var) 37 | -------------------------------------------------------------------------------- /docs/data/how-to/rccl-usage-tips/in-place_allreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/rccl/e94b36024615b65ddcd372ac39d9e9f5f43f685f/docs/data/how-to/rccl-usage-tips/in-place_allreduce.png -------------------------------------------------------------------------------- /docs/data/how-to/rccl-usage-tips/out-of-place_allreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/rccl/e94b36024615b65ddcd372ac39d9e9f5f43f685f/docs/data/how-to/rccl-usage-tips/out-of-place_allreduce.png -------------------------------------------------------------------------------- /docs/doxygen/mainpage.txt: -------------------------------------------------------------------------------- 1 | /*! \mainpage RCCL Documentation 2 | 3 | \tableofcontents 4 | 5 | \section intro_sec Introduction 6 | 7 | RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. 8 | 9 | The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API. 10 | 11 | \section API RCCL API Contents 12 | - @ref rccl_api_version 13 | - @ref rccl_result_code 14 | - @ref rccl_config_type 15 | - @ref rccl_api_communicator 16 | - @ref rccl_api_errcheck 17 | - @ref rccl_api_comminfo 18 | - @ref rccl_api_enumerations 19 | - @ref rccl_api_custom_redop 20 | - @ref rccl_collective_api 21 | - @ref rccl_group_api 22 | - @ref msccl_api 23 | 24 | \section Full RCCL API File 25 | - nccl.h.in 26 | 27 | */ 28 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs 3 | :keywords: RCCL, ROCm, library, API 4 | 5 | .. _index: 6 | 7 | ****************** 8 | RCCL documentation 9 | ****************** 10 | 11 | The ROCm Communication Collectives Library (RCCL) is a stand-alone library 12 | that provides multi-GPU and multi-node collective communication primitives 13 | optimized for AMD GPUs. It uses PCIe and xGMI high-speed interconnects. 14 | To learn more, see :doc:`what-is-rccl` 15 | 16 | The RCCL public repository is located at ``_. 17 | 18 | .. grid:: 2 19 | :gutter: 3 20 | 21 | .. grid-item-card:: Install 22 | 23 | * :doc:`Installing RCCL using the install script <./install/installation>` 24 | * :doc:`Running RCCL using Docker <./install/docker-install>` 25 | * :doc:`Building and installing RCCL from source code <./install/building-installing>` 26 | 27 | .. grid-item-card:: How to 28 | 29 | * :doc:`Using the RCCL Tuner plugin <./how-to/using-rccl-tuner-plugin-api>` 30 | * :doc:`Using the NCCL Net plugin <./how-to/using-nccl>` 31 | * :doc:`Troubleshoot RCCL <./how-to/troubleshooting-rccl>` 32 | * :doc:`RCCL usage tips <./how-to/rccl-usage-tips>` 33 | 34 | 35 | .. grid-item-card:: Examples 36 | 37 | * `RCCL Tuner plugin examples `_ 38 | * `NCCL Net plugin examples `_ 39 | 40 | .. grid-item-card:: API reference 41 | 42 | * :ref:`Library specification` 43 | * :ref:`api-library` 44 | 45 | To contribute to the documentation, see 46 | `Contributing to ROCm `_. 47 | 48 | You can find licensing information on the 49 | `Licensing `_ page. 50 | -------------------------------------------------------------------------------- /docs/install/docker-install.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: Instruction on how to install the RCCL library for collective communication primitives using Docker 3 | :keywords: RCCL, ROCm, library, API, install, Docker 4 | 5 | .. _install-docker: 6 | 7 | ***************************************** 8 | Running RCCL using Docker 9 | ***************************************** 10 | 11 | To use Docker to run RCCL, Docker must already be installed on the system. 12 | To build the Docker image and run the container, follow these steps. 13 | 14 | #. Build the Docker image 15 | 16 | By default, the Dockerfile uses ``docker.io/rocm/dev-ubuntu-22.04:latest`` as the base Docker image. 17 | It then installs RCCL and rccl-tests (in both cases, it uses the version from the ``develop`` branch). 18 | 19 | Use this command to build the Docker image: 20 | 21 | .. code-block:: shell 22 | 23 | docker build -t rccl-tests -f Dockerfile.ubuntu --pull . 24 | 25 | The base Docker image, rccl repository, rccl-tests repository, and GPU targets can be modified 26 | by using ``--build-args`` in the ``docker build`` command above. For example, to use a different base Docker image for the MI250 GPU, 27 | use this command: 28 | 29 | .. code-block:: shell 30 | 31 | docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull . 32 | 33 | #. Launch an interactive Docker container on a system with AMD GPUs: 34 | 35 | .. code-block:: shell 36 | 37 | docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash 38 | 39 | To run, for example, the ``all_reduce_perf`` test from rccl-tests on 8 AMD GPUs from inside the Docker container, use this command: 40 | 41 | If using ROCm 6.3.x or earlier 42 | .. code-block:: shell 43 | 44 | mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 45 | 46 | If using ROCm 6.4.0 or later 47 | .. code-block:: shell 48 | 49 | mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 50 | 51 | For more information on the rccl-tests options, see the `Usage guidelines `_ in the GitHub repository. 52 | -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: RCCL licensing information 3 | :keywords: RCCL, ROCm, library, API, license 4 | 5 | License 6 | ======= 7 | 8 | .. include:: ../LICENSE.txt 9 | -------------------------------------------------------------------------------- /docs/sphinx/_toc.yml.in: -------------------------------------------------------------------------------- 1 | root: index 2 | subtrees: 3 | 4 | - entries: 5 | - file: what-is-rccl.rst 6 | title: What is RCCL? 7 | 8 | - caption: Install 9 | entries: 10 | - file: install/installation 11 | title: Installation guide 12 | - file: install/docker-install 13 | title: Running RCCL using Docker 14 | - file: install/building-installing 15 | title: Building and installing from source 16 | 17 | - caption: How to 18 | entries: 19 | - file: how-to/using-rccl-tuner-plugin-api 20 | title: Using the RCCL Tuner plugin 21 | - file: how-to/using-nccl 22 | title: Using the NCCL Net plugin 23 | - file: how-to/troubleshooting-rccl 24 | title: Troubleshoot RCCL 25 | - file: how-to/rccl-usage-tips 26 | 27 | - caption: Examples 28 | entries: 29 | - url: https://github.com/ROCm/rccl/tree/develop/ext-tuner/example 30 | title: RCCL Tuner plugin examples 31 | - url: https://github.com/ROCm/rccl/tree/develop/ext-net/example 32 | title: NCCL Net plugin examples 33 | 34 | - caption: API reference 35 | entries: 36 | - file: api-reference/library-specification 37 | title: Library specification 38 | - file: api-reference/api-library 39 | 40 | - caption: About 41 | entries: 42 | - file: license 43 | - file: attributions 44 | -------------------------------------------------------------------------------- /docs/sphinx/requirements.in: -------------------------------------------------------------------------------- 1 | rocm-docs-core==1.18.2 2 | -------------------------------------------------------------------------------- /docs/what-is-rccl.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs 3 | :keywords: RCCL, ROCm, library, API 4 | 5 | .. _what-is: 6 | 7 | ****************** 8 | What is RCCL? 9 | ****************** 10 | 11 | The ROCm Communication Collectives Library (RCCL) includes multi-GPU and 12 | multi-node collective communication primitives optimized for AMD GPUs. 13 | It implements routines such as ``all-reduce``, ``all-gather``, ``reduce``, 14 | ``broadcast``, ``reduce-scatter``, ``gather``, ``scatter``, ``all-to-allv``, 15 | and ``all-to-all``, as well as direct point-to-point (GPU-to-GPU) send 16 | and receive operations. It is optimized to achieve high bandwidth 17 | on platforms using PCIe and xGMI and networking using InfiniBand Verbs or TCP/IP 18 | sockets. RCCL supports an arbitrary number of GPUs installed in a single node 19 | or multiple nodes and can be used in either 20 | single- or multi-process (for example, MPI) applications. 21 | 22 | The collective operations are implemented using ring and tree algorithms and have been optimized 23 | for throughput and latency by leveraging topology awareness, high-speed interconnects, 24 | and RDMA-based collectives. For best performance, small operations can be either 25 | batched into larger operations or aggregated through the API. 26 | 27 | RCCL uses PCIe and xGMI high-speed interconnects for intra-node communication 28 | as well as InfiniBand, RoCE, and TCP/IP for inter-node communication. 29 | It supports an arbitrary number of GPUs installed in a single-node or 30 | multi-node platform and can easily integrate into 31 | single- or multi-process (for example, MPI) applications. -------------------------------------------------------------------------------- /ext-net/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME:=../../build/ 7 | CUDA_HOME:=/usr/local/cuda 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl 9 | PLUGIN_SO:=libnccl-net.so 10 | 11 | default: $(PLUGIN_SO) 12 | 13 | $(PLUGIN_SO): plugin.c 14 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 15 | 16 | clean: 17 | rm -f $(PLUGIN_SO) 18 | -------------------------------------------------------------------------------- /ext-net/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 12 | 13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /ext-net/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_ERR_H_ 6 | #define NCCL_ERR_H_ 7 | 8 | /* Error type for plugins */ 9 | typedef enum { ncclSuccess = 0, 10 | ncclUnhandledCudaError = 1, 11 | ncclSystemError = 2, 12 | ncclInternalError = 3, 13 | ncclInvalidArgument = 4, 14 | ncclInvalidUsage = 5, 15 | ncclRemoteError = 6 } ncclResult_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_H_ 6 | #define NCCL_NET_H_ 7 | 8 | #include 9 | #include 10 | 11 | #include "common.h" 12 | #include "err.h" 13 | 14 | #define NCCL_NET_HANDLE_MAXSIZE 128 15 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB 16 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 17 | 18 | #define NCCL_PTR_HOST 0x1 19 | #define NCCL_PTR_CUDA 0x2 20 | #define NCCL_PTR_DMABUF 0x4 21 | 22 | // Maximum number of requests per comm object 23 | #define NCCL_NET_MAX_REQUESTS 32 24 | 25 | #include "net_v9.h" 26 | #include "net_v8.h" 27 | #include "net_v7.h" 28 | #include "net_v6.h" 29 | #include "net_v5.h" 30 | #include "net_v4.h" 31 | #include "net_v3.h" 32 | #include "net_v2.h" 33 | 34 | #endif // end include guard 35 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_device.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_DEVICE_H_ 8 | #define NET_DEVICE_H_ 9 | 10 | #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 11 | #define NCCL_NET_MTU_SIZE 4096 12 | 13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin 14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. 15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 16 | 17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; 18 | 19 | typedef struct { 20 | ncclNetDeviceType netDeviceType; // Network offload type 21 | int netDeviceVersion; // Version number for network offload 22 | void* handle; 23 | size_t size; 24 | int needsProxyProgress; 25 | } ncclNetDeviceHandle_v7_t; 26 | 27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; 28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; 29 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t; 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_NET_V3_H_ 6 | #define NCCL_NET_V3_H_ 7 | 8 | #define NCCL_NET_MAX_REQUESTS_V3 16 9 | 10 | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t; 11 | typedef struct { 12 | // Name of the network (mainly for logs) 13 | const char* name; 14 | // Initialize the network. 15 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 16 | // Return the number of adapters. 17 | ncclResult_t (*devices)(int* ndev); 18 | // Get various device properties. 19 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); 20 | // Create a receiving object and provide a handle to connect to it. The 21 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 22 | // between ranks to create a connection. 23 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 24 | // Connect to a handle and return a sending comm object for that peer. 25 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 26 | // Finalize connection establishment after remote peer has called connectHandle 27 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 28 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 29 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 30 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 31 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 32 | // Asynchronous send to a peer. 33 | // May return request == NULL if the call cannot be performed (or would block) 34 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 35 | // Asynchronous recv from a peer. 36 | // May return request == NULL if the call cannot be performed (or would block) 37 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 38 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 39 | // visible to the GPU 40 | ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); 41 | // Test whether a request is complete. If size is not NULL, it returns the 42 | // number of bytes sent/received. 43 | ncclResult_t (*test)(void* request, int* done, int* size); 44 | // Close and free send/recv comm objects 45 | ncclResult_t (*closeSend)(void* sendComm); 46 | ncclResult_t (*closeRecv)(void* recvComm); 47 | ncclResult_t (*closeListen)(void* listenComm); 48 | } ncclNet_v3_t; 49 | 50 | #endif // end include guard 51 | -------------------------------------------------------------------------------- /ext-net/example/nccl/types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_TYPES_H_ 6 | #define NCCL_TYPES_H_ 7 | 8 | /* Data types */ 9 | typedef enum { ncclInt8 = 0, ncclChar = 0, 10 | ncclUint8 = 1, 11 | ncclInt32 = 2, ncclInt = 2, 12 | ncclUint32 = 3, 13 | ncclInt64 = 4, 14 | ncclUint64 = 5, 15 | ncclFloat16 = 6, ncclHalf = 6, 16 | ncclFloat32 = 7, ncclFloat = 7, 17 | ncclFloat64 = 8, ncclDouble = 8, 18 | ncclBfloat16 = 9, 19 | ncclFloat8e4m3 = 10, 20 | ncclFloat8e5m2 = 11, 21 | } ncclDataType_t; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /ext-net/google-fastsocket/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME?=/usr/local/cuda 2 | INC:=-I$(CUDA_HOME)/include 3 | PLUGIN_SO:=libnccl-net.so 4 | 5 | default: $(PLUGIN_SO) 6 | 7 | $(PLUGIN_SO): nccl-fastsocket/*.cc 8 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 9 | 10 | nccl-fastsocket/*.cc: 11 | git clone https://github.com/google/nccl-fastsocket.git 12 | 13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO) 14 | 15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO) 16 | @printf "Grabbing %-35s > %s\n" $< $@ 17 | mkdir -p $(BUILDDIR)/lib 18 | install -m 644 $< $@ 19 | 20 | clean: 21 | rm -f $(PLUGIN_SO) 22 | rm -Rf nccl-fastsocket 23 | -------------------------------------------------------------------------------- /ext-profiler/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME := ../../build 7 | INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl 8 | PLUGIN_SO := libnccl-profiler.so 9 | 10 | default: $(PLUGIN_SO) 11 | 12 | $(PLUGIN_SO): plugin.c event.c print_event.c 13 | $(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 14 | 15 | clean: 16 | rm -f $(PLUGIN_SO) 17 | -------------------------------------------------------------------------------- /ext-profiler/example/event.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | #include "event.h" 9 | 10 | int taskEventQueueEmpty(struct group* g) { 11 | return g->eventHead == NULL; 12 | } 13 | 14 | void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) { 15 | event->next = NULL; 16 | if (g->eventHead) g->eventTail->next = event; 17 | else g->eventHead = event; 18 | g->eventTail = event; 19 | } 20 | 21 | struct taskEventBase* taskEventQueueHead(struct group* g) { 22 | return g->eventHead; 23 | } 24 | 25 | struct taskEventBase* taskEventQueueDequeue(struct group* g) { 26 | struct taskEventBase* tmp = g->eventHead; 27 | g->eventHead = g->eventHead->next; 28 | if (g->eventHead == NULL) g->eventTail = NULL; 29 | return tmp; 30 | } 31 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 12 | 13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ERR_H_ 8 | #define NCCL_ERR_H_ 9 | 10 | /* Error type for plugins */ 11 | typedef enum { ncclSuccess = 0, 12 | ncclUnhandledCudaError = 1, 13 | ncclSystemError = 2, 14 | ncclInternalError = 3, 15 | ncclInvalidArgument = 4, 16 | ncclInvalidUsage = 5, 17 | ncclRemoteError = 6 } ncclResult_t; 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PROFILER_H_ 8 | #define NCCL_PROFILER_H_ 9 | 10 | #include 11 | #include 12 | 13 | #include "common.h" 14 | #include "err.h" 15 | 16 | #include "profiler_v2.h" 17 | #include "profiler_v1.h" 18 | 19 | #endif // end include guard 20 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_TYPES_H_ 6 | #define NCCL_TYPES_H_ 7 | 8 | /* Data types */ 9 | typedef enum { ncclInt8 = 0, ncclChar = 0, 10 | ncclUint8 = 1, 11 | ncclInt32 = 2, ncclInt = 2, 12 | ncclUint32 = 3, 13 | ncclInt64 = 4, 14 | ncclUint64 = 5, 15 | ncclFloat16 = 6, ncclHalf = 6, 16 | ncclFloat32 = 7, ncclFloat = 7, 17 | ncclFloat64 = 8, ncclDouble = 8, 18 | ncclBfloat16 = 9, 19 | } ncclDataType_t; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /ext-profiler/example/print_event.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PRINT_EVENT_H_ 8 | #define PRINT_EVENT_H_ 9 | 10 | void debugEvent(void* eHandle, const char* tag); 11 | void printEvent(FILE* fh, void* handle); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /ext-src/bf16-tuning.patch: -------------------------------------------------------------------------------- 1 | diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp 2 | index 7a2cd4a..a14dfbc 100644 3 | --- a/apps/nccl/src/allreduce.hpp 4 | +++ b/apps/nccl/src/allreduce.hpp 5 | @@ -850,7 +850,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< 6 | flag++); 7 | #endif 8 | } else { 9 | - int nBlocks = 5*(nRanksPerNode - 1); 10 | + int nBlocks = 8 * (nRanksPerNode - 1); 11 | int nThreadsPerBlock = 512; 12 | if (hieAllred && worldSize >= 8) { 13 | nBlocks = 20; 14 | diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp 15 | index ca2c272..a6056ea 100644 16 | --- a/apps/nccl/src/common.hpp 17 | +++ b/apps/nccl/src/common.hpp 18 | @@ -17,7 +17,7 @@ constexpr int NRANKS1_PER_NODE = 4; 19 | constexpr int NRANKS_PER_NODE = 8; 20 | constexpr int NPEERS = 7; 21 | 22 | -constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB 23 | +constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 112; // double buffer * 56 thread-blocks * 8 ranks * 256KB = 112MB 24 | 25 | __device__ mscclpp::DeviceSyncer deviceSyncer; 26 | 27 | -------------------------------------------------------------------------------- /ext-src/check_ibv_access_relaxed_ordering.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(void) { 5 | enum ibv_access_flags has_ibv_access_relaxed_ordering = IBV_ACCESS_RELAXED_ORDERING; 6 | printf("IBV_ACCESS_RELAXED_ORDERING: %d\n", has_ibv_access_relaxed_ordering); 7 | return 0; 8 | } 9 | -------------------------------------------------------------------------------- /ext-src/cpx.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/numa.cc b/src/numa.cc 2 | index d72c99e..16c903d 100644 3 | --- a/src/numa.cc 4 | +++ b/src/numa.cc 5 | @@ -26,6 +26,7 @@ namespace mscclpp { 6 | 7 | MSCCLPP_API_CPP int getDeviceNumaNode(int cudaDev) { 8 | std::string busId = getBusId(cudaDev); 9 | + busId[busId.length() - 1] = '0'; 10 | std::string file_str = "/sys/bus/pci/devices/" + busId + "/numa_node"; 11 | std::ifstream file(file_str); 12 | int numaNode; 13 | -------------------------------------------------------------------------------- /ext-src/mscclpp_ibv_access_relaxed_ordering.patch: -------------------------------------------------------------------------------- 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt 2 | index a95a8e5..62b4f22 100644 3 | --- a/CMakeLists.txt 4 | +++ b/CMakeLists.txt 5 | @@ -96,6 +96,24 @@ include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake) 6 | 7 | # Find ibverbs and libnuma 8 | find_package(IBVerbs) 9 | + 10 | +# Check if IBV_ACCESS_RELAXED_ORDERING exists in infiniband/verbs.h 11 | +# Disable use of this symbol in mscclpp/src/ib.cc if it does not exist 12 | +if(IBVERBS_FOUND) 13 | + try_compile(HAS_IBV_ACCESS_RELAXED_ORDERING 14 | + ${CMAKE_BINARY_DIR} 15 | + "${CMAKE_CURRENT_SOURCE_DIR}/../check_ibv_access_relaxed_ordering.cc" 16 | + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${IBVERBS_INCLUDE_DIRS}" 17 | + OUTPUT_VARIABLE try_compile_output 18 | + ) 19 | + message(STATUS "try_compile_output: ${try_compile_output}") 20 | + if(NOT HAS_IBV_ACCESS_RELAXED_ORDERING) 21 | + message(WARNING "IBV_ACCESS_RELAXED_ORDERING does not exist in ${IBVERBS_INCLUDE_DIRS}/infiniband/verbs.h. Disabling this symbol in mscclpp/src/ib.cc.") 22 | + else() 23 | + message(STATUS "IBV_ACCESS_RELAXED_ORDERING exists in ${IBVERBS_INCLUDE_DIRS}/infiniband/verbs.h.") 24 | + endif() 25 | +endif() 26 | + 27 | find_package(NUMA REQUIRED) 28 | find_package(Threads REQUIRED) 29 | 30 | diff --git a/src/ib.cc b/src/ib.cc 31 | index d9d72d1..bddd4a8 100644 32 | --- a/src/ib.cc 33 | +++ b/src/ib.cc 34 | @@ -48,9 +48,17 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : buff(buff) { 35 | } 36 | uintptr_t addr = reinterpret_cast(buff) & -pageSize; 37 | std::size_t pages = (size + (reinterpret_cast(buff) - addr) + pageSize - 1) / pageSize; 38 | + 39 | +#if defined(HAS_IBV_ACCESS_RELAXED_ORDERING) 40 | this->mr = IBVerbs::ibv_reg_mr2(pd, reinterpret_cast(addr), pages * pageSize, 41 | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | 42 | IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC); 43 | +#else 44 | + this->mr = IBVerbs::ibv_reg_mr2(pd, reinterpret_cast(addr), pages * pageSize, 45 | + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | 46 | + IBV_ACCESS_REMOTE_ATOMIC); 47 | +#endif 48 | + 49 | if (this->mr == nullptr) { 50 | std::stringstream err; 51 | err << "ibv_reg_mr failed (errno " << errno << ")"; 52 | -------------------------------------------------------------------------------- /ext-src/non-multiple-128-fix.patch: -------------------------------------------------------------------------------- 1 | diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp 2 | index 76674ba..7a2cd4a 100644 3 | --- a/apps/nccl/src/allreduce.hpp 4 | +++ b/apps/nccl/src/allreduce.hpp 5 | @@ -368,7 +368,10 @@ __global__ void __launch_bounds__(512, 1) 6 | const size_t chanOffset = nPeer * blockIdx.x; 7 | // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) 8 | const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); 9 | - const size_t nInt4PerRank = nInt4 / worldSize; 10 | + size_t nInt4PerRank = nInt4 / worldSize; 11 | + if (nInt4 % worldSize) 12 | + nInt4PerRank = nInt4PerRank + 1; 13 | + 14 | auto smChans = smChannels + chanOffset; 15 | auto smOutChans = smOutChannels + chanOffset; 16 | 17 | -------------------------------------------------------------------------------- /ext-src/reg-fix.patch: -------------------------------------------------------------------------------- 1 | diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu 2 | index 5c19dc6..5fb99ef 100644 3 | --- a/apps/nccl/src/nccl.cu 4 | +++ b/apps/nccl/src/nccl.cu 5 | @@ -85,6 +85,7 @@ struct ncclComm { 6 | std::unordered_map channelInInfos; 7 | std::unordered_map channelOutInfos; 8 | std::unordered_map channelScratchInfos; 9 | + std::unordered_map regHandles; 10 | std::unordered_map handleKeys; 11 | std::shared_ptr scratchBuff; 12 | std::vector remoteScratchRegMemories; 13 | @@ -616,6 +617,11 @@ NCCL_API ncclResult_t ncclCommRegister(ncclComm_t comm, void* buff, size_t size, 14 | p->ipcHandle = ipcHandle; 15 | *handle = p; 16 | 17 | + auto regIt = comm->regHandles.find(buffKey); 18 | + if (regIt == comm->regHandles.end()) { 19 | + comm->regHandles[buffKey] = ipcHandle; 20 | + } 21 | + 22 | auto it = comm->handleKeys.find(*handle); 23 | if (it == comm->handleKeys.end()) { 24 | comm->handleKeys[*handle] = buffKey; 25 | @@ -642,6 +648,7 @@ NCCL_API ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle) { 26 | if (outIt != comm->channelOutInfos.end()) { 27 | comm->channelOutInfos.erase(outIt); 28 | } 29 | + comm->regHandles.erase(buffKey); 30 | comm->handleKeys.erase(handle); 31 | free(handle); 32 | } 33 | @@ -655,8 +662,8 @@ bool mscclpp_BuffIsRegistered(ncclComm_t comm, const void* buff){ 34 | CUdeviceptr buffBasePtr; 35 | MSCCLPP_CUTHROW(cuMemGetAddressRange(&buffBasePtr, &buffBytes, (CUdeviceptr)buff)); 36 | channelKey buffKey{(void*)buffBasePtr, buffBytes}; 37 | - auto buffIt = comm->channelScratchInfos.find(buffKey); 38 | - bool registered = buffIt != comm->channelScratchInfos.end(); 39 | + auto buffIt = comm->regHandles.find(buffKey); 40 | + bool registered = buffIt != comm->regHandles.end(); 41 | return registered; 42 | } 43 | size_t 44 | -------------------------------------------------------------------------------- /ext-src/remove-clip.patch: -------------------------------------------------------------------------------- 1 | diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp 2 | index fac105a..9ef93ce 100644 3 | --- a/apps/nccl/src/allreduce.hpp 4 | +++ b/apps/nccl/src/allreduce.hpp 5 | @@ -71,17 +71,29 @@ __forceinline__ __device__ __bfloat162 clip(__bfloat162 val) { 6 | 7 | template 8 | __forceinline__ __device__ T add_elements(T a, T b) { 9 | - return clip(a + b); 10 | + #ifdef MSCCLPP_CLIP_ENABLED 11 | + return clip(a + b); 12 | + #else 13 | + return a + b; 14 | + #endif 15 | } 16 | 17 | template <> 18 | __forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { 19 | - return clip(__hadd2(a, b)); 20 | + #ifdef MSCCLPP_CLIP_ENABLED 21 | + return clip(__hadd2(a, b)); 22 | + #else 23 | + return __hadd2(a, b); 24 | + #endif 25 | } 26 | 27 | template <> 28 | __forceinline__ __device__ __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) { 29 | - return clip(__hadd2(a, b)); 30 | + #ifdef MSCCLPP_CLIP_ENABLED 31 | + return clip(__hadd2(a, b)); 32 | + #else 33 | + return __hadd2(a, b); 34 | + #endif 35 | } 36 | 37 | template 38 | @@ -558,7 +570,7 @@ __global__ void __launch_bounds__(512, 1) 39 | 40 | 41 | template 42 | -__global__ void __launch_bounds__(512, 1) 43 | +__global__ void __launch_bounds__(1024, 1) 44 | allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, 45 | mscclpp::DeviceHandle* smOutChannels, size_t channelOutDataOffset, 46 | size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems) { 47 | @@ -1045,6 +1057,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< 48 | allreduce8Mod<<>>(buff, scratch, resultBuff, smScrChannels, 49 | channelScratchOffset, rank, nRanksPerNode, worldSize, nelems); 50 | } else { 51 | + nThreadsPerBlock = std::is_same::value ? 1024 : nThreadsPerBlock; 52 | allreduce8<<>>(buff, scratch, resultBuff, smScrChannels, 53 | smOutChannels, channelOutOffset, channelScratchOffset, rank, nRanksPerNode, 54 | worldSize, nelems); 55 | -------------------------------------------------------------------------------- /ext-tuner/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | RCCL_HOME:=../../build/release 7 | HIP_HOME:=/opt/rocm 8 | INC:= -I$(RCCL_HOME)/include/ -I$(HIP_HOME)/include/ -D__HIP_PLATFORM_AMD__ -Inccl 9 | PLUGIN_SO:=libnccl-tuner.so 10 | 11 | default: $(PLUGIN_SO) 12 | 13 | $(PLUGIN_SO): plugin.c 14 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 15 | 16 | clean: 17 | rm -f $(PLUGIN_SO) 18 | -------------------------------------------------------------------------------- /ext-tuner/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 12 | 13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /ext-tuner/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_ERR_H_ 6 | #define NCCL_ERR_H_ 7 | 8 | /* Error type for plugins */ 9 | typedef enum { ncclSuccess = 0, 10 | ncclUnhandledCudaError = 1, 11 | ncclSystemError = 2, 12 | ncclInternalError = 3, 13 | ncclInvalidArgument = 4, 14 | ncclInvalidUsage = 5, 15 | ncclRemoteError = 6 } ncclResult_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /makefiles/formatting.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting 8 | # As this file defines a new target (format), it should be included at least after the definition of the 9 | # default target. 10 | 11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none 12 | ASTYLEDIR := $(BUILDDIR)/contrib 13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz 14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle 15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/ 16 | ASTYLEVER := 3.1 17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz" 18 | 19 | $(ASTYLEDIR) : 20 | @mkdir -p $(ASTYLEDIR) 21 | 22 | $(ASTYLETAR) : $(ASTYLEDIR) 23 | @wget -q -O $(ASTYLETAR) $(ASTYLEURL) 24 | 25 | $(ASTYLEBLD) : $(ASTYLETAR) 26 | @cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR) 27 | 28 | $(ASTYLEBIN) : $(ASTYLEBLD) 29 | ${MAKE} -C $(ASTYLEBLD) 30 | 31 | .PHONY : format 32 | format : $(ASTYLEBIN) 33 | @$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT) 34 | -------------------------------------------------------------------------------- /makefiles/version.mk: -------------------------------------------------------------------------------- 1 | ##### version 2 | NCCL_MAJOR := 2 3 | NCCL_MINOR := 25 4 | NCCL_PATCH := 1 5 | NCCL_SUFFIX := 6 | PKG_REVISION := 1 7 | -------------------------------------------------------------------------------- /pkg/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : build 9 | build : debian.build txz.build 10 | 11 | BUILDDIR ?= $(abspath ../build) 12 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 13 | TARGETS := debian txz 14 | all: ${TARGETS:%=%.build} 15 | prep: ${TARGETS:%=%.prep} 16 | build: ${TARGETS:%=%.build} 17 | clean: ${TARGETS:%=%.clean} 18 | 19 | %.prep: 20 | ${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR} 21 | 22 | %.build: 23 | ${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR} 24 | 25 | %.clean: 26 | ${MAKE} -C $* clean 27 | -------------------------------------------------------------------------------- /pkg/debian/.gitignore: -------------------------------------------------------------------------------- 1 | /*.debhelper.log 2 | /*.debhelper 3 | /*.substvars 4 | /tmp/ 5 | /files 6 | /libnccl1/ 7 | /libnccl-dev/ 8 | -------------------------------------------------------------------------------- /pkg/debian/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | DEBPREPDIR := $(BUILDDIR)/debian 11 | PKGDIR := $(BUILDDIR)/pkg/deb/ 12 | 13 | DEBGEN_IN := $(wildcard *.in) 14 | DEBGEN := $(DEBGEN_IN:.in=) 15 | DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN) 16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) 20 | PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) 21 | 22 | prep : $(DEBTARGETS) 23 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 24 | 25 | build : prep 26 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 27 | @printf "Building Debian package\n" 28 | (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz) 29 | mkdir -p $(PKGDIR) 30 | mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ 31 | 32 | clean: 33 | rm -Rf $(DEBPREPDIR) $(PKGDIR) 34 | 35 | $(DEBPREPDIR)/% : %.in 36 | @printf "Generating %-35s > %s\n" $< $@ 37 | mkdir -p $(DEBPREPDIR) 38 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 39 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 40 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 41 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 42 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 43 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 44 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 45 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 46 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 47 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 48 | $< > $@ 49 | 50 | $(DEBPREPDIR)/% : % 51 | @printf "Grabbing %-35s > %s\n" $< $@ 52 | mkdir -p $(DEBPREPDIR) 53 | cp -f $< $@ 54 | -------------------------------------------------------------------------------- /pkg/debian/changelog.in: -------------------------------------------------------------------------------- 1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium 2 | 3 | * Automatic Debian package from build 4 | 5 | -- cudatools ${pkg:Timestamp} 6 | -------------------------------------------------------------------------------- /pkg/debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /pkg/debian/control.in: -------------------------------------------------------------------------------- 1 | Source: nccl 2 | Section: libs 3 | Maintainer: cudatools 4 | Priority: optional 5 | Build-depends: debhelper(>=9) 6 | Standards-Version: 3.9.5 7 | 8 | Package: libnccl${nccl:Major} 9 | Section: libs 10 | Architecture: ${pkg:Arch} 11 | Depends: ${misc:Depends}, ${shlibs:Depends} 12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | Package: libnccl-dev 21 | Section: libdevel 22 | Architecture: ${pkg:Arch} 23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version}) 24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files 25 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 26 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 27 | broadcast, and reduce-scatter. 28 | It has been optimized to achieve high bandwidth on any platform using PCIe, 29 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 30 | sockets. 31 | -------------------------------------------------------------------------------- /pkg/debian/copyright: -------------------------------------------------------------------------------- 1 | ../../LICENSE.txt -------------------------------------------------------------------------------- /pkg/debian/gbp.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | debian-branch = master 3 | upstream-branch = master 4 | 5 | ignore-new = True 6 | 7 | [git-buildpackage] 8 | 9 | no-purge = True 10 | -------------------------------------------------------------------------------- /pkg/debian/libnccl-dev.install.in: -------------------------------------------------------------------------------- 1 | bin/ncclras /usr/bin 2 | include/nccl.h /usr/include 3 | lib/libnccl.so /usr/lib/${pkg:MultiArch} 4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch} 5 | -------------------------------------------------------------------------------- /pkg/debian/libnccl2.install.in: -------------------------------------------------------------------------------- 1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch} 2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch} 3 | -------------------------------------------------------------------------------- /pkg/debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | %: 4 | dh $@ --parallel 5 | 6 | override_dh_auto_install: 7 | PREFIX=debian/tmp dh_auto_install 8 | 9 | override_dh_auto_test: 10 | # Do not make test 11 | 12 | override_dh_auto_clean: 13 | # Do not make clean 14 | 15 | override_dh_builddeb: 16 | dh_builddeb -- -Zxz 17 | -------------------------------------------------------------------------------- /pkg/debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /pkg/redhat/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | RPMPREPDIR := $(BUILDDIR)/redhat 11 | PKGDIR := $(BUILDDIR)/pkg/rpm/ 12 | 13 | RPMGEN_IN := $(wildcard *.in) 14 | RPMGEN := $(RPMGEN_IN:.in=) 15 | RPMFILES := $(RPMGEN) 16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | ARCH := $(shell uname -m) 20 | PKG_ARCH ?= $(shell uname -m) 21 | PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) 22 | ifeq ($(PKG_MULTIARCH),) 23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it 24 | PKG_MULTIARCH := $(ARCH)-linux-gnu 25 | endif 26 | 27 | prep : $(RPMTARGETS) 28 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 29 | 30 | build : prep 31 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 32 | $(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR) 33 | @printf "Building Redhat package\n" 34 | mkdir -p $(PKGDIR) 35 | rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \ 36 | --define "_rpmdir $(PKGDIR)" \ 37 | --define "_builddir $(PKGDIR)/build/" \ 38 | --define "_buildrootdir $(PKGDIR)/buildroot/" \ 39 | -bb $(BUILDDIR)/redhat/nccl.spec 40 | 41 | clean: 42 | rm -Rf $(RPMPREPDIR) $(PKGDIR) 43 | 44 | $(RPMPREPDIR)/% : %.in 45 | @printf "Generating %-35s > %s\n" $< $@ 46 | mkdir -p $(RPMPREPDIR) 47 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 48 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 49 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 50 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 51 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 52 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 53 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 54 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 55 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 56 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 57 | $< > $@ 58 | 59 | $(RPMPREPDIR)/% : % 60 | @printf "Grabbing %-35s > %s\n" $< $@ 61 | mkdir -p $(RPMPREPDIR) 62 | cp -f $< $@ 63 | -------------------------------------------------------------------------------- /pkg/srctxz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/srctxz 11 | PKGDIR := $(BUILDDIR)/pkg/srctxz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_REVISION ?= 3 18 | PKG_ARCH := $(shell uname -m) 19 | 20 | prep: $(TXZTARGETS) 21 | 22 | build: prep 23 | $(MAKE) -C ../../src clean 24 | @printf "Building source tar.xz package\n" 25 | (cd $(BUILDDIR); bash srctxz/create_srctxz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 40 | $< > $@ 41 | -------------------------------------------------------------------------------- /pkg/srctxz/create_srctxz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | cd .. 11 | NCCLDIR=`basename $PWD` 12 | 13 | echo "Checking for unclean directory ..." 14 | git clean -x -i 15 | echo "Clean done." 16 | echo "Checking for uncommited files ..." 17 | if [ "`git status -s | wc -l`" != "0" ]; then 18 | git status -s 19 | echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)" 20 | read 21 | fi 22 | 23 | cd .. 24 | NCCL_MAJOR=${nccl:Major} 25 | NCCL_MINOR=${nccl:Minor} 26 | NCCL_PATCH=${nccl:Patch} 27 | NCCL_SUFFIX=${nccl:Suffix} 28 | NCCL_BUILD=${pkg:Revision} 29 | 30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" 31 | 32 | tar --exclude build \ 33 | --exclude ".git*" \ 34 | --exclude pkg/srctxz \ 35 | --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR 36 | -------------------------------------------------------------------------------- /pkg/txz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/txz 11 | PKGDIR := $(BUILDDIR)/pkg/txz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_ARCH := $(shell uname -m) 18 | 19 | prep: $(TXZTARGETS) 20 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 21 | 22 | build: prep 23 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 24 | @printf "Building tar.xz package\n" 25 | (cd $(BUILDDIR); bash txz/create_txz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../nccl*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 40 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 41 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 42 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 43 | $< > $@ 44 | -------------------------------------------------------------------------------- /pkg/txz/create_txz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | BUILDDIR=`basename $PWD` 11 | 12 | cd .. 13 | NCCL_MAJOR=${nccl:Major} 14 | NCCL_MINOR=${nccl:Minor} 15 | NCCL_PATCH=${nccl:Patch} 16 | NCCL_SUFFIX=${nccl:Suffix} 17 | CUDA_MAJOR=${cuda:Major} 18 | CUDA_MINOR=${cuda:Minor} 19 | PKG_REVISION=${pkg:Revision} 20 | PKG_ARCH=${pkg:Arch} 21 | 22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" 23 | 24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt 25 | -------------------------------------------------------------------------------- /rtest.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | rccl-UnitTests --gtest_color=yes --gtest_filter= 5 | 6 | {GTEST_FILTER}*sum_float32* --gtest_output=xml:output_psdb.xml 7 | 8 | 9 | {GTEST_FILTER}* --gtest_output=xml:output_osdb.xml 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/device/common.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "device.h" 8 | #include "collectives.h" 9 | #include "common.h" 10 | 11 | __shared__ ncclShmemData ncclShmem; 12 | #if __CUDA_ARCH__ < 700 13 | __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; 14 | #endif 15 | 16 | struct RunWorkNop { 17 | __device__ void run() {} 18 | }; 19 | 20 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { 21 | ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args); 22 | } 23 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { 24 | ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args); 25 | } 26 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { 27 | ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args); 28 | } 29 | #ifdef ENABLE_COLLTRACE 30 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { 31 | ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args); 32 | } 33 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { 34 | ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args); 35 | } 36 | __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { 37 | ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&args4K.args); 38 | } 39 | #endif 40 | 41 | #ifdef USE_INDIRECT_FUNCTION_CALL 42 | __device__ void ncclDevFunc_Nop(); 43 | #else 44 | __device__ __attribute__((noinline)) void ncclDevFunc_Nop(); 45 | #endif 46 | -------------------------------------------------------------------------------- /src/device/network/unpack/unpack_defs.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, Google LLC. All rights reserved. 3 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | #ifndef NET_DEVICE_UNPACK_DEFS_H 8 | #define NET_DEVICE_UNPACK_DEFS_H 9 | 10 | #include 11 | 12 | #include "device.h" 13 | 14 | #define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16 15 | 16 | union alignas(16) loadMeta { 17 | uint64_t r64[2]; 18 | struct { 19 | uint32_t src_off; 20 | uint32_t len; 21 | uint64_t dst_off; 22 | }; 23 | }; 24 | static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned"); 25 | 26 | /****** global memory ******/ 27 | 28 | #define NET_UNPACK_MAX_QUEUE_DEPTH 16 // MAX_REQUESTS 29 | #define NET_UNPACK_MAX_SLICE_SIZE 4194304 // 4MB per Irecv call 30 | #define SLICE_PAGE_SIZE 4096 31 | #define NET_UNPACK_MAX_SLICE_PAGES \ 32 | (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2) // * 2 for slack, wasteful.. 33 | 34 | struct netUnpackMeta { 35 | loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES]; 36 | uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH]; 37 | }; 38 | 39 | struct unpackNetDeviceHandle { 40 | struct netUnpackMeta *meta; // mapped 41 | void* bounce_buf; 42 | uint64_t head; 43 | }; 44 | 45 | /****** shared memory ******/ 46 | 47 | #define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h 48 | #define NET_UNPACK_MAX_NPEERS 2 // The most you should have is 2 network peers per-group (indexed by index) 49 | #define WARP_SHM_PAGE_CNT 4 50 | #define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta)) 51 | struct unpackShmem { 52 | void* bounce_buf; 53 | }; 54 | 55 | struct unpackGroupShmem { 56 | int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv 57 | uint64_t head[NET_UNPACK_MAX_NPEERS]; 58 | struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy 59 | }; 60 | 61 | #endif // NET_DEVICE_UNPACK_DEFS_H_ 62 | -------------------------------------------------------------------------------- /src/enhcompat.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ 8 | 9 | enum cudaError_t { cudaErrorStubLibrary = 34 }; 10 | 11 | extern "C" { 12 | 13 | cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); 14 | cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } 15 | 16 | cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); 17 | cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } 18 | 19 | cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); 20 | cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } 21 | 22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); 23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } 24 | 25 | cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); 26 | cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/graph/rings.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "core.h" 8 | 9 | void dumpLine(int* values, int nranks, const char* prefix) { 10 | constexpr int line_length = 128; 11 | char line[line_length]; 12 | int num_width = snprintf(nullptr, 0, "%d", nranks-1); // safe as per "man snprintf" 13 | int n = snprintf(line, line_length, "%s", prefix); 14 | for (int i = 0; i < nranks && n < line_length-1; i++) { 15 | n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]); 16 | // At this point n may be more than line_length-1, so don't use it 17 | // for indexing into "line". 18 | } 19 | if (n >= line_length) { 20 | // Sprintf wanted to write more than would fit in the buffer. Assume 21 | // line_length is at least 4 and replace the end with "..." to 22 | // indicate that it was truncated. 23 | snprintf(line+line_length-4, 4, "..."); 24 | } 25 | INFO(NCCL_INIT, "%s", line); 26 | } 27 | 28 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { 29 | for (int r=0; r 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | /** 32 | ** This is an exact copy of the IO Link types from rocm_smi.h 33 | ** These definitions are required since we do not know whether the 34 | ** code will also be compiled such that it includes the rocm_smi.h 35 | ** file or not. The values have to be identical however 36 | */ 37 | typedef enum _ARSMI_IO_LINK_TYPE { 38 | ARSMI_IOLINK_TYPE_UNDEFINED = 0, //!< unknown type. 39 | ARSMI_IOLINK_TYPE_PCIEXPRESS, //!< PCI Express 40 | ARSMI_IOLINK_TYPE_XGMI, //!< XGMI 41 | ARSMI_IOLINK_TYPE_NUMIOLINKTYPES, //!< Number of IO Link types 42 | ARSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF //!< Max of IO Link types 43 | } ARSMI_IO_LINK_TYPE; 44 | 45 | struct ARSMI_linkInfo { 46 | uint32_t src_node; 47 | uint32_t dst_node; 48 | uint64_t hops; 49 | ARSMI_IO_LINK_TYPE type; 50 | uint64_t weight; 51 | uint64_t min_bandwidth; 52 | uint64_t max_bandwidth; 53 | }; 54 | typedef struct ARSMI_linkInfo ARSMI_linkInfo; 55 | 56 | int ARSMI_init (void); 57 | int ARSMI_get_num_devices (uint32_t *num_devices); 58 | int ARSMI_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid); 59 | int ARSMI_topo_get_link_info(uint32_t dv_ind_src, uint32_t dv_ind_dst, 60 | ARSMI_linkInfo *info); 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /src/include/archinfo.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef ARCHINFO_H_ 24 | #define ARCHINFO_H_ 25 | 26 | #include 27 | 28 | /* 29 | #include 30 | #include 31 | */ 32 | 33 | void GcnArchNameFormat(char *gcnArchName, char* out); 34 | void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName); 35 | int GetGcnArchName(int deviceId, char* out); 36 | double GetDeviceWallClockRateInKhz(int deviceId); 37 | bool IsArchMatch(char const* arch, char const* target); 38 | 39 | #endif // ARCHINFO_H 40 | -------------------------------------------------------------------------------- /src/include/argcheck.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ARGCHECK_H_ 8 | #define NCCL_ARGCHECK_H_ 9 | 10 | #include "core.h" 11 | #include "info.h" 12 | 13 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); 14 | ncclResult_t CommCheck(struct ncclComm* ptr, const char* opname, const char* ptrname); 15 | ncclResult_t ArgsCheck(struct ncclInfo* info); 16 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname); 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /src/include/bootstrap.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_BOOTSTRAP_H_ 8 | #define NCCL_BOOTSTRAP_H_ 9 | 10 | #include "nccl.h" 11 | #include "comm.h" 12 | 13 | struct ncclBootstrapHandle { 14 | uint64_t magic; 15 | union ncclSocketAddress addr; 16 | }; 17 | static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID"); 18 | 19 | ncclResult_t bootstrapNetInit(); 20 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv); 21 | ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle); 22 | ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm); 23 | ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks); 24 | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); 25 | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); 26 | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); 27 | ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag); 28 | ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size); 29 | ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag); 30 | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); 31 | ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size); 32 | ncclResult_t bootstrapClose(void* commState); 33 | ncclResult_t bootstrapAbort(void* commState); 34 | #endif 35 | -------------------------------------------------------------------------------- /src/include/channel.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CHANNEL_H_ 8 | #define NCCL_CHANNEL_H_ 9 | #include "comm.h" 10 | #include "utils.h" 11 | 12 | #include 13 | 14 | ncclResult_t initChannel(struct ncclComm* comm, int channelid); 15 | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); 16 | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); 17 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); 18 | 19 | inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) { 20 | if (comm->nNodes > 1) { 21 | int nodeDelta = p2pRound/comm->maxLocalRanks; 22 | int localDelta = p2pRound%comm->maxLocalRanks; 23 | int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); 24 | base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH; 25 | return base & 0xff; 26 | } else { 27 | return p2pRound & 0xff; 28 | } 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/include/core.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_CORE_H_ 9 | #define NCCL_CORE_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include // For std::min/std::max 16 | #include "nccl.h" 17 | 18 | #ifdef PROFAPI 19 | #define NCCL_API(ret, func, args...) \ 20 | __attribute__ ((visibility("default"))) \ 21 | __attribute__ ((alias(#func))) \ 22 | ret p##func (args); \ 23 | extern "C" \ 24 | __attribute__ ((visibility("default"))) \ 25 | __attribute__ ((weak)) \ 26 | ret func(args) 27 | #else 28 | #define NCCL_API(ret, func, args...) \ 29 | extern "C" \ 30 | __attribute__ ((visibility("default"))) \ 31 | ret func(args) 32 | #endif // end PROFAPI 33 | 34 | #include "debug.h" 35 | #include "checks.h" 36 | #include "rocmwrap.h" 37 | #include "alloc.h" 38 | #include "utils.h" 39 | #include "param.h" 40 | #ifdef NVTX_NO_IMPL 41 | #include "nvtx_stub.h" 42 | #else 43 | #include "nvtx.h" 44 | #endif 45 | 46 | #endif // end include guard 47 | -------------------------------------------------------------------------------- /src/include/cpuset.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CPUSET_H_ 8 | #define NCCL_CPUSET_H_ 9 | 10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t 11 | 12 | static int hexToInt(char c) { 13 | int v = c - '0'; 14 | if (v < 0) return -1; 15 | if (v > 9) v = 10 + c - 'a'; 16 | if ((v < 0) || (v > 15)) return -1; 17 | return v; 18 | } 19 | 20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) 21 | 22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { 23 | uint32_t cpumasks[CPU_SET_N_U32]; 24 | int m = CPU_SET_N_U32-1; 25 | cpumasks[m] = 0; 26 | for (int o=0; o=0; o--) { 49 | if (c == 0 && m8[o] == 0) continue; 50 | sprintf(str+c, "%02x", m8[o]); 51 | c+=2; 52 | if (o && o%4 == 0) { 53 | sprintf(str+c, ","); 54 | c++; 55 | } 56 | } 57 | str[c] = '\0'; 58 | return ncclSuccess; 59 | } 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /src/include/debug.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INT_DEBUG_H_ 8 | #define NCCL_INT_DEBUG_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_common.h" 12 | #include 13 | 14 | #include 15 | 16 | // Conform to pthread and NVTX standard 17 | #define NCCL_THREAD_NAMELEN 16 18 | 19 | extern int ncclDebugLevel; 20 | extern FILE *ncclDebugFile; 21 | 22 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); 23 | 24 | // Let code temporarily downgrade WARN into INFO 25 | extern thread_local int ncclDebugNoWarn; 26 | extern char ncclLastError[]; 27 | 28 | #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 29 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 30 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) 31 | #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) 32 | 33 | #ifdef ENABLE_TRACE 34 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) 35 | #else 36 | #define TRACE(...) 37 | #endif 38 | 39 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...); 40 | 41 | void ncclResetDebugInit(); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/include/enqueue.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ENQUEUE_H_ 8 | #define NCCL_ENQUEUE_H_ 9 | 10 | #include "comm.h" 11 | #include "group.h" 12 | #include "collectives.h" 13 | #include "utils.h" 14 | 15 | #define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t) 16 | #define NCCL_LL128_ALIGNMENT_PER_WARP 480 17 | #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL) 18 | #define NCCL_BYTES_ALIGNMENT 16 19 | 20 | ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize); 21 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); 22 | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); 23 | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); 24 | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); 25 | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); 26 | ncclResult_t ncclLaunchFinish(struct ncclComm* comm); 27 | ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo); 28 | ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm); 29 | 30 | static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) { 31 | return func == ncclFuncReduceScatter ? nRanks*count : count; 32 | } 33 | static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) { 34 | return func == ncclFuncAllGather ? nRanks*count : count; 35 | } 36 | rccl_static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) { 37 | return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count; 38 | } 39 | 40 | #endif // End include guard 41 | -------------------------------------------------------------------------------- /src/include/git_version.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef RCCL_GIT_VERSION_H_ 8 | #define RCCL_GIT_VERSION_H_ 9 | 10 | extern const char *rcclGitHash; 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src/include/hip_rocm_version_info.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef RCCL_HIP_ROCM_VERSION_INFO_H_ 24 | #define RCCL_HIP_ROCM_VERSION_INFO_H_ 25 | 26 | #define STR2(v) #v 27 | #define STR(v) STR2(v) 28 | 29 | // HIP version info retrieval 30 | #if ROCM_VERSION >= 50000 31 | #define HIP_BUILD_INFO STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) "." STR(HIP_VERSION_PATCH) "-" HIP_VERSION_GITHASH 32 | // HIP Githash info not available in older ROCm versions < 5.0 33 | #elif ROCM_VERSION >= 40000 34 | #define HIP_BUILD_INFO STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) "." STR(HIP_VERSION_PATCH) 35 | #else 36 | #define HIP_BUILD_INFO "Unknown" 37 | #endif 38 | 39 | // ROCm version info retrieval 40 | #if ROCM_VERSION >= 60000 41 | // rocm_version.h moved to rocm/include/rocm-core from ROCm 6.0 42 | #include 43 | #else 44 | // rocm-core/rocm_version.h not present in some ROCm versions < 6.0. 45 | // So, including it from rocm/include/rocm_version.h 46 | #if ROCM_VERSION >= 50000 47 | #include 48 | //ROCM_BUILD_INFO not defined in ROCm Versions < 5.50 49 | #ifndef ROCM_BUILD_INFO 50 | #define ROCM_BUILD_INFO STR(ROCM_VERSION_MAJOR) "." STR(ROCM_VERSION_MINOR) "." STR(ROCM_VERSION_PATCH) 51 | #endif 52 | //ROCm version info not available for ROCm versions < 5.0 53 | #else 54 | #define ROCM_BUILD_INFO "Unknown" 55 | #endif 56 | #endif 57 | 58 | #endif -------------------------------------------------------------------------------- /src/include/info.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_INFO_H_ 9 | #define NCCL_INFO_H_ 10 | 11 | #include "nccl.h" 12 | #include "collectives.h" 13 | #include "core.h" 14 | #include "utils.h" 15 | 16 | // Used to pass NCCL call information between functions 17 | struct ncclInfo { 18 | ncclFunc_t coll; 19 | const char* opName; 20 | // NCCL Coll Args 21 | const void* sendbuff; 22 | void* recvbuff; 23 | size_t count; 24 | ncclDataType_t datatype; 25 | ncclRedOp_t op; 26 | int root; // peer for p2p operations 27 | ncclComm_t comm; 28 | cudaStream_t stream; 29 | // Algorithm details 30 | int chunkSteps; 31 | int sliceSteps; 32 | }; 33 | 34 | #endif -------------------------------------------------------------------------------- /src/include/ipcsocket.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See COPYRIGHT for license information 5 | */ 6 | 7 | #ifndef NCCL_IPCSOCKET_H 8 | #define NCCL_IPCSOCKET_H 9 | 10 | #include "nccl.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #define NCCL_IPC_SOCKNAME_LEN 64 24 | 25 | struct ncclIpcSocket { 26 | int fd; 27 | char socketName[NCCL_IPC_SOCKNAME_LEN]; 28 | volatile uint32_t* abortFlag; 29 | }; 30 | 31 | ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); 32 | ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); 33 | ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd); 34 | 35 | ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); 36 | ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); 37 | 38 | ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash); 39 | ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd); 40 | 41 | #endif /* NCCL_IPCSOCKET_H */ 42 | -------------------------------------------------------------------------------- /src/include/mnnvl.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_MNNVL_H_ 8 | #define NCCL_MNNVL_H_ 9 | 10 | #include "nccl.h" 11 | #include "comm.h" 12 | 13 | ncclResult_t ncclMnnvlCheck(struct ncclComm* comm); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_lifecycle.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_LIFECYCLE_H_ 7 | #define MSCCL_LIFECYCLE_H_ 8 | 9 | #include "enqueue.h" 10 | 11 | #include "msccl/msccl_struct.h" 12 | 13 | bool mscclEnabled(); 14 | bool mscclForceEnabled(); 15 | 16 | void mscclSetIsCallerFlag(); 17 | void mscclClearIsCallerFlag(); 18 | bool mscclIsCaller(); 19 | 20 | /** 21 | * @brief mscclAvailable() is used to determine if msccl functionality is avaliable 22 | * @param comm is an optional rccl communicator, if provided uses the mscclStatus 23 | * from a global map mscclStatus> to determine if msccl is available. If not available 24 | * in the map, this invocations inserts a new key value pair in the global map. 25 | * If comm == nullptr, on the first invocation it initializes a static thread local variable 26 | * mscclStatus and uses the same object in subsequent calls from same thread if comm is null ptr 27 | */ 28 | bool mscclAvailable(const ncclComm_t comm = nullptr); 29 | 30 | ncclResult_t mscclSchedulerInit(ncclComm_t comm, int* numChannelsRequired); 31 | 32 | ncclResult_t mscclInit(ncclComm_t comm); 33 | 34 | ncclResult_t mscclGroupStart(); 35 | 36 | ncclResult_t mscclEnqueueCheck( 37 | const void* sendbuff, const size_t sendcounts[], const size_t sdispls[], 38 | void* recvbuff, const size_t recvcounts[], const size_t rdispls[], 39 | size_t count, ncclDataType_t datatype, int root, int peer, ncclRedOp_t op, 40 | mscclFunc_t mscclFunc, ncclComm_t comm, hipStream_t stream); 41 | 42 | ncclResult_t mscclGroupEnd(); 43 | 44 | ncclResult_t mscclTeardown(const ncclComm_t comm); 45 | 46 | size_t mscclKernMaxLocalSize(); 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_scheduler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_SCHEDULER_H_ 7 | #define MSCCL_SCHEDULER_H_ 8 | 9 | typedef enum { mscclFuncReduce = 0, 10 | mscclFuncBroadcast = 1, 11 | mscclFuncAllReduce = 2, 12 | mscclFuncReduceScatter = 3, 13 | mscclFuncAllGather = 4, 14 | mscclFuncSend = 5, 15 | mscclFuncRecv = 6, 16 | mscclFuncGather = 7, 17 | mscclFuncScatter = 8, 18 | mscclFuncAllToAll = 9, 19 | mscclFuncAllToAllv = 10, 20 | mscclNumFuncs = 11 } mscclFunc_t; 21 | 22 | struct mscclSchedulerParam { 23 | const void* sendBuff; 24 | const size_t* sendCounts; 25 | const size_t* sDisPls; 26 | void* recvBuff; 27 | const size_t* recvCounts; 28 | const size_t* rDisPls; 29 | size_t count; 30 | ncclDataType_t dataType; 31 | int root; 32 | int peer; 33 | ncclRedOp_t op; 34 | mscclFunc_t func; 35 | int rank; 36 | int nRanks; 37 | bool scheduled; 38 | mscclAlgoHandle_t handle; 39 | uint64_t opCount; 40 | }; 41 | 42 | typedef struct { 43 | // Name of the scheduler (mainly for logs) 44 | const char* name; 45 | // Load all algorithms 46 | ncclResult_t (*init)(); 47 | // Select an algorithm 48 | ncclResult_t (*selectAlgo)(struct mscclSchedulerParam* param); 49 | // Unload all algorithms 50 | ncclResult_t (*teardown)(); 51 | } mscclSchedulerInterface; 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_setup.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_SETUP_H_ 7 | #define MSCCL_SETUP_H_ 8 | 9 | #include 10 | 11 | #include "comm.h" 12 | #include "msccl/msccl_struct.h" 13 | 14 | ncclResult_t mscclGetCaptureStatus(const ncclComm_t comm, hipStream_t stream); 15 | 16 | ncclResult_t mscclSetupScratch(struct mscclAlgo* hostAlgo, hipStream_t stream); 17 | 18 | ncclResult_t mscclSetupSyncFlags(const ncclComm_t comm, hipStream_t stream); 19 | 20 | ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo,const ncclComm_t comm); 21 | 22 | ncclResult_t mscclSetupCount(struct mscclAlgo* hostAlgo, ncclComm_t comm, size_t count, ncclDataType_t dataType); 23 | 24 | ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm, hipStream_t stream); 25 | 26 | ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count, 27 | ncclDataType_t dataType, ncclRedOp_t op, struct mscclAlgo* hostAlgo, struct mscclAlgo* devAlgo, 28 | ncclComm_t comm, hipStream_t stream); 29 | 30 | ncclResult_t mscclInitWorkFifoStatus(mscclWorkFifoStatus* workFifoStatus); 31 | 32 | ncclResult_t mscclDestroyWorkFifoStatus(mscclWorkFifoStatus* workFifoStatus); 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/include/msccl/msccl_status.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef MSCCL_STATUS_H_ 7 | #define MSCCL_STATUS_H_ 8 | 9 | #include "msccl/msccl_struct.h" 10 | 11 | bool mscclInitialized(const ncclComm_t comm); 12 | 13 | void mscclSetInitialized(const ncclComm_t comm, bool initialized = true); 14 | 15 | void mscclRemoveRank(const ncclComm_t comm); 16 | 17 | mscclStatus& mscclGetStatus(const ncclComm_t comm); 18 | 19 | mscclSavedProxyArgs& mscclGetSavedProxyArgs(const ncclComm_t comm); 20 | 21 | mscclThreadLocalStatus& mscclGetThreadLocalStatus(); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/include/mscclpp/mscclpp_nccl.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt and NOTICES.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef MSCCLPP_NCCL_H_ 8 | #define MSCCLPP_NCCL_H_ 9 | 10 | #include "nccl.h" 11 | #include 12 | #include 13 | 14 | typedef struct mscclppComm* mscclppComm_t; 15 | 16 | typedef ncclUniqueId mscclppUniqueId; 17 | 18 | /* A ncclUniqueId and a mscclppUniqueId will always be created together and used alternatively. This maps between them. */ 19 | extern std::unordered_map mscclpp_uniqueIdMap; 20 | extern std::unordered_map> mscclpp_uniqueIdReverseMap; 21 | extern std::unordered_map mscclpp_commToUniqueIdMap; 22 | extern std::unordered_map ncclCommToUniqueIdMap; 23 | 24 | extern "C" { 25 | /* See ncclGetUniqueId. */ 26 | ncclResult_t mscclpp_ncclGetUniqueId(mscclppUniqueId* uniqueId); 27 | 28 | /* See ncclCommInitRank. */ 29 | ncclResult_t mscclpp_ncclCommInitRank(mscclppComm_t* comm, int nranks, mscclppUniqueId commId, int rank); 30 | 31 | /* See ncclCommDestroy. */ 32 | ncclResult_t mscclpp_ncclCommDestroy(mscclppComm_t comm); 33 | 34 | /* See ncclAllReduce. */ 35 | ncclResult_t mscclpp_ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, 36 | ncclDataType_t datatype, ncclRedOp_t op, mscclppComm_t comm, hipStream_t stream); 37 | 38 | /* See ncclAllGather. */ 39 | ncclResult_t mscclpp_ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, 40 | ncclDataType_t datatype, mscclppComm_t comm, hipStream_t stream); 41 | 42 | ncclResult_t mscclpp_ncclCommRegister(mscclppComm_t comm, void* buff, size_t size, void** handle); 43 | 44 | ncclResult_t mscclpp_ncclCommDeregister(mscclppComm_t comm, void* handle); 45 | 46 | bool mscclpp_BuffIsRegistered(mscclppComm_t comm, const void* buff); 47 | 48 | size_t mscclpp_BufferSize(mscclppComm_t comm, void* handle); 49 | 50 | ncclResult_t mscclpp_ncclMemAlloc(void** ptr, size_t size); 51 | 52 | ncclResult_t mscclpp_ncclMemFree(void* ptr); 53 | } 54 | 55 | namespace std { 56 | template <> 57 | struct hash { 58 | size_t operator ()(const ncclUniqueId& uniqueId) const noexcept; 59 | }; 60 | } 61 | 62 | bool operator ==(const ncclUniqueId& a, const ncclUniqueId& b); 63 | 64 | bool mscclppCommCompatible(ncclComm_t comm); 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /src/include/nccl_common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_DEBUG_H_ 8 | #define NCCL_DEBUG_H_ 9 | 10 | typedef enum { 11 | NCCL_LOG_NONE = 0, 12 | NCCL_LOG_VERSION = 1, 13 | NCCL_LOG_WARN = 2, 14 | NCCL_LOG_INFO = 3, 15 | NCCL_LOG_ABORT = 4, 16 | NCCL_LOG_TRACE = 5 17 | } ncclDebugLogLevel; 18 | 19 | typedef enum { 20 | NCCL_INIT = 0x1, 21 | NCCL_COLL = 0x2, 22 | NCCL_P2P = 0x4, 23 | NCCL_SHM = 0x8, 24 | NCCL_NET = 0x10, 25 | NCCL_GRAPH = 0x20, 26 | NCCL_TUNING = 0x40, 27 | NCCL_ENV = 0x80, 28 | NCCL_ALLOC = 0x100, 29 | NCCL_CALL = 0x200, 30 | NCCL_PROXY = 0x400, 31 | NCCL_NVLS = 0x800, 32 | NCCL_BOOTSTRAP = 0x1000, 33 | NCCL_REG = 0x2000, 34 | NCCL_PROFILE = 0x4000, 35 | NCCL_RAS = 0x8000, 36 | NCCL_VERBS = 0x10000, 37 | NCCL_ALL = ~0 38 | } ncclDebugLogSubSys; 39 | 40 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 41 | 42 | #define NCCL_NUM_ONERANK 12 43 | #define FUNC_INDEX_TOTAL 656 + NCCL_NUM_ONERANK 44 | 45 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now 46 | typedef enum { 47 | ncclFuncBroadcast = 0, 48 | ncclFuncReduce = 1, 49 | ncclFuncAllGather = 2, 50 | ncclFuncReduceScatter = 3, 51 | ncclFuncAllReduce = 4, 52 | ncclFuncSendRecv = 5, 53 | ncclFuncSend = 6, 54 | ncclFuncRecv = 7, 55 | ncclFuncAllToAllPivot = 8, 56 | ncclNumFuncs = 9 57 | } ncclFunc_t; 58 | 59 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* 60 | #define NCCL_ALGO_UNDEF -1 61 | #define NCCL_ALGO_TREE 0 62 | #define NCCL_ALGO_RING 1 63 | #define NCCL_ALGO_COLLNET_DIRECT 2 64 | #define NCCL_ALGO_COLLNET_CHAIN 3 65 | #define NCCL_ALGO_NVLS 4 66 | #define NCCL_ALGO_NVLS_TREE 5 67 | #define NCCL_ALGO_PAT 6 68 | 69 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 70 | #define NCCL_PROTO_UNDEF -1 71 | #define NCCL_PROTO_LL 0 72 | #define NCCL_PROTO_LL128 1 73 | #define NCCL_PROTO_SIMPLE 2 74 | 75 | #define NCCL_ALGO_PROTO_IGNORE -1.0 76 | 77 | #define NCCL_NUM_UNROLLS 3 // 1/2/4 78 | #define NCCL_UNROLL_1 0 79 | #define NCCL_UNROLL_2 1 80 | #define NCCL_UNROLL_4 2 81 | 82 | #define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8 83 | #endif 84 | -------------------------------------------------------------------------------- /src/include/net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INT_NET_H_ 8 | #define NCCL_INT_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | #include "comm.h" 13 | #include "checks.h" 14 | 15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 16 | 17 | ncclResult_t ncclNetPluginLoad(struct ncclComm* comm); 18 | ncclResult_t ncclNetPluginUnload(struct ncclComm* comm); 19 | ncclResult_t ncclNetInit(struct ncclComm* comm); 20 | ncclResult_t ncclNetFinalize(struct ncclComm* comm); 21 | int ncclNetVersion(struct ncclComm* comm); 22 | 23 | // Test whether the current GPU support GPU Direct RDMA. 24 | ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); 25 | 26 | extern ncclNet_t ncclNetIb; 27 | extern ncclNet_t ncclNetSocket; 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /src/include/net_device.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_NET_DEVICE_H_ 8 | #define NCCL_NET_DEVICE_H_ 9 | 10 | #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 11 | #define NCCL_NET_MTU_SIZE 4096 12 | 13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin 14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. 15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 16 | 17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; 18 | 19 | typedef struct { 20 | ncclNetDeviceType netDeviceType; // Network offload type 21 | int netDeviceVersion; // Version number for network offload 22 | void* handle; 23 | size_t size; 24 | int needsProxyProgress; 25 | } ncclNetDeviceHandle_v7_t; 26 | 27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; 28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; 29 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t; 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/include/npkit/npkit.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef NPKIT_H_ 7 | #define NPKIT_H_ 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | #include "npkit/npkit_event.h" 15 | #include "npkit/npkit_struct.h" 16 | #include "common.h" 17 | 18 | #define NPKIT_GET_GPU_TIMESTAMP wall_clock64 19 | #define NPKIT_GET_CPU_TIMESTAMP_FROM_BLOCK \ 20 | __atomic_load_n(reinterpret_cast((uint8_t *)ncclShmem.comm.cpuTimestamp + 128*blockIdx.x), __ATOMIC_RELAXED) 21 | 22 | 23 | class NpKit { 24 | public: 25 | static const uint64_t kNumGpuEventBuffers = 1024; 26 | 27 | static const uint64_t kNumCpuEventBuffers = 64; 28 | 29 | static ncclResult_t Init(int rank); 30 | 31 | static ncclResult_t Dump(const std::string& dump_dir); 32 | 33 | static ncclResult_t Shutdown(); 34 | 35 | static NpKitEventCollectContext* GetGpuEventCollectContexts(); 36 | 37 | static inline __device__ void CollectGpuEvent(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp, 38 | NpKitEventCollectContext* ctx) { 39 | uint64_t event_buffer_head = ctx->event_buffer_head; 40 | if (event_buffer_head < kMaxNumGpuEventsPerBuffer) { 41 | NpKitEvent& event = ctx->event_buffer[event_buffer_head]; 42 | event.fields.type = type; 43 | event.fields.size = size < 0 ? 0 : size; 44 | event.fields.rsvd = rsvd; 45 | event.fields.timestamp = timestamp; 46 | ctx->event_buffer_head++; 47 | } 48 | } 49 | 50 | static void CollectCpuEvent(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp, int channel_id); 51 | 52 | static uint64_t *GetCpuTimestamp(); 53 | 54 | private: 55 | static void CpuTimestampUpdateThread(); 56 | 57 | // 64K * 512 * 16B = 512MB per GPU 58 | static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 16; 59 | 60 | // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU 61 | static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21; 62 | 63 | static NpKitEvent** gpu_event_buffers_; 64 | static NpKitEvent** cpu_event_buffers_; 65 | 66 | static NpKitEventCollectContext* gpu_collect_contexts_; 67 | static NpKitEventCollectContext* cpu_collect_contexts_; 68 | static uint64_t* cpu_timestamp_; 69 | 70 | static uint64_t rank_; 71 | 72 | static std::thread* cpu_timestamp_update_thread_; 73 | static volatile bool cpu_timestamp_update_thread_should_stop_; 74 | }; 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /src/include/npkit/npkit_struct.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Microsoft Corporation. 3 | * Licensed under the MIT License. 4 | ************************************************************************/ 5 | 6 | #ifndef NPKIT_STRUCT_H_ 7 | #define NPKIT_STRUCT_H_ 8 | 9 | #include 10 | 11 | #pragma pack(push, 1) 12 | 13 | union NpKitEvent { 14 | uint64_t bits[2]; 15 | struct { 16 | uint64_t type : 8; 17 | uint32_t size : 32; 18 | uint64_t rsvd : 24; 19 | uint64_t timestamp; 20 | } fields; 21 | }; 22 | 23 | struct NpKitEventCollectContext { 24 | NpKitEvent* event_buffer; 25 | uint64_t event_buffer_head; 26 | }; 27 | 28 | #pragma pack(pop) 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtSemanticsScope.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /** 10 | * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. 11 | */ 12 | 13 | #ifndef NVTX_SEMANTIC_ID_SCOPE_V1 14 | #define NVTX_SEMANTIC_ID_SCOPE_V1 1 15 | 16 | /** 17 | * \brief Specify the NVTX scope for a payload entry. 18 | * 19 | * This allows the scope to be set for a specific value or counter in a payload. 20 | * The scope must be known at schema registration time. 21 | */ 22 | typedef struct nvtxSemanticsScope_v1 23 | { 24 | struct nvtxSemanticsHeader_v1 header; 25 | 26 | /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */ 27 | uint64_t scopeId; 28 | } nvtxSemanticsScope_t; 29 | 30 | #endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_HELPER_MACROS_H 10 | #define NVTX_EXT_HELPER_MACROS_H 11 | 12 | /* Combine tokens */ 13 | #define _NVTX_EXT_CONCAT(a, b) a##b 14 | #define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b) 15 | 16 | /* Resolves to the number of arguments passed. */ 17 | #define NVTX_EXT_NUM_ARGS(...) \ 18 | NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway) 19 | #define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16 20 | 21 | /* Cast argument(s) to void to prevent unused variable warnings. */ 22 | #define _NVTX_EXT_VOIDIFY1(a1) (void)a1; 23 | #define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2; 24 | #define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3; 25 | #define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4; 26 | 27 | /* Mark function arguments as unused. */ 28 | #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \ 29 | NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) 30 | 31 | #endif /* NVTX_EXT_HELPER_MACROS_H */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtTypes.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /* This header defines types which are used by the internal implementation 10 | * of NVTX and callback subscribers. API clients do not use these types, 11 | * so they are defined here instead of in nvToolsExt.h to clarify they are 12 | * not part of the NVTX client API. */ 13 | 14 | #ifndef NVTXEXTTYPES_H 15 | #define NVTXEXTTYPES_H 16 | 17 | #ifndef NVTX_EXT_TYPES_GUARD 18 | #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h. 19 | #endif 20 | 21 | typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId); 22 | 23 | typedef struct nvtxExtModuleSegment_t 24 | { 25 | size_t segmentId; 26 | size_t slotCount; 27 | intptr_t* functionSlots; 28 | } nvtxExtModuleSegment_t; 29 | 30 | typedef struct nvtxExtModuleInfo_t 31 | { 32 | uint16_t nvtxVer; 33 | uint16_t structSize; 34 | uint16_t moduleId; 35 | uint16_t compatId; 36 | size_t segmentsCount; 37 | nvtxExtModuleSegment_t* segments; 38 | NvtxExtGetExportFunction_t getExportFunction; 39 | const void* extInfo; 40 | } nvtxExtModuleInfo_t; 41 | 42 | typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo); 43 | 44 | #endif /* NVTXEXTTYPES_H */ -------------------------------------------------------------------------------- /src/include/nvtx_stub.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_NVTX_STUB_H_ 8 | #define NCCL_NVTX_STUB_H_ 9 | 10 | #include 11 | 12 | struct nccl_domain{static constexpr char const* name{"NCCL"};}; 13 | 14 | #define NVTX3_FUNC_RANGE_IN(domain) 15 | #define nvtxNameOsThreadA(syscall, thread) 16 | #define NVTX3_FUNC_WITH_PARAMS(N, T, P) 17 | #define NVTX3_PAYLOAD(...) __VA_ARGS__ 18 | #define NVTX3_RANGE(T) 19 | #define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) 20 | 21 | #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/include/p2p.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | 9 | #ifndef NCCL_P2P_H_ 10 | #define NCCL_P2P_H_ 11 | 12 | #include 13 | #include 14 | 15 | #include "core.h" 16 | 17 | #if CUDART_VERSION < 12030 18 | // MNNVL: FABRIC handle support lifted from CUDA 12.3 19 | #define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128) 20 | #define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL) 21 | #define CU_IPC_HANDLE_SIZE 64 22 | typedef struct CUmemFabricHandle_st { 23 | unsigned char data[CU_IPC_HANDLE_SIZE]; 24 | } CUmemFabricHandle_v1; 25 | typedef CUmemFabricHandle_v1 CUmemFabricHandle; 26 | #endif 27 | 28 | typedef union { 29 | uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support 30 | CUmemFabricHandle handle; 31 | } ncclCuDesc; 32 | 33 | typedef union { 34 | // Legacy CUDA IPC 35 | cudaIpcMemHandle_t devIpc; 36 | // cuMem API support 37 | struct { 38 | ncclCuDesc cuDesc; 39 | CUmemGenericAllocationHandle memHandle; 40 | }; 41 | } ncclIpcDesc; 42 | 43 | enum ncclIpcRegType { 44 | NCCL_IPC_SENDRECV = 0, 45 | NCCL_IPC_COLLECTIVE = 1 46 | }; 47 | 48 | struct ncclIpcImpInfo { 49 | void* rmtRegAddr; 50 | bool legacyIpcCap; 51 | uintptr_t offset; 52 | }; 53 | 54 | struct ncclIpcRegInfo { 55 | int peerRank; 56 | void* baseAddr; 57 | struct ncclProxyConnector* ipcProxyconn; 58 | struct ncclIpcImpInfo impInfo; 59 | }; 60 | 61 | ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr); 62 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); 63 | ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); 64 | ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut); 65 | ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts); 66 | 67 | ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo); 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /src/include/param.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_PARAM_H_ 9 | #define NCCL_PARAM_H_ 10 | 11 | #include 12 | 13 | const char* userHomeDir(); 14 | void setEnvFile(const char* fileName); 15 | void initEnv(); 16 | const char *ncclGetEnv(const char *name); 17 | 18 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); 19 | 20 | #define NCCL_PARAM(name, env, deftVal) \ 21 | int64_t ncclParam##name() { \ 22 | constexpr int64_t uninitialized = INT64_MIN; \ 23 | static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ 24 | static int64_t cache = uninitialized; \ 25 | if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ 26 | ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ 27 | } \ 28 | return cache; \ 29 | } 30 | 31 | #define RCCL_PARAM_DECLARE(name) \ 32 | int64_t rcclParam##name() 33 | 34 | #define RCCL_PARAM(name, env, deftVal) \ 35 | pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \ 36 | int64_t rcclParam##name() { \ 37 | constexpr int64_t uninitialized = INT64_MIN; \ 38 | static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ 39 | static int64_t cache = uninitialized; \ 40 | if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ 41 | ncclLoadParam("RCCL_" env, deftVal, uninitialized, &cache); \ 42 | } \ 43 | return cache; \ 44 | } 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/include/profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_H_ 8 | #define PROFILER_H_ 9 | 10 | #include 11 | #include "nccl_profiler.h" 12 | 13 | struct ncclProxyArgs; 14 | struct ncclKernelPlan; 15 | struct ncclTaskColl; 16 | struct ncclTaskP2p; 17 | struct ncclInfo; 18 | struct ncclComm; 19 | struct ncclProxyOp; 20 | 21 | // Plugin Init/Finalize Wrappers 22 | ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm); 23 | ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm); 24 | 25 | // Profiler Start/Stop Group Wrappers 26 | ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan); 27 | ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan); 28 | 29 | // Profiler Start/Stop Task Events Wrappers 30 | ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan); 31 | ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan); 32 | 33 | // Proxy Op Start/Stop Event Wrappers 34 | ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args); 35 | ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args); 36 | ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args); 37 | 38 | // Proxy Step Start/Stop Event Wrappers 39 | ncclResult_t ncclProfilerStartSendProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); 40 | ncclResult_t ncclProfilerStartRecvProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); 41 | ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); 42 | 43 | // Proxy Control Start/Stop Events Wrappers 44 | ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle); 45 | ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle); 46 | 47 | // Record Event Wrappers 48 | ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState); 49 | ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState); 50 | ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState); 51 | 52 | // Profiler utility functions 53 | ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op); 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /src/include/ras.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_RAS_H_ 8 | #define NCCL_RAS_H_ 9 | 10 | #include "socket.h" 11 | 12 | // Structure used to communicate data about NCCL ranks from NCCL threads to RAS. 13 | struct rasRankInit { 14 | union ncclSocketAddress addr; 15 | pid_t pid; 16 | int cudaDev; 17 | int nvmlDev; 18 | }; 19 | 20 | ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank); 21 | ncclResult_t ncclRasCommFini(const struct ncclComm* comm); 22 | ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks); 23 | 24 | #endif // !NCCL_RAS_H_ 25 | -------------------------------------------------------------------------------- /src/include/rccl_vars.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef RCCL_VARS_H_ 24 | #define RCCL_VARS_H_ 25 | 26 | #include "param.h" 27 | 28 | RCCL_PARAM_DECLARE(EnableHipGraph); // Opt-in environment variable for enabling hipGraph 29 | 30 | #ifdef RCCL_EXPOSE_STATIC 31 | #define rccl_static 32 | #define rccl_static_inline 33 | #else 34 | #define rccl_static static 35 | #define rccl_static_inline static inline 36 | #endif 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/include/register.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_REGISTER_H_ 2 | #define NCCL_REGISTER_H_ 3 | 4 | #include "device.h" 5 | 6 | #include 7 | #include 8 | 9 | int64_t ncclParamLocalRegister(); 10 | int64_t ncclParamGraphRegister(); 11 | 12 | enum { 13 | NET_REG_COMPLETE = 0x01, 14 | NVLS_REG_COMPLETE = 0x02, 15 | NVLS_REG_POSSIBLE = 0x04, 16 | NVLS_REG_NO_SUPPORT = 0x08, 17 | COLLNET_REG_COMPLETE = 0x10, 18 | IPC_REG_COMPLETE = 0x20 19 | }; 20 | 21 | struct ncclPeerRegIpcAddr { 22 | uintptr_t* devPeerRmtAddrs; 23 | uintptr_t* hostPeerRmtAddrs; 24 | }; 25 | 26 | struct ncclRegNetHandles { 27 | void* handle; 28 | struct ncclProxyConnector* proxyConn; 29 | struct ncclRegNetHandles* next; 30 | }; 31 | 32 | struct ncclReg { 33 | // common attributes 34 | size_t pages; 35 | int localRefs; 36 | int graphRefs; 37 | uintptr_t addr; 38 | uint32_t state; 39 | // net reg 40 | struct ncclRegNetHandles* netHandleHead; 41 | // nvls reg 42 | uintptr_t baseAddr; 43 | size_t baseSize; 44 | CUdeviceptr regAddr; 45 | size_t regSize; 46 | int dev; 47 | CUmemGenericAllocationHandle mcHandle; 48 | uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */ 49 | // collnet reg 50 | void* collnetHandle; 51 | struct ncclProxyConnector* collnetProxyconn; 52 | // general ipc reg 53 | struct ncclPeerRegIpcAddr regIpcAddrs; 54 | struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS]; 55 | }; 56 | 57 | struct ncclRegCache { 58 | struct ncclReg **slots; 59 | int capacity, population; 60 | uintptr_t pageSize; 61 | }; 62 | 63 | ncclResult_t ncclRegCleanup(struct ncclComm* comm); 64 | ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg); 65 | ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); 66 | ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle); 67 | ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid); 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /src/include/rocm_smi_wrap.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef ROCM_SMI_WRAP_H_ 24 | #define ROCM_SMI_WRAP_H_ 25 | 26 | #include "rocm_smi/rocm_smi.h" 27 | #ifdef USE_ROCM_SMI64CONFIG 28 | #include "rocm_smi/rocm_smi64Config.h" 29 | #endif 30 | #include "nccl.h" 31 | 32 | ncclResult_t rocm_smi_init(); 33 | ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs); 34 | ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len); 35 | ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex); 36 | ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *count); 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/include/shm.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_SHM_H_ 2 | #define NCCL_SHM_H_ 3 | 4 | #include "comm.h" 5 | 6 | struct shmLegacyIpc { 7 | char shmSuffix[7]; 8 | ncclShmHandle_t handle; 9 | size_t shmSize; 10 | }; 11 | 12 | struct shmCuIpc { 13 | union { 14 | CUmemFabricHandle handle; 15 | CUmemGenericAllocationHandle data; 16 | }; 17 | int tpProxyRank; 18 | void *ptr; 19 | size_t size; 20 | }; 21 | 22 | struct shmIpcDesc { 23 | union 24 | { 25 | struct shmLegacyIpc shmli; 26 | struct shmCuIpc shmci; 27 | }; 28 | bool legacy; 29 | }; 30 | 31 | typedef struct shmIpcDesc ncclShmIpcDesc_t; 32 | 33 | ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr); 34 | ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut); 35 | ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/include/shmutils.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SHMUTILS_H_ 8 | #define NCCL_SHMUTILS_H_ 9 | 10 | #include "nccl.h" 11 | 12 | typedef void* ncclShmHandle_t; 13 | ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); 14 | ncclResult_t ncclShmClose(ncclShmHandle_t handle); 15 | ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); 16 | 17 | struct ncclShmemCollBuff { 18 | volatile size_t *cnt[2]; 19 | volatile void *ptr[2]; 20 | int round; 21 | size_t maxTypeSize; 22 | }; 23 | 24 | ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/include/signals.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef RCCL_SIGNALS_H_ 8 | #define RCCL_SIGNALS_H_ 9 | 10 | void RegisterSignalHandlers(); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src/include/timer.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TIMER_H_ 8 | #define NCCL_TIMER_H_ 9 | #if ENABLE_TIMER 10 | #include 11 | #include 12 | #include 13 | static double freq = -1; 14 | static void calibrate() { 15 | struct timeval tv; 16 | gettimeofday(&tv, NULL); 17 | uint64_t timeCycles = __rdtsc(); 18 | double time = - tv.tv_sec*1E6 - tv.tv_usec; 19 | uint64_t total = 0ULL; 20 | for (int i=0; i<10000; i++) total += __rdtsc(); 21 | gettimeofday(&tv, NULL); 22 | timeCycles = __rdtsc() - timeCycles; 23 | time += tv.tv_sec*1E6 + tv.tv_usec; 24 | freq = timeCycles/time; 25 | } 26 | static inline double gettime() { 27 | if (freq == -1) calibrate(); 28 | return __rdtsc()/freq; 29 | } 30 | static uint64_t counts[8]; 31 | static double times[8]; 32 | static double startTimes[8]; 33 | #define TIME_START(index) do { \ 34 | counts[index]++; \ 35 | startTimes[index] = gettime(); \ 36 | } while (0) 37 | 38 | #define TIME_STOP(index) do { \ 39 | times[index] += gettime() - startTimes[index]; \ 40 | } while (0) 41 | 42 | #define TIME_CANCEL(index) do { \ 43 | counts[index]--; \ 44 | } while (0) 45 | 46 | #define TIME_PRINT(name) do { \ 47 | printf("%s stats", name); \ 48 | for (int i=0; i<8; i++) { \ 49 | if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ 50 | counts[i] = 0; \ 51 | } \ 52 | printf("\n"); \ 53 | } while (0) 54 | #else 55 | #define TIME_START(index) do {} while(0) 56 | #define TIME_STOP(index) do {} while(0) 57 | #define TIME_CANCEL(index) do {} while(0) 58 | #define TIME_PRINT(name) 59 | #endif 60 | #endif 61 | -------------------------------------------------------------------------------- /src/include/trees.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TREES_H_ 8 | #define NCCL_TREES_H_ 9 | 10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); 11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/include/tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_INT_TUNER_H_ 9 | #define NCCL_INT_TUNER_H_ 10 | 11 | #include "nccl_tuner.h" 12 | #include "comm.h" 13 | 14 | // Tuning plugin to override NCCL's default algorithm/protocol tuning. 15 | 16 | // Attempts to load NCCL tuner from environmental variable. 17 | // Returns ncclSuccess if the correct tuner symbol has been found and 18 | // successully loaded. Otherwise returns an error and also logs the error. 19 | ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm); 20 | 21 | // Cleans up NCCL tuner plugin. 22 | ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm); 23 | #endif 24 | -------------------------------------------------------------------------------- /src/init_nvtx.cc: -------------------------------------------------------------------------------- 1 | #include "nccl.h" 2 | #include "nvtx.h" 3 | 4 | static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { 5 | {"Sum", ncclSum, 0}, 6 | {"Product", ncclProd, 0}, 7 | {"Max", ncclMax, 0}, 8 | {"Min", ncclMin, 0}, 9 | {"Avg", ncclAvg, 0} 10 | }; 11 | 12 | // Must be called before the first call to any reduction operation. 13 | void initNvtxRegisteredEnums() { 14 | #ifndef NVTX_NO_IMPL 15 | // Register schemas and strings 16 | constexpr const nvtxPayloadEnumAttr_t eAttr { 17 | .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES | 18 | NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID, 19 | .name = NULL, 20 | .entries = NvtxEnumRedSchema, 21 | .numEntries = std::extent::value, 22 | .sizeOfEnum = sizeof(ncclRedOp_t), 23 | .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP, 24 | .extension = nullptr 25 | }; 26 | 27 | nvtxPayloadEnumRegister(nvtx3::domain::get(), &eAttr); 28 | #endif 29 | } 30 | -------------------------------------------------------------------------------- /src/misc/api_trace.c: -------------------------------------------------------------------------------- 1 | // 2 | // This file just ensures that api_trace.h is C-compatible 3 | // 4 | 5 | #if defined(__cplusplus) 6 | # error "C source file compiling as C++" 7 | #endif 8 | 9 | #include "api_trace.h" 10 | -------------------------------------------------------------------------------- /src/misc/mscclpp/mscclpp_nccl.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt and NOTICES.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "mscclpp/mscclpp_nccl.h" 8 | 9 | std::unordered_map mscclpp_uniqueIdMap; 10 | std::unordered_map> mscclpp_uniqueIdReverseMap; 11 | std::unordered_map mscclpp_commToUniqueIdMap; 12 | std::unordered_map ncclCommToUniqueIdMap; 13 | -------------------------------------------------------------------------------- /src/misc/mscclpp/mscclpp_nccl_syms.txt: -------------------------------------------------------------------------------- 1 | # > ${PROJECT_BINARY_DIR}/mscclpp_nccl_syms.txt; 2 | # for sym in $(nm -fjust-symbols ${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a | grep "^nccl"); do 3 | # echo $sym mscclpp_$sym>> ${PROJECT_BINARY_DIR}/mscclpp_nccl_syms.txt; 4 | # done 5 | ncclAllGather mscclpp_ncclAllGather 6 | ncclAllReduce mscclpp_ncclAllReduce 7 | ncclAllToAll mscclpp_ncclAllToAll 8 | ncclBcast mscclpp_ncclBcast 9 | ncclBroadcast mscclpp_ncclBroadcast 10 | ncclCommAbort mscclpp_ncclCommAbort 11 | ncclCommCount mscclpp_ncclCommCount 12 | ncclCommCuDevice mscclpp_ncclCommCuDevice 13 | ncclCommDestroy mscclpp_ncclCommDestroy 14 | ncclCommFinalize mscclpp_ncclCommFinalize 15 | ncclCommGetAsyncError mscclpp_ncclCommGetAsyncError 16 | ncclCommInitAll mscclpp_ncclCommInitAll 17 | ncclCommInitRank mscclpp_ncclCommInitRank 18 | ncclCommInitRankConfig mscclpp_ncclCommInitRankConfig 19 | ncclCommSplit mscclpp_ncclCommSplit 20 | ncclCommUserRank mscclpp_ncclCommUserRank 21 | ncclGetErrorString mscclpp_ncclGetErrorString 22 | ncclGetLastError mscclpp_ncclGetLastError 23 | ncclGetUniqueId mscclpp_ncclGetUniqueId 24 | ncclGetVersion mscclpp_ncclGetVersion 25 | ncclGroupEnd mscclpp_ncclGroupEnd 26 | ncclGroupStart mscclpp_ncclGroupStart 27 | ncclRecv mscclpp_ncclRecv 28 | ncclRedOpCreatePreMulSum mscclpp_ncclRedOpCreatePreMulSum 29 | ncclRedOpDestroy mscclpp_ncclRedOpDestroy 30 | ncclReduce mscclpp_ncclReduce 31 | ncclReduceScatter mscclpp_ncclReduceScatter 32 | ncclSend mscclpp_ncclSend 33 | ncclCommRegister mscclpp_ncclCommRegister 34 | ncclCommDeregister mscclpp_ncclCommDeregister 35 | ncclMemAlloc mscclpp_ncclMemAlloc 36 | ncclMemFree mscclpp_ncclMemFree 37 | -------------------------------------------------------------------------------- /src/misc/nvmlwrap_stub.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #include "nvmlwrap.h" 9 | 10 | ncclResult_t ncclNvmlSymbols(void) { 11 | return ncclSuccess; 12 | } 13 | 14 | ncclResult_t ncclNvmlInit(void) { 15 | return ncclSuccess; 16 | } 17 | 18 | ncclResult_t ncclNvmlShutdown(void) { 19 | return ncclSuccess; 20 | } 21 | 22 | ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { 23 | return ncclSystemError; 24 | } 25 | 26 | ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { 27 | *index = 0; 28 | return ncclSuccess; 29 | } 30 | 31 | ncclResult_t ncclNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { 32 | return ncclSystemError; 33 | } 34 | 35 | ncclResult_t ncclNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { 36 | *minorNumber = 0; 37 | return ncclSuccess; 38 | } 39 | 40 | ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { 41 | return ncclSystemError; 42 | } 43 | 44 | ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { 45 | return ncclSystemError; 46 | } 47 | 48 | ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, 49 | nvmlNvLinkCapability_t capability, unsigned int *capResult) { 50 | return ncclSystemError; 51 | } 52 | 53 | ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { 54 | *major = *minor = 1; 55 | return ncclSuccess; 56 | } 57 | -------------------------------------------------------------------------------- /src/misc/signals.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifdef HAVE_BFD 8 | #include "BfdBacktrace.hpp" 9 | #endif 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "param.h" 16 | #include "debug.h" 17 | #include 18 | 19 | void sig_handler(int signum) 20 | { 21 | printf("\n[Process: %d] Inside handler function signal: %s (%d)\n", getpid(), strsignal(signum), signum); 22 | 23 | #ifdef HAVE_BFD 24 | void *addresses[BACKTRACE_MAX]; 25 | int num_addresses = backtrace(addresses, BACKTRACE_MAX); 26 | struct backtrace_file file; 27 | backtrace_line line; 28 | backtrace_h bckt; 29 | bckt.size = 0; 30 | 31 | for (int i = 0; i < num_addresses; ++i) 32 | { 33 | file.dl.address = (unsigned long)addresses[i]; 34 | if (dl_lookup_address(&file.dl) && load_file(&file)) 35 | { 36 | bckt.size += get_line_info(&file, 1, 37 | bckt.lines + bckt.size, 38 | BACKTRACE_MAX - bckt.size); 39 | unload_file(&file); 40 | } 41 | } 42 | 43 | for (int i=0; i signalsToCatch = {SIGILL, SIGBUS, SIGFPE, SIGSEGV, SIGUSR2}; 77 | 78 | for (auto signum : signalsToCatch) 79 | { 80 | if (signal(signum, sig_handler) == SIG_ERR) 81 | { 82 | INFO(NCCL_INIT, "Unable to register signal handler for %s\n", strsignal(signum)); 83 | } 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/nccl.pc.in: -------------------------------------------------------------------------------- 1 | prefix=${nccl:Prefix} 2 | exec_prefix=${prefix} 3 | libdir=${exec_prefix}/lib 4 | includedir=${prefix}/include 5 | 6 | Name: nccl 7 | Description: Optimized primitives for collective multi-GPU communication 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} 9 | Libs: -L${libdir} -lnccl 10 | Cflags: -I${includedir} 11 | -------------------------------------------------------------------------------- /src/register/sendrecv_reg.cc: -------------------------------------------------------------------------------- 1 | #include "register.h" 2 | #include "transport.h" 3 | 4 | ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue* cleanupQueue) { 5 | ncclResult_t ret = ncclSuccess; 6 | 7 | *regFlag = 0; 8 | if (comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) { 9 | if (comm->planner.persistent && ncclParamGraphRegister()) { 10 | ncclNetGraphRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle, cleanupQueue, NULL); 11 | } 12 | if (*regFlag == 0 && ncclParamLocalRegister()) { 13 | ncclNetLocalRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle); 14 | } 15 | } 16 | return ret; 17 | } 18 | 19 | ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue* cleanupQueue) { 20 | ncclResult_t ret = ncclSuccess; 21 | uintptr_t offset = 0; 22 | uintptr_t* peerRmtAddrs = NULL; 23 | 24 | *regFlag = 0; 25 | if (comm->planner.persistent && ncclParamGraphRegister()) { 26 | ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast(cleanupQueue), NULL); 27 | } 28 | if (*regFlag == 0 && ncclParamLocalRegister()) { 29 | ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs); 30 | } 31 | 32 | if (*regFlag) 33 | *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset); 34 | return ret; 35 | } 36 | -------------------------------------------------------------------------------- /test/common/CallCollectiveForked.hpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef CALLCOLLECTIVEFORKED_H 8 | #define CALLCOLLECTIVEFORKED_H 9 | 10 | #include 11 | 12 | namespace RcclUnitTesting 13 | { 14 | void callCollectiveForked(int nranks, int collID, const std::vector& sendBuff, std::vector& recvBuff, const std::vector& expected, bool use_managed_mem = false); 15 | } 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /test/common/PrepDataFuncs.hpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | #pragma once 7 | #include "ErrCode.hpp" 8 | 9 | namespace RcclUnitTesting 10 | { 11 | class CollectiveArgs; 12 | 13 | // Checks that enough memory has been allocated 14 | ErrCode CheckAllocation(CollectiveArgs const& collArgs); 15 | 16 | // Default PrepareData functions 17 | // PrepareData functions are responsible for setting up input / expected for the given collArgs 18 | ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs); 19 | ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs); 20 | ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce); 21 | ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather); 22 | ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs); 23 | ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs); 24 | ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs); 25 | ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs); 26 | ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs); 27 | ErrCode DefaultPrepData_Recv(CollectiveArgs &collArgs); 28 | } 29 | -------------------------------------------------------------------------------- /test/common/RcclMockFuncs.hpp: -------------------------------------------------------------------------------- 1 | #include "info.h" 2 | #include "comm.h" 3 | 4 | void ncclDebugLog(ncclDebugLogLevel, unsigned long, char const*, int, char const*, ...) {}; 5 | ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { 6 | return ncclSuccess; 7 | } 8 | -------------------------------------------------------------------------------- /test/common/StandaloneUtils.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | #include "CollectiveArgs.hpp" 7 | #include "StandaloneUtils.hpp" 8 | #include 9 | #include 10 | 11 | 12 | namespace RcclUnitTesting 13 | { 14 | 15 | std::string executeCommand(const char* cmd) { 16 | std::string result; 17 | FILE* pipe = popen(cmd, "r"); 18 | 19 | if (!pipe) { 20 | std::cerr << "Error executing command: " << cmd << std::endl; 21 | return result; 22 | } 23 | 24 | char buffer[128]; 25 | while (!feof(pipe)) { 26 | if (fgets(buffer, 128, pipe) != NULL) { 27 | result += buffer; 28 | } 29 | } 30 | 31 | pclose(pipe); 32 | return result; 33 | } 34 | 35 | std::vector splitString(const std::string& str, char delimiter) { 36 | std::vector result; 37 | std::istringstream iss(str); 38 | 39 | std::string line; 40 | while(std::getline(iss, line, delimiter)) { 41 | result.push_back(line); 42 | } 43 | 44 | return result; 45 | } 46 | 47 | 48 | ArchInfo parseMetadata(const std::vector& list) { 49 | ArchInfo archInfo; 50 | KernelInfo currKernelInfo; 51 | 52 | std::regex amdhsaTargetRegex("amdhsa.target:\\s+(?:'?)amdgcn-amd-amdhsa--(\\w+)(?:'?)"); 53 | std::regex kernelNameRegex("\\.name:\\s+(\\w+)"); 54 | std::regex privateSegmentSizeRegex("\\.private_segment_fixed_size:\\s+(\\d+)"); 55 | 56 | for (const auto& line : list) { 57 | std::smatch match; 58 | 59 | if (std::regex_search(line, match, amdhsaTargetRegex)) { 60 | archInfo.archName = match[1]; 61 | } else if (std::regex_search(line, match, kernelNameRegex)) { 62 | currKernelInfo.name = match[1]; 63 | } else if (std::regex_search(line, match, privateSegmentSizeRegex)) { 64 | currKernelInfo.privateSegmentFixedSize = std::stoi(match[1]); 65 | } 66 | 67 | if (!currKernelInfo.name.empty() && currKernelInfo.privateSegmentFixedSize != 0) { 68 | archInfo.kernels.push_back(currKernelInfo); 69 | currKernelInfo = {}; // Empty kernelInfo 70 | } 71 | } 72 | 73 | return archInfo; 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /test/common/StandaloneUtils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STANDALONE_UTILS_H 2 | #define STANDALONE_UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define HIPCALL(cmd) \ 10 | do { \ 11 | hipError_t error = (cmd); \ 12 | if (error != hipSuccess) \ 13 | { \ 14 | printf("Encountered HIP error (%s) at line %d in file %s\n", \ 15 | hipGetErrorString(error), __LINE__, __FILE__); \ 16 | exit(-1); \ 17 | } \ 18 | } while (0) 19 | 20 | #define NCCLCHECK(cmd) do { \ 21 | ncclResult_t res = cmd; \ 22 | if (res != ncclSuccess) { \ 23 | printf("NCCL failure %s:%d '%s'\n", \ 24 | __FILE__,__LINE__,ncclGetErrorString(res)); \ 25 | } \ 26 | } while(0) 27 | 28 | #define MAX_STACK_SIZE 570 29 | 30 | #ifdef ENABLE_LL128 31 | #define MAX_STACK_SIZE_gfx90a 360 32 | #else 33 | #define MAX_STACK_SIZE_gfx90a MAX_STACK_SIZE 34 | #endif 35 | 36 | namespace RcclUnitTesting 37 | { 38 | struct KernelInfo { 39 | std::string name; 40 | int privateSegmentFixedSize = 0; 41 | }; 42 | 43 | struct ArchInfo { 44 | std::string archName; 45 | std::vector kernels; 46 | }; 47 | 48 | std::string executeCommand(const char* cmd); 49 | 50 | std::vector splitString(const std::string& str, char delimiter); 51 | 52 | ArchInfo parseMetadata(const std::vector& list); 53 | } 54 | #endif 55 | -------------------------------------------------------------------------------- /test/common/main.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | #include "EnvVars.hpp" 9 | #include "TestBed.hpp" 10 | int main(int argc, char **argv) 11 | { 12 | ::testing::InitGoogleTest(&argc, argv); 13 | RcclUnitTesting::EnvVars ev; 14 | ev.ShowConfig(); 15 | int retCode = RUN_ALL_TESTS(); 16 | printf("[ INFO ] Total executed cases: %d\n", RcclUnitTesting::TestBed::NumTestsRun()); 17 | 18 | // Show timing information 19 | 20 | if (ev.showTiming) 21 | { 22 | size_t totalTimeMsec = 0; 23 | fflush(stdout); 24 | printf("[ TIMING ] %-20s: %-20s: %10s ms (%s)\n", "TEST SUITE", "TEST NAME", "TIME", "STATUS"); 25 | auto unitTest = ::testing::UnitTest::GetInstance(); 26 | for (int i = 0; i < unitTest->total_test_suite_count(); i++) 27 | { 28 | auto suiteInfo = unitTest->GetTestSuite(i); 29 | if (!suiteInfo->should_run()) continue; 30 | 31 | for (int j = 0; j < suiteInfo->total_test_count(); j++) 32 | { 33 | auto testInfo = suiteInfo->GetTestInfo(j); 34 | if (!testInfo->should_run()) continue; 35 | auto testResult = testInfo->result(); 36 | if (testResult->Skipped()) continue; 37 | printf("[ TIMING ] %-20s: %-20s: %10.2f sec (%4s)\n", testInfo->test_suite_name(), testInfo->name(), testResult->elapsed_time() / 1000.0, testResult->Passed() ? "PASS" : "FAIL"); 38 | } 39 | printf("[ TIMING ] %-20s: %-20s: %10.2f sec (%4s)\n", suiteInfo->name(), "TOTAL", suiteInfo->elapsed_time() / 1000.0, suiteInfo->Passed() ? "PASS" : "FAIL"); 40 | totalTimeMsec += suiteInfo->elapsed_time(); 41 | } 42 | printf("[ TIMING ] Total time: %10.2f minutes\n", totalTimeMsec / (60 * 1000.0)); 43 | } 44 | return retCode; 45 | } 46 | -------------------------------------------------------------------------------- /toolchain-linux.cmake: -------------------------------------------------------------------------------- 1 | 2 | if (DEFINED ENV{ROCM_PATH}) 3 | set(rocm_bin "$ENV{ROCM_PATH}/bin") 4 | else() 5 | set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.") 6 | set(rocm_bin "/opt/rocm/bin") 7 | endif() 8 | 9 | if (NOT DEFINED ENV{CXX}) 10 | set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler") 11 | else() 12 | set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler") 13 | endif() 14 | 15 | if (NOT DEFINED ENV{CXXFLAGS}) 16 | set(CMAKE_CXX_FLAGS_DEBUG "-g -O1") 17 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 18 | endif() 19 | 20 | if (NOT DEFINED ENV{CC}) 21 | set(CMAKE_C_COMPILER "${rocm_bin}/amdclang" CACHE PATH "Path to the C compiler") 22 | else() 23 | set(CMAKE_C_COMPILER "$ENV{CC}" CACHE PATH "Path to the C compiler") 24 | endif() 25 | 26 | if (NOT DEFINED ENV{CFLAGS}) 27 | set(CMAKE_C_FLAGS_DEBUG "-g -O1") 28 | set(CMAKE_C_FLAGS_RELEASE "-O3") 29 | endif() 30 | -------------------------------------------------------------------------------- /tools/EmptyKernelTest/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROCM_PATH ?= /opt/rocm 3 | CUDA_PATH ?= /usr/local/cuda 4 | 5 | HIPCC=$(ROCM_PATH)/bin/hipcc 6 | NVCC=$(CUDA_PATH)/bin/nvcc 7 | 8 | # Compile TransferBenchCuda if nvcc detected 9 | ifeq ("$(shell test -e $(NVCC) && echo found)", "found") 10 | EXE=./EmptyKernelTestCuda 11 | else 12 | EXE=./EmptyKernelTest 13 | endif 14 | 15 | all: $(EXE) 16 | 17 | ./EmptyKernelTest: EmptyKernelTest.cpp 18 | $(HIPCC) EmptyKernelTest.cpp -o EmptyKernelTest 19 | 20 | ./EmptyKernelTestCuda: EmptyKernelTest.cpp 21 | $(NVCC) EmptyKernelTest.cpp -x cu -o EmptyKernelTestCuda 22 | 23 | 24 | clean: 25 | rm -f ./EmptyKernelTest ./EmptyKernelTestCuda 26 | 27 | -------------------------------------------------------------------------------- /tools/GraphBench/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 2 | 3 | # Set to where RCCL is installed 4 | RCCL_INSTALL=../../build/release 5 | 6 | HIP_PATH?= $(wildcard /opt/rocm) 7 | ifeq (,$(HIP_PATH)) 8 | HIP_PATH=../../.. 9 | endif 10 | HIPCC=$(HIP_PATH)/bin/hipcc 11 | 12 | EXE=GraphBench 13 | CXXFLAGS = -std=c++11 -O3 -I../../src/include -I$(RCCL_INSTALL)/include -L$(RCCL_INSTALL) -lrccl 14 | 15 | all: $(EXE) 16 | 17 | $(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp") 18 | $(HIPCC) $(CXXFLAGS) $< -o $@ 19 | 20 | test: $(EXE) 21 | LD_LIBRARY_PATH=$(RCCL_INSTALL) RCCL_ENABLE_HIPGRAPH=1 ./$(EXE) 22 | 23 | testInfo: $(EXE) 24 | NCCL_DEBUG=INFO LD_LIBRARY_PATH=$(RCCL_INSTALL) RCCL_ENABLE_HIPGRAPH=1 ./$(EXE) 25 | clean: 26 | rm -f *.o $(EXE) 27 | -------------------------------------------------------------------------------- /tools/HelloRccl/HelloRccl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef HELLORCCL_HPP 24 | #define HELLORCCL_HPP 25 | #include 26 | 27 | #define HIP_CALL(cmd) \ 28 | do { \ 29 | hipError_t error = (cmd); \ 30 | if (error != hipSuccess) \ 31 | { \ 32 | std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \ 33 | << __LINE__ << " in file " << __FILE__ << "\n"; \ 34 | exit(-1); \ 35 | } \ 36 | } while (0) 37 | 38 | #define NCCL_CALL(cmd) \ 39 | do { \ 40 | ncclResult_t error = (cmd); \ 41 | if (error != ncclSuccess) \ 42 | { \ 43 | std::cerr << "Encountered NCCL error (" << ncclGetErrorString(error) << ") at line " \ 44 | << __LINE__ << " in file " << __FILE__ << "\n"; \ 45 | exit(-1); \ 46 | } \ 47 | } while (0) 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /tools/HelloRccl/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. 2 | 3 | # Set to where RCCL is installed 4 | RCCL_INSTALL=../../build/release 5 | 6 | HIP_PATH?= $(wildcard /opt/rocm) 7 | ifeq (,$(HIP_PATH)) 8 | HIP_PATH=../../.. 9 | endif 10 | HIPCC=$(HIP_PATH)/bin/hipcc 11 | 12 | EXE=HelloRccl 13 | CXXFLAGS = -std=c++11 -O3 -I../../src/include -I$(RCCL_INSTALL) -L$(RCCL_INSTALL) -lrccl 14 | 15 | all: $(EXE) 16 | 17 | $(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp") 18 | $(HIPCC) $(CXXFLAGS) $< -o $@ 19 | 20 | clean: 21 | rm -f *.o $(EXE) 22 | -------------------------------------------------------------------------------- /tools/HelloRccl/runTest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RCCL_INSTALL=../../build/release 3 | EXE=$PWD/HelloRccl 4 | LDPATH=$LD_LIBRARY_PATH:$RCCL_INSTALL 5 | 6 | echo "Single process:" 7 | NCCL_DEBUG=INFO LD_LIBRARY_PATH=$LDPATH $EXE 4 8 | 9 | echo "Multi-process:" 10 | NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 0 & 11 | NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 1 & 12 | NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 2 & 13 | NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 3 14 | -------------------------------------------------------------------------------- /tools/JitterBench/Common.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | 25 | #define HIP_CALL(cmd) \ 26 | do { \ 27 | hipError_t error = (cmd); \ 28 | if (error != hipSuccess) \ 29 | { \ 30 | std::cout << "Encountered HIP error (" << hipGetErrorString(error) \ 31 | << ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \ 32 | exit(-1); \ 33 | } \ 34 | } while (0) 35 | 36 | // Macro for collecting HW_REG_XCC_ID 37 | #if defined(__gfx942__) || defined(__gfx950__) 38 | #define GetXccId(val) \ 39 | asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val)); 40 | #else 41 | #define GetXccId(val) \ 42 | val = 0 43 | #endif 44 | 45 | // Macro for collecting HW_REG_HW_ID 46 | #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__) 47 | #define GetHwId(val) \ 48 | val = 0 49 | #else 50 | #define GetHwId(val) \ 51 | asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (val)); 52 | #endif 53 | -------------------------------------------------------------------------------- /tools/JitterBench/Makefile: -------------------------------------------------------------------------------- 1 | ROCM_PATH ?= /opt/rocm 2 | CUDA_PATH ?= /usr/local/cuda 3 | HIPCC = $(ROCM_PATH)/bin/hipcc 4 | NVCC = $(CUDA_PATH)/bin/nvcc 5 | 6 | CCFLAGS = -O3 -lhsa-runtime64 -fopenmp -lnuma 7 | NVFLAGS = -O3 -x cu -lnuma -Xcompiler -fopenmp -gencode=arch=compute_90,code=sm_90 8 | 9 | ifneq ("$(MPI_DIR)", "") 10 | MPIFLAGS = -DMPI_SUPPORT -I$(MPI_DIR)/include -L$(MPI_DIR)/lib -lmpi 11 | else 12 | MPIFLAGS = 13 | endif 14 | 15 | all: JitterBench 16 | 17 | JitterBench: JitterBench.cpp Common.hpp Timeline.hpp 18 | ifeq ("$(shell test -e $(NVCC) && echo found)", "found") 19 | $(NVCC) $(NVFLAGS) $(MPIFLAGS) $< -o $@ 20 | else 21 | $(HIPCC) $(CCFLAGS) $(MPIFLAGS) $< -o $@ 22 | endif 23 | 24 | clean: 25 | rm -f ./JitterBench 26 | -------------------------------------------------------------------------------- /tools/JitterBench/Timeline.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | struct TimelineData 7 | { 8 | std::string rowLabel; 9 | std::string barLabel; 10 | std::string toolTip; 11 | uint64_t startTime; 12 | uint64_t stopTime; 13 | }; 14 | 15 | void ExportToTimeLine(std::string outputFilename, 16 | std::string rowLabelName, 17 | std::string barLabelName, 18 | std::vector const& timelineData) 19 | { 20 | FILE *fp = fopen(outputFilename.c_str(), "w"); 21 | 22 | fprintf(fp, "\n"); 23 | fprintf(fp, "\n"); 49 | fprintf(fp, "
\n"); 50 | fclose(fp); 51 | } 52 | -------------------------------------------------------------------------------- /tools/JitterBench/runSweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for numBlocks in 1 2 4 8 16 32; do 4 | for blockSize in 64 128 256; do 5 | for numTimers in 0 1; do 6 | for useNuma in 0 1; do 7 | echo "numBlocks=$numBlocks blockSize=$blockSize numTimers=$numTimers useNuma=$useNuma"; 8 | ./LaunchBench $numBlocks $blockSize $numTimers $useNuma &> output.$numBlocks.$blockSize.$numTimers.$useNuma.txt 9 | done; 10 | done; 11 | done; 12 | done; 13 | -------------------------------------------------------------------------------- /tools/RcclReplayer/Makefile: -------------------------------------------------------------------------------- 1 | ROCM_DIR ?= /opt/rocm 2 | RCCL_DIR ?= ../../build/release 3 | MPI_DIR ?= /opt/ompi 4 | 5 | INCLUDES = -I$(MPI_DIR)/include -I$(RCCL_DIR)/include -I$(RCCL_DIR)/hipify/src/include 6 | LDFLAGS = -L$(MPI_DIR)/lib -L$(RCCL_DIR) -lmpi -lrccl 7 | 8 | main: rcclReplayer.cpp 9 | $(ROCM_DIR)/bin/hipcc rcclReplayer.cpp -O1 -g -o rcclReplayer $(INCLUDES) $(LDFLAGS) 10 | 11 | clean: 12 | rm -f ./rcclReplayer 13 | -------------------------------------------------------------------------------- /tools/TopoVisual/4_nodes.log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/rccl/e94b36024615b65ddcd372ac39d9e9f5f43f685f/tools/TopoVisual/4_nodes.log.png -------------------------------------------------------------------------------- /tools/TopoVisual/README.md: -------------------------------------------------------------------------------- 1 | # Topology Visualizer 2 | Topology Visualizer extracts topology information from RCCL log file and presents graphically. Less than optimal connections between GPUs and nodes are highlighted in red for easy identification. 3 | 4 | ## Requirements 5 | Following packages are required to run Topology Visualizer: 6 | 1. gawk 7 | 2. graphviz 8 | 9 | ## Usage 10 | Topology Visualizer accepts both RCCL log files or simulator output, i.e. [Topology Explorer](https://github.com/ROCm/rccl/tree/master/tools/topo_expl "Topology Explorer"). 11 | 12 | RCCL logs needs to be collected with NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=INIT,GRAPH environmental variables. Example command line: 13 | ```shell 14 | mpirun -np 4 -host rocm-framework-1,rocm-framework-3,rocm-framework-5,rocm-framework-6 \ 15 | -env HSA_FORCE_FINE_GRAIN_PCIE 1 -env NCCL_DEBUG INFO -env NCCL_DEBUG_SUBSYS INIT,GRAPH \ 16 | ~/rccl-tests/build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 | tee ~/4_nodes.log 17 | 18 | ./topo_visual.sh -i 4_nodes.log 19 | ``` 20 | 21 | ## Legend 22 | 23 | Solid lines: connections over P2P or shared memory 24 | 25 | Dashed lines: connections over network 26 | 27 | Green: P2P connections, network connections with GPU RDMA 28 | 29 | Red: Connections over shared memory or without GPU RDMA 30 | 31 | ## Example Output 32 | ![image info](./4_nodes.log.png) 33 | 34 | ## Copyright 35 | All source code and accompanying documentation are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. 36 | -------------------------------------------------------------------------------- /tools/TopoVisual/topo_visual.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 23 | 24 | exit_error() { 25 | echo "Usage: $0 [ -i input_filename ]" 26 | exit 1 27 | } 28 | 29 | while getopts ":i:o:" options; do 30 | case "${options}" in 31 | i) 32 | INPUT_NAME=${OPTARG} 33 | ;; 34 | :) 35 | echo "Error: -${OPTARG} requires an argument." 36 | exit_error 37 | ;; 38 | ?) 39 | exit_error 40 | ;; 41 | esac 42 | done 43 | 44 | if [ -z "$INPUT_NAME" ] 45 | then 46 | exit_error 47 | else 48 | $DIR/extract_topo.awk $INPUT_NAME | dot -Tpng -o "$INPUT_NAME.png" 49 | echo "Extracted topology from $INPUT_NAME to $INPUT_NAME.png" 50 | fi 51 | 52 | exit 0 53 | -------------------------------------------------------------------------------- /tools/TransferBench/README.md: -------------------------------------------------------------------------------- 1 | # TransferBench 2 | 3 | TransferBench is a simple utility capable of benchmarking simultaneous copies between user-specified devices (CPUs/GPUs). 4 | TransferBench can now be found at: https://github.com/ROCm/TransferBench 5 | 6 | ## Copyright 7 | 8 | All source code and accompanying documentation is copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 9 | -------------------------------------------------------------------------------- /tools/ib-test/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. 2 | HIP_PATH ?= $(wildcard /opt/rocm) 3 | ifeq (,$(HIP_PATH)) 4 | HIP_PATH = ../../.. 5 | endif 6 | HIPCC = $(HIP_PATH)/bin/hipcc 7 | 8 | EXE = ib_test 9 | CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/clique -DENABLE_TRACE -DRCCL_IB_TEST -ldl -lnuma 10 | 11 | files = $(EXE).cpp utils.cpp ../../src/transport/net_ib.cc ../../src/misc/ibvwrap.cc ../../src/debug.cc 12 | 13 | all: $(EXE) 14 | 15 | $(EXE): $(files) 16 | $(HIPCC) $(CXXFLAGS) $^ -o $@ 17 | #scp $(EXE) rocm-framework-3:$(shell pwd) 18 | 19 | clean: 20 | rm -f *.o $(EXE) 21 | -------------------------------------------------------------------------------- /tools/p2p-latency-test/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | HIP_PATH ?= $(wildcard /opt/rocm) 4 | ifeq (,$(HIP_PATH)) 5 | HIP_PATH = ../../.. 6 | endif 7 | HIPCC = $(HIP_PATH)/bin/hipcc 8 | 9 | all: p2p_latency_test ll_latency_test 10 | 11 | CXXFLAGS = -g -O3 12 | p2p_latency_test: p2p_latency_test.cpp 13 | $(HIPCC) $(CXXFLAGS) $^ -o $@ 14 | ll_latency_test: ll_latency_test.cpp 15 | $(HIPCC) $(CXXFLAGS) $^ -o $@ 16 | 17 | clean: 18 | rm -f *.o $(EXE) 19 | -------------------------------------------------------------------------------- /tools/p2p-latency-test/build_and_run.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | make 5 | 6 | # Example run: test one-way latency between GPU 0 and GPU 1 in both directions. 7 | export HSA_FORCE_FINE_GRAIN_PCIE=1 8 | 9 | echo Running p2p_latency_test using GPU pair 0 1 10 | ./p2p_latency_test 0 1 11 | 12 | sleep 1 13 | 14 | echo Running p2p_latency_test using GPU pair 1 0 15 | ./p2p_latency_test 1 0 16 | 17 | sleep 1 18 | 19 | echo Running ll_latency_test using GPU pair 0 1 20 | ./ll_latency_test 0 1 21 | 22 | sleep 1 23 | 24 | echo Running ll_latency_test using GPU pair 1 0 25 | ./ll_latency_test 1 0 26 | -------------------------------------------------------------------------------- /tools/rccl-prim-test/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. 2 | HIP_PATH?= $(wildcard /opt/rocm) 3 | ifeq (,$(HIP_PATH)) 4 | HIP_PATH=../../.. 5 | endif 6 | HIPCC=$(HIP_PATH)/bin/hipcc 7 | 8 | EXE=rccl_prim_test 9 | CXXFLAGS = -O3 -g -I/opt/rocm/rocrand/include 10 | 11 | all: $(EXE) 12 | 13 | $(EXE): rccl_prim_test.cpp 14 | $(HIPCC) $(CXXFLAGS) $^ -o $@ 15 | 16 | clean: 17 | rm -f *.o $(EXE) 18 | -------------------------------------------------------------------------------- /tools/scripts/pytorch-all-reduce/README.md: -------------------------------------------------------------------------------- 1 | Small benchmark utility for gpt-fast's all reduce. 2 | 3 | ### How to run 4 | Out of box run (This will try various sequence lengths and dump perf results to terminal output) 5 | ``` 6 | torchrun --nproc_per_node=8 all_reduce.py 7 | ``` 8 | 9 | To enable intra node all-reduce algorithms use: 10 | ``` 11 | ENABLE_INTRA_NODE_COMM=1 torchrun --nproc_per_node=8 python3 all_reduce.py 12 | ``` 13 | 14 | ### Rocprof trace script 15 | To create perfetto traces for each rank of each all reduce a bash script is provided. 16 | ``` 17 | ENABLE_INTRA_NODE_COMM=1 bash trace_runs.sh 18 | ``` 19 | 20 | ### Additional options: 21 | The tensor size is dependent on sequence_length and dim supplied in gpt fast. There are 4 different all-reduce calls in gpt-fast at runtime: 22 | - 1: [seq_len, dim] 23 | - 2: [seq_len, 2, dim] 24 | - 3: [1, dim] 25 | - 4: [1, 2, dim] 26 | ``` 27 | --sequence_lengths (defaults to [50, 64, 128, 256, 512, 1024, 2048, 4096]) 28 | --dim (defaults to 6144) 29 | --all_reduce (defaults to [0,1,2,3]) - Can be modified to only run a single all-reduce, mapping to the 4 all reduces listed above 30 | --tracing - Enables tracing mode to skip CPU timers in recording 31 | ``` 32 | 33 | -------------------------------------------------------------------------------- /tools/scripts/pytorch-all-reduce/trace_runs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEQUENCE_LENGTHS=(50 128 256 512 1024 2048 4096) 4 | ALL_REDUCE_ALGOS=(1 2 3 4) 5 | 6 | HIP_DEV_FORCE_KERNARG=1 7 | 8 | for SEQ_LEN in "${SEQUENCE_LENGTHS[@]}"; do 9 | for ALGO in "${ALL_REDUCE_ALGOS[@]}"; do 10 | echo "Running sequence length $SEQ_LEN with intra-node all_reduce $ALGO" 11 | ENABLE_INTRA_NODE_COMM=1 rocprofv3 --hip-trace --kernel-trace --stats --output-format PFTRACE -d rocprof_trace/intranode_input"$SEQ_LEN"_allreduce"$ALGO"/ -- torchrun --nproc_per_node=8 all_reduce.py --sequence_lengths $SEQ_LEN --all_reduce $ALGO --tracing 12 | done 13 | done 14 | -------------------------------------------------------------------------------- /tools/scripts/topo_val.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 23 | 24 | for i in {0..88} 25 | do 26 | if [[ $i -eq 50 ]] || [[ $i -eq 51 ]] 27 | then 28 | NCCL_COLLNET_ENABLE=1 $DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log" 29 | elif [[ $i -eq 54 ]] 30 | then 31 | RCCL_ENABLE_MULTIPLE_SAT=1 NCCL_COLLNET_ENABLE=1 $DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log" 32 | else 33 | $DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log" 34 | fi 35 | $DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log" 36 | done 37 | -------------------------------------------------------------------------------- /tools/time-trace/rccl-TimeTrace.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Directory path to search for JSON files 4 | directory="../../build/release" 5 | 6 | if command -v pip &>/dev/null; then 7 | echo "pip is already installed." 8 | else 9 | echo "pip is not installed. Installing..." 10 | sudo apt-get update 11 | sudo apt install python3-pip 12 | fi 13 | 14 | required_library='pandas' 15 | 16 | # Check if pandas is installed 17 | if python3 -c "import $required_library" &> /dev/null; then 18 | echo "$required_library is already installed." 19 | else 20 | echo "$required_library is not installed. Installing..." 21 | pip3 install $required_library 22 | fi 23 | 24 | required_library='plotly' 25 | 26 | # Check if the library is installed 27 | if python3 -c "import $required_library" &> /dev/null; then 28 | echo "$required_library is already installed." 29 | else 30 | echo "$required_library is not installed. Installing..." 31 | pip3 install $required_library 32 | fi 33 | 34 | # Check if the file exists 35 | if [ ! -f "$directory/.ninja_log" ]; then 36 | echo "File '$directory/.ninja_log' does not exist." 37 | exit 1 38 | fi 39 | 40 | declare -A unique_values 41 | 42 | # Use awk to compare and delete duplicates 43 | awk '!unique_values[$5]++' "$directory/.ninja_log" > temp_file.txt 44 | mv temp_file.txt "$directory/.ninja_log" 45 | 46 | # Rename the file with .csv extension 47 | mv "$directory/.ninja_log" "$directory/time_trace.log" 48 | 49 | # Run the python program 50 | python3 time_trace_generator.py --min_val 5 --include_linking -------------------------------------------------------------------------------- /tools/topo_expl/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. 2 | HIP_PATH ?= $(wildcard /opt/rocm) 3 | ifeq (,$(HIP_PATH)) 4 | HIP_PATH = ../../.. 5 | endif 6 | HIPCC = $(HIP_PATH)/bin/hipcc 7 | 8 | EXE = topo_expl 9 | CXXFLAGS = -g -ffunction-sections -fdata-sections -Wl,--gc-sections -fgpu-rdc -Iinclude -Ihipify_rccl/include -Ihipify_rccl/device/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DENABLE_LL128 -DNVTX_NO_IMPL -DRCCL_EXPOSE_STATIC -lpthread 10 | 11 | files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \ 12 | hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc \ 13 | hipify_rccl/collectives.cc hipify_rccl/register.cc hipify_rccl/enqueue.cc ../../src/rccl_wrap.cc 14 | 15 | all: hipify $(EXE) 16 | 17 | $(EXE): $(files) 18 | $(HIPCC) $(CXXFLAGS) $^ -o $@ 19 | 20 | hipify: 21 | rm -rf hipify_rccl 22 | mkdir -p hipify_rccl/device/include hipify_rccl/include/network/unpack 23 | cp -a ../../src/include/ hipify_rccl/ 24 | cp -a ../../src/graph/ hipify_rccl/ 25 | cp -a ../../src/device/*.h hipify_rccl/device/include 26 | cp -a ../../src/device/network/unpack/*.h hipify_rccl/include/network/unpack 27 | cp -a ../../src/enqueue.cc hipify_rccl/ 28 | cp -a ../../src/register/register.cc hipify_rccl/ 29 | cp -a ../../src/collectives.cc hipify_rccl/ 30 | cp -a ../../src/misc/archinfo.cc hipify_rccl/graph/ 31 | hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h 32 | hipify-perl -inplace -quiet-warnings hipify_rccl/device/include/*.h 33 | sed -i "s/template/template/g" "hipify_rccl/device/include/common.h" 34 | sed -i "s/\\(struct RunWorkBatch]*\\)>*/\\1, COLL_UNROLL>/" "hipify_rccl/device/include/common.h" 35 | hipify-perl -inplace -quiet-warnings hipify_rccl/graph/* 36 | hipify-perl -inplace -quiet-warnings hipify_rccl/include/network/unpack/* 37 | hipify-perl -inplace -quiet-warnings hipify_rccl/*.cc 38 | 39 | clean: 40 | rm -rf hipify_rccl 41 | rm -f *.o $(EXE) -------------------------------------------------------------------------------- /tools/topo_expl/include/device_table.h: -------------------------------------------------------------------------------- 1 | #ifndef DEVICE_TABLE_COMPATIBILITY 2 | #define DEVICE_TABLE_COMPATIBILITY 3 | __forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {} 4 | __forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {} 5 | __forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {} 6 | __forceinline__ __device__ void NCCL_CALL_FUNCTIONS_4(unsigned short funcIndex) noexcept {} 7 | #endif 8 | -------------------------------------------------------------------------------- /tools/topo_expl/include/utils.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. 3 | * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef UTILS_H_ 9 | #define UTILS_H_ 10 | 11 | struct graphInfo { 12 | int pattern; 13 | int nChannels; 14 | int sameChannels; 15 | float bwIntra; 16 | float bwInter; 17 | int typeIntra; 18 | int typeInter; 19 | }; 20 | 21 | struct allGatherInfo { 22 | struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS]; 23 | struct ncclTopoRanks topoRanks; 24 | int nc; 25 | bool pivotA2AEnabled; 26 | bool ll128Enabled; 27 | bool mscclEnabled; 28 | }; 29 | 30 | void initCollNet(); 31 | 32 | ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system); 33 | 34 | ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem); 35 | 36 | ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash); 37 | 38 | ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *allGather3Data, 39 | struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph, struct ncclComm* parent = NULL); 40 | 41 | ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *allGather3Data, 42 | struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph); 43 | 44 | #define TIME_START(index) 45 | 46 | #define TIME_STOP(index) 47 | 48 | #define TIME_CANCEL(index) 49 | 50 | #define TIME_PRINT(name) 51 | 52 | #endif -------------------------------------------------------------------------------- /tools/topo_expl/models/topo_3p_pcie.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /tools/topo_expl/models/topo_3p_pcie_1.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /tools/topo_expl/models/topo_4p1h.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /tools/topo_expl/models/topo_4p1h_1.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | --------------------------------------------------------------------------------