├── .azure-pipelines
    ├── integration-test-rocm.yml
    ├── integration-test.yml
    ├── multi-nodes-test.yml
    ├── nccl-api-test.yaml
    ├── templates
    │   ├── integration-test.yaml
    │   ├── nccl-test.yaml
    │   ├── ut-npkit.yaml
    │   └── ut.yaml
    └── ut.yml
├── .clang-format
├── .devcontainer
    ├── Dockerfile
    └── devcontainer.json
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation-improvement.md
    │   ├── feature_request.md
    │   └── perf_improvement.md
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── doc-build.yaml
    │   ├── gh-pages.yml
    │   ├── integration-test-backup.yml
    │   ├── lint.yml
    │   ├── mscclpp-lang.yml
    │   ├── update-version.yml
    │   └── ut-backup.yml
├── .gitignore
├── .readthedocs.yaml
├── CITATION.cff
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── VERSION
├── apps
    └── nccl
    │   ├── CMakeLists.txt
    │   ├── include
    │       └── nccl.h
    │   ├── src
    │       ├── allgather.hpp
    │       ├── allreduce.hpp
    │       ├── broadcast.hpp
    │       ├── common.hpp
    │       └── nccl.cu
    │   └── test
    │       ├── CMakeLists.txt
    │       └── nccl_api_test.cc
├── cmake
    ├── AddFormatTargets.cmake
    ├── CheckAmdGpu.cmake
    ├── CheckNvidiaGpu.cmake
    ├── FindGDRCopy.cmake
    ├── FindIBVerbs.cmake
    ├── FindNUMA.cmake
    ├── check_amd_gpu.hip
    └── check_nvidia_gpu.cu
├── docker
    ├── base-dev-x.dockerfile
    ├── base-x-rocm.dockerfile
    ├── base-x.dockerfile
    └── build.sh
├── docs
    ├── .gitignore
    ├── Doxyfile
    ├── Makefile
    ├── README.md
    ├── api
    │   └── index.rst
    ├── conf.py
    ├── design
    │   ├── design.md
    │   ├── mscclpp-dsl.md
    │   └── nccl-over-mscclpp.md
    ├── figs
    │   ├── abstractions.png
    │   ├── mscclpp_vs_nccl_comparison_num_nodes_1.jpeg
    │   ├── mscclpp_vs_nccl_comparison_num_nodes_2.jpeg
    │   └── size_boundary_diagram.png
    ├── getting-started
    │   ├── quickstart.md
    │   └── tutorials
    │   │   ├── customized-proxy-service.md
    │   │   ├── index.rst
    │   │   ├── initialization.md
    │   │   ├── memory-channel.md
    │   │   ├── packet-api.md
    │   │   ├── port-channel.md
    │   │   └── python-api.md
    ├── index.rst
    ├── make.bat
    ├── performance
    │   └── performance-ndmv4.md
    └── requirements.txt
├── include
    ├── CMakeLists.txt
    └── mscclpp
    │   ├── assert_device.hpp
    │   ├── atomic_device.hpp
    │   ├── concurrency_device.hpp
    │   ├── copy_device.hpp
    │   ├── core.hpp
    │   ├── device.hpp
    │   ├── env.hpp
    │   ├── errors.hpp
    │   ├── executor.hpp
    │   ├── fifo.hpp
    │   ├── fifo_device.hpp
    │   ├── gpu.hpp
    │   ├── gpu_data_types.hpp
    │   ├── gpu_utils.hpp
    │   ├── memory_channel.hpp
    │   ├── memory_channel_device.hpp
    │   ├── npkit
    │       ├── npkit.hpp
    │       ├── npkit_event.hpp
    │       └── npkit_struct.hpp
    │   ├── numa.hpp
    │   ├── nvls.hpp
    │   ├── nvls_device.hpp
    │   ├── packet_device.hpp
    │   ├── poll_device.hpp
    │   ├── port_channel.hpp
    │   ├── port_channel_device.hpp
    │   ├── proxy.hpp
    │   ├── semaphore.hpp
    │   ├── semaphore_device.hpp
    │   └── utils.hpp
├── pyproject.toml
├── python
    ├── CMakeLists.txt
    ├── examples
    │   ├── allgather_allpairs_multinodes_packets.py
    │   ├── allgather_barrier.py
    │   ├── allreduce_allpairs.py
    │   ├── allreduce_allpairs_get.py
    │   ├── allreduce_allpairs_packet.py
    │   ├── allreduce_nvls.py
    │   ├── allreduce_ring.py
    │   ├── send_recv_packet.py
    │   └── send_recv_proxy.py
    ├── mscclpp
    │   ├── CMakeLists.txt
    │   ├── __init__.py
    │   ├── comm.py
    │   ├── core_py.cpp
    │   ├── env_py.cpp
    │   ├── error_py.cpp
    │   ├── executor_py.cpp
    │   ├── fifo_py.cpp
    │   ├── gpu_utils_py.cpp
    │   ├── language
    │   │   ├── __init__.py
    │   │   ├── buffer.py
    │   │   ├── chunk.py
    │   │   ├── collective_checker.py
    │   │   ├── collectives.py
    │   │   ├── dag
    │   │   │   ├── __init__.py
    │   │   │   ├── instruction_dag.py
    │   │   │   ├── lower.py
    │   │   │   └── optimizer.py
    │   │   ├── ir.py
    │   │   ├── program.py
    │   │   ├── rank.py
    │   │   ├── topo_sort.py
    │   │   ├── types.py
    │   │   └── utils.py
    │   ├── memory_channel_py.cpp
    │   ├── npkit_py.cpp
    │   ├── numa_py.cpp
    │   ├── nvls_py.cpp
    │   ├── port_channel_py.cpp
    │   ├── semaphore_py.cpp
    │   ├── utils.py
    │   └── utils_py.cpp
    ├── mscclpp_benchmark
    │   ├── __init__.py
    │   ├── allreduce.cu
    │   ├── allreduce_bench.py
    │   ├── mscclpp_op.py
    │   └── nccl_op.py
    ├── requirements_cuda11.txt
    ├── requirements_cuda12.txt
    ├── requirements_rocm6.txt
    └── test
    │   ├── CMakeLists.txt
    │   ├── __init__.py
    │   ├── _cpp
    │       ├── __init__.py
    │       └── proxy_test.cpp
    │   ├── configs
    │       └── mscclpp_lang_test_config.json
    │   ├── d2d_semaphore_test.cu
    │   ├── executor_test.py
    │   ├── executor_test_verifier.cu
    │   ├── fifo_test.cu
    │   ├── h2d_semaphore_test.cu
    │   ├── memory_channel_test.cu
    │   ├── mscclpp_mpi.py
    │   ├── nvls_test.cu
    │   ├── port_channel_test.cu
    │   ├── proxy_test.cu
    │   ├── test_generate_mscclpp_lang_result.py
    │   └── test_mscclpp.py
├── src
    ├── .gitignore
    ├── CMakeLists.txt
    ├── bootstrap
    │   ├── bootstrap.cc
    │   └── socket.cc
    ├── c_style_remnants.cc
    ├── communicator.cc
    ├── connection.cc
    ├── context.cc
    ├── core.cc
    ├── debug.cc
    ├── endpoint.cc
    ├── env.cpp
    ├── errors.cc
    ├── executor
    │   ├── execution_kernel.cu
    │   ├── execution_plan.cc
    │   └── executor.cc
    ├── fifo.cc
    ├── gpu_utils.cc
    ├── ib.cc
    ├── include
    │   ├── api.h
    │   ├── atomic.hpp
    │   ├── communicator.hpp
    │   ├── connection.hpp
    │   ├── context.hpp
    │   ├── debug.h
    │   ├── endpoint.hpp
    │   ├── execution_common.hpp
    │   ├── execution_kernel.hpp
    │   ├── execution_plan.hpp
    │   ├── ib.hpp
    │   ├── ibverbs_wrapper.hpp
    │   ├── registered_memory.hpp
    │   ├── socket.h
    │   └── utils_internal.hpp
    ├── memory_channel.cc
    ├── npkit
    │   └── npkit.cc
    ├── numa.cc
    ├── nvls.cc
    ├── port_channel.cc
    ├── proxy.cc
    ├── registered_memory.cc
    ├── semaphore.cc
    ├── utils.cc
    └── utils_internal.cc
├── test
    ├── CMakeLists.txt
    ├── allgather_test_cpp.cu
    ├── allgather_test_host_offloading.cu
    ├── deploy
    │   ├── config
    │   ├── deploy.sh
    │   ├── hostfile
    │   ├── hostfile_ci
    │   ├── hostfile_mpi
    │   ├── perf_ndmv4.jsonl
    │   ├── perf_ndmv5.jsonl
    │   ├── pytest.sh
    │   ├── run_tests.sh
    │   └── setup.sh
    ├── execution-files
    │   ├── allreduce.json
    │   ├── allreduce_nvls.json
    │   ├── allreduce_packet.json
    │   ├── sendrecv.json
    │   └── sendrecv_packet.json
    ├── executor_test.cc
    ├── mp_unit
    │   ├── CMakeLists.txt
    │   ├── bootstrap_tests.cc
    │   ├── communicator_tests.cu
    │   ├── executor_tests.cc
    │   ├── ib_tests.cu
    │   ├── memory_channel_tests.cu
    │   ├── mp_unit_tests.cc
    │   ├── mp_unit_tests.hpp
    │   └── port_channel_tests.cu
    ├── mscclpp-test
    │   ├── CMakeLists.txt
    │   ├── allgather_test.cu
    │   ├── allreduce_test.cu
    │   ├── alltoall_test.cu
    │   ├── check_perf_result.py
    │   ├── common.cc
    │   ├── common.hpp
    │   └── sendrecv_test.cu
    ├── nvls_test.cu
    ├── run_mpi_test.sh.in
    └── unit
    │   ├── CMakeLists.txt
    │   ├── compile_tests.cu
    │   ├── core_tests.cc
    │   ├── cuda_utils_tests.cc
    │   ├── errors_tests.cc
    │   ├── fifo_tests.cu
    │   ├── numa_tests.cc
    │   ├── socket_tests.cc
    │   ├── utils_internal_tests.cc
    │   └── utils_tests.cc
└── tools
    └── npkit
        ├── build_and_run_npkit.sh
        └── npkit_trace_generator.py


/.azure-pipelines/integration-test.yml:
--------------------------------------------------------------------------------
 1 | trigger:
 2 | - main
 3 | - release/*
 4 | 
 5 | pr:
 6 |   branches:
 7 |     include:
 8 |     - main
 9 |     - release/*
10 |   drafts: false
11 | 
12 | jobs:
13 | - job: IntegrationTestA100
14 |   displayName: Integration test A100
15 |   strategy:
16 |     matrix:
17 |       cuda11:
18 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
19 |       cuda12:
20 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
21 | 
22 |   pool:
23 |     name: msccl-ci
24 |   container:
25 |     image: $(containerImage)
26 | 
27 |   steps:
28 |   - template: templates/integration-test.yaml
29 |     parameters:
30 |       subscription:     mscclpp-ci
31 |       vmssName:         mscclpp-ci
32 |       sshKeySecureFile: mscclpp.pem
33 | 
34 | - job: IntegrationTestH100
35 |   displayName: Integration test H100
36 |   strategy:
37 |     matrix:
38 |       cuda12:
39 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
40 | 
41 |   pool:
42 |     name: msccl-ci-h100
43 |   container:
44 |     image: $(containerImage)
45 | 
46 |   steps:
47 |   - template: templates/integration-test.yaml
48 |     parameters:
49 |       subscription:     mscclpp-ci-h100
50 |       vmssName:         mscclpp-h100-ci
51 |       sshKeySecureFile: mscclpp.pem
52 |       perfBaselineFile: test/deploy/perf_ndmv5.jsonl
53 | 


--------------------------------------------------------------------------------
/.azure-pipelines/nccl-api-test.yaml:
--------------------------------------------------------------------------------
 1 | trigger:
 2 | - main
 3 | - release/*
 4 | 
 5 | pr:
 6 |   branches:
 7 |     include:
 8 |     - main
 9 |     - release/*
10 |   drafts: false
11 | 
12 | jobs:
13 | - job: NcclTestA100
14 |   displayName: Run MSCCLPP over NCCL Test (A100)
15 |   pool:
16 |     name: msccl-ci
17 | 
18 |   strategy:
19 |     matrix:
20 |       cuda11:
21 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
22 |       cuda12:
23 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
24 | 
25 |   container:
26 |     image: $(containerImage)
27 | 
28 |   steps:
29 |   - template: templates/nccl-test.yaml
30 |     parameters:
31 |       subscription:     mscclpp-ci
32 |       vmssName:         mscclpp-ci
33 |       sshKeySecureFile: mscclpp.pem
34 |       nvccGencode:      "-gencode=arch=compute_80,code=sm_80"
35 | 
36 | - job: NcclTestH100
37 |   displayName: Run MSCCLPP over NCCL Test (H100)
38 |   pool:
39 |     name: msccl-ci-h100
40 | 
41 |   strategy:
42 |     matrix:
43 |       cuda12:
44 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
45 | 
46 |   container:
47 |     image: $(containerImage)
48 | 
49 |   steps:
50 |   - template: templates/nccl-test.yaml
51 |     parameters:
52 |       subscription:     mscclpp-ci-h100
53 |       vmssName:         mscclpp-h100-ci
54 |       sshKeySecureFile: mscclpp.pem
55 |       nvccGencode:      "-gencode=arch=compute_90,code=sm_90"


--------------------------------------------------------------------------------
/.azure-pipelines/ut.yml:
--------------------------------------------------------------------------------
 1 | trigger:
 2 | - main
 3 | - release/*
 4 | 
 5 | pr:
 6 |   branches:
 7 |     include:
 8 |     - main
 9 |     - release/*
10 |   drafts: false
11 | 
12 | jobs:
13 | - job: UnitTestA100
14 |   timeoutInMinutes: 40
15 |   pool:
16 |     name: msccl-ci
17 |   strategy:
18 |     matrix:
19 |       cuda11:
20 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
21 |       cuda12:
22 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
23 | 
24 |   container:
25 |     image: $(containerImage)
26 | 
27 |   steps:
28 |   - template: templates/ut.yaml
29 |     parameters:
30 |       subscription:     mscclpp-ci
31 |       vmssName:         mscclpp-ci
32 |       sshKeySecureFile: mscclpp.pem
33 | 
34 | - job: UnitTestWithNpKitA100
35 |   timeoutInMinutes: 30
36 |   pool:
37 |     name: msccl-ci
38 |   strategy:
39 |     matrix:
40 |       cuda11:
41 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
42 |       cuda12:
43 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
44 | 
45 |   container:
46 |     image: $(containerImage)
47 | 
48 |   steps:
49 |   - template: templates/ut-npkit.yaml
50 |     parameters:
51 |       subscription:     mscclpp-ci
52 |       vmssName:         mscclpp-ci
53 |       sshKeySecureFile: mscclpp.pem
54 | 
55 | - job: UnitTestH100
56 |   timeoutInMinutes: 40
57 |   pool:
58 |     name: msccl-ci-h100
59 |   strategy:
60 |     matrix:
61 |       cuda12:
62 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
63 | 
64 |   container:
65 |     image: $(containerImage)
66 | 
67 |   steps:
68 |   - template: templates/ut.yaml
69 |     parameters:
70 |       subscription:     mscclpp-ci-h100
71 |       vmssName:         mscclpp-h100-ci
72 |       sshKeySecureFile: mscclpp.pem
73 | 
74 | - job: UnitTestWithNpKitH100
75 |   timeoutInMinutes: 30
76 |   pool:
77 |     name: msccl-ci-h100
78 |   strategy:
79 |     matrix:
80 |       cuda12:
81 |         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
82 | 
83 |   container:
84 |     image: $(containerImage)
85 | 
86 |   steps:
87 |   - template: templates/ut-npkit.yaml
88 |     parameters:
89 |       subscription:     mscclpp-ci-h100
90 |       vmssName:         mscclpp-h100-ci
91 |       sshKeySecureFile: mscclpp.pem
92 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Google
2 | ColumnLimit: 120
3 | 


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE
 2 | FROM ${BASE_IMAGE}
 3 | ARG USERNAME=mscclpp
 4 | ARG USER_UID=1000
 5 | ARG USER_GID=$USER_UID
 6 | 
 7 | # Create the user
 8 | RUN groupadd --gid $USER_GID $USERNAME && \
 9 |     useradd --uid $USER_UID --gid $USER_GID -m $USERNAME && \
10 |     echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME && \
11 |     chmod 0440 /etc/sudoers.d/$USERNAME
12 | 
13 | USER $USERNAME
14 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "MSCCL++ Dev Container",
 3 |     "build": {
 4 |         "dockerfile": "Dockerfile",
 5 |         "args": {
 6 |             "BASE_IMAGE": "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8"
 7 |         }
 8 |     },
 9 |     "remoteUser": "mscclpp",
10 |     "customizations": {
11 |         "vscode": {
12 |             "extensions": [
13 |                 // Python
14 |                 "ms-python.python",
15 |                 "ms-python.vscode-pylance",
16 |                 // C++
17 |                 "ms-vscode.cpptools",
18 |                 "ms-vscode.cpptools-extension-pack",
19 |                 "ms-vscode.cmake-tools"
20 |             ]
21 |         }
22 |     },
23 |     "privileged": true,
24 |     "runArgs": [
25 |         "--net=host",
26 |         "--ipc=host",
27 |         "--gpus=all",
28 |         "--ulimit=memlock=-1:-1"
29 |     ],
30 |     "workspaceFolder": "/home/mscclpp/mscclpp",
31 |     "workspaceMount": "source=${localWorkspaceFolder},target=/home/mscclpp/mscclpp,type=bind,consistency=cached"
32 | }
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us fix
 4 | title: "[Bug]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-improvement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation improvement
 3 | about: Enhance or fix documentation
 4 | title: "[Doc]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[Feature]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/perf_improvement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Performance improvement
 3 | about: Discuss on performance issues
 4 | title: "[Perf]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
  1 | name: "CodeQL"
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |       - release/*
  8 |   pull_request:
  9 |     branches:
 10 |       - main
 11 |       - release/*
 12 |   schedule:
 13 |     - cron: "30 1 * * 1"
 14 | 
 15 | jobs:
 16 |   analyze-cuda:
 17 |     name: Analyze (CUDA)
 18 |     runs-on: 'ubuntu-latest'
 19 |     container:
 20 |       image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }}
 21 | 
 22 |     permissions:
 23 |       actions: read
 24 |       contents: read
 25 |       security-events: write
 26 | 
 27 |     strategy:
 28 |       fail-fast: false
 29 |       matrix:
 30 |         language: [ 'cpp', 'python' ]
 31 |         version: [ 'cuda11.8', 'cuda12.8' ]
 32 | 
 33 |     steps:
 34 |     - name: Checkout repository
 35 |       uses: actions/checkout@v4
 36 | 
 37 |     - name: Check disk space
 38 |       run: |
 39 |         df -h
 40 | 
 41 |     - name: Initialize CodeQL
 42 |       uses: github/codeql-action/init@v2
 43 |       with:
 44 |         languages: ${{ matrix.language }}
 45 | 
 46 |     - name: Dubious ownership exception
 47 |       run: |
 48 |         git config --global --add safe.directory /__w/mscclpp/mscclpp
 49 | 
 50 |     - name: Build
 51 |       run: |
 52 |         rm -rf build && mkdir build && cd build
 53 |         cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
 54 |         make -j
 55 | 
 56 |     - name: Perform CodeQL Analysis
 57 |       uses: github/codeql-action/analyze@v2
 58 |       with:
 59 |         category: "/language:${{matrix.language}}/version:${{matrix.version}}"
 60 | 
 61 |   analyze-rocm:
 62 |     name: Analyze (ROCm)
 63 |     runs-on: 'ubuntu-latest'
 64 |     container:
 65 |       image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }}
 66 | 
 67 |     permissions:
 68 |       actions: read
 69 |       contents: read
 70 |       security-events: write
 71 | 
 72 |     strategy:
 73 |       fail-fast: false
 74 |       matrix:
 75 |         language: [ 'cpp', 'python' ]
 76 |         version: [ 'rocm6.2' ]
 77 | 
 78 |     steps:
 79 |     - name: Checkout repository
 80 |       uses: actions/checkout@v4
 81 | 
 82 |     - name: Check disk space
 83 |       run: |
 84 |         df -h
 85 | 
 86 |     - name: Initialize CodeQL
 87 |       uses: github/codeql-action/init@v2
 88 |       with:
 89 |         languages: ${{ matrix.language }}
 90 | 
 91 |     - name: Dubious ownership exception
 92 |       run: |
 93 |         git config --global --add safe.directory /__w/mscclpp/mscclpp
 94 | 
 95 |     - name: Build
 96 |       run: |
 97 |         rm -rf build && mkdir build && cd build
 98 |         CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
 99 |         make -j
100 | 
101 |     - name: Perform CodeQL Analysis
102 |       uses: github/codeql-action/analyze@v2
103 |       with:
104 |         category: "/language:${{matrix.language}}/version:${{matrix.version}}"
105 | 


--------------------------------------------------------------------------------
/.github/workflows/doc-build.yaml:
--------------------------------------------------------------------------------
 1 | name: Docs Build
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - '**'
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Checkout
16 |         uses: actions/checkout@v4
17 | 
18 |       - name: Setup Python
19 |         uses: actions/setup-python@v5
20 |         with:
21 |           python-version: '3.10'
22 | 
23 |       - name: Install dependencies
24 |         run: |
25 |           sudo apt-get update
26 |           sudo apt-get install -y doxygen graphviz
27 |           pip install -r docs/requirements.txt
28 | 
29 |       - name: Build docs
30 |         run: |
31 |           cd docs
32 |           doxygen
33 |           make html
34 |           touch _build/html/.nojekyll
35 | 


--------------------------------------------------------------------------------
/.github/workflows/gh-pages.yml:
--------------------------------------------------------------------------------
 1 | name: GitHub Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 | 
 8 |   # Allows you to run this workflow manually from the Actions tab
 9 |   workflow_dispatch:
10 | 
11 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
12 | permissions:
13 |   contents: read
14 |   pages: write
15 |   id-token: write
16 | 
17 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
18 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
19 | concurrency:
20 |   group: "pages"
21 |   cancel-in-progress: false
22 | 
23 | jobs:
24 |   build:
25 |     runs-on: ubuntu-latest
26 |     steps:
27 |       - name: Checkout
28 |         uses: actions/checkout@v4
29 |       - name: Setup python
30 |         uses: actions/setup-python@v5
31 |         with:
32 |           python-version: '3.10'
33 |       - name: Install dependencies
34 |         run: |
35 |           sudo apt-get update
36 |           sudo apt-get install -y doxygen graphviz
37 |           pip install -r docs/requirements.txt
38 |       - name: Build docs
39 |         run: |
40 |           cd docs
41 |           doxygen
42 |           make html
43 |           touch _build/html/.nojekyll
44 |       - name: Upload artifacts
45 |         uses: actions/upload-pages-artifact@v3
46 |         with:
47 |           path: docs/_build/html
48 | 
49 |   deploy:
50 |     environment:
51 |       name: github-pages
52 |       url: ${{ steps.deployment.outputs.page_url }}
53 |     runs-on: ubuntu-latest
54 |     needs: build
55 |     steps:
56 |       - name: Deploy to GitHub Pages
57 |         id: deployment
58 |         uses: actions/deploy-pages@v4
59 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - '**'
 7 | 
 8 | jobs:
 9 |   cpplint:
10 |     runs-on: ubuntu-22.04
11 | 
12 |     steps:
13 |     - name: Check out Git repository
14 |       uses: actions/checkout@v4
15 | 
16 |     - name: Install ClangFormat
17 |       run: |
18 |         sudo apt-get update
19 |         sudo apt-get install -y clang-format
20 | 
21 |     - name: Run cpplint
22 |       run: |
23 |         CPPSOURCES=$(find ./src ./include ./python ./test ./apps -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')
24 |         clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}
25 | 
26 |   pylint:
27 |     runs-on: ubuntu-22.04
28 | 
29 |     steps:
30 |       - name: Check out Git repository
31 |         uses: actions/checkout@v4
32 |   
33 |       - name: Set up Python
34 |         uses: actions/setup-python@v4
35 |         with:
36 |           python-version: 3
37 | 
38 |       - name: Install Python dependencies
39 |         run: python3 -m pip install black
40 | 
41 |       - name: Run black
42 |         run: python3 -m black --check --config pyproject.toml .
43 | 
44 |   spelling:
45 |     runs-on: ubuntu-22.04
46 | 
47 |     steps:
48 |     - name: Check out Git repository
49 |       uses: actions/checkout@v4
50 | 
51 |     - name: Download misspell
52 |       run: |
53 |         curl -L https://github.com/client9/misspell/releases/download/v0.3.4/misspell_0.3.4_linux_64bit.tar.gz -o /tmp/misspell_0.3.4_linux_64bit.tar.gz
54 |         tar -xzf /tmp/misspell_0.3.4_linux_64bit.tar.gz -C .
55 | 
56 |     - name: Check spelling
57 |       run: |
58 |         ./misspell -error .
59 | 


--------------------------------------------------------------------------------
/.github/workflows/mscclpp-lang.yml:
--------------------------------------------------------------------------------
 1 | name: MSCCLPPLang
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 |       - release/*
 8 | 
 9 | jobs:
10 |   compare-diffs:
11 |     runs-on: 'ubuntu-latest'
12 |     container:
13 |       image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }}
14 | 
15 |     strategy:
16 |         fail-fast: false
17 |         matrix:
18 |           version: [ 'cuda11.8', 'cuda12.8' ]
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v4
22 | 
23 |     - name: Set environment variable
24 |       run: echo "LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/local/cuda/lib64" >> $GITHUB_ENV
25 | 
26 |     - name: Install mscclpp
27 |       run: |
28 |         CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install .
29 | 
30 |     - name: Copy test script/config to temp directory
31 |       run: |
32 |         cp python/test/test_generate_mscclpp_lang_result.py $RUNNER_TEMP/
33 |         cp python/test/configs/mscclpp_lang_test_config.json $RUNNER_TEMP/
34 |     - name: generate outputs
35 |       run: |
36 |         python3 $RUNNER_TEMP/test_generate_mscclpp_lang_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/pr-outputs/
37 |     - name: Checkout main branch
38 |       uses: actions/checkout@v4
39 |       if: github.event_name == 'pull_request' || github.event_name == 'push'
40 |       with:
41 |         ref: main
42 |     - name: Install msccl and dependencies
43 |       run: |
44 |         CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install .
45 |     - name: generate outputs
46 |       run: |
47 |         python3 $RUNNER_TEMP/test_generate_mscclpp_lang_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/main-outputs/
48 |     - name: Compare outputs
49 |       run: |
50 |         diff -rw $RUNNER_TEMP/tests/main-outputs/ $RUNNER_TEMP/tests/pr-outputs/


--------------------------------------------------------------------------------
/.github/workflows/update-version.yml:
--------------------------------------------------------------------------------
 1 | name: Update Version
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - main
 6 |       - release/**
 7 |     paths:
 8 |       - 'VERSION'
 9 | 
10 | permissions:
11 |   contents: write
12 | 
13 | jobs:
14 |   update-version:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout code
18 |         uses: actions/checkout@v4
19 |         with:
20 |           ref: ${{ github.head_ref }}
21 |           fetch-depth: 0
22 | 
23 |       - name: Read version
24 |         id: read_version
25 |         run: echo "VERSION=$(cat VERSION)" >> $GITHUB_ENV
26 | 
27 |       - name: Update Version in Files
28 |         run: |
29 |           VERSION=${{ env.VERSION }}
30 |           sed -i "s/^version: .*/version: ${VERSION}/" CITATION.cff
31 |           sed -i "s/^release = \".*\"/release = \"v${VERSION}\"/" docs/conf.py
32 |           sed -i "s/^version = \".*\"/version = \"${VERSION}\"/" pyproject.toml
33 | 
34 |           IFS='.' read -ra VER <<< "$VERSION"
35 |           MAJOR=${VER[0]}
36 |           MINOR=${VER[1]}
37 |           PATCH=${VER[2]}
38 | 
39 |           # Update CMakeLists.txt
40 |           sed -i "s/set(MSCCLPP_MAJOR \".*\")/set(MSCCLPP_MAJOR \"${MAJOR}\")/" CMakeLists.txt
41 |           sed -i "s/set(MSCCLPP_MINOR \".*\")/set(MSCCLPP_MINOR \"${MINOR}\")/" CMakeLists.txt
42 |           sed -i "s/set(MSCCLPP_PATCH \".*\")/set(MSCCLPP_PATCH \"${PATCH}\")/" CMakeLists.txt
43 | 
44 |           # Update header files
45 |           sed -i "s/#define MSCCLPP_MAJOR .*/#define MSCCLPP_MAJOR ${MAJOR}/" include/mscclpp/core.hpp
46 |           sed -i "s/#define MSCCLPP_MINOR .*/#define MSCCLPP_MINOR ${MINOR}/" include/mscclpp/core.hpp
47 |           sed -i "s/#define MSCCLPP_PATCH .*/#define MSCCLPP_PATCH ${PATCH}/" include/mscclpp/core.hpp
48 | 
49 |       - name: Commit and Push Changes
50 |         run: |
51 |           git config user.name "github-actions"
52 |           git config user.email "github-actions@github.com"
53 |           git add CITATION.cff docs/conf.py include/mscclpp/core.hpp pyproject.toml || true
54 |           if git diff --cached --exit-code; then
55 |             echo "No changes to commit."
56 |           else
57 |             git commit -m "Update version to ${{ env.VERSION }}"
58 |             git push
59 |           fi
60 |         env:
61 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
62 | 


--------------------------------------------------------------------------------
/.github/workflows/ut-backup.yml:
--------------------------------------------------------------------------------
 1 | name: UnitTest
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   UnitTest:
 7 |     runs-on: [ self-hosted, A100 ]
 8 |     defaults:
 9 |       run:
10 |         shell: bash
11 |     timeout-minutes: 30
12 |     strategy:
13 |       matrix:
14 |         cuda: [ cuda11.8, cuda12.2 ]
15 | 
16 |     container:
17 |       image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
18 |       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
19 | 
20 |     steps:
21 |       - name: Checkout
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Build
25 |         run: |
26 |           mkdir build && cd build
27 |           cmake -DCMAKE_BUILD_TYPE=Release ..
28 |           make -j
29 |         working-directory: ${{ github.workspace }}
30 | 
31 |       - name: LockGPUClock
32 |         run: |
33 |           sudo nvidia-smi -pm 1
34 |           for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
35 |             sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
36 |           done
37 | 
38 |       - name: UnitTests
39 |         run: |
40 |           ./build/test/unit_tests
41 | 
42 |       - name: MpUnitTests
43 |         run: |
44 |           set -e
45 |           mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
46 |           mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
47 |           mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
48 | 
49 |       - name: PyTests
50 |         run: |
51 |           set -e
52 |           mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | .hypothesis/
3 | build/
4 | dist/
5 | __pycache__
6 | .*.swp
7 | .idea/
8 | *.so
9 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for Sphinx projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   apt_packages:
11 |     - doxygen
12 |   tools:
13 |     python: "3.12"
14 |   jobs:
15 |     pre_build:
16 |     - cd docs && doxygen
17 | 
18 | # Build documentation in the "docs/" directory with Sphinx
19 | sphinx:
20 |   configuration: docs/conf.py
21 |   # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
22 |   # builder: "dirhtml"
23 |   # Fail on all warnings to avoid broken references
24 |   # fail_on_warning: true
25 | 
26 | # Optionally build your docs in additional formats such as PDF and ePub
27 | # formats:
28 | #   - pdf
29 | #   - epub
30 | 
31 | # Optional but recommended, declare the Python requirements required
32 | # to build your documentation
33 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
34 | python:
35 |   install:
36 |     - requirements: docs/requirements.txt
37 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | title: >-
 3 |   MSCCL++: Rethinking GPU Communication Abstractions for
 4 |   Cutting-edge AI Applications
 5 | message: >-
 6 |   If you use this software, please cite it using the
 7 |   metadata from this file.
 8 | type: software
 9 | authors:
10 |   - given-names: Aashaka
11 |     family-names: Shah
12 |     affiliation: Microsoft Research
13 |   - given-names: Abhinav
14 |     family-names: Jangda
15 |     affiliation: Microsoft Research
16 |   - given-names: Binyang
17 |     family-names: Li
18 |     affiliation: Microsoft Azure
19 |   - given-names: Caio
20 |     family-names: Rocha
21 |     affiliation: Microsoft Azure
22 |   - given-names: Changho
23 |     family-names: Hwang
24 |     affiliation: Microsoft Research
25 |   - given-names: Jithin
26 |     family-names: Jose
27 |     affiliation: Microsoft Azure
28 |   - given-names: Madan
29 |     family-names: Musuvathi
30 |     affiliation: Microsoft Research
31 |   - given-names: Olli
32 |     family-names: Saarikivi
33 |     affiliation: Microsoft Research
34 |   - given-names: Peng
35 |     family-names: Cheng
36 |     affiliation: Microsoft Research
37 |   - given-names: Qinghua
38 |     family-names: Zhou
39 |     affiliation: Microsoft Azure
40 |   - given-names: Roshan
41 |     family-names: Dathathri
42 |     affiliation: Microsoft Research
43 |   - given-names: Saeed
44 |     family-names: Maleki
45 |     affiliation: Microsoft Research
46 |   - given-names: Ziyue
47 |     family-names: Yang
48 |     affiliation: Microsoft Research
49 | identifiers:
50 |   - type: other
51 |     value: 'arxiv:2504.09014'
52 | repository-code: 'https://github.com/microsoft/mscclpp'
53 | url: 'https://microsoft.github.io/mscclpp/index.html'
54 | abstract: >-
55 |   MSCCL++ redefines the interface for inter-GPU communication, thereby
56 |   delivering a highly efficient and customizable communication stack
57 |   tailored for distributed GPU applications.
58 | license: MIT
59 | license-url: https://github.com/microsoft/mscclpp/blob/main/LICENSE
60 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Support
 2 | 
 3 | ## How to file issues and get help  
 4 | 
 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
 6 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
 7 | feature request as a new Issue.
 8 | 
 9 | For help and questions about using this project, please file them as new Issues.
10 | 
11 | ## Microsoft Support Policy  
12 | 
13 | Support for this project is limited to the resources listed above.
14 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.6.0
2 | 


--------------------------------------------------------------------------------
/apps/nccl/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/*)
 5 | file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS include/nccl.h)
 6 | 
 7 | if(MSCCLPP_USE_ROCM)
 8 |     set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE CXX)
 9 | endif()
10 | 
11 | add_library(mscclpp_nccl_obj OBJECT)
12 | target_sources(mscclpp_nccl_obj PRIVATE ${SOURCES})
13 | target_sources(mscclpp_nccl_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS})
14 | target_include_directories(mscclpp_nccl_obj PRIVATE include ${PROJECT_SOURCE_DIR}/src/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
15 | target_link_libraries(mscclpp_nccl_obj PRIVATE ${GPU_LIBRARIES} PUBLIC mscclpp_obj)
16 | set_target_properties(mscclpp_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
17 | if(MSCCLPP_USE_CUDA)
18 |     target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_CUDA)
19 | elseif(MSCCLPP_USE_ROCM)
20 |     target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_ROCM)
21 | endif()
22 | if(MSCCLPP_NPKIT_FLAGS)
23 |     target_compile_definitions(mscclpp_nccl_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
24 | endif()
25 | add_library(mscclpp_nccl SHARED)
26 | target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj)
27 | set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
28 | add_library(mscclpp_nccl_static STATIC)
29 | target_link_libraries(mscclpp_nccl_static PUBLIC mscclpp_obj mscclpp_nccl_obj)
30 | set_target_properties(mscclpp_nccl_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
31 | 
32 | install(TARGETS mscclpp_nccl_obj
33 |     FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include)
34 | install(TARGETS mscclpp_nccl
35 |     LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
36 | install(TARGETS mscclpp_nccl_static
37 |     ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib)
38 | 
39 | if(MSCCLPP_BUILD_TESTS)
40 |     add_subdirectory(test)
41 | endif()
42 | 


--------------------------------------------------------------------------------
/apps/nccl/src/common.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef NCCL_COMMON_HPP_
 5 | #define NCCL_COMMON_HPP_
 6 | 
 7 | #include <mscclpp/concurrency_device.hpp>
 8 | #include <mscclpp/env.hpp>
 9 | 
10 | #if defined(__HIP_PLATFORM_AMD__)
11 | #define WARP_SIZE 64
12 | #define __syncwarp() __builtin_amdgcn_wave_barrier()
13 | #else
14 | #define WARP_SIZE 32
15 | #endif
16 | 
17 | constexpr int NUM_NVLS_CONNECTION = 8;
18 | constexpr int NUM_SEMAPHORES = 64;
19 | 
20 | constexpr int NRANKS_PER_NODE = 8;
21 | constexpr int NPEERS = 7;
22 | 
23 | constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
24 | static bool mscclppDisableChannelCache = mscclpp::env()->disableChannelCache;
25 | 
26 | __device__ mscclpp::DeviceSyncer deviceSyncer;
27 | __constant__ mscclpp::DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES];
28 | 
29 | #endif  // NCCL_COMMON_HPP_
30 | 


--------------------------------------------------------------------------------
/apps/nccl/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | find_package(MPI)
 5 | 
 6 | add_executable(nccl_api_test nccl_api_test.cc)
 7 | target_link_libraries(nccl_api_test mscclpp mscclpp_nccl ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX)
 8 | if(IBVERBS_FOUND)
 9 |     target_link_libraries(nccl_api_test ${IBVERBS_LIBRARIES})
10 |     target_compile_definitions(nccl_api_test PRIVATE USE_IBVERBS)
11 | endif()
12 | target_include_directories(nccl_api_test PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/apps/nccl/include)
13 | 


--------------------------------------------------------------------------------
/cmake/AddFormatTargets.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | # Add targets to run clang-format and black
 5 | 
 6 | add_custom_target(check-format)
 7 | add_custom_target(format)
 8 | 
 9 | find_program(CLANG_FORMAT clang-format)
10 | if(CLANG_FORMAT)
11 |     message(STATUS "Found clang-format: ${CLANG_FORMAT}")
12 |     set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps/nccl/src)
13 |     add_custom_target(check-format-cpp ALL
14 |         COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu`
15 |     )
16 |     add_dependencies(check-format check-format-cpp)
17 |     add_custom_target(format-cpp
18 |         COMMAND ${CLANG_FORMAT} -style=file -i `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu`
19 |     )
20 |     add_dependencies(format format-cpp)
21 | else()
22 |     message(STATUS "clang-format not found.")
23 | endif()
24 | 
25 | find_program(BLACK black)
26 | if (BLACK)
27 |     message(STATUS "Found black: ${BLACK}")
28 |     add_custom_target(check-format-py
29 |         COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR}
30 |     )
31 |     add_dependencies(check-format check-format-py)
32 |     add_custom_target(format-py
33 |         COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR}
34 |     )
35 |     add_dependencies(format format-py)
36 | else()
37 |     message(STATUS "black not found.")
38 | endif()
39 | 


--------------------------------------------------------------------------------
/cmake/CheckAmdGpu.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | set(AMD_FOUND "FALSE")
 5 | 
 6 | set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
 7 | # Temporal fix for rocm5.6
 8 | set(ENV{amd_comgr_DIR} "/opt/rocm/lib/cmake/amd_comgr")
 9 | set(ENV{AMDDeviceLibs_DIR} "/opt/rocm/lib/cmake/AMDDeviceLibs")
10 | 
11 | find_package(hip QUIET)
12 | 
13 | if(NOT hip_FOUND)
14 |     return()
15 | endif()
16 | 
17 | enable_language(HIP)
18 | 
19 | set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_amd_gpu.hip")
20 | 
21 | try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC})
22 | 
23 | if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0)
24 |     set(AMD_FOUND "TRUE")
25 | endif()
26 | 


--------------------------------------------------------------------------------
/cmake/CheckNvidiaGpu.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | set(NVIDIA_FOUND "FALSE")
 5 | 
 6 | find_package(CUDAToolkit)
 7 | 
 8 | if(NOT CUDAToolkit_FOUND)
 9 |     return()
10 | endif()
11 | 
12 | set(CMAKE_CUDA_ARCHITECTURES "60")
13 | if(NOT CMAKE_CUDA_COMPILER)
14 |     # In case the CUDA Toolkit directory is not in the PATH
15 |     find_program(CUDA_COMPILER
16 |                  NAMES nvcc
17 |                  PATHS ${CUDAToolkit_BIN_DIR})
18 |     if(NOT CUDA_COMPILER)
19 |         message(WARNING "Could not find nvcc in ${CUDAToolkit_BIN_DIR}")
20 |         unset(CMAKE_CUDA_ARCHITECTURES)
21 |         return()
22 |     endif()
23 |     set(CMAKE_CUDA_COMPILER "${CUDA_COMPILER}")
24 | endif()
25 | enable_language(CUDA)
26 | 
27 | set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_nvidia_gpu.cu")
28 | 
29 | try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC})
30 | 
31 | if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0)
32 |     set(NVIDIA_FOUND "TRUE")
33 | else()
34 |     unset(CMAKE_CUDA_ARCHITECTURES)
35 |     unset(CMAKE_CUDA_COMPILER)
36 | endif()
37 | 


--------------------------------------------------------------------------------
/cmake/FindGDRCopy.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | # Find the GDRCopy libraries
 5 | #
 6 | # The following variables are optionally searched for defaults
 7 | #  GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
 8 | #  GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
 9 | #  GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
10 | 
11 | # The following are set after configuration is done:
12 | #  GDRCOPY_FOUND
13 | #  GDRCOPY_INCLUDE_DIRS
14 | #  GDRCOPY_LIBRARIES
15 | 
16 | # An imported target MSCCLPP::gdrcopy is created if the library is found.
17 | 
18 | find_path(GDRCOPY_INCLUDE_DIRS
19 |   NAMES gdrapi.h
20 |   HINTS
21 |   ${GDRCOPY_INCLUDE_DIR}
22 |   ${GDRCOPY_ROOT_DIR}
23 |   ${GDRCOPY_ROOT_DIR}/include)
24 | 
25 | find_library(GDRCOPY_LIBRARIES
26 |   NAMES gdrapi
27 |   HINTS
28 |   ${GDRCOPY_LIB_DIR}
29 |   ${GDRCOPY_ROOT_DIR}
30 |   ${GDRCOPY_ROOT_DIR}/lib)
31 | 
32 | include(FindPackageHandleStandardArgs)
33 | find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
34 | mark_as_advanced(GDRCOPY_INCLUDE_DIR GDRCOPY_LIBRARIES)
35 | 
36 | if(GDRCOPY_FOUND)
37 |   if(NOT TARGET MSCCLPP::gdrcopy)
38 |     add_library(MSCCLPP::gdrcopy UNKNOWN IMPORTED)
39 |   endif()
40 |   set_target_properties(MSCCLPP::gdrcopy PROPERTIES
41 |     INTERFACE_INCLUDE_DIRECTORIES "${GDRCOPY_INCLUDE_DIR}"
42 |     IMPORTED_LINK_INTERFACE_LANGUAGES "C"
43 |     IMPORTED_LOCATION "${GDRCOPY_LIBRARIES}")
44 | endif()


--------------------------------------------------------------------------------
/cmake/FindIBVerbs.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | # Find the IB Verbs libraries
 5 | #
 6 | # The following variables are optionally searched for defaults
 7 | #  IBVERBS_ROOT_DIR: Base directory where all ibverbs components are found
 8 | #  IBVERBS_INCLUDE_DIR: Directory where ibverbs headers are found
 9 | #  IBVERBS_LIB_DIR: Directory where ibverbs libraries are found
10 | 
11 | # The following are set after configuration is done:
12 | #  IBVERBS_FOUND
13 | #  IBVERBS_INCLUDE_DIRS
14 | #  IBVERBS_LIBRARIES
15 | 
16 | # An imported target MSCCLPP::ibverbs is created if the library is found.
17 | 
18 | find_path(IBVERBS_INCLUDE_DIRS
19 |   NAMES infiniband/verbs.h
20 |   HINTS
21 |   ${IBVERBS_INCLUDE_DIR}
22 |   ${IBVERBS_ROOT_DIR}
23 |   ${IBVERBS_ROOT_DIR}/include)
24 | 
25 | find_library(IBVERBS_LIBRARIES
26 |   NAMES ibverbs
27 |   HINTS
28 |   ${IBVERBS_LIB_DIR}
29 |   ${IBVERBS_ROOT_DIR}
30 |   ${IBVERBS_ROOT_DIR}/lib)
31 | 
32 | include(FindPackageHandleStandardArgs)
33 | find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES)
34 | mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES)
35 | 
36 | if(IBVERBS_FOUND)
37 |   if(NOT TARGET MSCCLPP::ibverbs)
38 |     add_library(MSCCLPP::ibverbs UNKNOWN IMPORTED)
39 |   endif()
40 |   set_target_properties(MSCCLPP::ibverbs PROPERTIES
41 |     INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}"
42 |     IMPORTED_LINK_INTERFACE_LANGUAGES "C"
43 |     IMPORTED_LOCATION "${IBVERBS_LIBRARIES}")
44 | endif()


--------------------------------------------------------------------------------
/cmake/FindNUMA.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | # Find the numa libraries
 5 | #
 6 | # The following variables are optionally searched for defaults
 7 | #  NUMA_ROOT_DIR: Base directory where all numa components are found
 8 | #  NUMA_INCLUDE_DIR: Directory where numa headers are found
 9 | #  NUMA_LIB_DIR: Directory where numa libraries are found
10 | 
11 | # The following are set after configuration is done:
12 | #  NUMA_FOUND
13 | #  NUMA_INCLUDE_DIRS
14 | #  NUMA_LIBRARIES
15 | 
16 | # An imported target MSCCLPP::numa is created if the library is found.
17 | 
18 | find_path(NUMA_INCLUDE_DIRS
19 |   NAMES numa.h
20 |   HINTS
21 |   ${NUMA_INCLUDE_DIR}
22 |   ${NUMA_ROOT_DIR}
23 |   ${NUMA_ROOT_DIR}/include)
24 | 
25 | find_library(NUMA_LIBRARIES
26 |   NAMES numa
27 |   HINTS
28 |   ${NUMA_LIB_DIR}
29 |   ${NUMA_ROOT_DIR}
30 |   ${NUMA_ROOT_DIR}/lib)
31 | 
32 | include(FindPackageHandleStandardArgs)
33 | find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_INCLUDE_DIRS NUMA_LIBRARIES)
34 | mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARIES)
35 | 
36 | if(NUMA_FOUND)
37 |   if(NOT TARGET MSCCLPP::numa)
38 |     add_library(MSCCLPP::numa UNKNOWN IMPORTED)
39 |   endif()
40 |   set_target_properties(MSCCLPP::numa PROPERTIES
41 |     INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}"
42 |     IMPORTED_LINK_INTERFACE_LANGUAGES "C"
43 |     IMPORTED_LOCATION "${NUMA_LIBRARIES}")
44 | endif()


--------------------------------------------------------------------------------
/cmake/check_amd_gpu.hip:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <hip/hip_runtime.h>
 5 | 
 6 | __global__ void kernel() {}
 7 | 
 8 | int main() {
 9 |     int cnt;
10 |     hipError_t err = hipGetDeviceCount(&cnt);
11 |     if (err != hipSuccess || cnt == 0) {
12 |         return 1;
13 |     }
14 |     return 0;
15 | }
16 | 


--------------------------------------------------------------------------------
/cmake/check_nvidia_gpu.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | __global__ void kernel() {}
 7 | 
 8 | int main() {
 9 |     int cnt;
10 |     cudaError_t err = cudaGetDeviceCount(&cnt);
11 |     if (err != cudaSuccess || cnt == 0) {
12 |         return 1;
13 |     }
14 |     return 0;
15 | }
16 | 


--------------------------------------------------------------------------------
/docker/base-dev-x.dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE
 2 | FROM ${BASE_IMAGE}
 3 | 
 4 | LABEL maintainer="MSCCL++"
 5 | LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 6 | 
 7 | RUN apt-get update && \
 8 |     apt-get install -y --no-install-recommends \
 9 |         htop \
10 |         lcov \
11 |         vim \
12 |         && \
13 |     apt-get autoremove -y && \
14 |     apt-get clean && \
15 |     rm -rf /var/lib/apt/lists/* /tmp/*
16 | 
17 | # Install CMake 3.26.4
18 | RUN ARCH=$(uname -m) && \
19 |     CMAKE_VERSION="3.26.4" && \
20 |     CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-${ARCH}" && \
21 |     CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${ARCH}.tar.gz" && \
22 |     curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
23 |     tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
24 |     rm -rf ${CMAKE_HOME}.tar.gz && \
25 |     ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${ARCH}/bin/* /usr/bin/
26 | 
27 | # Install Python dependencies
28 | ADD . /tmp/mscclpp
29 | WORKDIR /tmp/mscclpp
30 | ARG TARGET="cuda12.1"
31 | RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
32 |     python3 -m pip install --no-cache-dir --upgrade pip && \
33 |     python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt
34 | 
35 | # Cleanup
36 | RUN rm -rf /tmp/mscclpp
37 | WORKDIR /
38 | 


--------------------------------------------------------------------------------
/docker/base-x-rocm.dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE
 2 | FROM ${BASE_IMAGE}
 3 | 
 4 | LABEL maintainer="MSCCL++"
 5 | LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 6 | 
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | 
 9 | ENV RCCL_VERSION=rocm-6.2.0
10 | ARG ARCH=gfx942
11 | ENV ARCH_TARGET=${ARCH}
12 | RUN cd /tmp && \
13 |     git clone --branch ${RCCL_VERSION} --depth 1  https://github.com/ROCm/rccl.git && \
14 |     cd rccl && \
15 |     ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
16 |     cd .. && \
17 |     rm -rf /tmp/rccl
18 | 
19 | WORKDIR /
20 | 


--------------------------------------------------------------------------------
/docker/base-x.dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE
 2 | FROM ${BASE_IMAGE}
 3 | 
 4 | LABEL maintainer="MSCCL++"
 5 | LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 6 | 
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | USER root
 9 | 
10 | RUN rm -rf /opt/nvidia
11 | 
12 | RUN apt-get update && \
13 |     apt-get install -y --no-install-recommends \
14 |         build-essential \
15 |         ca-certificates \
16 |         curl \
17 |         git \
18 |         libcap2 \
19 |         libnuma-dev \
20 |         lsb-release \
21 |         openssh-client \
22 |         openssh-server \
23 |         python3-dev \
24 |         python3-pip \
25 |         python3-setuptools \
26 |         python3-wheel \
27 |         sudo \
28 |         wget
29 | 
30 | # Install OFED
31 | ARG OFED_VERSION=5.2-2.2.3.0
32 | RUN cd /tmp && \
33 |     ARCH=$(uname -m) && \
34 |     OS_VERSION=$(lsb_release -rs) && \
35 |     OS_VERSION=ubuntu${OS_VERSION} && \
36 |     wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \
37 |     tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \
38 |     MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
39 |     rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
40 | 
41 | # Install OpenMPI (should be done after the OFED installation) & clean apt cache
42 | RUN apt-get update && \
43 |     apt-get install -y --no-install-recommends \
44 |         libopenmpi-dev \
45 |         && \
46 |     apt-get autoremove -y && \
47 |     apt-get clean && \
48 |     rm -rf /var/lib/apt/lists/* /tmp/*
49 | 
50 | # OpenMPI short link (for compatibility with old images)
51 | RUN ln -s /usr/lib/x86_64-linux-gnu/openmpi /usr/local/mpi
52 | 
53 | ARG EXTRA_LD_PATH=
54 | ENV LD_LIBRARY_PATH="${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}"
55 | RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
56 | 
57 | ENTRYPOINT []
58 | WORKDIR /
59 | 


--------------------------------------------------------------------------------
/docker/build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | declare -A baseImageTable
 6 | baseImageTable=(
 7 |     ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
 8 |     ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
 9 |     ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
10 |     ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
11 |     ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
12 |     ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
13 |     ["rocm6.2"]="rocm/rocm-terminal:6.2.1"
14 | )
15 | 
16 | declare -A extraLdPathTable
17 | extraLdPathTable=(
18 |     ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
19 |     ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
20 |     ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
21 |     ["rocm6.2"]="/opt/rocm/lib"
22 | )
23 | 
24 | declare -A ofedVersionTable
25 | ofedVersionTable=(
26 |     ["cuda12.4"]="23.07-0.5.1.2"
27 |     ["cuda12.8"]="24.10-1.1.4.0"
28 | )
29 | 
30 | GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
31 | TARGET=${1}
32 | 
33 | print_usage() {
34 |     echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|rocm6.2]"
35 | }
36 | 
37 | if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
38 |     echo "Invalid target: ${TARGET}"
39 |     print_usage
40 |     exit 1
41 | fi
42 | echo "Target: ${TARGET}"
43 | 
44 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
45 | 
46 | cd ${SCRIPT_DIR}/..
47 | 
48 | DEFAULT_OFED_VERSION="5.2-2.2.3.0"
49 | OFED_VERSION=${ofedVersionTable[${TARGET}]}
50 | if [[ -z ${OFED_VERSION} ]]; then
51 |     OFED_VERSION=${DEFAULT_OFED_VERSION}
52 | fi
53 | 
54 | docker build -t ${GHCR}-common:base-${TARGET} \
55 |     -f docker/base-x.dockerfile \
56 |     --build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
57 |     --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
58 |     --build-arg TARGET=${TARGET} \
59 |     --build-arg OFED_VERSION=${OFED_VERSION} .
60 | 
61 | if [[ ${TARGET} == rocm* ]]; then
62 |     echo "Building ROCm base image..."
63 |     docker build -t ${GHCR}:base-${TARGET} \
64 |         -f docker/base-x-rocm.dockerfile \
65 |         --build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \
66 |         --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
67 |         --build-arg TARGET=${TARGET} \
68 |         --build-arg ARCH="gfx942" .
69 |     docker rmi ${GHCR}-common:base-${TARGET}
70 | else
71 |     echo "Building CUDA base image..."
72 |     docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET}
73 |     docker rmi --no-prune ${GHCR}-common:base-${TARGET}
74 | fi
75 | 
76 | docker build -t ${GHCR}:base-dev-${TARGET} \
77 |     -f docker/base-dev-x.dockerfile \
78 |     --build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
79 |     --build-arg TARGET=${TARGET} .
80 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | doxygen/
2 | _build/
3 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | ## How to build docs
 2 | 
 3 | 1. Install `doxygen`.
 4 | 
 5 |     ```bash
 6 |     $ sudo apt-get install doxygen graphviz
 7 |     ```
 8 | 
 9 | 2. Install Python packages below. If you install them on the user's local, you need to include `~/.local/bin` to `$PATH` (to use `sphinx-build`).
10 | 
11 |     ```bash
12 |     $ sudo python3 -m pip install -r ./requirements.txt
13 |     ```
14 | 
15 | 3. Create Doxygen documents.
16 | 
17 |     ```bash
18 |     $ doxygen
19 |     ```
20 | 
21 | 4. Create Sphinx documents.
22 | 
23 |     ```bash
24 |     $ make html
25 |     ```
26 | 
27 | 5. Done. The HTML files will be on `_build/` directory.
28 | 


--------------------------------------------------------------------------------
/docs/api/index.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 | 
4 | .. doxygennamespace:: mscclpp
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = "mscclpp"
10 | copyright = "2024, MSCCL++ Team"
11 | author = "MSCCL++ Team"
12 | release = "v0.6.0"
13 | 
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 | 
17 | extensions = ["breathe", "myst_parser"]
18 | 
19 | templates_path = ["_templates"]
20 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
21 | 
22 | # Breathe configuration
23 | breathe_projects = {"mscclpp": "./doxygen/xml"}
24 | breathe_default_project = "mscclpp"
25 | 
26 | # -- Options for HTML output -------------------------------------------------
27 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
28 | 
29 | html_theme = "sphinx_rtd_theme"
30 | html_static_path = ["_static"]
31 | 


--------------------------------------------------------------------------------
/docs/design/nccl-over-mscclpp.md:
--------------------------------------------------------------------------------
 1 | # NCCL Over MSCCL++
 2 | 
 3 | (limitations)=
 4 | ## Limitations
 5 | 
 6 | Current NCCL over MSCCL++ has a few limitations.
 7 | 
 8 | * We do not cover all APIs yet. See the [API Support Table](#api-support-table) for details.
 9 | * Multi-node communication is not supported yet.
10 | * Currently, collective communication functions may not work correctly if the buffer address is differed from that of previous function calls while sharing the same base address (returned by [cuMemGetAddressRange](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g64fee5711274a2a0573a789c94d8299b)) with the previous address. This is because the current implementation performs zero-copy communication over user buffers, and it is difficult to efficiently inform all ranks if the buffer address dynamically changes.
11 | 
12 | (api-support-table)=
13 | ## API Support Table
14 | 
15 | The table below lists all NCCL APIs (v2.21). We may cover more APIs in the future.
16 | 
17 | | API Name                 | Supported |
18 | | :----------------------- | :-------: |
19 | | ncclGetLastError         | X         |
20 | | ncclGetErrorString       | O         |
21 | | ncclGetVersion           | O         |
22 | | ncclGetUniqueId          | O         |
23 | | ncclCommInitRank         | O         |
24 | | ncclCommInitAll          | X         |
25 | | ncclCommInitRankConfig   | X         |
26 | | ncclCommSplit            | X         |
27 | | ncclCommFinalize         | O         |
28 | | ncclCommDestroy          | O         |
29 | | ncclCommAbort            | X         |
30 | | ncclCommGetAsyncError    | O         |
31 | | ncclCommCount            | O         |
32 | | ncclCommCuDevice         | O         |
33 | | ncclCommUserRank         | O         |
34 | | ncclCommRegister         | X         |
35 | | ncclCommDeregister       | X         |
36 | | ncclMemAlloc             | X         |
37 | | ncclMemFree              | X         |
38 | | ncclAllReduce            | O         |
39 | | ncclBroadcast            | X         |
40 | | ncclReduce               | X         |
41 | | ncclAllGather            | O         |
42 | | ncclReduceScatter        | X         |
43 | | ncclGroupStart           | O         |
44 | | ncclGroupEnd             | O         |
45 | | ncclSend                 | X         |
46 | | ncclRecv                 | X         |
47 | | ncclRedOpCreatePreMulSum | X         |
48 | | ncclRedOpDestroy         | X         |
49 | 
50 | ## Executor Support
51 | 
52 | The executor is a versatile tool designed to specify how mscclpp executes algorithms. Currently, only the allReduce operation allows for algorithm customization. The following environment variables can be managed:
53 | 
54 | - MSCCLPP_EXECUTION_PLAN_DIR: Specifies the directory where the executor will look for JSON files.
55 | 
56 | ```{figure} ../figs/size_boundary_diagram.png
57 | :name: MMSCCL++ Abstractions
58 | :alt: MSCCL++ Abstractions
59 | :align: center
60 | 
61 | Decision Flowchart for Message Size-Based Algorithm Execution
62 | ```
63 | 
64 | This is an example of executing the interface with the executor:
65 | ``` bash
66 | mpirun -np 8 -x MSCCLPP_EXECUTION_PLAN_DIR=/root/azure-mscclpp/nccl/test/execution-files ./apps/nccl/test/nccl_api_test
67 | ```
68 | 


--------------------------------------------------------------------------------
/docs/figs/abstractions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/docs/figs/abstractions.png


--------------------------------------------------------------------------------
/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg


--------------------------------------------------------------------------------
/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg


--------------------------------------------------------------------------------
/docs/figs/size_boundary_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/docs/figs/size_boundary_diagram.png


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/customized-proxy-service.md:
--------------------------------------------------------------------------------
1 | # Customize the Proxy Service
2 | 


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/index.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | ----------
 3 | 
 4 | This tutorial section provides a step-by-step guide to help you get started with the C++/Python API.
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 1
 8 |    :caption: Tutorials
 9 |    :hidden:
10 | 
11 |    initialization
12 |    port-channel
13 |    memory-channel
14 |    packet-api
15 |    customized-proxy-service
16 |    python-api
17 | 


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/initialization.md:
--------------------------------------------------------------------------------
 1 | # Commnunication initialize with mscclpp API
 2 | 
 3 | In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ C++ API. You will also learn how to use the Python API to initialize communication.
 4 | 
 5 | ## Prerequisites
 6 | A system with eight GPUs is required to run this tutorial.
 7 | 
 8 | Also make sure that you have installed MSCCL++ on your system. If not, please follow the [quick start](../quickstart.md).
 9 | 
10 | ## Initialize Communication with C++ API
11 | We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ C++ API.
12 | 
13 | ```cpp
14 | #include <mscclpp/core.hpp>
15 | #include <mscclpp/gpu_utils.hpp>
16 | #include <mscclpp/port_channel.hpp>
17 | 
18 | #include <memory>
19 | #include <string>
20 | #include <vector>
21 | 
22 | template <class T>
23 | using DeviceHandle = mscclpp::DeviceHandle<T>;
24 | __constant__ DeviceHandle<mscclpp::PortChannel> constPortChans[8];
25 | 
26 | void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
27 |   std::string ip_port = "10.0.0.4:50000";
28 |   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, worldsize);
29 |   bootstrap->initialize(ip_port);
30 |   mscclpp::Communicator comm(bootstrap);
31 |   mscclpp::ProxyService proxyService;
32 | 
33 |   std::vector<mscclpp::SemaphoreId> semaphoreIds;
34 |   std::vector<mscclpp::RegisteredMemory> localMemories;
35 |   std::vector<std::shared_future<std::shared_ptr<mscclpp::Connection>>> connections(world_size);
36 |   std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteMemories;
37 | 
38 |   for (int r = 0; r < world_size; ++r) {
39 |     if (r == rank) continue;
40 |     mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
41 |     // Connect with all other ranks
42 |     connections[r] = comm.connect(r, 0, transport);
43 |     auto memory = comm.registerMemory(data, dataSize, mscclpp::Transport::CudaIpc | ibTransport);
44 |     localMemories.push_back(memory);
45 |     comm.sendMemory(memory, r, 0);
46 |     remoteMemories.push_back(comm.recvMemory(r, 0));
47 |   }
48 | 
49 |   for (int r = 0; r < world_size; ++r) {
50 |     if (r == rank) continue;
51 |     semaphoreIds.push_back(proxyService.buildAndAddSemaphore(comm, connections[r].get()));
52 |   }
53 | 
54 |   std::vector<DeviceHandle<mscclpp::PortChannel>> portChannels;
55 |   for (size_t i = 0; i < semaphoreIds.size(); ++i) {
56 |     portChannels.push_back(mscclpp::deviceHandle(mscclpp::PortChannel(
57 |         proxyService.portChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
58 |         proxyService.addMemory(localMemories[i]))));
59 |   }
60 | 
61 |   if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
62 |     std::runtime_error("unexpected error");
63 |   }
64 |   CUDACHECK(cudaMemcpyToSymbol(constPortChans, portChannels.data(),
65 |                               sizeof(DeviceHandle<mscclpp::PortChannel>) * portChannels.size()));
66 | }
67 | ```
68 | 


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/memory-channel.md:
--------------------------------------------------------------------------------
1 | # Using MemoryChannel for Intra-Node Communication
2 | 
3 | TBU
4 | 


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/packet-api.md:
--------------------------------------------------------------------------------
1 | # Packet API for latency sensitive applications
2 | 


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/port-channel.md:
--------------------------------------------------------------------------------
1 | # Offload commnunication to CPU with PortChannel
2 | 
3 | TBU
4 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. MSCCL++ documentation master file, created by
 2 |    sphinx-quickstart on Tue Sep  5 13:03:46 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to MSCCL++'s documentation!
 7 | ===================================
 8 | 
 9 | MSCCL++ is a GPU-driven communication stack for scalable AI applications. It is designed to provide a high-performance, scalable, and customizable communication stack for distributed GPU applications.
10 | 
11 | Getting Started
12 | ---------------
13 | - Follow the :doc:`quick start <getting-started/quickstart>` for your platform of choice.
14 | - Take a look at the :doc:`tutorials <getting-started/tutorials/index>` to learn how to write your first mscclpp program.
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption:  Getting Started
19 |    :hidden:
20 | 
21 |    getting-started/quickstart
22 |    getting-started/tutorials/index
23 | 
24 | Design
25 | -------
26 | - :doc:`Design <design/design>` doc for those who want to understand the internals of MSCCL++.
27 | - :doc:`NCCL over MSCCL++ <design/nccl-over-mscclpp>` doc for those who want to understand how to use NCCL over MSCCL++.
28 | - :doc:`MSCCL++ DSL <design/mscclpp-dsl>` doc for those who want to understand the MSCCL++ DSL.
29 | 
30 | .. toctree::
31 |    :maxdepth: 1
32 |    :caption:  Design
33 |    :hidden:
34 | 
35 |    design/design
36 |    design/nccl-over-mscclpp
37 |    design/mscclpp-dsl
38 | 
39 | Performance
40 | ---------------
41 | - We evaluate the performance of MSCCL++ in A100 and H100. Here are some :doc:`performance results <performance/performance-ndmv4>` for all-reduce operations.
42 | 
43 | .. toctree::
44 |    :maxdepth: 1
45 |    :caption:  Performance
46 |    :hidden:
47 | 
48 |    performance/performance-ndmv4
49 | 
50 | C++ API
51 | ---------------
52 | - :doc:`mscclpp <api/index>`
53 | 
54 | 
55 | .. toctree::
56 |    :maxdepth: 1
57 |    :caption: C++ API
58 |    :hidden:
59 | 
60 |    api/index
61 | 
62 | Indices and tables
63 | ==================
64 | 
65 | * :ref:`genindex`
66 | * :ref:`modindex`
67 | * :ref:`search`
68 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/performance/performance-ndmv4.md:
--------------------------------------------------------------------------------
1 | # NDmv4 Performance
2 | 
3 | TBU
4 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | breathe
2 | sphinx_rtd_theme
3 | myst_parser
4 | 


--------------------------------------------------------------------------------
/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | 
4 | file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS *.hpp)
5 | target_sources(mscclpp_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS})
6 | 


--------------------------------------------------------------------------------
/include/mscclpp/assert_device.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_ASSERT_DEVICE_HPP_
 5 | #define MSCCLPP_ASSERT_DEVICE_HPP_
 6 | 
 7 | #include "device.hpp"
 8 | 
 9 | #if defined(MSCCLPP_DEVICE_COMPILE)
10 | 
11 | #include <cstdint>
12 | 
13 | #if !defined(DEBUG_BUILD)
14 | 
15 | #define MSCCLPP_ASSERT_DEVICE(__cond, __msg)
16 | 
17 | #else  // defined(DEBUG_BUILD)
18 | 
19 | #if defined(MSCCLPP_DEVICE_HIP)
20 | extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
21 |                                          const char *__function);
22 | #else   // !defined(MSCCLPP_DEVICE_HIP)
23 | extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
24 |                                                   const char *__function) __THROW;
25 | #endif  // !defined(MSCCLPP_DEVICE_HIP)
26 | 
27 | #define MSCCLPP_ASSERT_DEVICE(__cond, __msg)                         \
28 |   do {                                                               \
29 |     if (!(__cond)) {                                                 \
30 |       __assert_fail(__msg, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
31 |     }                                                                \
32 |   } while (0)
33 | 
34 | #endif  // !defined(DEBUG_BUILD)
35 | 
36 | #endif  // defined(MSCCLPP_DEVICE_COMPILE)
37 | 
38 | #endif  // MSCCLPP_ASSERT_DEVICE_HPP_
39 | 


--------------------------------------------------------------------------------
/include/mscclpp/atomic_device.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_ATOMIC_DEVICE_HPP_
 5 | #define MSCCLPP_ATOMIC_DEVICE_HPP_
 6 | 
 7 | #include "device.hpp"
 8 | 
 9 | #if defined(MSCCLPP_DEVICE_CUDA)
10 | #include <cuda/atomic>
11 | #endif  // defined(MSCCLPP_DEVICE_CUDA)
12 | 
13 | namespace mscclpp {
14 | 
15 | #if defined(MSCCLPP_DEVICE_CUDA)
16 | 
17 | constexpr cuda::memory_order memoryOrderRelaxed = cuda::memory_order_relaxed;
18 | constexpr cuda::memory_order memoryOrderAcquire = cuda::memory_order_acquire;
19 | constexpr cuda::memory_order memoryOrderRelease = cuda::memory_order_release;
20 | constexpr cuda::memory_order memoryOrderAcqRel = cuda::memory_order_acq_rel;
21 | constexpr cuda::memory_order memoryOrderSeqCst = cuda::memory_order_seq_cst;
22 | 
23 | constexpr cuda::thread_scope scopeSystem = cuda::thread_scope_system;
24 | constexpr cuda::thread_scope scopeDevice = cuda::thread_scope_device;
25 | 
26 | template <typename T, cuda::thread_scope Scope = cuda::thread_scope_system>
27 | MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order memoryOrder) {
28 |   return cuda::atomic_ref<T, Scope>{*ptr}.load(memoryOrder);
29 | }
30 | 
31 | template <typename T, cuda::thread_scope Scope = cuda::thread_scope_system>
32 | MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, cuda::memory_order memoryOrder) {
33 |   cuda::atomic_ref<T, Scope>{*ptr}.store(val, memoryOrder);
34 | }
35 | 
36 | template <typename T, cuda::thread_scope Scope = cuda::thread_scope_system>
37 | MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_order memoryOrder) {
38 |   return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
39 | }
40 | 
41 | #elif defined(MSCCLPP_DEVICE_HIP)
42 | 
43 | constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
44 | constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
45 | constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
46 | constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
47 | constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
48 | 
49 | // HIP does not have thread scope enums like CUDA
50 | constexpr auto scopeSystem = 0;
51 | constexpr auto scopeDevice = 0;
52 | 
53 | template <typename T, int scope = scopeSystem>
54 | MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(const T* ptr, int memoryOrder) {
55 |   return __atomic_load_n(ptr, memoryOrder);
56 | }
57 | 
58 | template <typename T, int scope = scopeSystem>
59 | MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, int memoryOrder) {
60 |   __atomic_store_n(ptr, val, memoryOrder);
61 | }
62 | 
63 | template <typename T, int scope = scopeSystem>
64 | MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrder) {
65 |   return __atomic_fetch_add(ptr, val, memoryOrder);
66 | }
67 | 
68 | #endif  // defined(MSCCLPP_DEVICE_HIP)
69 | 
70 | }  // namespace mscclpp
71 | 
72 | #endif  // MSCCLPP_ATOMIC_DEVICE_HPP_
73 | 


--------------------------------------------------------------------------------
/include/mscclpp/device.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_DEVICE_HPP_
 5 | #define MSCCLPP_DEVICE_HPP_
 6 | 
 7 | #if defined(__HIP_PLATFORM_AMD__)
 8 | #include <hip/hip_runtime.h>
 9 | #endif  // defined(__HIP_PLATFORM_AMD__)
10 | 
11 | #if (defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
12 | 
13 | #define MSCCLPP_DEVICE_COMPILE
14 | #define MSCCLPP_INLINE __forceinline__
15 | #define MSCCLPP_DEVICE_INLINE __forceinline__ __device__
16 | #define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__
17 | #if defined(__HIP_PLATFORM_AMD__)
18 | #define MSCCLPP_DEVICE_HIP
19 | #else  // !(defined(__HIP_PLATFORM_AMD__)
20 | #define MSCCLPP_DEVICE_CUDA
21 | #endif  // !(defined(__HIP_PLATFORM_AMD__))
22 | 
23 | #else  // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
24 | 
25 | #define MSCCLPP_HOST_COMPILE
26 | #define MSCCLPP_INLINE inline
27 | #define MSCCLPP_HOST_DEVICE_INLINE inline
28 | 
29 | #endif  // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
30 | 
31 | #endif  // MSCCLPP_DEVICE_HPP_
32 | 


--------------------------------------------------------------------------------
/include/mscclpp/env.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_ENV_HPP_
 5 | #define MSCCLPP_ENV_HPP_
 6 | 
 7 | #include <memory>
 8 | #include <string>
 9 | 
10 | namespace mscclpp {
11 | 
12 | class Env;
13 | 
14 | /// Get the MSCCL++ environment.
15 | /// @return A reference to the global environment object.
16 | std::shared_ptr<Env> env();
17 | 
18 | /// The MSCCL++ environment. The constructor reads environment variables and sets the corresponding fields.
19 | /// Use the @ref env() function to get the environment object.
20 | class Env {
21 |  public:
22 |   const std::string debug;
23 |   const std::string debugSubsys;
24 |   const std::string debugFile;
25 |   const std::string hcaDevices;
26 |   const std::string hostid;
27 |   const std::string socketFamily;
28 |   const std::string socketIfname;
29 |   const std::string commId;
30 |   const std::string executionPlanDir;
31 |   const std::string npkitDumpDir;
32 |   const bool cudaIpcUseDefaultStream;
33 |   const std::string ncclSharedLibPath;
34 |   const std::string forceNcclFallbackOperation;
35 |   const bool enableNcclFallback;
36 |   const bool disableChannelCache;
37 |   const bool forceDisableNvls;
38 | 
39 |  private:
40 |   Env();
41 | 
42 |   friend std::shared_ptr<Env> env();
43 | };
44 | 
45 | }  // namespace mscclpp
46 | 
47 | #endif  // MSCCLPP_ENV_HPP_
48 | 


--------------------------------------------------------------------------------
/include/mscclpp/errors.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_ERRORS_HPP_
 5 | #define MSCCLPP_ERRORS_HPP_
 6 | 
 7 | #include <stdexcept>
 8 | 
 9 | namespace mscclpp {
10 | 
11 | /// Enumeration of error codes used by MSCCL++.
12 | enum class ErrorCode {
13 |   SystemError,    // A system error occurred.
14 |   InternalError,  // An MSCCL++ internal error occurred.
15 |   RemoteError,    // An error occurred on a remote system.
16 |   InvalidUsage,   // The function was used incorrectly.
17 |   Timeout,        // The operation timed out.
18 |   Aborted,        // The operation was aborted.
19 |   ExecutorError,  // An error occurred in the MSCCL++ executor.
20 | };
21 | 
22 | /// Convert an error code to a string.
23 | ///
24 | /// @param error The error code to convert.
25 | /// @return The string representation of the error code.
26 | std::string errorToString(enum ErrorCode error);
27 | 
28 | /// Base class for all errors thrown by MSCCL++.
29 | class BaseError : public std::runtime_error {
30 |  public:
31 |   /// Constructor for @ref BaseError.
32 |   ///
33 |   /// @param message The error message.
34 |   /// @param errorCode The error code.
35 |   BaseError(const std::string& message, int errorCode);
36 | 
37 |   /// Constructor for @ref BaseError.
38 |   ///
39 |   /// @param errorCode The error code.
40 |   explicit BaseError(int errorCode);
41 | 
42 |   /// Virtual destructor for BaseError.
43 |   virtual ~BaseError() = default;
44 | 
45 |   /// Get the error code.
46 |   ///
47 |   /// @return The error code.
48 |   int getErrorCode() const;
49 | 
50 |   /// Get the error message.
51 |   ///
52 |   /// @return The error message.
53 |   const char* what() const noexcept override;
54 | 
55 |  protected:
56 |   std::string message_;
57 |   int errorCode_;
58 | };
59 | 
60 | /// A generic error.
61 | class Error : public BaseError {
62 |  public:
63 |   Error(const std::string& message, ErrorCode errorCode);
64 |   virtual ~Error() = default;
65 |   ErrorCode getErrorCode() const;
66 | };
67 | 
68 | /// An error from a system call that sets `errno`.
69 | class SysError : public BaseError {
70 |  public:
71 |   SysError(const std::string& message, int errorCode);
72 |   virtual ~SysError() = default;
73 | };
74 | 
75 | /// An error from a CUDA runtime library call.
76 | class CudaError : public BaseError {
77 |  public:
78 |   CudaError(const std::string& message, int errorCode);
79 |   virtual ~CudaError() = default;
80 | };
81 | 
82 | /// An error from a CUDA driver library call.
83 | class CuError : public BaseError {
84 |  public:
85 |   CuError(const std::string& message, int errorCode);
86 |   virtual ~CuError() = default;
87 | };
88 | 
89 | /// An error from an ibverbs library call.
90 | class IbError : public BaseError {
91 |  public:
92 |   IbError(const std::string& message, int errorCode);
93 |   virtual ~IbError() = default;
94 | };
95 | 
96 | };  // namespace mscclpp
97 | 
98 | #endif  // MSCCLPP_ERRORS_HPP_
99 | 


--------------------------------------------------------------------------------
/include/mscclpp/executor.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_EXECUTOR_HPP_
 5 | #define MSCCLPP_EXECUTOR_HPP_
 6 | 
 7 | #include <memory>
 8 | #include <mscclpp/core.hpp>
 9 | #include <mscclpp/gpu.hpp>
10 | #include <unordered_map>
11 | 
12 | namespace mscclpp {
13 | 
14 | enum class DataType {
15 |   INT32,
16 |   UINT32,
17 |   FLOAT16,
18 |   FLOAT32,
19 |   BFLOAT16,
20 | };
21 | 
22 | enum class PacketType {
23 |   LL8,
24 |   LL16,
25 | };
26 | 
27 | class ExecutionPlan {
28 |  public:
29 |   ExecutionPlan(const std::string& planPath);
30 |   ~ExecutionPlan() = default;
31 | 
32 |   std::string name() const;
33 |   std::string collective() const;
34 |   size_t minMessageSize() const;
35 |   size_t maxMessageSize() const;
36 |   bool isInPlace() const;
37 | 
38 |  private:
39 |   struct Impl;
40 |   std::shared_ptr<Impl> impl_;
41 | 
42 |   friend class Executor;
43 | };
44 | 
45 | class Executor {
46 |  public:
47 |   Executor(std::shared_ptr<Communicator> comm);
48 |   Executor(const Executor&) = delete;
49 |   Executor& operator=(const Executor&) = delete;
50 |   ~Executor();
51 | 
52 |   void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, DataType dataType,
53 |                const ExecutionPlan& plan, cudaStream_t stream, PacketType packetType = PacketType::LL16);
54 | 
55 |  private:
56 |   struct Impl;
57 |   std::unique_ptr<Impl> impl_;
58 | };
59 | }  // namespace mscclpp
60 | 
61 | #endif  // MSCCLPP_EXECUTOR_HPP_
62 | 


--------------------------------------------------------------------------------
/include/mscclpp/fifo.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_FIFO_HPP_
 5 | #define MSCCLPP_FIFO_HPP_
 6 | 
 7 | #include <cstdint>
 8 | #include <functional>
 9 | #include <memory>
10 | 
11 | #include "fifo_device.hpp"
12 | 
13 | namespace mscclpp {
14 | 
15 | constexpr size_t DEFAULT_FIFO_SIZE = 128;
16 | 
17 | /// A class representing a host proxy FIFO that can consume work elements pushed by device threads.
18 | class Fifo {
19 |  public:
20 |   /// Constructs a new @ref Fifo object.
21 |   /// @param size The number of entires in the FIFO.
22 |   Fifo(int size = DEFAULT_FIFO_SIZE);
23 | 
24 |   /// Destroys the @ref Fifo object.
25 |   ~Fifo();
26 | 
27 |   /// Polls the FIFO for a trigger.
28 |   ///
29 |   /// Returns @ref ProxyTrigger which is the trigger at the head of fifo.
30 |   ProxyTrigger poll();
31 | 
32 |   /// Pops a trigger from the FIFO.
33 |   void pop();
34 | 
35 |   /// Flushes the tail of the FIFO.
36 |   ///
37 |   /// @param sync If true, waits for the flush to complete before returning.
38 |   void flushTail(bool sync = false);
39 | 
40 |   /// Return the FIFO size.
41 |   /// @return The FIFO size.
42 |   int size() const;
43 | 
44 |   /// Returns a @ref FifoDeviceHandle object representing the device FIFO.
45 |   ///
46 |   /// @return A @ref FifoDeviceHandle object representing the device FIFO.
47 |   FifoDeviceHandle deviceHandle();
48 | 
49 |  private:
50 |   struct Impl;
51 |   std::unique_ptr<Impl> pimpl;
52 | };
53 | 
54 | }  // namespace mscclpp
55 | 
56 | #endif  // MSCCLPP_FIFO_HPP_
57 | 


--------------------------------------------------------------------------------
/include/mscclpp/gpu_data_types.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_GPU_DATA_TYPES_HPP_
 5 | #define MSCCLPP_GPU_DATA_TYPES_HPP_
 6 | 
 7 | #if defined(__HIP_PLATFORM_AMD__)
 8 | 
 9 | #include <hip/hip_bf16.h>
10 | #include <hip/hip_fp16.h>
11 | 
12 | using __bfloat16 = __hip_bfloat16;
13 | using __bfloat162 = __hip_bfloat162;
14 | #define __CUDA_BF16_TYPES_EXIST__
15 | 
16 | #else
17 | 
18 | #include <cuda_fp16.h>
19 | #include <cuda_runtime_api.h>
20 | #if (CUDART_VERSION >= 11000)
21 | #include <cuda_bf16.h>
22 | #endif
23 | #if (CUDART_VERSION >= 11080)
24 | #include <cuda_fp8.h>
25 | #endif
26 | 
27 | using __bfloat16 = __nv_bfloat16;
28 | using __bfloat162 = __nv_bfloat162;
29 | 
30 | #endif
31 | 
32 | #endif  // MSCCLPP_GPU_DATA_TYPES_HPP_
33 | 


--------------------------------------------------------------------------------
/include/mscclpp/memory_channel.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_MEMORY_CHANNEL_HPP_
 5 | #define MSCCLPP_MEMORY_CHANNEL_HPP_
 6 | 
 7 | #include <type_traits>
 8 | 
 9 | #include "core.hpp"
10 | #include "memory_channel_device.hpp"
11 | #include "semaphore.hpp"
12 | 
13 | namespace mscclpp {
14 | 
15 | /// Memory channel without specifying source/destination memory regions.
16 | struct BaseMemoryChannel {
17 |  protected:
18 |   std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore_;
19 | 
20 |  public:
21 |   /// Default constructor.
22 |   BaseMemoryChannel() = default;
23 | 
24 |   /// Constructor.
25 |   /// @param semaphore The semaphore used to synchronize the communication.
26 |   BaseMemoryChannel(std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore);
27 | 
28 |   BaseMemoryChannel(const BaseMemoryChannel& other) = default;
29 | 
30 |   BaseMemoryChannel& operator=(BaseMemoryChannel& other) = default;
31 | 
32 |   /// Device-side handle for @ref BaseMemoryChannel.
33 |   using DeviceHandle = BaseMemoryChannelDeviceHandle;
34 | 
35 |   /// Returns the device-side handle.
36 |   ///
37 |   /// User should make sure the BaseMemoryChannel is not released when using the returned handle.
38 |   ///
39 |   DeviceHandle deviceHandle() const;
40 | };
41 | 
42 | /// Channel for accessing peer memory directly from GPU threads.
43 | struct MemoryChannel : public BaseMemoryChannel {
44 |  private:
45 |   RegisteredMemory dst_;
46 |   void* src_;
47 |   void* packetBuffer_;
48 | 
49 |  public:
50 |   /// Default constructor.
51 |   MemoryChannel() = default;
52 | 
53 |   /// Constructor.
54 |   /// @param semaphore The semaphore used to synchronize the communication.
55 |   /// @param dst Registered memory of the destination.
56 |   /// @param src The source memory address.
57 |   /// @param packetBuffer A buffer used to store packets. @p packetBuffer is optional and if it is nullptr,
58 |   /// unpackPacket() and unpackPackets() methods are not available.
59 |   MemoryChannel(std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore, RegisteredMemory dst, void* src,
60 |                 void* packetBuffer = nullptr);
61 | 
62 |   /// Device-side handle for @ref MemoryChannel.
63 |   using DeviceHandle = MemoryChannelDeviceHandle;
64 | 
65 |   /// Returns the device-side handle.
66 |   ///
67 |   /// User should make sure the MemoryChannel is not released when using the returned handle.
68 |   ///
69 |   DeviceHandle deviceHandle() const;
70 | };
71 | 
72 | /// @deprecated Use @ref MemoryChannel instead.
73 | [[deprecated("Use MemoryChannel instead.")]] typedef MemoryChannel SmChannel;
74 | 
75 | }  // namespace mscclpp
76 | 
77 | #endif  // MSCCLPP_MEMORY_CHANNEL_HPP_
78 | 


--------------------------------------------------------------------------------
/include/mscclpp/npkit/npkit_event.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef NPKIT_EVENT_H_
 5 | #define NPKIT_EVENT_H_
 6 | 
 7 | #define NPKIT_EVENT_INVALID 0x0
 8 | 
 9 | #define NPKIT_EVENT_TIME_SYNC_GPU 0x1
10 | #define NPKIT_EVENT_TIME_SYNC_CPU 0x2
11 | 
12 | #define NPKIT_EVENT_CONN_CUDA_IPC_WRITE_ENTRY 0x3
13 | #define NPKIT_EVENT_CONN_CUDA_IPC_WRITE_EXIT 0x4
14 | #define NPKIT_EVENT_CONN_CUDA_IPC_UPDATE_AND_SYNC_ENTRY 0x5
15 | #define NPKIT_EVENT_CONN_CUDA_IPC_UPDATE_AND_SYNC_EXIT 0x6
16 | #define NPKIT_EVENT_CONN_CUDA_IPC_FLUSH_ENTRY 0x7
17 | #define NPKIT_EVENT_CONN_CUDA_IPC_FLUSH_EXIT 0x8
18 | 
19 | #define NPKIT_EVENT_CONN_IB_WRITE_ENTRY 0x9
20 | #define NPKIT_EVENT_CONN_IB_WRITE_EXIT 0xA
21 | #define NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_ENTRY 0xB
22 | #define NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT 0xC
23 | #define NPKIT_EVENT_CONN_IB_FLUSH_ENTRY 0xD
24 | #define NPKIT_EVENT_CONN_IB_FLUSH_EXIT 0xE
25 | 
26 | #define NPKIT_EVENT_CONN_ETH_WRITE_ENTRY 0xF
27 | #define NPKIT_EVENT_CONN_ETH_WRITE_EXIT 0x10
28 | #define NPKIT_EVENT_CONN_ETH_UPDATE_AND_SYNC_ENTRY 0x11
29 | #define NPKIT_EVENT_CONN_ETH_UPDATE_AND_SYNC_EXIT 0x12
30 | #define NPKIT_EVENT_CONN_ETH_FLUSH_ENTRY 0x13
31 | #define NPKIT_EVENT_CONN_ETH_FLUSH_EXIT 0x14
32 | #define NPKIT_EVENT_CONN_ETH_RECV_META_ENTRY 0x15
33 | #define NPKIT_EVENT_CONN_ETH_RECV_META_EXIT 0x16
34 | #define NPKIT_EVENT_CONN_ETH_RECV_DATA_ENTRY 0x17
35 | #define NPKIT_EVENT_CONN_ETH_RECV_DATA_EXIT 0x18
36 | 
37 | #define NPKIT_EVENT_EXECUTOR_INIT_ENTRY 0x19
38 | #define NPKIT_EVENT_EXECUTOR_INIT_EXIT 0x1A
39 | 
40 | #define NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY 0x1B
41 | #define NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT 0x1C
42 | 
43 | #define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1D
44 | #define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x31
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/include/mscclpp/npkit/npkit_struct.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef NPKIT_STRUCT_H_
 5 | #define NPKIT_STRUCT_H_
 6 | 
 7 | #include <cstdint>
 8 | 
 9 | #pragma pack(push, 1)
10 | 
11 | union NpKitEvent {
12 |   uint64_t bits[2];
13 |   struct {
14 |     uint64_t type : 8;
15 |     uint64_t size : 32;
16 |     uint64_t rsvd : 24;
17 |     uint64_t timestamp;
18 |   } fields;
19 | };
20 | 
21 | struct NpKitEventCollectContext {
22 |   NpKitEvent* event_buffer;
23 |   uint64_t event_buffer_head;
24 | };
25 | 
26 | #pragma pack(pop)
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/include/mscclpp/numa.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_NUMA_HPP_
 5 | #define MSCCLPP_NUMA_HPP_
 6 | 
 7 | namespace mscclpp {
 8 | 
 9 | int getDeviceNumaNode(int cudaDev);
10 | void numaBind(int node);
11 | 
12 | }  // namespace mscclpp
13 | 
14 | #endif  // MSCCLPP_NUMA_HPP_
15 | 


--------------------------------------------------------------------------------
/include/mscclpp/nvls.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_NVLS_HPP_
 5 | #define MSCCLPP_NVLS_HPP_
 6 | 
 7 | #include <mscclpp/gpu_utils.hpp>
 8 | #include <mscclpp/nvls_device.hpp>
 9 | 
10 | namespace mscclpp {
11 | 
12 | class NvlsConnection {
13 |  public:
14 |   NvlsConnection(size_t bufferSize, int numDevices);
15 |   NvlsConnection(const std::vector<char>& data);
16 |   NvlsConnection() = delete;
17 |   std::vector<char> serialize();
18 | 
19 |   // Everyone needs to synchronize after creating a NVLS connection before adding devices
20 |   void addDevice();
21 |   void addDevice(int cudaDeviceId);
22 | 
23 |   struct DeviceMulticastPointer {
24 |    private:
25 |     void* devicePtr_;
26 |     std::shared_ptr<char> mcPtr_;
27 |     size_t bufferSize_;
28 | 
29 |    public:
30 |     using DeviceHandle = DeviceMulticastPointerDeviceHandle;
31 |     DeviceMulticastPointer(void* devicePtr, std::shared_ptr<char> mcPtr, size_t bufferSize)
32 |         : devicePtr_(devicePtr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
33 |     DeviceHandle deviceHandle() const;
34 |     void* getDevicePtr();
35 | 
36 |     friend class NvlsConnection;
37 |   };
38 | 
39 |   /// @brief bind the memory allocated via @ref mscclpp::GpuBuffer to the multicast handle. The behavior
40 |   /// is undefined if the devicePtr is not allocated by @ref mscclpp::GpuBuffer.
41 |   /// @param devicePtr The device pointer returned by `mscclpp::GpuBuffer::data()`.
42 |   /// @param size The bytes of the memory to bind to the multicast handle.
43 |   /// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize
44 |   DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);
45 | 
46 |   size_t getMultiCastMinGranularity();
47 | 
48 |  private:
49 |   class Impl;
50 |   std::shared_ptr<Impl> pimpl_;
51 | };
52 | 
53 | class Communicator;
54 | 
55 | /// Connect to NVLS on setup.
56 | ///
57 | /// This function used to connect to NVLS on setup. NVLS collective using multicast operations to send/recv data.
58 | /// Here we need to put all involved ranks into the collective group.
59 | ///
60 | /// @param comm The communicator.
61 | /// @param allRanks The ranks of all processes involved in the collective.
62 | /// @param config The configuration for the local endpoint.
63 | /// @return std::shared_ptr<NvlsConnection> A shared pointer to the NVLS connection.
64 | std::shared_ptr<NvlsConnection> connectNvlsCollective(std::shared_ptr<Communicator> comm, std::vector<int> allRanks,
65 |                                                       size_t bufferSize);
66 | 
67 | }  // namespace mscclpp
68 | 
69 | #endif  // MSCCLPP_NVLS_HPP_
70 | 


--------------------------------------------------------------------------------
/include/mscclpp/poll_device.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_POLL_DEVICE_HPP_
 5 | #define MSCCLPP_POLL_DEVICE_HPP_
 6 | 
 7 | #include "assert_device.hpp"
 8 | 
 9 | #if defined(MSCCLPP_DEVICE_COMPILE)
10 | 
11 | // If a spin is stuck, print a warning and keep spinning.
12 | #define POLL_MAYBE_JAILBREAK(__cond, __max_spin_cnt)                                          \
13 |   do {                                                                                        \
14 |     [[maybe_unused]] int64_t __spin_cnt = 0;                                                  \
15 |     while (__cond) {                                                                          \
16 |       MSCCLPP_ASSERT_DEVICE((__max_spin_cnt < 0 || __spin_cnt++ != __max_spin_cnt), #__cond); \
17 |     }                                                                                         \
18 |   } while (0);
19 | 
20 | // the as POLL_MAYBE_JAILBREAK except that __cond1 is checked before __cond2
21 | // this is specially useful when __cond1 is faster to check
22 | #define OR_POLL_MAYBE_JAILBREAK(__cond1, __cond2, __max_spin_cnt)                                       \
23 |   do {                                                                                                  \
24 |     [[maybe_unused]] int64_t __spin_cnt = 0;                                                            \
25 |     while (true) {                                                                                      \
26 |       if (!(__cond1)) {                                                                                 \
27 |         break;                                                                                          \
28 |       } else if (!(__cond2)) {                                                                          \
29 |         break;                                                                                          \
30 |       }                                                                                                 \
31 |       MSCCLPP_ASSERT_DEVICE((__max_spin_cnt < 0 || __spin_cnt++ != __max_spin_cnt), #__cond1 #__cond2); \
32 |     }                                                                                                   \
33 |   } while (0);
34 | 
35 | #endif  // defined(MSCCLPP_DEVICE_COMPILE)
36 | 
37 | #endif  // MSCCLPP_POLL_DEVICE_HPP_
38 | 


--------------------------------------------------------------------------------
/include/mscclpp/proxy.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_PROXY_HPP_
 5 | #define MSCCLPP_PROXY_HPP_
 6 | 
 7 | #include <functional>
 8 | #include <memory>
 9 | 
10 | #include "fifo.hpp"
11 | 
12 | namespace mscclpp {
13 | 
14 | enum class ProxyHandlerResult {
15 |   Continue,
16 |   FlushFifoTailAndContinue,
17 |   Stop,
18 | };
19 | 
20 | class Proxy;
21 | using ProxyHandler = std::function<ProxyHandlerResult(ProxyTrigger)>;
22 | 
23 | class Proxy {
24 |  public:
25 |   Proxy(ProxyHandler handler, std::function<void()> threadInit, size_t fifoSize = DEFAULT_FIFO_SIZE);
26 |   Proxy(ProxyHandler handler, size_t fifoSize = DEFAULT_FIFO_SIZE);
27 |   ~Proxy();
28 | 
29 |   void start();
30 |   void stop();
31 | 
32 |   /// This is a concurrent fifo which is multiple threads from the device
33 |   /// can produce for and the sole proxy thread consumes it.
34 |   /// @return the fifo
35 |   Fifo& fifo();
36 | 
37 |  private:
38 |   struct Impl;
39 |   std::unique_ptr<Impl> pimpl;
40 | };
41 | 
42 | }  // namespace mscclpp
43 | 
44 | #endif  // MSCCLPP_PROXY_HPP_
45 | 


--------------------------------------------------------------------------------
/include/mscclpp/utils.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_UTILS_HPP_
 5 | #define MSCCLPP_UTILS_HPP_
 6 | 
 7 | #include <chrono>
 8 | #include <mscclpp/core.hpp>
 9 | #include <string>
10 | 
11 | namespace mscclpp {
12 | 
13 | struct Timer {
14 |   std::chrono::steady_clock::time_point start_;
15 |   int timeout_;
16 | 
17 |   Timer(int timeout = -1);
18 | 
19 |   ~Timer();
20 | 
21 |   /// Returns the elapsed time in microseconds.
22 |   int64_t elapsed() const;
23 | 
24 |   void set(int timeout);
25 | 
26 |   void reset();
27 | 
28 |   void print(const std::string& name);
29 | };
30 | 
31 | struct ScopedTimer : public Timer {
32 |   const std::string name_;
33 | 
34 |   ScopedTimer(const std::string& name);
35 | 
36 |   ~ScopedTimer();
37 | };
38 | 
39 | std::string getHostName(int maxlen, const char delim);
40 | 
41 | /// Get the number of available InfiniBand devices.
42 | ///
43 | /// @return The number of available InfiniBand devices.
44 | int getIBDeviceCount();
45 | 
46 | /// Get the name of the InfiniBand device associated with the specified transport.
47 | ///
48 | /// @param ibTransport The InfiniBand transport to get the device name for.
49 | /// @return The name of the InfiniBand device associated with the specified transport.
50 | std::string getIBDeviceName(Transport ibTransport);
51 | 
52 | /// Get the InfiniBand transport associated with the specified device name.
53 | ///
54 | /// @param ibDeviceName The name of the InfiniBand device to get the transport for.
55 | /// @return The InfiniBand transport associated with the specified device name.
56 | Transport getIBTransportByDeviceName(const std::string& ibDeviceName);
57 | 
58 | }  // namespace mscclpp
59 | 
60 | #endif  // MSCCLPP_UTILS_HPP_
61 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | [build-system]
 5 | requires = ["scikit-build-core"]
 6 | build-backend = "scikit_build_core.build"
 7 | 
 8 | [project]
 9 | name = "mscclpp"
10 | version = "0.6.0"
11 | 
12 | [tool.scikit-build]
13 | cmake.version = ">=3.25.0"
14 | cmake.build-type = "Release"
15 | # for dlpack issue: https://github.com/microsoft/vcpkg/pull/44679
16 | cmake.args = ["-DCMAKE_POLICY_VERSION_MINIMUM=3.5"]
17 | build-dir = "build/{wheel_tag}"
18 | wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"]
19 | wheel.install-dir = "mscclpp"
20 | 
21 | [tool.scikit-build.cmake.define]
22 | MSCCLPP_BUILD_PYTHON_BINDINGS = "ON"
23 | MSCCLPP_BUILD_TESTS = "OFF"
24 | 
25 | [tool.black]
26 | line-length = 120
27 | target-version = ['py38']
28 | include = '\.pyi?$'
29 | 


--------------------------------------------------------------------------------
/python/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | add_subdirectory(mscclpp)
 5 | add_subdirectory(test)
 6 | 
 7 | add_custom_target(pytest_lib_copy ALL
 8 |     COMMAND ${CMAKE_COMMAND} -E copy_if_different
 9 |         ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
10 |         ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
11 |     COMMAND ${CMAKE_COMMAND} -E copy_if_different
12 |         ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
13 |         ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
14 |     DEPENDS mscclpp_py mscclpp_py_test
15 | )
16 | 


--------------------------------------------------------------------------------
/python/examples/allgather_allpairs_multinodes_packets.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from mscclpp.language import *
 6 | from mscclpp.language.collectives import AllGather
 7 | from mscclpp.language.buffer import Buffer
 8 | from mscclpp.language.types import ChannelType, ReplicationPolicy
 9 | 
10 | 
11 | def allgather_multinodes_allpair(gpus, gpus_per_node, instances):
12 |     """
13 |     Implements a multi-node allgather collective using an allpairs algorithm with MSCCL++ DSL.
14 |     @param gpus: Total number of GPUs
15 |     @param gpus_per_node: Number of GPUs per node
16 |     Steps:
17 |     1. Each rank sends a chunk to all other ranks' scratch buffers using packet format.
18 |     2. Copy the chunk from the scratch buffer to the output buffer using packet format.
19 |     """
20 |     collective = AllGather(gpus, 1, True)
21 |     with MSCCLPPProgram(
22 |         "allgather_multinodes_allpair",
23 |         collective,
24 |         gpus,
25 |         instances,
26 |         protocol="LL",
27 |         replication_policy=ReplicationPolicy.interleaved,
28 |         num_threads_per_block=1024,
29 |     ):
30 |         for g in range(gpus):
31 |             src_rank = g
32 |             c = chunk(src_rank, Buffer.input, 0, 1)
33 |             for peer in range(1, gpus):
34 |                 dst_rank = (src_rank + peer) % gpus
35 |                 tb = dst_rank if dst_rank < src_rank else dst_rank - 1
36 |                 if src_rank // gpus_per_node == dst_rank // gpus_per_node:
37 |                     c.put_packet(dst_rank, Buffer.scratch, index=src_rank, sendtb=tb)
38 |                 else:
39 |                     c.put_packet(
40 |                         dst_rank,
41 |                         Buffer.scratch,
42 |                         index=src_rank,
43 |                         sendtb=tb,
44 |                         chan_type=ChannelType.port,
45 |                         temp_buffer=Buffer.scratch,
46 |                         temp_buffer_index=src_rank,
47 |                     )
48 | 
49 |         # Copying packet from local scratch buffer to local buffer
50 |         for g in range(gpus):
51 |             src_rank = g
52 |             src_offset = src_rank
53 |             for peer in range(1, gpus):
54 |                 dst_rank = (g + peer) % gpus
55 |                 tb = src_offset if src_offset < dst_rank else src_offset - 1
56 |                 c = chunk(dst_rank, Buffer.scratch, src_offset, 1)
57 |                 c.copy_packet(dst_rank, Buffer.output, src_offset, sendtb=tb + gpus - 1)
58 | 
59 |         Json()
60 |         Check()
61 | 
62 | 
63 | parser = argparse.ArgumentParser()
64 | parser.add_argument("num_gpus", type=int, help="number of gpus")
65 | parser.add_argument("gpus_per_node", type=int, help="number of gpus")
66 | parser.add_argument("instances", type=int, help="number of instances")
67 | 
68 | args = parser.parse_args()
69 | 
70 | allgather_multinodes_allpair(
71 |     args.num_gpus,
72 |     args.gpus_per_node,
73 |     args.instances,
74 | )
75 | 


--------------------------------------------------------------------------------
/python/examples/allgather_barrier.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mscclpp.language import *
 3 | from mscclpp.language.buffer import Buffer
 4 | from mscclpp.language.collectives import AllGather
 5 | from mscclpp.language.types import ChannelType, ReplicationPolicy
 6 | 
 7 | 
 8 | def allgather_test(gpus, instances):
 9 |     """
10 |     Demonstrates how to use barrier in the MSCCL++ DSL with an allgather collective.
11 |     This example uses an allpairs algorithm for the allgather operation.
12 |     Steps:
13 |     1. Each rank sends a chunk to all other ranks' output buffers and copies the chunk to its own output buffer.
14 |     2. A barrier is called to synchronize the send and copy operations, and signal peers that the data has been sent.
15 |     3. Wait for all the chunks from other ranks to be received.
16 |     """
17 |     size = gpus
18 |     collective = AllGather(size, 1, False)
19 |     with MSCCLPPProgram(
20 |         "allgather_with_barrier",
21 |         collective,
22 |         size,
23 |         instances,
24 |         protocol="Simple",
25 |         replication_policy=ReplicationPolicy.interleaved,
26 |     ):
27 |         for n in range(gpus):
28 |             c = chunk(n, Buffer.input, 0, 1)
29 |             for peer in range(gpus):
30 |                 if n != peer:
31 |                     c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.memory)
32 |                 else:
33 |                     c.copy(n, Buffer.output, n, sendtb=peer)
34 |             # explicit barrier
35 |             r = rank(n)
36 |             r.barrier(tb_list=list(range(gpus)))
37 |             for peer in range(gpus):
38 |                 if n != peer:
39 |                     c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.memory)
40 | 
41 |         for n in range(gpus):
42 |             for peer in range(gpus):
43 |                 c = chunk(n, Buffer.output, peer, 1)
44 |                 if n != peer:
45 |                     c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.memory)
46 | 
47 |         Json()
48 |         Check()
49 | 
50 | 
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument("num_gpus", type=int, help="number of gpus")
53 | parser.add_argument("instances", type=int, help="number of instances")
54 | args = parser.parse_args()
55 | allgather_test(args.num_gpus, args.instances)
56 | 


--------------------------------------------------------------------------------
/python/examples/allreduce_allpairs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from mscclpp.language import *
 6 | from mscclpp.language.collectives import AllReduce
 7 | from mscclpp.language.buffer import Buffer
 8 | 
 9 | 
10 | def allreduce_allpairs(gpus, instances, protocol):
11 |     """
12 |     Demonstrate allreduce with all pairs algorithm using put semantics.
13 |     Steps:
14 |     1. Sync all ranks to ensure the data is ready.
15 |     2. Each rank reads chunks from all peers and reduces the data.
16 |     3. Put the reduced data to all peers.
17 |     4. Sync all ranks to ensure the data is received.
18 |     """
19 |     size = gpus
20 |     chunksperloop = gpus * gpus
21 |     collective = AllReduce(size, chunksperloop, True)
22 |     with MSCCLPPProgram("allreduce_pairs", collective, size, instances, protocol=protocol):
23 |         for rank in range(size):
24 |             for tb in range(size):
25 |                 index = rank * size
26 |                 c = chunk(rank, Buffer.input, index + tb)
27 |                 # step1 make sure the data is ready
28 |                 for nghr in range(size):
29 |                     peer_index = nghr * size
30 |                     if rank != nghr:
31 |                         # signal peer the buffer is ready
32 |                         c_peer = chunk(rank, Buffer.input, peer_index + tb)
33 |                         c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb)
34 |                 for nghr in range(size):
35 |                     if rank != nghr:
36 |                         c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
37 |                 # step2 reduce the chunks and send to peers
38 |                 for nghr in range(size):
39 |                     if rank != nghr:
40 |                         c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb)
41 |                 for nghr in range(size):
42 |                     if rank != nghr:
43 |                         c.put(nghr, Buffer.input, index + tb, sendtb=tb)
44 |                 # step3 signal the peers buffer is ready
45 |                 for nghr in range(size):
46 |                     if rank != nghr:
47 |                         c.signal(nghr, Buffer.input, index + tb, sendtb=tb)
48 |                 for nghr in range(size):
49 |                     if rank != nghr:
50 |                         peer_index = nghr * size
51 |                         c_peer = chunk(rank, Buffer.input, peer_index + tb)
52 |                         c_peer.wait(nghr, Buffer.input, peer_index + tb, recvtb=tb)
53 | 
54 |         Json()
55 |         Check()
56 | 
57 | 
58 | parser = argparse.ArgumentParser()
59 | parser.add_argument("num_gpus", type=int, help="number of gpus")
60 | parser.add_argument("instances", type=int, help="number of instances")
61 | parser.add_argument("--protocol", type=str, default="Simple", choices=["Simple"], help="Protocol")
62 | 
63 | args = parser.parse_args()
64 | 
65 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
66 | 


--------------------------------------------------------------------------------
/python/examples/allreduce_allpairs_packet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from mscclpp.language import *
 6 | from mscclpp.language.collectives import AllReduce
 7 | from mscclpp.language.buffer import Buffer
 8 | 
 9 | 
10 | def allreduce_allpairs(gpus, instances):
11 |     """
12 |     AllReduce with all pairs algorithm using packets format.
13 |     Steps:
14 |     1. Each rank sends its nth chunk to the nth rank's scratch space.
15 |     2. Each rank performs a local reduction on its nth chunk using data from all other ranks' scratch spaces.
16 |     3. Each rank sends the reduced data to all other ranks' scratch spaces.
17 |     4. Each rank retrieves the final reduced result from the scratch space.
18 |     """
19 |     size = gpus
20 |     chunksperloop = gpus * gpus
21 |     collective = AllReduce(size, chunksperloop, True)
22 |     with MSCCLPPProgram(
23 |         "allreduce_packets",
24 |         collective,
25 |         size,
26 |         instances,
27 |         protocol="LL",
28 |         use_double_scratch_buffer=True,
29 |     ):
30 |         # Each rank sends the nth chunk to the nth rank into scratch space
31 |         for r1 in range(size):
32 |             for tb in range(size):
33 |                 if tb == r1:
34 |                     continue
35 |                 remote_rank = tb
36 |                 index = remote_rank * size
37 |                 c = chunk(r1, Buffer.input, index, size)
38 |                 c.put_packet(remote_rank, Buffer.scratch, index=r1 * size, sendtb=tb)
39 | 
40 |         # Each rank performs a local reduction on the nth chunk
41 |         # Utilize 8 threadblocks for this reduction for better parallelism
42 |         for r in range(size):
43 |             for index in range(size):
44 |                 c = chunk(r, Buffer.input, r * size + index)
45 |                 for peer in range(size):
46 |                     if peer != r:
47 |                         c.reduce_packet(chunk(r, Buffer.scratch, peer * size + index), recvtb=index)
48 |                 for peer in range(size):
49 |                     if peer != r:
50 |                         c.put_packet(peer, Buffer.scratch, (size * size) + r * size + index, sendtb=index)
51 | 
52 |         # Each rank get final result from scratch space
53 |         for r in range(size):
54 |             for peer in range(size):
55 |                 if peer != r:
56 |                     c = chunk(r, Buffer.scratch, size * size + peer * size, size)
57 |                     c.copy_packet(r, Buffer.input, peer * size, sendtb=peer)
58 | 
59 |         Json()
60 |         Check()
61 | 
62 | 
63 | parser = argparse.ArgumentParser()
64 | parser.add_argument("num_gpus", type=int, help="number of gpus")
65 | parser.add_argument("instances", type=int, help="number of instances")
66 | 
67 | args = parser.parse_args()
68 | 
69 | allreduce_allpairs(args.num_gpus, args.instances)
70 | 


--------------------------------------------------------------------------------
/python/examples/allreduce_nvls.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from mscclpp.language import *
 6 | from mscclpp.language.collectives import AllReduce
 7 | from mscclpp.language.buffer import Buffer
 8 | 
 9 | 
10 | def allreduce_nvls(gpus, instances):
11 |     """
12 |     Allreduce via NVLS channel
13 |     Steps:
14 |     1. Sync all the ranks to make sure the data is ready.
15 |     2. Call group_load_reduce to reduce the data.
16 |     3. Call group_store to propagate the data to all the ranks.
17 |     """
18 |     size = gpus
19 |     chunksperloop = gpus
20 |     collective = AllReduce(size, chunksperloop, True)
21 |     with MSCCLPPProgram(
22 |         "allreduce_nvls",
23 |         collective,
24 |         size,
25 |         instances,
26 |     ):
27 |         # Each rank sends the nth chunk to the nth rank into scratch space
28 |         for rank in range(size):
29 |             index = rank
30 |             c = chunk(rank, Buffer.input, index)
31 |             reduce_chunks = []
32 |             # make sure the data is ready
33 |             for nghr in range(size):
34 |                 if rank != nghr:
35 |                     c_peer = chunk(nghr, Buffer.input, index)
36 |                     reduce_chunks.append(c_peer)
37 |                     c.signal(nghr, Buffer.input, index, sendtb=0)
38 |             for nghr in range(size):
39 |                 if rank != nghr:
40 |                     c.wait(nghr, Buffer.input, index, recvtb=0)
41 |             c = c.group_load_reduce(reduce_chunks, recvtb=0)
42 |             ngbrs = [nghr for nghr in range(size) if nghr != rank]
43 |             c.group_store(ngbrs, sendtb=0)
44 | 
45 |         Json()
46 |         Check()
47 | 
48 | 
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument("num_gpus", type=int, help="number of gpus")
51 | parser.add_argument("instances", type=int, help="number of instances")
52 | 
53 | args = parser.parse_args()
54 | 
55 | allreduce_nvls(args.num_gpus, args.instances)
56 | 


--------------------------------------------------------------------------------
/python/examples/allreduce_ring.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from mscclpp.language import *
 6 | from mscclpp.language.collectives import AllReduce
 7 | from mscclpp.language.buffer import Buffer
 8 | 
 9 | 
10 | def allreduce_ring(size, instances):
11 |     """
12 |     Implements a ring based allreduce.
13 |     Steps:
14 |     1. Send signal to next rank and wait for signal from previous rank. Make sure the data is ready in previous rank.
15 |     2. Reduce the data and send to next rank.
16 |     3. After all the data is reduced, propagate the data to all the ranks.
17 |     """
18 |     collective = AllReduce(size, size, True)
19 |     with MSCCLPPProgram(
20 |         f"allreduce_ring",
21 |         collective,
22 |         size,
23 |         instances,
24 |         protocol="Simple",
25 |     ):
26 |         # Reduce ring
27 |         for step in range(0, size - 1):
28 |             for index in range(0, size):
29 |                 rank = (index + step) % size
30 |                 next_rank = (index + step + 1) % size
31 |                 c = chunk(rank, Buffer.input, index)
32 |                 c.signal(next_rank, Buffer.input, index, 0)
33 |                 prev_rank = (index + step - 1) % size
34 |                 c = chunk(rank, Buffer.input, (index + size - 1) % size)
35 |                 c.wait(prev_rank, Buffer.input, (index + size - 1) % size, 0)
36 |                 c.reduce(chunk(prev_rank, Buffer.input, (index + size - 1) % size), recvtb=0)
37 | 
38 |         # Propagate ring
39 |         for step in range(-1, size - 2):
40 |             for index in range(0, size):
41 |                 rank = (index + step) % size
42 |                 c = chunk(rank, Buffer.input, index)
43 |                 next_rank = (index + step + 1) % size
44 |                 c.put(next_rank, Buffer.input, index, sendtb=0)
45 |                 c.signal(next_rank, Buffer.input, index, 0)
46 |                 prev_rank = (index + step - 1) % size
47 |                 c = chunk(rank, Buffer.input, (index + size - 1) % size)
48 |                 c.wait(prev_rank, Buffer.input, (index + size - 1) % size, 0)
49 | 
50 |         Json()
51 |         Check()
52 | 
53 | 
54 | parser = argparse.ArgumentParser()
55 | parser.add_argument("num_gpus", type=int, help="number of gpus")
56 | parser.add_argument("instances", type=int, help="number of instances")
57 | args = parser.parse_args()
58 | 
59 | allreduce_ring(args.num_gpus, args.instances)
60 | 


--------------------------------------------------------------------------------
/python/examples/send_recv_packet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from mscclpp.language import *
 6 | from mscclpp.language.collectives import SendRecv
 7 | from mscclpp.language.buffer import Buffer
 8 | from mscclpp.language.types import ChannelType
 9 | 
10 | 
11 | def send_recv(instances):
12 |     """
13 |     Send and receive data between two ranks using port channels, with LL protocol and double scratch buffer.
14 |     Steps:
15 |     1. Each rank sends a chunk to every other rank's scratch buffer with packet format via port channel.
16 |     2. Wait for the data to be received, then copy it to the output buffer.
17 |     """
18 |     size = 2
19 |     chunksperloop = 1
20 |     collective = SendRecv(size, chunksperloop, False)
21 |     with MSCCLPPProgram(
22 |         "send_recv",
23 |         collective,
24 |         size,
25 |         instances,
26 |         protocol="LL",
27 |         use_double_scratch_buffer=True,
28 |     ):
29 |         for r in range(size):
30 |             for nghr in range(size):
31 |                 if nghr == r:
32 |                     continue
33 |                 c = chunk(r, Buffer.input, 0)
34 |                 c.put_packet(
35 |                     nghr,
36 |                     Buffer.scratch,
37 |                     1,
38 |                     sendtb=0,
39 |                     chan_type=ChannelType.port,
40 |                     temp_buffer=Buffer.scratch,
41 |                     temp_buffer_index=0,
42 |                 )
43 | 
44 |         for r in range(size):
45 |             c = chunk(r, Buffer.scratch, 1)
46 |             c.copy_packet(r, Buffer.output, 0, sendtb=0)
47 | 
48 |         Json()
49 |         Check()
50 | 
51 | 
52 | parser = argparse.ArgumentParser()
53 | parser.add_argument("instances", type=int, help="number of instances")
54 | 
55 | args = parser.parse_args()
56 | 
57 | send_recv(args.instances)
58 | 


--------------------------------------------------------------------------------
/python/examples/send_recv_proxy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from mscclpp.language import *
 6 | from mscclpp.language.buffer import Buffer
 7 | from mscclpp.language.collectives import SendRecv
 8 | from mscclpp.language.types import ChannelType
 9 | 
10 | 
11 | def send_recv(instances):
12 |     """
13 |     Send and receive data between two ranks using port channels.
14 |     steps:
15 |     1. Each rank sends a chunk to the other rank's scratch buffer and signals the other rank that the data has been sent.
16 |     2. Wait for the data to be received then copy it to the output buffer.
17 |     """
18 |     size = 2
19 |     chunksperloop = 1
20 |     collective = SendRecv(size, chunksperloop, False)
21 |     with MSCCLPPProgram(
22 |         "send_recv",
23 |         collective,
24 |         size,
25 |         instances,
26 |     ):
27 |         for r in range(size):
28 |             for nghr in range(size):
29 |                 if nghr == r:
30 |                     continue
31 |                 c = chunk(r, Buffer.input, 0)
32 |                 c.put(
33 |                     nghr,
34 |                     Buffer.scratch,
35 |                     1,
36 |                     sendtb=0,
37 |                     chan_type=ChannelType.port,
38 |                 )
39 |                 c.signal(nghr, Buffer.scratch, 1, sendtb=0, chan_type=ChannelType.port)
40 |                 c.flush(nghr, Buffer.scratch, 1, sendtb=0, chan_type=ChannelType.port)
41 | 
42 |         for r in range(size):
43 |             c = chunk(r, Buffer.scratch, 1)
44 |             c.wait(1 - r, Buffer.input, 0, recvtb=0, chan_type=ChannelType.port)
45 |             c.copy(r, Buffer.output, 0, sendtb=0)
46 | 
47 |         Json()
48 |         Check()
49 | 
50 | 
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument("instances", type=int, help="number of instances")
53 | 
54 | args = parser.parse_args()
55 | 
56 | send_recv(args.instances)
57 | 


--------------------------------------------------------------------------------
/python/mscclpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
 5 | include(FetchContent)
 6 | FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
 7 | FetchContent_MakeAvailable(nanobind)
 8 | 
 9 | FetchContent_Declare(dlpack GIT_REPOSITORY https://github.com/dmlc/dlpack.git GIT_TAG v1.1)
10 | FetchContent_MakeAvailable(dlpack)
11 | 
12 | file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
13 | nanobind_add_module(mscclpp_py ${SOURCES})
14 | set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
15 | target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp_static ${GPU_LIBRARIES})
16 | target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
17 | install(TARGETS mscclpp_py LIBRARY DESTINATION .)
18 | 


--------------------------------------------------------------------------------
/python/mscclpp/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT license.
  3 | 
  4 | import os
  5 | import warnings
  6 | from functools import wraps
  7 | 
  8 | from ._mscclpp import (
  9 |     Env,
 10 |     ErrorCode,
 11 |     BaseError,
 12 |     Error,
 13 |     SysError,
 14 |     CudaError,
 15 |     CuError,
 16 |     IbError,
 17 |     Communicator,
 18 |     Connection,
 19 |     connect_nvls_collective,
 20 |     EndpointConfig,
 21 |     Fifo,
 22 |     Host2DeviceSemaphore,
 23 |     Host2HostSemaphore,
 24 |     numa,
 25 |     ProxyService,
 26 |     RegisteredMemory,
 27 |     PortChannel,
 28 |     MemoryChannel,
 29 |     MemoryDevice2DeviceSemaphore,
 30 |     TcpBootstrap,
 31 |     Transport,
 32 |     TransportFlags,
 33 |     DataType,
 34 |     Executor,
 35 |     ExecutionPlan,
 36 |     PacketType,
 37 |     RawGpuBuffer,
 38 |     env,
 39 |     version,
 40 |     is_nvls_supported,
 41 |     npkit,
 42 | )
 43 | 
 44 | 
 45 | __all__ = [
 46 |     "Communicator",
 47 |     "Connection",
 48 |     "connect_nvls_collective",
 49 |     "EndpointConfig",
 50 |     "Fifo",
 51 |     "Host2DeviceSemaphore",
 52 |     "Host2HostSemaphore",
 53 |     "numa",
 54 |     "ProxyService",
 55 |     "RegisteredMemory",
 56 |     "PortChannel",
 57 |     "MemoryChannel",
 58 |     "MemoryDevice2DeviceSemaphore",
 59 |     "TcpBootstrap",
 60 |     "Transport",
 61 |     "TransportFlags",
 62 |     "DataType",
 63 |     "Executor",
 64 |     "ExecutionPlan",
 65 |     "PacketType",
 66 |     "version",
 67 |     "is_nvls_supported",
 68 |     "alloc_shared_physical_cuda",
 69 |     "npkit",
 70 |     "__version__",
 71 |     "get_include",
 72 |     "get_lib",
 73 |     ### Deprecated ###
 74 |     "ProxyChannel",
 75 |     "SmChannel",
 76 |     "SmDevice2DeviceSemaphore",
 77 | ]
 78 | 
 79 | __version__: str = str(version())
 80 | 
 81 | if os.environ.get("MSCCLPP_HOME", None) is None:
 82 |     os.environ["MSCCLPP_HOME"] = os.path.abspath(os.path.dirname(__file__))
 83 | 
 84 | 
 85 | def get_include() -> str:
 86 |     """Return the directory that contains the MSCCL++ headers."""
 87 |     return os.path.join(os.path.dirname(__file__), "include")
 88 | 
 89 | 
 90 | def get_lib() -> str:
 91 |     """Return the directory that contains the MSCCL++ headers."""
 92 |     return os.path.join(os.path.dirname(__file__), "lib")
 93 | 
 94 | 
 95 | def deprecated(new_cls):
 96 |     def decorator(old_cls):
 97 |         @wraps(old_cls)
 98 |         def wrapper(*args, **kwargs):
 99 |             warnings.warn(
100 |                 f"{old_cls.__name__} is deprecated, use {new_cls.__name__} instead.",
101 |                 DeprecationWarning,
102 |             )
103 |             return new_cls(*args, **kwargs)
104 | 
105 |         return wrapper
106 | 
107 |     return decorator
108 | 
109 | 
110 | @deprecated(PortChannel)
111 | class ProxyChannel(PortChannel):
112 |     pass
113 | 
114 | 
115 | @deprecated(MemoryChannel)
116 | class SmChannel(MemoryChannel):
117 |     pass
118 | 
119 | 
120 | @deprecated(MemoryDevice2DeviceSemaphore)
121 | class SmDevice2DeviceSemaphore(MemoryDevice2DeviceSemaphore):
122 |     pass
123 | 


--------------------------------------------------------------------------------
/python/mscclpp/env_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/stl/shared_ptr.h>
 6 | #include <nanobind/stl/string.h>
 7 | 
 8 | #include <mscclpp/env.hpp>
 9 | 
10 | namespace nb = nanobind;
11 | using namespace mscclpp;
12 | 
13 | void register_env(nb::module_& m) {
14 |   nb::class_<Env>(m, "Env")
15 |       .def_ro("debug", &Env::debug)
16 |       .def_ro("debug_subsys", &Env::debugSubsys)
17 |       .def_ro("debug_file", &Env::debugFile)
18 |       .def_ro("hca_devices", &Env::hcaDevices)
19 |       .def_ro("hostid", &Env::hostid)
20 |       .def_ro("socket_family", &Env::socketFamily)
21 |       .def_ro("socket_ifname", &Env::socketIfname)
22 |       .def_ro("comm_id", &Env::commId)
23 |       .def_ro("execution_plan_dir", &Env::executionPlanDir)
24 |       .def_ro("npkit_dump_dir", &Env::npkitDumpDir)
25 |       .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
26 | 
27 |   m.def("env", &env);
28 | }
29 | 


--------------------------------------------------------------------------------
/python/mscclpp/error_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/stl/string.h>
 6 | 
 7 | #include <mscclpp/errors.hpp>
 8 | 
 9 | namespace nb = nanobind;
10 | using namespace mscclpp;
11 | 
12 | #define REGISTER_EXCEPTION_TRANSLATOR(name_)                                                                         \
13 |   nb::register_exception_translator(                                                                                 \
14 |       [](const std::exception_ptr &p, void *payload) {                                                               \
15 |         try {                                                                                                        \
16 |           std::rethrow_exception(p);                                                                                 \
17 |         } catch (const name_ &e) {                                                                                   \
18 |           PyErr_SetObject(reinterpret_cast<PyObject *>(payload),                                                     \
19 |                           PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \
20 |         }                                                                                                            \
21 |       },                                                                                                             \
22 |       m.attr(#name_).ptr());
23 | 
24 | void register_error(nb::module_ &m) {
25 |   nb::enum_<ErrorCode>(m, "ErrorCode")
26 |       .value("SystemError", ErrorCode::SystemError)
27 |       .value("InternalError", ErrorCode::InternalError)
28 |       .value("RemoteError", ErrorCode::RemoteError)
29 |       .value("InvalidUsage", ErrorCode::InvalidUsage)
30 |       .value("Timeout", ErrorCode::Timeout)
31 |       .value("Aborted", ErrorCode::Aborted)
32 |       .value("ExecutorError", ErrorCode::ExecutorError);
33 | 
34 |   nb::exception<BaseError>(m, "BaseError");
35 |   REGISTER_EXCEPTION_TRANSLATOR(BaseError);
36 | 
37 |   nb::exception<Error>(m, "Error", m.attr("BaseError").ptr());
38 |   REGISTER_EXCEPTION_TRANSLATOR(Error);
39 | 
40 |   nb::exception<SysError>(m, "SysError", m.attr("BaseError").ptr());
41 |   REGISTER_EXCEPTION_TRANSLATOR(SysError);
42 | 
43 |   nb::exception<CudaError>(m, "CudaError", m.attr("BaseError").ptr());
44 |   REGISTER_EXCEPTION_TRANSLATOR(CudaError);
45 | 
46 |   nb::exception<CuError>(m, "CuError", m.attr("BaseError").ptr());
47 |   REGISTER_EXCEPTION_TRANSLATOR(CuError);
48 | 
49 |   nb::exception<IbError>(m, "IbError", m.attr("BaseError").ptr());
50 |   REGISTER_EXCEPTION_TRANSLATOR(IbError);
51 | }
52 | 


--------------------------------------------------------------------------------
/python/mscclpp/executor_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/stl/shared_ptr.h>
 6 | #include <nanobind/stl/string.h>
 7 | 
 8 | #include <mscclpp/executor.hpp>
 9 | #include <mscclpp/gpu.hpp>
10 | 
11 | namespace nb = nanobind;
12 | using namespace mscclpp;
13 | 
14 | void register_executor(nb::module_& m) {
15 |   nb::enum_<DataType>(m, "DataType")
16 |       .value("int32", DataType::INT32)
17 |       .value("uint32", DataType::UINT32)
18 |       .value("float16", DataType::FLOAT16)
19 |       .value("float32", DataType::FLOAT32)
20 |       .value("bfloat16", DataType::BFLOAT16);
21 | 
22 |   nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
23 | 
24 |   nb::class_<ExecutionPlan>(m, "ExecutionPlan")
25 |       .def(nb::init<const std::string>(), nb::arg("planPath"))
26 |       .def("name", &ExecutionPlan::name)
27 |       .def("collective", &ExecutionPlan::collective)
28 |       .def("min_message_size", &ExecutionPlan::minMessageSize)
29 |       .def("max_message_size", &ExecutionPlan::maxMessageSize);
30 | 
31 |   nb::class_<Executor>(m, "Executor")
32 |       .def(nb::init<std::shared_ptr<Communicator>>(), nb::arg("comm"))
33 |       .def(
34 |           "execute",
35 |           [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize,
36 |              DataType dataType, const ExecutionPlan& plan, uintptr_t stream, PacketType packetType) {
37 |             self->execute(rank, reinterpret_cast<void*>(sendbuff), reinterpret_cast<void*>(recvBuff), sendBuffSize,
38 |                           recvBuffSize, dataType, plan, (cudaStream_t)stream, packetType);
39 |           },
40 |           nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"),
41 |           nb::arg("dataType"), nb::arg("plan"), nb::arg("stream"), nb::arg("packetType") = PacketType::LL16);
42 | }
43 | 


--------------------------------------------------------------------------------
/python/mscclpp/fifo_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | 
 6 | #include <mscclpp/fifo.hpp>
 7 | 
 8 | namespace nb = nanobind;
 9 | using namespace mscclpp;
10 | 
11 | void register_fifo(nb::module_& m) {
12 |   nb::class_<ProxyTrigger>(m, "ProxyTrigger").def_rw("fst", &ProxyTrigger::fst).def_rw("snd", &ProxyTrigger::snd);
13 | 
14 |   nb::class_<FifoDeviceHandle>(m, "FifoDeviceHandle")
15 |       .def_rw("triggers", &FifoDeviceHandle::triggers)
16 |       .def_rw("tail_replica", &FifoDeviceHandle::tailReplica)
17 |       .def_rw("head", &FifoDeviceHandle::head)
18 |       .def_rw("size", &FifoDeviceHandle::size)
19 |       .def_prop_ro("raw", [](const FifoDeviceHandle& self) -> nb::bytes {
20 |         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
21 |       });
22 | 
23 |   nb::class_<Fifo>(m, "Fifo")
24 |       .def(nb::init<int>(), nb::arg("size") = DEFAULT_FIFO_SIZE)
25 |       .def("poll", &Fifo::poll)
26 |       .def("pop", &Fifo::pop)
27 |       .def("flush_tail", &Fifo::flushTail, nb::arg("sync") = false)
28 |       .def("size", &Fifo::size)
29 |       .def("device_handle", &Fifo::deviceHandle);
30 | }
31 | 


--------------------------------------------------------------------------------
/python/mscclpp/language/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | 
4 | from mscclpp.language.program import MSCCLPPProgram, Json, Check, chunk, rank
5 | 


--------------------------------------------------------------------------------
/python/mscclpp/language/buffer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from enum import Enum
 5 | 
 6 | 
 7 | # Scratch buffer slice with manual indexing
 8 | class BufferSlice:
 9 |     def __init__(self, buf, name):
10 |         self.name = name
11 |         self.buf = buf
12 |         self.offset = -1  # Offset into the global scratch buffer
13 |         self.chunks = []
14 | 
15 |     # Returns the global index into the scratch buffer
16 |     def get_global_index(self, index):
17 |         assert self.offset > -1, "set_offset needs to be called first"
18 |         return self.offset + index
19 | 
20 |     def get_buffer(self):
21 |         return self.buf
22 | 
23 |     def instance_size(self):
24 |         return len(self.chunks)
25 | 
26 |     def set_offset(self, offset):
27 |         self.offset = offset
28 | 
29 |     def __getitem__(self, index):
30 |         return self.chunks[index]
31 | 
32 |     def __setitem__(self, index, value):
33 |         current_size = len(self.chunks)
34 |         while index > current_size:
35 |             self.chunks.append(None)
36 |             current_size = len(self.chunks)
37 |         if index == current_size:
38 |             self.chunks.append(value)
39 |         else:
40 |             self.chunks[index] = value
41 | 
42 |     def __len__(self):
43 |         return len(self.chunks)
44 | 
45 | 
46 | class Buffer(Enum):
47 |     input = "i"
48 |     output = "o"
49 |     scratch = "s"
50 | 
51 |     def __str__(self):
52 |         return self.value
53 | 
54 |     def __lt__(self, other):
55 |         return self.value < other.value
56 | 
57 |     def __gt__(self, other):
58 |         return self.value < other.value
59 | 


--------------------------------------------------------------------------------
/python/mscclpp/language/chunk.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | 
 5 | from dataclasses import dataclass
 6 | 
 7 | 
 8 | @dataclass
 9 | class Chunk:
10 |     origin_rank: int  # Rank the chunk initially started at
11 |     origin_index: int  # Index the chunk initially started at
12 |     dst_rank: int = -1
13 |     dst_index: int = -1
14 | 
15 |     def reduce(self, dst, chunk):
16 |         if type(chunk) is ReduceChunk:
17 |             return chunk.reduce(dst, self)
18 |         elif type(chunk) is Chunk:
19 |             chunks = [self, chunk]
20 |             return ReduceChunk(dst, chunks)
21 |         else:
22 |             raise ValueError("Trying to reduce with chunk of None")
23 | 
24 |     def __hash__(self):
25 |         return hash((self.origin_rank, self.origin_index))
26 | 
27 |     def __eq__(self, other):
28 |         return (
29 |             type(other) is Chunk and self.origin_rank == other.origin_rank and self.origin_index == other.origin_index
30 |         )
31 | 
32 |     def __lt__(self, other):
33 |         return self.origin_rank < other.origin_rank or (
34 |             self.origin_rank == other.origin_rank and self.origin_index < other.origin_index
35 |         )
36 | 
37 | 
38 | @dataclass
39 | class ReduceChunk:
40 |     creation_rank: int  # Rank the Reduce Chunk is created. Necessary since the same ReduceChunk can be created on multiple ranks independently
41 |     chunks: list  # List of chunks reduced
42 | 
43 |     def reduce(self, dst, chunk):
44 |         if type(chunk) is ReduceChunk:
45 |             chunks = self.chunks + chunk.chunks
46 |         elif type(chunk) is Chunk:
47 |             chunks = self.chunks + [chunk]
48 |         else:
49 |             raise ValueError("Trying to reduce with chunk of None")
50 |         return ReduceChunk(self.creation_rank, chunks)
51 | 
52 |     def sort(self):
53 |         self.chunks.sort()
54 | 
55 |     def __hash__(self):
56 |         self.sort()
57 |         return hash((self.creation_rank,) + tuple(self.chunks))
58 | 
59 |     # Two reduce chunks are equal if they contain the same list of
60 |     # chunks being reduced
61 |     def __eq__(self, other):
62 |         self.sort()
63 |         other.sort()
64 |         return self.chunks == other.chunks
65 | 


--------------------------------------------------------------------------------
/python/mscclpp/language/dag/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | 
4 | from mscclpp.language.dag.instruction_dag import InstructionDAG
5 | from mscclpp.language.dag.lower import DagLower
6 | from mscclpp.language.dag.optimizer import DagOptimizer
7 | 


--------------------------------------------------------------------------------
/python/mscclpp/language/rank.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from dataclasses import dataclass, field
 5 | from typing import Dict
 6 | 
 7 | 
 8 | class BarrierInfo:
 9 |     def __init__(self, tb_list):
10 |         self.tb_list = tb_list
11 | 
12 |     def __eq__(self, other):
13 |         return self.tb_list == other.tb_list
14 | 
15 |     def __hash__(self):
16 |         return hash(tuple(self.tb_list))
17 | 
18 | 
19 | @dataclass
20 | class Rank:
21 |     rank_id: int
22 |     current_max_barrier_id: int = 0
23 |     current_barriers: Dict[BarrierInfo, int] = field(default_factory=dict)
24 | 
25 |     def get_barrier_id(self, tb_list):
26 |         barrier_info = BarrierInfo(tb_list)
27 |         if barrier_info in self.current_barriers:
28 |             return self.current_barriers[barrier_info]
29 |         else:
30 |             self.current_barriers[barrier_info] = self.current_max_barrier_id
31 |             barrier_id = self.current_max_barrier_id
32 |             self.current_max_barrier_id += 1
33 |             return barrier_id
34 | 


--------------------------------------------------------------------------------
/python/mscclpp/language/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from mscclpp.language.types import Op
 5 | 
 6 | 
 7 | def remove_op(op: Op):
 8 |     for p in op.prev:
 9 |         p.next.remove(op)
10 |         p.next += op.next
11 |         p.next = list(set(p.next))
12 | 
13 |     for n in op.next:
14 |         n.prev.remove(op)
15 |         n.prev = op.prev.union(n.prev)
16 | 
17 |     op.next = []
18 |     op.prev = []
19 | 
20 | 
21 | def merge_op(op: Op, other_op: Op):
22 |     if other_op in op.next:
23 |         op.next.remove(other_op)
24 |         other_op.prev.remove(op)
25 |     for p in other_op.prev:
26 |         p.next.remove(other_op)
27 |         p.next.append(op)
28 | 
29 |     for n in other_op.next:
30 |         n.prev.remove(other_op)
31 |         n.prev.add(op)
32 | 
33 |     op.prev = op.prev.union(other_op.prev)
34 |     op.next = list(set(op.next + other_op.next))
35 | 
36 | 
37 | def circular_dep_after_merge(op: Op, other_op: Op):
38 |     root = set([op, other_op])
39 |     frontier = set(op.next)
40 |     visited = set()
41 |     if other_op in frontier:
42 |         frontier.remove(other_op)
43 |     frontier = list(frontier.union(other_op.next))
44 |     while len(frontier) > 0:
45 |         current = frontier[0]
46 |         for n in current.next:
47 |             # The root node will be visited again if there is a circular dependency
48 |             if n in root:
49 |                 return True
50 |             if n not in visited:
51 |                 frontier.append(n)
52 |                 visited.add(n)
53 |         frontier = frontier[1:]
54 | 
55 | 
56 | def all_prevs_visited_after_merge(op: Op, other_op: Op):
57 |     """
58 |     For case: op2.prev = [op1, op3]. op1.next = [op2]. op3.next = [op2]. And op1 and op2 are satisfied to merge.
59 |     We only apply the merge if all previous ops of op2 are visited. (op1 is the last previous op of op2).
60 |     """
61 |     step = op.step
62 |     for prev in other_op.prev:
63 |         if prev.step > step:
64 |             return False
65 |     return True
66 | 
67 | 
68 | def same_tb(op1: Op, op2: Op):
69 |     return op1.tb == op2.tb and op1.channel == op2.channel
70 | 
71 | 
72 | def same_count(op1: Op, op2: Op):
73 |     return op1.cnt() == op2.cnt()
74 | 
75 | 
76 | def same_buf_dst(op1: Op, op2: Op):
77 |     return op1.dst.buffer == op2.dst.buffer and op1.dst.index == op2.dst.index
78 | 
79 | 
80 | def same_src_dst_buffer_type(op1: Op, op2: Op):
81 |     return op1.src.buffer == op2.src.buffer and op1.dst.buffer == op2.dst.buffer
82 | 
83 | 
84 | def buf_dst_src_match(op1: Op, op2: Op):
85 |     return op1.dst.buffer == op2.src.buffer and op1.dst.index == op2.src.index
86 | 
87 | 
88 | def same_buf_src(op1: Op, op2: Op):
89 |     return op1.src.buffer == op2.src.buffer and op1.src.index == op2.src.index
90 | 
91 | 
92 | def same_chan_type(op1: Op, op2: Op):
93 |     return op1.channel_type == op2.channel_type
94 | 
95 | 
96 | def same_tb(op1: Op, op2: Op):
97 |     return op1.tb == op2.tb
98 | 


--------------------------------------------------------------------------------
/python/mscclpp/memory_channel_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/stl/shared_ptr.h>
 6 | #include <nanobind/stl/string.h>
 7 | 
 8 | #include <mscclpp/memory_channel.hpp>
 9 | 
10 | namespace nb = nanobind;
11 | using namespace mscclpp;
12 | 
13 | void register_memory_channel(nb::module_& m) {
14 |   nb::class_<BaseMemoryChannel> baseMemoryChannel(m, "BaseMemoryChannel");
15 |   baseMemoryChannel
16 |       .def("__init__",
17 |            [](BaseMemoryChannel* baseMemoryChannel, std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore) {
18 |              new (baseMemoryChannel) BaseMemoryChannel(semaphore);
19 |            })
20 |       .def("device_handle", &BaseMemoryChannel::deviceHandle);
21 | 
22 |   nb::class_<BaseMemoryChannel::DeviceHandle>(m, "BaseMemoryChannelDeviceHandle")
23 |       .def(nb::init<>())
24 |       .def_rw("semaphore_", &BaseMemoryChannel::DeviceHandle::semaphore_)
25 |       .def_prop_ro("raw", [](const BaseMemoryChannel::DeviceHandle& self) -> nb::bytes {
26 |         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
27 |       });
28 | 
29 |   nb::class_<MemoryChannel> memoryChannel(m, "MemoryChannel");
30 |   memoryChannel
31 |       .def("__init__",
32 |            [](MemoryChannel* memoryChannel, std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
33 |               RegisteredMemory dst,
34 |               uintptr_t src) { new (memoryChannel) MemoryChannel(semaphore, dst, reinterpret_cast<void*>(src)); })
35 |       .def("__init__",
36 |            [](MemoryChannel* memoryChannel, std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
37 |               RegisteredMemory dst, uintptr_t src, uintptr_t packet_buffer) {
38 |              new (memoryChannel)
39 |                  MemoryChannel(semaphore, dst, reinterpret_cast<void*>(src), reinterpret_cast<void*>(packet_buffer));
40 |            })
41 |       .def("device_handle", &MemoryChannel::deviceHandle);
42 | 
43 |   nb::class_<MemoryChannel::DeviceHandle>(m, "MemoryChannelDeviceHandle")
44 |       .def(nb::init<>())
45 |       .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_)
46 |       .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_)
47 |       .def_rw("src_", &MemoryChannel::DeviceHandle::src_)
48 |       .def_rw("packetBuffer_", &MemoryChannel::DeviceHandle::packetBuffer_)
49 |       .def_prop_ro("raw", [](const MemoryChannel::DeviceHandle& self) -> nb::bytes {
50 |         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
51 |       });
52 | };
53 | 


--------------------------------------------------------------------------------
/python/mscclpp/npkit_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/stl/string.h>
 6 | 
 7 | #include <mscclpp/npkit/npkit.hpp>
 8 | 
 9 | namespace nb = nanobind;
10 | 
11 | void register_npkit(nb::module_ &m) {
12 |   nb::module_ sub_m = m.def_submodule("npkit", "NPKit functions");
13 |   sub_m.def("init", &NpKit::Init);
14 |   sub_m.def("dump", &NpKit::Dump);
15 |   sub_m.def("shutdown", &NpKit::Shutdown);
16 | }
17 | 


--------------------------------------------------------------------------------
/python/mscclpp/numa_py.cpp:
--------------------------------------------------------------------------------
 1 | #include <nanobind/nanobind.h>
 2 | namespace nb = nanobind;
 3 | 
 4 | namespace mscclpp {
 5 | int getDeviceNumaNode(int cudaDev);
 6 | void numaBind(int node);
 7 | };  // namespace mscclpp
 8 | 
 9 | void register_numa(nb::module_ &m) {
10 |   nb::module_ sub_m = m.def_submodule("numa", "numa functions");
11 |   sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode);
12 |   sub_m.def("numa_bind", &mscclpp::numaBind);
13 | }
14 | 


--------------------------------------------------------------------------------
/python/mscclpp/nvls_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/operators.h>
 6 | #include <nanobind/stl/array.h>
 7 | #include <nanobind/stl/shared_ptr.h>
 8 | #include <nanobind/stl/string.h>
 9 | #include <nanobind/stl/vector.h>
10 | 
11 | #include <mscclpp/core.hpp>
12 | #include <mscclpp/nvls.hpp>
13 | 
14 | namespace nb = nanobind;
15 | using namespace mscclpp;
16 | 
17 | void register_nvls(nb::module_& m) {
18 |   nb::class_<NvlsConnection::DeviceMulticastPointer>(m, "DeviceMulticastPointer")
19 |       .def("get_device_ptr",
20 |            [](NvlsConnection::DeviceMulticastPointer* self) { return (uintptr_t)self->getDevicePtr(); })
21 |       .def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle);
22 | 
23 |   nb::class_<NvlsConnection::DeviceMulticastPointer::DeviceHandle>(m, "DeviceHandle")
24 |       .def(nb::init<>())
25 |       .def_rw("devicePtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::devicePtr)
26 |       .def_rw("mcPtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::mcPtr)
27 |       .def_rw("size", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::bufferSize)
28 |       .def_prop_ro("raw", [](const NvlsConnection::DeviceMulticastPointer::DeviceHandle& self) -> nb::bytes {
29 |         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
30 |       });
31 | 
32 |   nb::class_<NvlsConnection>(m, "NvlsConnection")
33 |       .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("devicePtr"), nb::arg("size"))
34 |       .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);
35 | 
36 |   m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"),
37 |         nb::arg("bufferSize"));
38 | }
39 | 


--------------------------------------------------------------------------------
/python/mscclpp/port_channel_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/stl/shared_ptr.h>
 6 | #include <nanobind/stl/string.h>
 7 | 
 8 | #include <mscclpp/port_channel.hpp>
 9 | 
10 | namespace nb = nanobind;
11 | using namespace mscclpp;
12 | 
13 | void register_port_channel(nb::module_& m) {
14 |   nb::class_<BaseProxyService>(m, "BaseProxyService")
15 |       .def("start_proxy", &BaseProxyService::startProxy)
16 |       .def("stop_proxy", &BaseProxyService::stopProxy);
17 | 
18 |   nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
19 |       .def(nb::init<size_t>(), nb::arg("fifoSize") = DEFAULT_FIFO_SIZE)
20 |       .def("start_proxy", &ProxyService::startProxy)
21 |       .def("stop_proxy", &ProxyService::stopProxy)
22 |       .def("build_and_add_semaphore", &ProxyService::buildAndAddSemaphore, nb::arg("comm"), nb::arg("connection"))
23 |       .def("add_semaphore", &ProxyService::addSemaphore, nb::arg("semaphore"))
24 |       .def("add_memory", &ProxyService::addMemory, nb::arg("memory"))
25 |       .def("semaphore", &ProxyService::semaphore, nb::arg("id"))
26 |       .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id"))
27 |       .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src"));
28 | 
29 |   nb::class_<BasePortChannel>(m, "BasePortChannel")
30 |       .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>>(),
31 |            nb::arg("semaphoreId"), nb::arg("semaphore"), nb::arg("proxy"))
32 |       .def("device_handle", &BasePortChannel::deviceHandle);
33 | 
34 |   nb::class_<BasePortChannel::DeviceHandle>(m, "BasePortChannelDeviceHandle")
35 |       .def(nb::init<>())
36 |       .def_rw("semaphoreId_", &BasePortChannel::DeviceHandle::semaphoreId_)
37 |       .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_)
38 |       .def_rw("fifo_", &BasePortChannel::DeviceHandle::fifo_)
39 |       .def_prop_ro("raw", [](const BasePortChannel::DeviceHandle& self) -> nb::bytes {
40 |         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
41 |       });
42 | 
43 |   nb::class_<PortChannel>(m, "PortChannel")
44 |       .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>, MemoryId, MemoryId>(),
45 |            nb::arg("semaphoreId"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src"))
46 |       .def("device_handle", &PortChannel::deviceHandle);
47 | 
48 |   nb::class_<PortChannel::DeviceHandle>(m, "PortChannelDeviceHandle")
49 |       .def(nb::init<>())
50 |       .def_rw("semaphoreId_", &PortChannel::DeviceHandle::semaphoreId_)
51 |       .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_)
52 |       .def_rw("fifo_", &PortChannel::DeviceHandle::fifo_)
53 |       .def_rw("src_", &PortChannel::DeviceHandle::src_)
54 |       .def_rw("dst_", &PortChannel::DeviceHandle::dst_)
55 |       .def_prop_ro("raw", [](const PortChannel::DeviceHandle& self) -> nb::bytes {
56 |         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
57 |       });
58 | };
59 | 


--------------------------------------------------------------------------------
/python/mscclpp/semaphore_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/stl/shared_ptr.h>
 6 | 
 7 | #include <mscclpp/semaphore.hpp>
 8 | 
 9 | namespace nb = nanobind;
10 | using namespace mscclpp;
11 | 
12 | void register_semaphore(nb::module_& m) {
13 |   nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "Host2DeviceSemaphore");
14 |   host2DeviceSemaphore
15 |       .def(nb::init<Communicator&, std::shared_ptr<Connection>>(), nb::arg("communicator"), nb::arg("connection"))
16 |       .def("connection", &Host2DeviceSemaphore::connection)
17 |       .def("signal", &Host2DeviceSemaphore::signal)
18 |       .def("device_handle", &Host2DeviceSemaphore::deviceHandle);
19 | 
20 |   nb::class_<Host2DeviceSemaphore::DeviceHandle>(host2DeviceSemaphore, "DeviceHandle")
21 |       .def(nb::init<>())
22 |       .def_rw("inbound_semaphore_id", &Host2DeviceSemaphore::DeviceHandle::inboundSemaphoreId)
23 |       .def_rw("expected_inbound_semaphore_id", &Host2DeviceSemaphore::DeviceHandle::expectedInboundSemaphoreId)
24 |       .def_prop_ro("raw", [](const Host2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
25 |         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
26 |       });
27 | 
28 |   nb::class_<Host2HostSemaphore>(m, "Host2HostSemaphore")
29 |       .def(nb::init<Communicator&, std::shared_ptr<Connection>>(), nb::arg("communicator"), nb::arg("connection"))
30 |       .def("connection", &Host2HostSemaphore::connection)
31 |       .def("signal", &Host2HostSemaphore::signal)
32 |       .def("poll", &Host2HostSemaphore::poll)
33 |       .def("wait", &Host2HostSemaphore::wait, nb::call_guard<nb::gil_scoped_release>(),
34 |            nb::arg("max_spin_count") = 10000000);
35 | 
36 |   nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore");
37 |   memoryDevice2DeviceSemaphore
38 |       .def(nb::init<Communicator&, std::shared_ptr<Connection>>(), nb::arg("communicator"), nb::arg("connection"))
39 |       .def("device_handle", &MemoryDevice2DeviceSemaphore::deviceHandle);
40 | 
41 |   nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
42 |       .def(nb::init<>())
43 |       .def_rw("inboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundSemaphoreId)
44 |       .def_rw("outboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundSemaphoreId)
45 |       .def_rw("remoteInboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundSemaphoreId)
46 |       .def_rw("expectedInboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundSemaphoreId)
47 |       .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
48 |         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
49 |       });
50 | }
51 | 


--------------------------------------------------------------------------------
/python/mscclpp/utils_py.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <nanobind/nanobind.h>
 5 | #include <nanobind/stl/string.h>
 6 | 
 7 | #include <mscclpp/utils.hpp>
 8 | 
 9 | namespace nb = nanobind;
10 | using namespace mscclpp;
11 | 
12 | void register_utils(nb::module_& m) {
13 |   nb::class_<Timer>(m, "Timer")
14 |       .def(nb::init<int>(), nb::arg("timeout") = -1)
15 |       .def("elapsed", &Timer::elapsed)
16 |       .def("set", &Timer::set, nb::arg("timeout"))
17 |       .def("reset", &Timer::reset)
18 |       .def("print", &Timer::print, nb::arg("name"));
19 | 
20 |   nb::class_<ScopedTimer, Timer>(m, "ScopedTimer").def(nb::init<std::string>(), nb::arg("name"));
21 | 
22 |   m.def("get_host_name", &getHostName, nb::arg("maxlen"), nb::arg("delim"));
23 | }
24 | 


--------------------------------------------------------------------------------
/python/mscclpp_benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5
2 | 


--------------------------------------------------------------------------------
/python/mscclpp_benchmark/nccl_op.py:
--------------------------------------------------------------------------------
 1 | import cupy.cuda.nccl as nccl
 2 | from mpi4py import MPI
 3 | import cupy as cp
 4 | 
 5 | 
 6 | class NcclAllReduce:
 7 |     def __init__(self, nccl_comm: nccl.NcclCommunicator, memory: cp.ndarray):
 8 |         self.nccl_comm = nccl_comm
 9 |         self.memory = memory
10 |         if memory.dtype == cp.float32:
11 |             self.nccl_dtype = nccl.NCCL_FLOAT32
12 |         elif memory.dtype == cp.float16:
13 |             self.nccl_dtype = nccl.NCCL_FLOAT16
14 |         elif memory.dtype == cp.int32:
15 |             self.nccl_dtype = nccl.NCCL_INT32
16 |         else:
17 |             raise RuntimeError("Make sure that the data type is mapped to the correct NCCL data type")
18 | 
19 |     def __call__(self, stream):
20 |         stream_ptr = stream.ptr if stream else 0
21 |         self.nccl_comm.allReduce(
22 |             self.memory.data.ptr, self.memory.data.ptr, self.memory.size, self.nccl_dtype, nccl.NCCL_SUM, stream_ptr
23 |         )
24 |         return self.memory
25 | 


--------------------------------------------------------------------------------
/python/requirements_cuda11.txt:
--------------------------------------------------------------------------------
1 | mpi4py
2 | cupy-cuda11x
3 | prettytable
4 | netifaces
5 | pytest
6 | numpy
7 | matplotlib
8 | 


--------------------------------------------------------------------------------
/python/requirements_cuda12.txt:
--------------------------------------------------------------------------------
1 | mpi4py
2 | cupy-cuda12x
3 | prettytable
4 | netifaces
5 | pytest
6 | numpy
7 | matplotlib
8 | 


--------------------------------------------------------------------------------
/python/requirements_rocm6.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/python/requirements_rocm6.txt


--------------------------------------------------------------------------------
/python/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
 5 | include(FetchContent)
 6 | FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
 7 | FetchContent_MakeAvailable(nanobind)
 8 | 
 9 | file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
10 | nanobind_add_module(mscclpp_py_test ${SOURCES})
11 | set_target_properties(mscclpp_py_test PROPERTIES OUTPUT_NAME _ext)
12 | target_link_libraries(mscclpp_py_test PRIVATE mscclpp_static ${GPU_LIBRARIES})
13 | target_include_directories(mscclpp_py_test SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
14 | 


--------------------------------------------------------------------------------
/python/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/python/test/__init__.py


--------------------------------------------------------------------------------
/python/test/_cpp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/python/test/_cpp/__init__.py


--------------------------------------------------------------------------------
/python/test/configs/mscclpp_lang_test_config.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "filename": "allgather_barrier.py",
 4 |         "args": ["8", "8"]
 5 |     },
 6 |     {
 7 |         "filename": "allreduce_allpairs_packet.py",
 8 |         "args": ["8", "8"]
 9 |     },
10 |     {
11 |         "filename": "allreduce_allpairs_get.py",
12 |         "args": ["8", "8"]
13 |     },
14 |     {
15 |         "filename": "allreduce_allpairs.py",
16 |         "args": ["8", "8"]
17 |     },
18 |     {
19 |         "filename": "allreduce_ring.py",
20 |         "args": ["8", "8"]
21 |     },
22 |     {
23 |         "filename": "send_recv_packet.py",
24 |         "args": ["2"]
25 |     },
26 |     {
27 |         "filename": "send_recv_proxy.py",
28 |         "args": ["2"]
29 |     },
30 |     {
31 |         "filename": "allreduce_nvls.py",
32 |         "args": ["8", "2"]
33 |     },
34 |     {
35 |         "filename": "allgather_allpairs_multinodes_packets.py",
36 |         "args": ["16", "8", "1"]
37 |     }
38 | ]
39 | 


--------------------------------------------------------------------------------
/python/test/d2d_semaphore_test.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/semaphore_device.hpp>
 5 | 
 6 | // be careful about using semaphore[my_rank] as it is an invalid semaphore and it is there just for simplicity of
 7 | // indexing
 8 | extern "C" __global__ void __launch_bounds__(1024, 1)
 9 |     d2d_semaphore(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks) {
10 |   int tid = threadIdx.x;
11 |   if (tid < nranks && tid != my_rank) {
12 |     semaphores[tid].signal();
13 |     semaphores[tid].wait();
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/python/test/fifo_test.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/fifo_device.hpp>
 5 | 
 6 | extern "C" __global__ void __launch_bounds__(1024, 1) fifo(mscclpp::FifoDeviceHandle fifo) {
 7 |   mscclpp::ProxyTrigger trigger;
 8 |   trigger.fst = 123;
 9 |   fifo.push(trigger);
10 | }
11 | 


--------------------------------------------------------------------------------
/python/test/h2d_semaphore_test.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/semaphore_device.hpp>
 5 | 
 6 | // be careful about using semaphore[my_rank] as it is an invalid semaphore and it is there just for simplicity of
 7 | // indexing
 8 | extern "C" __global__ void __launch_bounds__(1024, 1)
 9 |     h2d_semaphore(mscclpp::Host2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks) {
10 |   int tid = threadIdx.x;
11 |   if (tid < nranks && tid != my_rank) semaphores[tid].wait();
12 | }
13 | 


--------------------------------------------------------------------------------
/python/test/memory_channel_test.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/memory_channel_device.hpp>
 5 | 
 6 | // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
 7 | extern "C" __global__ void __launch_bounds__(1024, 1)
 8 |     memory_channel(mscclpp::MemoryChannelDeviceHandle* channels, int my_rank, int nranks, int num_elements,
 9 |                    int use_packet) {
10 |   int tid = threadIdx.x;
11 |   int bid = blockIdx.x;
12 |   uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks;
13 |   uint64_t my_offset = size_per_rank * my_rank;
14 |   uint64_t my_nghr_offset = size_per_rank * bid;
15 |   int flag = 123;
16 |   if (bid < nranks && bid != my_rank) {
17 |     if (use_packet) {
18 |       channels[bid].putPackets(2 * my_offset, my_offset, size_per_rank, tid, blockDim.x, flag);
19 |       channels[bid].unpackPackets(2 * my_nghr_offset, my_nghr_offset, size_per_rank, tid, blockDim.x, flag);
20 |     } else {
21 |       channels[bid].put(my_offset, my_offset, size_per_rank, tid, blockDim.x);
22 |       __syncthreads();
23 |       if (!use_packet && tid == 0) {
24 |         channels[bid].signal();
25 |         channels[bid].wait();
26 |       }
27 |     }
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/python/test/mscclpp_mpi.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | import atexit
 5 | import logging
 6 | 
 7 | import cupy as cp
 8 | import mpi4py
 9 | 
10 | mpi4py.rc.initialize = False
11 | mpi4py.rc.finalize = False
12 | 
13 | from mpi4py import MPI
14 | import pytest
15 | 
16 | N_GPUS_PER_NODE = 8
17 | 
18 | logging.basicConfig(level=logging.INFO)
19 | 
20 | 
21 | def init_mpi():
22 |     if not MPI.Is_initialized():
23 |         MPI.Init()
24 |         shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL)
25 |         N_GPUS_PER_NODE = shm_comm.size
26 |         shm_comm.Free()
27 |         cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use()
28 | 
29 | 
30 | # Define a function to finalize MPI
31 | def finalize_mpi():
32 |     if MPI.Is_initialized():
33 |         MPI.Finalize()
34 | 
35 | 
36 | # Register the function to be called on exit
37 | atexit.register(finalize_mpi)
38 | 
39 | 
40 | class MpiGroup:
41 |     def __init__(self, ranks: list = []):
42 |         world_group = MPI.COMM_WORLD.group
43 |         if len(ranks) == 0:
44 |             self.comm = MPI.COMM_WORLD
45 |         else:
46 |             group = world_group.Incl(ranks)
47 |             self.comm = MPI.COMM_WORLD.Create(group)
48 | 
49 | 
50 | @pytest.fixture
51 | def mpi_group(request: pytest.FixtureRequest):
52 |     MPI.COMM_WORLD.barrier()
53 |     if request.param is None:
54 |         pytest.skip(f"Skip for rank {MPI.COMM_WORLD.rank}")
55 |     yield request.param
56 | 
57 | 
58 | def parametrize_mpi_groups(*tuples: tuple):
59 |     def decorator(func):
60 |         mpi_groups = []
61 |         for group_size in list(tuples):
62 |             if MPI.COMM_WORLD.size < group_size:
63 |                 logging.warning(f"MPI.COMM_WORLD.size < {group_size}, skip")
64 |                 continue
65 |             mpi_group = MpiGroup(list(range(group_size)))
66 |             if mpi_group.comm == MPI.COMM_NULL:
67 |                 mpi_groups.append(None)
68 |             else:
69 |                 mpi_groups.append(mpi_group)
70 |         return pytest.mark.parametrize("mpi_group", mpi_groups, indirect=True)(func)
71 | 
72 |     return decorator
73 | 
74 | 
75 | init_mpi()
76 | 


--------------------------------------------------------------------------------
/python/test/nvls_test.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/concurrency_device.hpp>
 5 | #include <mscclpp/nvls_device.hpp>
 6 | #include <mscclpp/poll_device.hpp>
 7 | #include <mscclpp/semaphore_device.hpp>
 8 | 
 9 | __device__ mscclpp::DeviceSyncer deviceSyncer;
10 | 
11 | extern "C" __global__ void __launch_bounds__(1024, 1)
12 |     nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs,
13 |               mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) {
14 |   int nelem = nbytes / sizeof(float);
15 |   float* dev_ptr = (float*)nvlsPtrs.devicePtr;
16 |   float* mc_ptr = (float*)nvlsPtrs.mcPtr;
17 |   int tid = threadIdx.x;
18 |   int bid = blockIdx.x;
19 | 
20 |   for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) {
21 |     dev_ptr[idx] = my_rank;
22 |   }
23 |   deviceSyncer.sync(gridDim.x);
24 |   if (tid == 0 && bid == 0) {
25 |     __threadfence_system();
26 |   }
27 | 
28 |   if (bid == 0) {
29 |     if (tid < nranks && tid != my_rank) {
30 |       semaphores[tid].signal();
31 |       semaphores[tid].wait();
32 |     }
33 |   }
34 |   deviceSyncer.sync(gridDim.x);
35 | 
36 |   int my_st = ((int64_t)nelem * (int64_t)my_rank) / (int64_t)nranks;
37 |   int my_en = ((int64_t)nelem * (int64_t)(my_rank + 1)) / (int64_t)nranks;
38 | 
39 |   int my_offset = (tid + bid * blockDim.x) * 4;
40 |   int my_step = blockDim.x * gridDim.x * 4;
41 | 
42 |   for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
43 |     uint4 val;
44 |     mscclpp::DeviceMulticastPointerDeviceHandle::multimemLoadReduce(val, mc_ptr + idx);
45 |     mscclpp::DeviceMulticastPointerDeviceHandle::multimemStore(val, mc_ptr + idx);
46 |   }
47 | 
48 |   deviceSyncer.sync(gridDim.x);
49 |   if (tid == 0 && bid == 0) {
50 |     __threadfence_system();
51 |   }
52 | 
53 |   if (bid == 0) {
54 |     if (tid < nranks && tid != my_rank) {
55 |       semaphores[tid].signal();
56 |       semaphores[tid].wait();
57 |     }
58 |   }
59 |   deviceSyncer.sync(gridDim.x);
60 | 
61 |   for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) {
62 |     MSCCLPP_ASSERT_DEVICE(dev_ptr[idx] == ((nranks * (nranks - 1)) / 2), "dev_ptr[idx] != nranks");
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/python/test/port_channel_test.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/copy_device.hpp>
 5 | #include <mscclpp/port_channel_device.hpp>
 6 | 
 7 | // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
 8 | extern "C" __global__ void __launch_bounds__(1024, 1)
 9 |     port_channel(mscclpp::PortChannelDeviceHandle* channels, int my_rank, int nranks, int* data, int* scratch,
10 |                  int num_elements, int use_packet) {
11 |   int tid = threadIdx.x;
12 |   int nthreads = blockDim.x;
13 |   uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks;
14 |   uint64_t my_offset = size_per_rank * my_rank;
15 |   int nthreads_per_rank = nthreads / nranks;
16 |   int my_nghr = tid / nthreads_per_rank;
17 |   uint64_t my_nghr_offset = size_per_rank * my_nghr;
18 |   __syncthreads();
19 |   int flag = 123;
20 |   if (use_packet) {
21 |     mscclpp::copyToPackets((char*)scratch + 2 * my_offset, (char*)data + my_offset, size_per_rank, tid, nthreads, flag);
22 |     __syncthreads();
23 |     if (tid < nranks && tid != my_rank) {
24 |       channels[tid].put(2 * my_offset, 2 * my_offset, 2 * size_per_rank);
25 |     }
26 |     if (my_nghr != my_rank && my_nghr < nranks)
27 |       mscclpp::copyFromPackets((char*)data + my_nghr_offset, (char*)scratch + 2 * my_nghr_offset, size_per_rank,
28 |                                tid % nthreads_per_rank, nthreads_per_rank, flag);
29 |   } else {
30 |     if (tid < nranks && tid != my_rank) {
31 |       channels[tid].putWithSignalAndFlush(my_offset, my_offset, size_per_rank);
32 |       channels[tid].wait();
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/python/test/proxy_test.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/fifo_device.hpp>
 5 | #include <mscclpp/semaphore_device.hpp>
 6 | 
 7 | extern "C" __global__ void __launch_bounds__(1024, 1) proxy(int my_rank, int nranks, mscclpp::FifoDeviceHandle fifo,
 8 |                                                             mscclpp::Host2DeviceSemaphoreDeviceHandle* semaphores) {
 9 |   int tid = threadIdx.x;
10 |   if (tid == 0) {
11 |     mscclpp::ProxyTrigger trigger;
12 |     trigger.fst = 123;
13 |     trigger.snd = 0;
14 |     uint64_t currentFifoHead = fifo.push(trigger);
15 |     // wait for the work to be done in cpu side
16 |     fifo.sync(currentFifoHead);
17 |   }
18 |   __syncthreads();
19 |   if (tid < nranks && tid != my_rank) {
20 |     semaphores[tid].wait();
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/python/test/test_generate_mscclpp_lang_result.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import json
 6 | from pathlib import Path
 7 | import subprocess
 8 | 
 9 | 
10 | def run_examples(input_folder, configs, output_folder):
11 |     for config in configs:
12 |         file_name = config["filename"]
13 |         args = config["args"]
14 | 
15 |         input_file_path = Path(input_folder) / file_name
16 |         # Strip the ".py" from the filename and add ".output"
17 |         base_file_name = file_name[:-3] if file_name.endswith(".py") else file_name
18 |         base_file_name = base_file_name.replace("/", "_")
19 |         output_file_path = Path(output_folder) / f"{base_file_name}.output"
20 | 
21 |         # Construct the command to run the Python script
22 |         command = ["python3", str(input_file_path)] + args
23 | 
24 |         # Run the command and capture output
25 |         with open(output_file_path, "w") as output_file:
26 |             result = subprocess.run(command, stdout=output_file, stderr=subprocess.STDOUT, text=True)
27 | 
28 |         # Optional: Check the return code to handle errors
29 |         if result.returncode != 0:
30 |             print(f"Error running {file_name}. See {output_file_path} for details.")
31 | 
32 | 
33 | def main(input_folder, config_path, output_folder):
34 |     with open(config_path, "r") as f:
35 |         config = json.load(f)
36 | 
37 |     Path(output_folder).mkdir(parents=True, exist_ok=True)
38 |     run_examples(input_folder, config, output_folder)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser(description="Process files according to a configuration and save the results.")
43 |     parser.add_argument("input_folder", type=str, help="Path to the folder containing the input files.")
44 |     parser.add_argument("config", type=str, help="Path to the configuration file.")
45 |     parser.add_argument("output_folder", type=str, help="Path to the folder where the processed files will be saved.")
46 |     args = parser.parse_args()
47 |     main(args.input_folder, args.config, args.output_folder)
48 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | # Python in-place installs move the .so files into the source directories.
2 | *.so
3 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | 
4 | file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cpp *.cu)
5 | target_sources(mscclpp_obj PRIVATE ${SOURCES})
6 | target_include_directories(mscclpp_obj PRIVATE include)
7 | 


--------------------------------------------------------------------------------
/src/c_style_remnants.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include "api.h"
 5 | #include "debug.h"
 6 | 
 7 | MSCCLPP_API void mscclppDefaultLogHandler(const char* msg) { mscclppDebugDefaultLogHandler(msg); }
 8 | 
 9 | MSCCLPP_API mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler) {
10 |   return mscclppDebugSetLogHandler(handler);
11 | }
12 | 
13 | MSCCLPP_API const char* mscclppGetErrorString(mscclppResult_t code) {
14 |   switch (code) {
15 |     case mscclppSuccess:
16 |       return "no error";
17 |     case mscclppUnhandledCudaError:
18 |       return "unhandled cuda error";
19 |     case mscclppSystemError:
20 |       return "unhandled system error";
21 |     case mscclppInternalError:
22 |       return "internal error";
23 |     case mscclppInvalidArgument:
24 |       return "invalid argument";
25 |     case mscclppInvalidUsage:
26 |       return "invalid usage";
27 |     case mscclppRemoteError:
28 |       return "remote process exited or there was a network error";
29 |     case mscclppInProgress:
30 |       return "MSCCL++ operation in progress";
31 |     default:
32 |       return "unknown result code";
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/context.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include "context.hpp"
 5 | 
 6 | #include "api.h"
 7 | #include "connection.hpp"
 8 | #include "debug.h"
 9 | #include "endpoint.hpp"
10 | #include "registered_memory.hpp"
11 | 
12 | namespace mscclpp {
13 | 
14 | Context::Impl::Impl() : ipcStream_(std::make_shared<CudaStreamWithFlags>()) {}
15 | 
16 | IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
17 |   // Find IB context or create it
18 |   auto it = ibContexts_.find(ibTransport);
19 |   if (it == ibContexts_.end()) {
20 |     auto ibDev = getIBDeviceName(ibTransport);
21 |     ibContexts_[ibTransport] = std::make_unique<IbCtx>(ibDev);
22 |     return ibContexts_[ibTransport].get();
23 |   } else {
24 |     return it->second.get();
25 |   }
26 | }
27 | 
28 | MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique<Impl>()) {}
29 | 
30 | MSCCLPP_API_CPP Context::~Context() = default;
31 | 
32 | MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) {
33 |   return RegisteredMemory(std::make_shared<RegisteredMemory::Impl>(ptr, size, transports, *pimpl_));
34 | }
35 | 
36 | MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
37 |   return Endpoint(std::make_shared<Endpoint::Impl>(config, *pimpl_));
38 | }
39 | 
40 | MSCCLPP_API_CPP std::shared_ptr<Connection> Context::connect(Endpoint localEndpoint, Endpoint remoteEndpoint) {
41 |   std::shared_ptr<Connection> conn;
42 |   if (localEndpoint.transport() == Transport::CudaIpc) {
43 |     if (remoteEndpoint.transport() != Transport::CudaIpc) {
44 |       throw mscclpp::Error("Local transport is CudaIpc but remote is not", ErrorCode::InvalidUsage);
45 |     }
46 |     conn = std::make_shared<CudaIpcConnection>(localEndpoint, remoteEndpoint, pimpl_->ipcStream_);
47 |   } else if (AllIBTransports.has(localEndpoint.transport())) {
48 |     if (!AllIBTransports.has(remoteEndpoint.transport())) {
49 |       throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage);
50 |     }
51 |     conn = std::make_shared<IBConnection>(localEndpoint, remoteEndpoint, *this);
52 |   } else if (localEndpoint.transport() == Transport::Ethernet) {
53 |     if (remoteEndpoint.transport() != Transport::Ethernet) {
54 |       throw mscclpp::Error("Local transport is Ethernet but remote is not", ErrorCode::InvalidUsage);
55 |     }
56 |     conn = std::make_shared<EthernetConnection>(localEndpoint, remoteEndpoint);
57 |   } else {
58 |     throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError);
59 |   }
60 | 
61 |   pimpl_->connections_.push_back(conn);
62 |   return conn;
63 | }
64 | 
65 | }  // namespace mscclpp
66 | 


--------------------------------------------------------------------------------
/src/errors.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <cstring>
 5 | #include <mscclpp/errors.hpp>
 6 | #include <mscclpp/gpu.hpp>
 7 | 
 8 | #include "api.h"
 9 | 
10 | namespace mscclpp {
11 | 
12 | std::string errorToString(enum ErrorCode error) {
13 |   switch (error) {
14 |     case ErrorCode::SystemError:
15 |       return "SystemError";
16 |     case ErrorCode::InternalError:
17 |       return "InternalError";
18 |     case ErrorCode::InvalidUsage:
19 |       return "InvalidUsage";
20 |     case ErrorCode::Timeout:
21 |       return "Timeout";
22 |     case ErrorCode::Aborted:
23 |       return "Aborted";
24 |     case ErrorCode::ExecutorError:
25 |       return "ExecutorError";
26 |     default:
27 |       return "UnknownError";
28 |   }
29 | }
30 | 
31 | BaseError::BaseError(const std::string& message, int errorCode)
32 |     : std::runtime_error(""), message_(message), errorCode_(errorCode) {}
33 | 
34 | BaseError::BaseError(int errorCode) : std::runtime_error(""), errorCode_(errorCode) {}
35 | 
36 | int BaseError::getErrorCode() const { return errorCode_; }
37 | 
38 | const char* BaseError::what() const noexcept { return message_.c_str(); }
39 | 
40 | MSCCLPP_API_CPP Error::Error(const std::string& message, ErrorCode errorCode) : BaseError(static_cast<int>(errorCode)) {
41 |   message_ = message + " (Mscclpp failure: " + errorToString(errorCode) + ")";
42 | }
43 | 
44 | MSCCLPP_API_CPP ErrorCode Error::getErrorCode() const { return static_cast<ErrorCode>(errorCode_); }
45 | 
46 | MSCCLPP_API_CPP SysError::SysError(const std::string& message, int errorCode) : BaseError(errorCode) {
47 |   message_ = message + " (System failure: " + std::strerror(errorCode) + ")";
48 | }
49 | 
50 | MSCCLPP_API_CPP CudaError::CudaError(const std::string& message, int errorCode) : BaseError(errorCode) {
51 |   message_ = message + " (Cuda failure: " + cudaGetErrorString(static_cast<cudaError_t>(errorCode)) + ")";
52 | }
53 | 
54 | MSCCLPP_API_CPP CuError::CuError(const std::string& message, int errorCode) : BaseError(errorCode) {
55 |   const char* errStr;
56 |   if (cuGetErrorString(static_cast<CUresult>(errorCode), &errStr) != CUDA_SUCCESS) {
57 |     errStr = "failed to get error string";
58 |   }
59 |   message_ = message + " (Cu failure: " + errStr + ")";
60 | }
61 | 
62 | MSCCLPP_API_CPP IbError::IbError(const std::string& message, int errorCode) : BaseError(errorCode) {
63 |   message_ = message + " (Ib failure: " + std::strerror(errorCode) + ")";
64 | }
65 | 
66 | };  // namespace mscclpp
67 | 


--------------------------------------------------------------------------------
/src/fifo.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/fifo.hpp>
 5 | #include <mscclpp/gpu_utils.hpp>
 6 | 
 7 | #include "api.h"
 8 | #include "atomic.hpp"
 9 | 
10 | namespace mscclpp {
11 | 
12 | struct Fifo::Impl {
13 |   detail::UniqueGpuHostPtr<ProxyTrigger> triggers;
14 |   detail::UniqueGpuPtr<uint64_t> head;
15 |   detail::UniqueGpuPtr<uint64_t> tailReplica;
16 |   const int size;
17 | 
18 |   // allocated on the host. Only accessed by the host. This is a copy of the
19 |   // value pointed to by fifoTailDev and the invariant is that
20 |   // *fifoTailDev <= hostTail. Meaning that host's copy of tail is
21 |   // always ahead of the device's copy and host updates the device's copy
22 |   // only when it is needed. Therefore, hostTail is the "true" tail
23 |   // and fifoTailDev is a "stale" tail. See proxy.cc to undertand how
24 |   // these updates are pushed to the device.
25 |   uint64_t hostTail;
26 | 
27 |   // for transferring fifo tail
28 |   CudaStreamWithFlags stream;
29 | 
30 |   Impl(int size)
31 |       : triggers(detail::gpuCallocHostUnique<ProxyTrigger>(size)),
32 |         head(detail::gpuCallocUnique<uint64_t>()),
33 |         tailReplica(detail::gpuCallocUnique<uint64_t>()),
34 |         size(size),
35 |         hostTail(0),
36 |         stream(cudaStreamNonBlocking) {}
37 | };
38 | 
39 | MSCCLPP_API_CPP Fifo::Fifo(int size) : pimpl(std::make_unique<Impl>(size)) {}
40 | MSCCLPP_API_CPP Fifo::~Fifo() = default;
41 | 
42 | MSCCLPP_API_CPP ProxyTrigger Fifo::poll() {
43 |   ProxyTrigger trigger;
44 |   ProxyTrigger* ptr = &pimpl->triggers.get()[pimpl->hostTail % pimpl->size];
45 |   // we are loading fst first. if fst is non-zero then snd is also valid
46 |   trigger.fst = atomicLoad(&(ptr->fst), memoryOrderAcquire);
47 |   trigger.snd = ptr->snd;
48 |   return trigger;
49 | }
50 | 
51 | MSCCLPP_API_CPP void Fifo::pop() {
52 |   atomicStore(&(pimpl->triggers.get()[pimpl->hostTail % pimpl->size].fst), uint64_t{0}, memoryOrderRelease);
53 |   (pimpl->hostTail)++;
54 | }
55 | 
56 | MSCCLPP_API_CPP void Fifo::flushTail(bool sync) {
57 |   // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure that the fifo can
58 |   // make progress even if there is no request mscclppSync. However, mscclppSync type is for flush request.
59 |   AvoidCudaGraphCaptureGuard cgcGuard;
60 |   MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), &pimpl->hostTail, sizeof(uint64_t),
61 |                                     cudaMemcpyHostToDevice, pimpl->stream));
62 |   if (sync) {
63 |     MSCCLPP_CUDATHROW(cudaStreamSynchronize(pimpl->stream));
64 |   }
65 | }
66 | 
67 | MSCCLPP_API_CPP int Fifo::size() const { return pimpl->size; }
68 | 
69 | MSCCLPP_API_CPP FifoDeviceHandle Fifo::deviceHandle() {
70 |   FifoDeviceHandle deviceHandle;
71 |   deviceHandle.triggers = pimpl->triggers.get();
72 |   deviceHandle.head = pimpl->head.get();
73 |   deviceHandle.tailReplica = pimpl->tailReplica.get();
74 |   deviceHandle.size = pimpl->size;
75 |   return deviceHandle;
76 | }
77 | 
78 | }  // namespace mscclpp
79 | 


--------------------------------------------------------------------------------
/src/include/api.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_API_H_
 5 | #define MSCCLPP_API_H_
 6 | 
 7 | #define MSCCLPP_API extern "C" __attribute__((visibility("default")))
 8 | #define MSCCLPP_API_CPP __attribute__((visibility("default")))
 9 | 
10 | #endif  // MSCCLPP_API_H_
11 | 


--------------------------------------------------------------------------------
/src/include/atomic.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_ATOMIC_HPP_
 5 | #define MSCCLPP_ATOMIC_HPP_
 6 | 
 7 | #if defined(MSCCLPP_USE_CUDA)
 8 | #define MSCCLPP_DEVICE_CUDA
 9 | #include <mscclpp/atomic_device.hpp>
10 | #undef MSCCLPP_DEVICE_CUDA
11 | #else  // !defined(MSCCLPP_USE_CUDA)
12 | #define MSCCLPP_DEVICE_HIP
13 | #include <mscclpp/atomic_device.hpp>
14 | #undef MSCCLPP_DEVICE_HIP
15 | #endif  // !defined(MSCCLPP_USE_CUDA)
16 | 
17 | #endif  // MSCCLPP_ATOMIC_HPP_
18 | 


--------------------------------------------------------------------------------
/src/include/communicator.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_COMMUNICATOR_HPP_
 5 | #define MSCCLPP_COMMUNICATOR_HPP_
 6 | 
 7 | #include <memory>
 8 | #include <mscclpp/core.hpp>
 9 | #include <unordered_map>
10 | #include <vector>
11 | 
12 | #include "utils_internal.hpp"
13 | 
14 | namespace mscclpp {
15 | 
16 | class BaseRecvItem {
17 |  public:
18 |   virtual ~BaseRecvItem() = default;
19 |   virtual void wait() = 0;
20 |   virtual bool isReady() const = 0;
21 | };
22 | 
23 | template <typename T>
24 | class RecvItem : public BaseRecvItem {
25 |  public:
26 |   RecvItem(std::shared_future<T> future) : future_(future) {}
27 | 
28 |   void wait() { future_.wait(); }
29 | 
30 |   bool isReady() const { return future_.wait_for(std::chrono::seconds(0)) == std::future_status::ready; }
31 | 
32 |  private:
33 |   std::shared_future<T> future_;
34 | };
35 | 
36 | struct ConnectionInfo {
37 |   int remoteRank;
38 |   int tag;
39 | };
40 | 
41 | struct Communicator::Impl {
42 |   std::shared_ptr<Bootstrap> bootstrap_;
43 |   std::shared_ptr<Context> context_;
44 |   std::unordered_map<const Connection*, ConnectionInfo> connectionInfos_;
45 |   std::shared_ptr<BaseRecvItem> lastRecvItem_;
46 | 
47 |   // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair.
48 |   // If the RecvItem gets ready, it will be removed at the next call to getLastRecvItem.
49 |   std::unordered_map<std::pair<int, int>, std::shared_ptr<BaseRecvItem>, PairHash> lastRecvItems_;
50 | 
51 |   Impl(std::shared_ptr<Bootstrap> bootstrap, std::shared_ptr<Context> context);
52 | 
53 |   // Set the last RecvItem for a {remoteRank, tag} pair.
54 |   // This is used to store the corresponding RecvItem of a future returned by recvMemory() or connect().
55 |   void setLastRecvItem(int remoteRank, int tag, std::shared_ptr<BaseRecvItem> item);
56 | 
57 |   // Return the last RecvItem that is not ready.
58 |   // If the item is ready, it will be removed from the map and nullptr will be returned.
59 |   std::shared_ptr<BaseRecvItem> getLastRecvItem(int remoteRank, int tag);
60 | 
61 |   struct Connector;
62 | };
63 | 
64 | }  // namespace mscclpp
65 | 
66 | #endif  // MSCCLPP_COMMUNICATOR_HPP_
67 | 


--------------------------------------------------------------------------------
/src/include/connection.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_CONNECTION_HPP_
 5 | #define MSCCLPP_CONNECTION_HPP_
 6 | 
 7 | #include <mscclpp/core.hpp>
 8 | #include <mscclpp/gpu_utils.hpp>
 9 | 
10 | #include "communicator.hpp"
11 | #include "context.hpp"
12 | #include "ib.hpp"
13 | #include "registered_memory.hpp"
14 | #include "socket.h"
15 | 
16 | namespace mscclpp {
17 | 
18 | class CudaIpcConnection : public Connection {
19 |   std::shared_ptr<CudaStreamWithFlags> stream_;
20 | 
21 |  public:
22 |   CudaIpcConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, std::shared_ptr<CudaStreamWithFlags> stream);
23 | 
24 |   Transport transport() override;
25 | 
26 |   Transport remoteTransport() override;
27 | 
28 |   void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
29 |              uint64_t size) override;
30 |   void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override;
31 | 
32 |   void flush(int64_t timeoutUsec) override;
33 | };
34 | 
35 | class IBConnection : public Connection {
36 |   Transport transport_;
37 |   Transport remoteTransport_;
38 |   IbQp* qp;
39 |   std::unique_ptr<uint64_t> dummyAtomicSource_;  // not used anywhere but IB needs a source
40 |   RegisteredMemory dummyAtomicSourceMem_;
41 |   mscclpp::TransportInfo dstTransportInfo_;
42 | 
43 |  public:
44 |   IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context);
45 | 
46 |   Transport transport() override;
47 | 
48 |   Transport remoteTransport() override;
49 | 
50 |   void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
51 |              uint64_t size) override;
52 |   void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override;
53 | 
54 |   void flush(int64_t timeoutUsec) override;
55 | };
56 | 
57 | class EthernetConnection : public Connection {
58 |   std::unique_ptr<Socket> sendSocket_;
59 |   std::unique_ptr<Socket> recvSocket_;
60 |   std::thread threadRecvMessages_;
61 |   volatile uint32_t* abortFlag_;
62 |   const uint64_t sendBufferSize_;
63 |   const uint64_t recvBufferSize_;
64 |   std::vector<char> sendBuffer_;
65 |   std::vector<char> recvBuffer_;
66 | 
67 |  public:
68 |   EthernetConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, uint64_t sendBufferSize = 256 * 1024 * 1024,
69 |                      uint64_t recvBufferSize = 256 * 1024 * 1024);
70 | 
71 |   ~EthernetConnection();
72 | 
73 |   Transport transport() override;
74 | 
75 |   Transport remoteTransport() override;
76 | 
77 |   void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
78 |              uint64_t size) override;
79 |   void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override;
80 | 
81 |   void flush(int64_t timeoutUsec) override;
82 | 
83 |  private:
84 |   void recvMessages();
85 | 
86 |   void sendMessage();
87 | };
88 | 
89 | }  // namespace mscclpp
90 | 
91 | #endif  // MSCCLPP_CONNECTION_HPP_
92 | 


--------------------------------------------------------------------------------
/src/include/context.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_CONTEXT_HPP_
 5 | #define MSCCLPP_CONTEXT_HPP_
 6 | 
 7 | #include <mscclpp/core.hpp>
 8 | #include <mscclpp/gpu_utils.hpp>
 9 | #include <unordered_map>
10 | #include <vector>
11 | 
12 | #include "ib.hpp"
13 | 
14 | namespace mscclpp {
15 | 
16 | struct Context::Impl {
17 |   std::vector<std::shared_ptr<Connection>> connections_;
18 |   std::unordered_map<Transport, std::unique_ptr<IbCtx>> ibContexts_;
19 |   std::shared_ptr<CudaStreamWithFlags> ipcStream_;
20 |   CUmemGenericAllocationHandle mcHandle_;
21 | 
22 |   Impl();
23 | 
24 |   IbCtx* getIbContext(Transport ibTransport);
25 | };
26 | 
27 | }  // namespace mscclpp
28 | 
29 | #endif  // MSCCLPP_CONTEXT_HPP_
30 | 


--------------------------------------------------------------------------------
/src/include/endpoint.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_ENDPOINT_HPP_
 5 | #define MSCCLPP_ENDPOINT_HPP_
 6 | 
 7 | #include <mscclpp/core.hpp>
 8 | #include <vector>
 9 | 
10 | #include "ib.hpp"
11 | #include "socket.h"
12 | 
13 | #define MAX_IF_NAME_SIZE 16
14 | 
15 | namespace mscclpp {
16 | 
17 | struct Endpoint::Impl {
18 |   Impl(EndpointConfig config, Context::Impl& contextImpl);
19 |   Impl(const std::vector<char>& serialization);
20 | 
21 |   Transport transport_;
22 |   uint64_t hostHash_;
23 |   int maxWriteQueueSize_;
24 | 
25 |   // The following are only used for IB and are undefined for other transports.
26 |   bool ibLocal_;
27 |   IbQp* ibQp_;
28 |   IbQpInfo ibQpInfo_;
29 | 
30 |   // The following are only used for Ethernet and are undefined for other transports.
31 |   std::unique_ptr<Socket> socket_;
32 |   SocketAddress socketAddress_;
33 |   volatile uint32_t* abortFlag_;
34 |   char netIfName_[MAX_IF_NAME_SIZE + 1];
35 | };
36 | 
37 | }  // namespace mscclpp
38 | 
39 | #endif  // MSCCLPP_ENDPOINT_HPP_
40 | 


--------------------------------------------------------------------------------
/src/include/registered_memory.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef MSCCLPP_REGISTERED_MEMORY_HPP_
 5 | #define MSCCLPP_REGISTERED_MEMORY_HPP_
 6 | 
 7 | #include <mscclpp/core.hpp>
 8 | #include <mscclpp/errors.hpp>
 9 | #include <mscclpp/gpu.hpp>
10 | 
11 | #include "communicator.hpp"
12 | #include "ib.hpp"
13 | 
14 | namespace mscclpp {
15 | 
16 | struct TransportInfo {
17 |   Transport transport;
18 | 
19 |   // TODO: rewrite this using std::variant or something
20 |   bool ibLocal;
21 |   union {
22 |     struct {
23 |       cudaIpcMemHandle_t cudaIpcBaseHandle;
24 |       size_t cudaIpcOffsetFromBase;
25 |     };
26 |     struct {
27 |       const IbMr* ibMr;
28 |       IbMrInfo ibMrInfo;
29 |     };
30 |     struct {
31 |       union {
32 |         char shareableHandle[64];
33 |         struct {
34 |           // These are only defined for multicast (NVLS) capability
35 |           pid_t rootPid;
36 |           int fileDesc;
37 |         };
38 |       };
39 |       size_t offsetFromBase;
40 |     };
41 |   };
42 | };
43 | 
44 | struct RegisteredMemory::Impl {
45 |   // This is the data pointer returned by RegisteredMemory::data(), which may be different from the original data
46 |   // pointer for deserialized remote memory.
47 |   void* data;
48 |   // This is the original data pointer the RegisteredMemory was created with.
49 |   void* originalDataPtr;
50 |   size_t size;
51 |   // This is the size returned by cuMemGetAddressRange of data
52 |   size_t baseDataSize;
53 |   uint64_t hostHash;
54 |   uint64_t pidHash;
55 |   bool isCuMemMapAlloc;
56 |   TransportFlags transports;
57 |   std::vector<TransportInfo> transportInfos;
58 | 
59 |   // For sharing memory handle via file descriptor
60 |   int fileDesc = -1;
61 | 
62 |   Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl);
63 |   /// Constructs a RegisteredMemory::Impl from a vector of data. The constructor should only be used for the remote
64 |   /// memory.
65 |   Impl(const std::vector<char>& data);
66 |   ~Impl();
67 | 
68 |   const TransportInfo& getTransportInfo(Transport transport) const;
69 | };
70 | 
71 | }  // namespace mscclpp
72 | 
73 | #endif  // MSCCLPP_REGISTERED_MEMORY_HPP_
74 | 


--------------------------------------------------------------------------------
/src/include/utils_internal.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 2 | // Modifications Copyright (c) Microsoft Corporation.
 3 | // Licensed under the MIT License.
 4 | 
 5 | #ifndef MSCCLPP_UTILS_INTERNAL_HPP_
 6 | #define MSCCLPP_UTILS_INTERNAL_HPP_
 7 | 
 8 | #include <chrono>
 9 | #include <cstdint>
10 | #include <cstdio>
11 | #include <mscclpp/utils.hpp>
12 | 
13 | namespace mscclpp {
14 | 
15 | // PCI Bus ID <-> int64 conversion functions
16 | std::string int64ToBusId(int64_t id);
17 | int64_t busIdToInt64(const std::string busId);
18 | 
19 | uint64_t getHash(const char* string, int n);
20 | uint64_t getHostHash();
21 | uint64_t getPidHash();
22 | void getRandomData(void* buffer, size_t bytes);
23 | 
24 | struct netIf {
25 |   char prefix[64];
26 |   int port;
27 | };
28 | 
29 | int parseStringList(const char* string, struct netIf* ifList, int maxList);
30 | bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
31 | 
32 | template <class T>
33 | inline void hashCombine(std::size_t& hash, const T& v) {
34 |   std::hash<T> hasher;
35 |   hash ^= hasher(v) + 0x9e3779b9 + (hash << 6) + (hash >> 2);
36 | }
37 | 
38 | struct PairHash {
39 |  public:
40 |   template <typename T, typename U>
41 |   std::size_t operator()(const std::pair<T, U>& x) const {
42 |     std::size_t hash = 0;
43 |     hashCombine(hash, x.first);
44 |     hashCombine(hash, x.second);
45 |     return hash;
46 |   }
47 | };
48 | 
49 | }  // namespace mscclpp
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/src/memory_channel.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mscclpp/memory_channel.hpp>
 5 | 
 6 | #include "api.h"
 7 | #include "debug.h"
 8 | 
 9 | namespace mscclpp {
10 | 
11 | MSCCLPP_API_CPP BaseMemoryChannel::BaseMemoryChannel(std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore)
12 |     : semaphore_(semaphore) {}
13 | 
14 | MSCCLPP_API_CPP MemoryChannel::MemoryChannel(std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
15 |                                              RegisteredMemory dst, void* src, void* packetBuffer)
16 |     : BaseMemoryChannel(semaphore), dst_(dst), src_(src), packetBuffer_(packetBuffer) {
17 |   if (!dst.transports().has(Transport::CudaIpc)) {
18 |     throw Error("MemoryChannel: dst must be registered with CudaIpc", ErrorCode::InvalidUsage);
19 |   }
20 | }
21 | 
22 | MSCCLPP_API_CPP BaseMemoryChannel::DeviceHandle BaseMemoryChannel::deviceHandle() const {
23 |   return BaseMemoryChannel::DeviceHandle(semaphore_->deviceHandle());
24 | }
25 | 
26 | MSCCLPP_API_CPP MemoryChannel::DeviceHandle MemoryChannel::deviceHandle() const {
27 |   return MemoryChannel::DeviceHandle(semaphore_->deviceHandle(), dst_.data(), src_, packetBuffer_);
28 | }
29 | 
30 | }  // namespace mscclpp
31 | 


--------------------------------------------------------------------------------
/src/numa.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <numa.h>
 5 | 
 6 | #include <fstream>
 7 | #include <mscclpp/gpu_utils.hpp>
 8 | 
 9 | #include "api.h"
10 | 
11 | // Convert a logical cudaDev index to the NVML device minor number
12 | static const std::string getBusId(int cudaDev) {
13 |   // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
14 |   // format. Still need to allocate proper space in case PCI domain goes
15 |   // higher.
16 |   char busIdChar[] = "00000000:00:00.0";
17 |   MSCCLPP_CUDATHROW(cudaDeviceGetPCIBusId(busIdChar, sizeof(busIdChar), cudaDev));
18 |   // we need the hex in lower case format
19 |   for (size_t i = 0; i < sizeof(busIdChar); i++) {
20 |     busIdChar[i] = std::tolower(busIdChar[i]);
21 |   }
22 |   return std::string(busIdChar);
23 | }
24 | 
25 | namespace mscclpp {
26 | 
27 | MSCCLPP_API_CPP int getDeviceNumaNode(int cudaDev) {
28 |   std::string busId = getBusId(cudaDev);
29 |   std::string file_str = "/sys/bus/pci/devices/" + busId + "/numa_node";
30 |   std::ifstream file(file_str);
31 |   int numaNode;
32 |   if (file.is_open()) {
33 |     if (!(file >> numaNode)) {
34 |       throw Error("Failed to read NUMA node from file: " + file_str, ErrorCode::SystemError);
35 |     }
36 |   } else {
37 |     throw Error("Failed to open file: " + file_str, ErrorCode::SystemError);
38 |   }
39 |   return numaNode;
40 | }
41 | 
42 | MSCCLPP_API_CPP void numaBind(int node) {
43 |   int totalNumNumaNodes = numa_num_configured_nodes();
44 |   if (node < 0 || node >= totalNumNumaNodes) {
45 |     throw Error(
46 |         "Invalid NUMA node " + std::to_string(node) + ", must be between 0 and " + std::to_string(totalNumNumaNodes),
47 |         ErrorCode::InvalidUsage);
48 |   }
49 |   nodemask_t mask;
50 |   nodemask_zero(&mask);
51 |   nodemask_set_compat(&mask, node);
52 |   numa_bind_compat(&mask);
53 | }
54 | 
55 | }  // namespace mscclpp
56 | 


--------------------------------------------------------------------------------
/src/utils.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <signal.h>
 5 | #include <unistd.h>
 6 | 
 7 | #include <chrono>
 8 | #include <iostream>
 9 | #include <mscclpp/errors.hpp>
10 | #include <mscclpp/utils.hpp>
11 | #include <sstream>
12 | #include <string>
13 | 
14 | // Throw upon SIGALRM.
15 | static void sigalrmTimeoutHandler(int) {
16 |   signal(SIGALRM, SIG_IGN);
17 |   throw mscclpp::Error("Timer timed out", mscclpp::ErrorCode::Timeout);
18 | }
19 | 
20 | namespace mscclpp {
21 | 
22 | Timer::Timer(int timeout) { set(timeout); }
23 | 
24 | Timer::~Timer() {
25 |   if (timeout_ > 0) {
26 |     alarm(0);
27 |     signal(SIGALRM, SIG_DFL);
28 |   }
29 | }
30 | 
31 | int64_t Timer::elapsed() const {
32 |   auto end = std::chrono::steady_clock::now();
33 |   return std::chrono::duration_cast<std::chrono::microseconds>(end - start_).count();
34 | }
35 | 
36 | void Timer::set(int timeout) {
37 |   timeout_ = timeout;
38 |   if (timeout > 0) {
39 |     signal(SIGALRM, sigalrmTimeoutHandler);
40 |     alarm(timeout);
41 |   }
42 |   start_ = std::chrono::steady_clock::now();
43 | }
44 | 
45 | void Timer::reset() { set(timeout_); }
46 | 
47 | void Timer::print(const std::string& name) {
48 |   auto us = elapsed();
49 |   std::stringstream ss;
50 |   ss << name << ": " << us << " us\n";
51 |   std::cout << ss.str();
52 | }
53 | 
54 | ScopedTimer::ScopedTimer(const std::string& name) : name_(name) {}
55 | 
56 | ScopedTimer::~ScopedTimer() { print(name_); }
57 | 
58 | std::string getHostName(int maxlen, const char delim) {
59 |   std::string hostname(maxlen + 1, '\0');
60 |   if (gethostname(const_cast<char*>(hostname.data()), maxlen) != 0) {
61 |     throw Error("gethostname failed", ErrorCode::SystemError);
62 |   }
63 |   int i = 0;
64 |   while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1)) i++;
65 |   hostname[i] = '\0';
66 |   return hostname.substr(0, i);
67 | }
68 | 
69 | }  // namespace mscclpp
70 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | find_package(MPI)
 5 | 
 6 | set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads)
 7 | if(IBVERBS_FOUND)
 8 |     list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
 9 | endif()
10 | set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
11 | set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
12 | set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/include)
13 | 
14 | if(MSCCLPP_USE_ROCM)
15 |     file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
16 |     set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
17 | endif()
18 | 
19 | function(add_test_executable name sources)
20 |     add_executable(${name} ${sources})
21 |     target_link_libraries(${name} ${TEST_LIBS_COMMON} MPI::MPI_CXX)
22 |     if(IBVERBS_FOUND)
23 |         target_compile_definitions(${name} PRIVATE USE_IBVERBS)
24 |     endif()
25 |     target_include_directories(${name} ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
26 |     target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS)
27 |     add_test(NAME ${name} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh ${name} 2)
28 | endfunction()
29 | 
30 | add_test_executable(allgather_test_cpp allgather_test_cpp.cu)
31 | add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu)
32 | add_test_executable(nvls_test nvls_test.cu)
33 | add_test_executable(executor_test executor_test.cc)
34 | 
35 | configure_file(run_mpi_test.sh.in run_mpi_test.sh)
36 | 
37 | include(CTest)
38 | include(FetchContent)
39 | FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip)
40 | option(INSTALL_GTEST OFF)
41 | FetchContent_MakeAvailable(googletest)
42 | include(GoogleTest)
43 | 
44 | # Unit tests
45 | add_executable(unit_tests)
46 | target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST})
47 | target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
48 | add_subdirectory(unit)
49 | gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)
50 | 
51 | # Multi-process unit tests
52 | add_executable(mp_unit_tests)
53 | target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX)
54 | target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
55 | add_subdirectory(mp_unit)
56 | gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST)
57 | 
58 | # mscclpp-test
59 | add_subdirectory(mscclpp-test)
60 | 


--------------------------------------------------------------------------------
/test/deploy/config:
--------------------------------------------------------------------------------
1 | Host mscclit-000000
2 |   Port 22345
3 |   IdentityFile /root/mscclpp/sshkey
4 |   StrictHostKeyChecking no
5 | Host mscclit-000001
6 |   Port 22345
7 |   IdentityFile /root/mscclpp/sshkey
8 |   StrictHostKeyChecking no
9 | 


--------------------------------------------------------------------------------
/test/deploy/deploy.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | # get parameter form $1
 4 | TEST_NAME=$1
 5 | 
 6 | KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
 7 | ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
 8 | if [ "${TEST_NAME}" == "nccltest-single-node" ]; then
 9 |   ROOT_DIR="${ROOT_DIR}/mscclpp"
10 |   SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp"
11 | fi
12 | DST_DIR="/tmp/mscclpp"
13 | if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then
14 |   HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"
15 | else
16 |   HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
17 | fi
18 | SSH_OPTION="StrictHostKeyChecking=no"
19 | 
20 | chmod 400 ${KeyFilePath}
21 | ssh-keygen -t rsa -f sshkey -P ""
22 | 
23 | while true; do
24 |   set +e
25 |   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname"
26 |   if [ $? -eq 0 ]; then
27 |     break
28 |   fi
29 |   echo "Waiting for sshd to start..."
30 |   sleep 5
31 | done
32 | 
33 | set -e
34 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
35 | parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
36 | 
37 | # force to pull the latest image
38 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
39 |   "sudo docker pull ${CONTAINERIMAGE}"
40 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
41 |   "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
42 |   -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
43 |   --entrypoint /bin/bash ${CONTAINERIMAGE}"
44 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
45 |   "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"
46 | 
47 | 


--------------------------------------------------------------------------------
/test/deploy/hostfile:
--------------------------------------------------------------------------------
1 | azureuser@mscclit-000000
2 | azureuser@mscclit-000001
3 | 


--------------------------------------------------------------------------------
/test/deploy/hostfile_ci:
--------------------------------------------------------------------------------
1 | azureuser@10.0.0.4


--------------------------------------------------------------------------------
/test/deploy/hostfile_mpi:
--------------------------------------------------------------------------------
1 | mscclit-000000
2 | mscclit-000001
3 | 


--------------------------------------------------------------------------------
/test/deploy/perf_ndmv4.jsonl:
--------------------------------------------------------------------------------
 1 | {"name":"allgather", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":291.52, "busBw":255.08, "size":1073741824, "time":3683.13, "target":"throughput"}
 2 | {"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":244.61, "busBw":229.33, "size":3221225472, "time":13168.31,"target":"throughput"}
 3 | {"name":"allgather", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":0.1112, "busBw":0.0973, "size":8192,       "time":73.63,   "target":"latency"}
 4 | {"name":"allreduce", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":139.41, "busBw":243.96, "size":1073741824, "time":7701.98, "target":"throughput"}
 5 | {"name":"allreduce", "kernel":2, "ranks":8, "ranksPerNode":8, "algBw":1.25,   "busBw":2.19,   "size":8192,       "time":6.51,    "target":"latency"}
 6 | {"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.44,   "busBw":0.83,   "size":8192,       "time":18.42,   "target":"latency"}
 7 | {"name":"allreduce", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":139.08, "busBw":243.40, "size":1073741824, "time":7719.85, "target":"throughput"}
 8 | {"name":"allreduce", "kernel":4, "ranks":8, "ranksPerNode":8, "algBw":106.98, "busBw":187.22, "size":16777216,   "time":156.81,  "target":"throughput"}
 9 | {"name":"allreduce", "kernel":4, "ranks":8, "ranksPerNode":8, "algBw":116.24, "busBw":203.42, "size":33554432,   "time":288.65,  "target":"throughput"}
10 | {"name":"allreduce", "kernel":5, "ranks":8, "ranksPerNode":8,  "algBw":126.52,"busBw":221.418,"size":50331648,   "time":397.79,  "target":"throughput"}
11 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":3.3919,"busBw":5.9359, "size":24576,      "time":7.24,    "target":"latency"}
12 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":6.21,  "busBw":10.87,  "size":49152,      "time":7.91,    "target":"latency"}
13 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":8.90,  "busBw":15.57,  "size":73728,      "time":8.28,    "target":"latency"}
14 | {"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":80.84,  "busBw":151.58, "size":25165824,   "time":311.28,  "target":"throughput"}
15 | {"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":97.27,  "busBw":182.38, "size":50331648,   "time":517.43,  "target":"throughput"}
16 | {"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":125.99, "busBw":236.24, "size":3221225472, "time":25565.46,"target":"throughput"}
17 | {"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5,  "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"}
18 | {"name":"alltoall",  "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53,  "busBw":43.63,  "size":1073741824, "time":23071.5, "target":"throughput"}
19 | {"name":"alltoall",  "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"}
20 | 


--------------------------------------------------------------------------------
/test/deploy/perf_ndmv5.jsonl:
--------------------------------------------------------------------------------
1 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":3.98,  "busBw":6.96,   "size":24576,      "time":6.18,    "target":"latency"}
2 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":7.42,  "busBw":12.99,  "size":49152,      "time":6.62,    "target":"latency"}
3 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":10.67, "busBw":18.68,  "size":73728,      "time":6.91,    "target":"latency"}


--------------------------------------------------------------------------------
/test/deploy/pytest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | if [[ $OMPI_COMM_WORLD_RANK == 0 ]]
 5 | then
 6 |   pytest /root/mscclpp/python/test/test_mscclpp.py -x -v
 7 | else
 8 |   pytest /root/mscclpp/python/test/test_mscclpp.py -x 2>&1 >/dev/null
 9 | fi
10 | 


--------------------------------------------------------------------------------
/test/deploy/setup.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | mkdir -p /root/.ssh
 4 | mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
 5 | chown root:root /root/.ssh/authorized_keys
 6 | mv /root/mscclpp/test/deploy/config /root/.ssh/config
 7 | chown root:root /root/.ssh/config
 8 | chmod 400 /root/mscclpp/sshkey
 9 | chown root:root /root/mscclpp/sshkey
10 | 
11 | nvidia-smi -pm 1
12 | for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
13 |     nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
14 | done
15 | 
16 | if [[ "${CUDA_VERSION}" == *"11."* ]]; then
17 |     pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
18 | else
19 |     pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
20 | fi
21 | 
22 | cd /root/mscclpp && pip3 install .
23 | 
24 | mkdir -p /var/run/sshd
25 | /usr/sbin/sshd -p 22345
26 | 


--------------------------------------------------------------------------------
/test/mp_unit/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | target_sources(mp_unit_tests PRIVATE
 5 |     mp_unit_tests.cc
 6 |     bootstrap_tests.cc
 7 |     ib_tests.cu
 8 |     communicator_tests.cu
 9 |     port_channel_tests.cu
10 |     memory_channel_tests.cu
11 |     executor_tests.cc
12 | )
13 | 


--------------------------------------------------------------------------------
/test/mp_unit/executor_tests.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mpi.h>
 5 | 
 6 | #include <filesystem>
 7 | #include <mscclpp/env.hpp>
 8 | #include <mscclpp/npkit/npkit.hpp>
 9 | 
10 | #include "mp_unit_tests.hpp"
11 | 
12 | namespace {
13 | std::string getExecutablePath() {
14 |   char result[PATH_MAX];
15 |   ssize_t count = readlink("/proc/self/exe", result, PATH_MAX);
16 |   if (count == -1) {
17 |     throw std::runtime_error("Failed to get executable path");
18 |   }
19 |   return std::string(result, count);
20 | }
21 | }  // namespace
22 | 
23 | void ExecutorTest::SetUp() {
24 |   MultiProcessTest::SetUp();
25 | 
26 |   MSCCLPP_CUDATHROW(cudaSetDevice(rankToLocalRank(gEnv->rank)));
27 |   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap;
28 |   mscclpp::UniqueId id;
29 |   bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
30 |   if (gEnv->rank == 0) id = bootstrap->createUniqueId();
31 |   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
32 |   bootstrap->initialize(id);
33 |   std::shared_ptr<mscclpp::Communicator> communicator = std::make_shared<mscclpp::Communicator>(bootstrap);
34 |   executor = std::make_shared<mscclpp::Executor>(communicator);
35 |   npkitDumpDir = mscclpp::env()->npkitDumpDir;
36 |   if (npkitDumpDir != "") {
37 |     NpKit::Init(gEnv->rank);
38 |   }
39 | }
40 | 
41 | void ExecutorTest::TearDown() {
42 |   if (npkitDumpDir != "") {
43 |     NpKit::Dump(npkitDumpDir);
44 |     NpKit::Shutdown();
45 |   }
46 |   executor.reset();
47 |   MultiProcessTest::TearDown();
48 | }
49 | 
50 | TEST_F(ExecutorTest, TwoNodesAllreduce) {
51 |   if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
52 |     GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
53 |     return;
54 |   }
55 |   std::string executablePath = getExecutablePath();
56 |   std::filesystem::path path = executablePath;
57 |   std::filesystem::path executionFilesPath =
58 |       path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json";
59 |   mscclpp::ExecutionPlan plan(executionFilesPath.string());
60 |   const int bufferSize = 1024 * 1024;
61 |   std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
62 |   mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
63 |   executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16,
64 |                     plan, stream);
65 |   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
66 | }
67 | 


--------------------------------------------------------------------------------
/test/mscclpp-test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz)
 5 | FetchContent_MakeAvailable(json)
 6 | 
 7 | function(add_mscclpp_test_executable name sources)
 8 |     if(MSCCLPP_USE_ROCM)
 9 |         set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
10 |     endif()
11 |     add_executable(${name} ${sources} common.cc)
12 |     target_link_libraries(${name} ${TEST_LIBS_COMMON} MPI::MPI_CXX nlohmann_json::nlohmann_json)
13 |     target_include_directories(${name} ${TEST_INC_COMMON})
14 | endfunction()
15 | 
16 | add_mscclpp_test_executable(sendrecv_test_perf sendrecv_test.cu)
17 | add_mscclpp_test_executable(allgather_test_perf allgather_test.cu)
18 | add_mscclpp_test_executable(allreduce_test_perf allreduce_test.cu)
19 | add_mscclpp_test_executable(alltoall_test_perf alltoall_test.cu)
20 | 


--------------------------------------------------------------------------------
/test/mscclpp-test/check_perf_result.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import json
 5 | import logging
 6 | 
 7 | 
 8 | def load_perf_file(perf_fine: str) -> dict:
 9 |     res = {}
10 |     with open(perf_fine, "r") as f:
11 |         for line in f:
12 |             data = json.loads(line)
13 |             res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])] = {
14 |                 "algBw": data["algBw"],
15 |                 "busBw": data["busBw"],
16 |                 "time": data["time"],
17 |             }
18 |             if "target" in data:
19 |                 res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[
20 |                     "target"
21 |                 ]
22 |     return res
23 | 
24 | 
25 | def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float, bandwidth_threshold: float) -> bool:
26 |     res = True
27 |     threshold = None
28 |     for key, value in perf_result.items():
29 |         if key not in baseline:
30 |             continue
31 |         if baseline[key]["target"] == "latency":
32 |             threshold = time_threshold
33 |         else:
34 |             threshold = bandwidth_threshold
35 |         if abs(value["time"] - baseline[key]["time"]) / baseline[key]["time"] > threshold:
36 |             logging.error(
37 |                 "%s: time %f not match baseline %f with threshold %f",
38 |                 str(key),
39 |                 value["time"],
40 |                 baseline[key]["time"],
41 |                 threshold,
42 |             )
43 |             res = False
44 |     return res
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     import argparse
49 | 
50 |     parser = argparse.ArgumentParser()
51 |     parser.add_argument("--perf-file", type=str, required=True)
52 |     parser.add_argument("--baseline-file", type=str, required=True)
53 |     # We use different threshold for latency and bandwidth. For latency,
54 |     # small data size is used which introduces more variance. For bandwidth, the performance is more stable.
55 |     parser.add_argument("--time-threshold", type=float, default=0.15)
56 |     parser.add_argument("--bandwidth-threshold", type=float, default=0.05)
57 |     args = parser.parse_args()
58 | 
59 |     perf_result = load_perf_file(args.perf_file)
60 |     baseline = load_perf_file(args.baseline_file)
61 |     if check_perf_result(perf_result, baseline, args.time_threshold, args.bandwidth_threshold):
62 |         print("PASS")
63 |     else:
64 |         print("FAIL")
65 |         exit(1)
66 | 


--------------------------------------------------------------------------------
/test/run_mpi_test.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 2 ]; then
 4 |     echo "Usage: $0 <test_name> <np> [test_args]"
 5 |     exit 1
 6 | fi
 7 | test_name=$1
 8 | np=$2
 9 | shift 2 # Pass the rest of the arguments to the test
10 | 
11 | mpirun --bind-to numa --tag-output -x MSCCLPP_DEBUG=INFO -x LD_LIBRARY_PATH=@CMAKE_BINARY_DIR@:$LD_LIBRARY_PATH -np $np @CMAKE_CURRENT_BINARY_DIR@/$test_name $@
12 | 


--------------------------------------------------------------------------------
/test/unit/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | target_sources(unit_tests PRIVATE
 5 |     core_tests.cc
 6 |     cuda_utils_tests.cc
 7 |     errors_tests.cc
 8 |     fifo_tests.cu
 9 |     numa_tests.cc
10 |     socket_tests.cc
11 |     utils_tests.cc
12 |     utils_internal_tests.cc
13 |     compile_tests.cu
14 | )
15 | 


--------------------------------------------------------------------------------
/test/unit/compile_tests.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <gtest/gtest.h>
 5 | 
 6 | #undef NDEBUG
 7 | #ifndef DEBUG_BUILD
 8 | #define DEBUG_BUILD
 9 | #endif  // DEBUG_BUILD
10 | #include <assert.h>
11 | 
12 | #include <mscclpp/poll_device.hpp>
13 | 
14 | TEST(CompileTest, Assert) { assert(true); }
15 | 


--------------------------------------------------------------------------------
/test/unit/core_tests.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <gmock/gmock.h>
 5 | #include <gtest/gtest.h>
 6 | 
 7 | #include <mscclpp/core.hpp>
 8 | 
 9 | class LocalCommunicatorTest : public ::testing::Test {
10 |  protected:
11 |   void SetUp() override {
12 |     bootstrap = std::make_shared<mscclpp::TcpBootstrap>(0, 1);
13 |     bootstrap->initialize(bootstrap->createUniqueId());
14 |     comm = std::make_shared<mscclpp::Communicator>(bootstrap);
15 |   }
16 | 
17 |   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap;
18 |   std::shared_ptr<mscclpp::Communicator> comm;
19 | };
20 | 
21 | TEST_F(LocalCommunicatorTest, RegisterMemory) {
22 |   int dummy[42];
23 |   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
24 |   EXPECT_EQ(memory.data(), &dummy);
25 |   EXPECT_EQ(memory.size(), sizeof(dummy));
26 |   EXPECT_EQ(memory.transports(), mscclpp::NoTransports);
27 | }
28 | 
29 | TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
30 |   int dummy[42];
31 |   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
32 |   comm->sendMemory(memory, 0, 0);
33 |   auto memoryFuture = comm->recvMemory(0, 0);
34 |   auto sameMemory = memoryFuture.get();
35 |   EXPECT_EQ(sameMemory.data(), memory.data());
36 |   EXPECT_EQ(sameMemory.size(), memory.size());
37 |   EXPECT_EQ(sameMemory.transports(), memory.transports());
38 | }
39 | 


--------------------------------------------------------------------------------
/test/unit/cuda_utils_tests.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <gtest/gtest.h>
 5 | 
 6 | #include <mscclpp/gpu_utils.hpp>
 7 | 
 8 | TEST(CudaUtilsTest, AllocShared) {
 9 |   auto p1 = mscclpp::detail::gpuCallocShared<uint32_t>();
10 |   auto p2 = mscclpp::detail::gpuCallocShared<int64_t>(5);
11 | }
12 | 
13 | TEST(CudaUtilsTest, AllocUnique) {
14 |   auto p1 = mscclpp::detail::gpuCallocUnique<uint32_t>();
15 |   auto p2 = mscclpp::detail::gpuCallocUnique<int64_t>(5);
16 | }
17 | 
18 | TEST(CudaUtilsTest, MakeSharedHost) {
19 |   auto p1 = mscclpp::detail::gpuCallocHostShared<uint32_t>();
20 |   auto p2 = mscclpp::detail::gpuCallocHostShared<int64_t>(5);
21 | }
22 | 
23 | TEST(CudaUtilsTest, MakeUniqueHost) {
24 |   auto p1 = mscclpp::detail::gpuCallocHostUnique<uint32_t>();
25 |   auto p2 = mscclpp::detail::gpuCallocHostUnique<int64_t>(5);
26 | }
27 | 
28 | TEST(CudaUtilsTest, Memcpy) {
29 |   const int nElem = 1024;
30 |   std::vector<int> hostBuff(nElem);
31 |   for (int i = 0; i < nElem; ++i) {
32 |     hostBuff[i] = i + 1;
33 |   }
34 |   std::vector<int> hostBuffTmp(nElem, 0);
35 |   auto devBuff = mscclpp::detail::gpuCallocShared<int>(nElem);
36 |   mscclpp::gpuMemcpy<int>(devBuff.get(), hostBuff.data(), nElem, cudaMemcpyHostToDevice);
37 |   mscclpp::gpuMemcpy<int>(hostBuffTmp.data(), devBuff.get(), nElem, cudaMemcpyDeviceToHost);
38 | 
39 |   for (int i = 0; i < nElem; ++i) {
40 |     EXPECT_EQ(hostBuff[i], hostBuffTmp[i]);
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/test/unit/errors_tests.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <gtest/gtest.h>
 5 | 
 6 | #include <mscclpp/errors.hpp>
 7 | 
 8 | TEST(ErrorsTest, SystemError) {
 9 |   mscclpp::Error error("test", mscclpp::ErrorCode::SystemError);
10 |   EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError);
11 |   EXPECT_EQ(error.what(), std::string("test (Mscclpp failure: SystemError)"));
12 | }
13 | 
14 | TEST(ErrorsTest, InternalError) {
15 |   mscclpp::Error error("test", mscclpp::ErrorCode::InternalError);
16 |   EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError);
17 |   EXPECT_EQ(error.what(), std::string("test (Mscclpp failure: InternalError)"));
18 | }
19 | 
20 | TEST(ErrorsTest, InvalidUsage) {
21 |   mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage);
22 |   EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage);
23 |   EXPECT_EQ(error.what(), std::string("test (Mscclpp failure: InvalidUsage)"));
24 | }
25 | 
26 | TEST(ErrorsTest, Timeout) {
27 |   mscclpp::Error error("test", mscclpp::ErrorCode::Timeout);
28 |   EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout);
29 |   EXPECT_EQ(error.what(), std::string("test (Mscclpp failure: Timeout)"));
30 | }
31 | 


--------------------------------------------------------------------------------
/test/unit/fifo_tests.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <gtest/gtest.h>
 5 | 
 6 | #include <mscclpp/fifo.hpp>
 7 | #include <mscclpp/gpu_utils.hpp>
 8 | #include <mscclpp/numa.hpp>
 9 | #include <mscclpp/utils.hpp>
10 | 
11 | #define ITER 10000  // should be larger than the FIFO size for proper testing
12 | 
13 | __constant__ mscclpp::FifoDeviceHandle gFifoTestFifoDeviceHandle;
14 | __global__ void kernelFifoTest() {
15 |   if (threadIdx.x + blockIdx.x * blockDim.x != 0) return;
16 | 
17 |   mscclpp::FifoDeviceHandle& fifo = gFifoTestFifoDeviceHandle;
18 |   mscclpp::ProxyTrigger trigger;
19 |   for (uint64_t i = 1; i < ITER + 1; ++i) {
20 |     trigger.fst = i;
21 |     trigger.snd = i;
22 |     uint64_t curFifoHead = fifo.push(trigger);
23 |     if (i % fifo.size == 0) {
24 |       fifo.sync(curFifoHead);
25 |     }
26 |   }
27 | }
28 | 
29 | TEST(FifoTest, Fifo) {
30 |   int cudaNum;
31 |   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaNum));
32 |   int numaNode = mscclpp::getDeviceNumaNode(cudaNum);
33 |   mscclpp::numaBind(numaNode);
34 | 
35 |   mscclpp::Fifo hostFifo;
36 |   if (hostFifo.size() >= ITER) {
37 |     FAIL() << "ITER is too small for proper testing.";
38 |   }
39 | 
40 |   mscclpp::FifoDeviceHandle devFifo = hostFifo.deviceHandle();
41 |   MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gFifoTestFifoDeviceHandle, &devFifo, sizeof(devFifo)));
42 | 
43 |   kernelFifoTest<<<1, 1>>>();
44 |   MSCCLPP_CUDATHROW(cudaGetLastError());
45 | 
46 |   mscclpp::ProxyTrigger trigger;
47 |   trigger.fst = 0;
48 |   trigger.snd = 0;
49 | 
50 |   uint64_t spin = 0;
51 |   uint64_t flushCnt = 0;
52 |   mscclpp::Timer timer(3);
53 |   for (uint64_t i = 0; i < ITER; ++i) {
54 |     trigger = hostFifo.poll();
55 |     while (trigger.fst == 0 || trigger.snd == 0) {
56 |       trigger = hostFifo.poll();
57 | 
58 |       if (spin++ > 1000000) {
59 |         FAIL() << "Polling is stuck.";
60 |       }
61 |     }
62 |     // see `src/proxy.cc` for the reason of this line
63 |     trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
64 |     ASSERT_TRUE(trigger.fst == (i + 1));
65 |     ASSERT_TRUE(trigger.snd == (i + 1));
66 |     hostFifo.pop();
67 |     if ((++flushCnt % hostFifo.size()) == 0) {
68 |       hostFifo.flushTail();
69 |     }
70 |     spin = 0;
71 |   }
72 |   hostFifo.flushTail(true);
73 | 
74 |   std::stringstream ss;
75 |   ss << "FifoTest.Fifo: " << (float)timer.elapsed() / ITER << " us/iter\n";
76 |   std::cout << ss.str();
77 | 
78 |   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
79 | }
80 | 


--------------------------------------------------------------------------------
/test/unit/numa_tests.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <gtest/gtest.h>
 5 | 
 6 | #include <mscclpp/gpu_utils.hpp>
 7 | #include <mscclpp/numa.hpp>
 8 | 
 9 | TEST(NumaTest, Basic) {
10 |   int num;
11 |   MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num));
12 |   if (num == 0) {
13 |     return;
14 |   }
15 |   for (int i = 0; i < num; i++) {
16 |     int numaNode = mscclpp::getDeviceNumaNode(i);
17 |     EXPECT_GE(numaNode, 0);
18 |     mscclpp::numaBind(numaNode);
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/test/unit/socket_tests.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <gtest/gtest.h>
 5 | 
 6 | #include <mscclpp/utils.hpp>
 7 | #include <thread>
 8 | 
 9 | #include "socket.h"
10 | 
11 | TEST(Socket, ListenAndConnect) {
12 |   mscclpp::Timer timeout(3);
13 | 
14 |   std::string ipPortPair = "127.0.0.1:51512";
15 |   mscclpp::SocketAddress listenAddr;
16 | 
17 |   ASSERT_NO_THROW(mscclpp::SocketGetAddrFromString(&listenAddr, ipPortPair.c_str()));
18 | 
19 |   mscclpp::Socket listenSock(&listenAddr);
20 |   listenSock.bindAndListen();
21 | 
22 |   std::thread clientThread([&listenAddr]() {
23 |     mscclpp::Socket sock(&listenAddr);
24 |     sock.connect();
25 |   });
26 | 
27 |   mscclpp::Socket sock;
28 |   sock.accept(&listenSock);
29 | 
30 |   clientThread.join();
31 | }
32 | 


--------------------------------------------------------------------------------
/test/unit/utils_internal_tests.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include <gtest/gtest.h>
 5 | 
 6 | #include <thread>
 7 | 
 8 | #include "utils_internal.hpp"
 9 | 
10 | TEST(UtilsInternalTest, getHostHash) {
11 |   uint64_t hash1 = mscclpp::getHostHash();
12 |   uint64_t hash2;
13 | 
14 |   std::thread th([&hash2]() { hash2 = mscclpp::getHostHash(); });
15 | 
16 |   ASSERT_TRUE(th.joinable());
17 |   th.join();
18 | 
19 |   EXPECT_EQ(hash1, hash2);
20 | }
21 | 


--------------------------------------------------------------------------------
/test/unit/utils_tests.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <gtest/gtest.h>
 5 | 
 6 | #include <mscclpp/errors.hpp>
 7 | #include <mscclpp/utils.hpp>
 8 | #include <thread>
 9 | 
10 | TEST(UtilsTest, Timer) {
11 |   mscclpp::Timer timer;
12 |   sleep(1);
13 |   int64_t elapsed = timer.elapsed();
14 |   EXPECT_GE(elapsed, 1000000);
15 | 
16 |   timer.reset();
17 |   sleep(1);
18 |   elapsed = timer.elapsed();
19 |   EXPECT_GE(elapsed, 1000000);
20 |   EXPECT_LT(elapsed, 1100000);
21 | }
22 | 
23 | TEST(UtilsTest, TimerTimeout) {
24 |   mscclpp::Timer timer(1);
25 |   ASSERT_THROW(sleep(2), mscclpp::Error);
26 | }
27 | 
28 | TEST(UtilsTest, TimerTimeoutReset) {
29 |   mscclpp::Timer timer(3);
30 |   sleep(2);
31 |   // Resetting the timer should prevent the timeout.
32 |   timer.reset();
33 |   ASSERT_NO_THROW(sleep(2));
34 | 
35 |   // Elapsed time should be slightly larger than 2 seconds.
36 |   EXPECT_GT(timer.elapsed(), 2000000);
37 |   EXPECT_LT(timer.elapsed(), 2100000);
38 | }
39 | 
40 | TEST(UtilsTest, ScopedTimer) {
41 |   mscclpp::ScopedTimer timerA("UtilsTest.ScopedTimer.A");
42 |   mscclpp::ScopedTimer timerB("UtilsTest.ScopedTimer.B");
43 |   sleep(1);
44 |   int64_t elapsedA = timerA.elapsed();
45 |   int64_t elapsedB = timerB.elapsed();
46 |   EXPECT_GE(elapsedA, 1000000);
47 |   EXPECT_GE(elapsedB, 1000000);
48 | }
49 | 
50 | TEST(UtilsTest, getHostName) {
51 |   std::string hostname1 = mscclpp::getHostName(1024, '.');
52 |   EXPECT_FALSE(hostname1.empty());
53 |   EXPECT_LE(hostname1.size(), 1024);
54 | 
55 |   EXPECT_EQ(mscclpp::getHostName(1024, hostname1[0]).size(), 0);
56 | 
57 |   std::string hostname2;
58 | 
59 |   std::thread th([&hostname2]() { hostname2 = mscclpp::getHostName(1024, '.'); });
60 | 
61 |   ASSERT_TRUE(th.joinable());
62 |   th.join();
63 | 
64 |   EXPECT_EQ(hostname1, hostname2);
65 | }
66 | 


--------------------------------------------------------------------------------
/tools/npkit/build_and_run_npkit.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | MSCCLPP_SRC_DIR="/mnt/mscclpp"
 4 | NPKIT_RUN_DIR="/mnt/npkit_run"
 5 | MPI_HOME="/usr/local/mpi"
 6 | HOSTFILE="hostfile"
 7 | LEADER_IP_PORT="10.6.0.4:50000"
 8 | 
 9 | cd ${MSCCLPP_SRC_DIR}
10 | make clean
11 | MPI_HOME=${MPI_HOME} make -j NPKIT=1
12 | 
13 | parallel-ssh -h ${HOSTFILE} "rm -rf ${NPKIT_RUN_DIR}"
14 | parallel-ssh -h ${HOSTFILE} "mkdir -p ${NPKIT_RUN_DIR}"
15 | parallel-scp -r -h ${HOSTFILE} ${MSCCLPP_SRC_DIR} ${NPKIT_RUN_DIR}
16 | parallel-ssh -h ${HOSTFILE} "mkdir -p ${NPKIT_RUN_DIR}/npkit_dump"
17 | parallel-ssh -h ${HOSTFILE} "mkdir -p ${NPKIT_RUN_DIR}/npkit_trace"
18 | 
19 | # --bind-to numa is required because hardware timer from different cores (or core groups) can be non-synchronized.
20 | mpirun --allow-run-as-root -hostfile ${HOSTFILE} -map-by ppr:8:node --bind-to numa -x LD_PRELOAD=${NPKIT_RUN_DIR}/mscclpp/build/lib/libmscclpp.so -x MSCCLPP_DEBUG=WARN -x MSCCLPP_NPKIT_DUMP_DIR=${NPKIT_RUN_DIR}/npkit_dump ${NPKIT_RUN_DIR}/mscclpp/build/bin/tests/allgather_test -ip_port ${LEADER_IP_PORT} -kernel 0
21 | 
22 | parallel-ssh -h ${HOSTFILE} "cd ${NPKIT_RUN_DIR}/mscclpp/tools/npkit && python npkit_trace_generator.py --npkit_dump_dir ${NPKIT_RUN_DIR}/npkit_dump --npkit_event_header_path ${NPKIT_RUN_DIR}/mscclpp/src/include/npkit/npkit_event.h --output_dir ${NPKIT_RUN_DIR}/npkit_trace"
23 | 


--------------------------------------------------------------------------------