├── .azure-pipelines ├── integration-test-rocm.yml ├── integration-test.yml ├── multi-nodes-test.yml ├── nccl-api-test.yaml ├── templates │ ├── integration-test.yaml │ ├── nccl-test.yaml │ ├── ut-npkit.yaml │ └── ut.yaml └── ut.yml ├── .clang-format ├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation-improvement.md │ ├── feature_request.md │ └── perf_improvement.md └── workflows │ ├── codeql-analysis.yml │ ├── doc-build.yaml │ ├── gh-pages.yml │ ├── integration-test-backup.yml │ ├── lint.yml │ ├── mscclpp-lang.yml │ ├── update-version.yml │ └── ut-backup.yml ├── .gitignore ├── .readthedocs.yaml ├── CITATION.cff ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── VERSION ├── apps └── nccl │ ├── CMakeLists.txt │ ├── include │ └── nccl.h │ ├── src │ ├── allgather.hpp │ ├── allreduce.hpp │ ├── broadcast.hpp │ ├── common.hpp │ └── nccl.cu │ └── test │ ├── CMakeLists.txt │ └── nccl_api_test.cc ├── cmake ├── AddFormatTargets.cmake ├── CheckAmdGpu.cmake ├── CheckNvidiaGpu.cmake ├── FindGDRCopy.cmake ├── FindIBVerbs.cmake ├── FindNUMA.cmake ├── check_amd_gpu.hip └── check_nvidia_gpu.cu ├── docker ├── base-dev-x.dockerfile ├── base-x-rocm.dockerfile ├── base-x.dockerfile └── build.sh ├── docs ├── .gitignore ├── Doxyfile ├── Makefile ├── README.md ├── api │ └── index.rst ├── conf.py ├── design │ ├── design.md │ ├── mscclpp-dsl.md │ └── nccl-over-mscclpp.md ├── figs │ ├── abstractions.png │ ├── mscclpp_vs_nccl_comparison_num_nodes_1.jpeg │ ├── mscclpp_vs_nccl_comparison_num_nodes_2.jpeg │ └── size_boundary_diagram.png ├── getting-started │ ├── quickstart.md │ └── tutorials │ │ ├── customized-proxy-service.md │ │ ├── index.rst │ │ ├── initialization.md │ │ ├── memory-channel.md │ │ ├── packet-api.md │ │ ├── port-channel.md │ │ └── python-api.md ├── index.rst ├── make.bat ├── performance │ └── performance-ndmv4.md └── requirements.txt ├── include ├── CMakeLists.txt └── mscclpp │ ├── assert_device.hpp │ ├── atomic_device.hpp │ ├── concurrency_device.hpp │ ├── copy_device.hpp │ ├── core.hpp │ ├── device.hpp │ ├── env.hpp │ ├── errors.hpp │ ├── executor.hpp │ ├── fifo.hpp │ ├── fifo_device.hpp │ ├── gpu.hpp │ ├── gpu_data_types.hpp │ ├── gpu_utils.hpp │ ├── memory_channel.hpp │ ├── memory_channel_device.hpp │ ├── npkit │ ├── npkit.hpp │ ├── npkit_event.hpp │ └── npkit_struct.hpp │ ├── numa.hpp │ ├── nvls.hpp │ ├── nvls_device.hpp │ ├── packet_device.hpp │ ├── poll_device.hpp │ ├── port_channel.hpp │ ├── port_channel_device.hpp │ ├── proxy.hpp │ ├── semaphore.hpp │ ├── semaphore_device.hpp │ └── utils.hpp ├── pyproject.toml ├── python ├── CMakeLists.txt ├── examples │ ├── allgather_allpairs_multinodes_packets.py │ ├── allgather_barrier.py │ ├── allreduce_allpairs.py │ ├── allreduce_allpairs_get.py │ ├── allreduce_allpairs_packet.py │ ├── allreduce_nvls.py │ ├── allreduce_ring.py │ ├── send_recv_packet.py │ └── send_recv_proxy.py ├── mscclpp │ ├── CMakeLists.txt │ ├── __init__.py │ ├── comm.py │ ├── core_py.cpp │ ├── env_py.cpp │ ├── error_py.cpp │ ├── executor_py.cpp │ ├── fifo_py.cpp │ ├── gpu_utils_py.cpp │ ├── language │ │ ├── __init__.py │ │ ├── buffer.py │ │ ├── chunk.py │ │ ├── collective_checker.py │ │ ├── collectives.py │ │ ├── dag │ │ │ ├── __init__.py │ │ │ ├── instruction_dag.py │ │ │ ├── lower.py │ │ │ └── optimizer.py │ │ ├── ir.py │ │ ├── program.py │ │ ├── rank.py │ │ ├── topo_sort.py │ │ ├── types.py │ │ └── utils.py │ ├── memory_channel_py.cpp │ ├── npkit_py.cpp │ ├── numa_py.cpp │ ├── nvls_py.cpp │ ├── port_channel_py.cpp │ ├── semaphore_py.cpp │ ├── utils.py │ └── utils_py.cpp ├── mscclpp_benchmark │ ├── __init__.py │ ├── allreduce.cu │ ├── allreduce_bench.py │ ├── mscclpp_op.py │ └── nccl_op.py ├── requirements_cuda11.txt ├── requirements_cuda12.txt ├── requirements_rocm6.txt └── test │ ├── CMakeLists.txt │ ├── __init__.py │ ├── _cpp │ ├── __init__.py │ └── proxy_test.cpp │ ├── configs │ └── mscclpp_lang_test_config.json │ ├── d2d_semaphore_test.cu │ ├── executor_test.py │ ├── executor_test_verifier.cu │ ├── fifo_test.cu │ ├── h2d_semaphore_test.cu │ ├── memory_channel_test.cu │ ├── mscclpp_mpi.py │ ├── nvls_test.cu │ ├── port_channel_test.cu │ ├── proxy_test.cu │ ├── test_generate_mscclpp_lang_result.py │ └── test_mscclpp.py ├── src ├── .gitignore ├── CMakeLists.txt ├── bootstrap │ ├── bootstrap.cc │ └── socket.cc ├── c_style_remnants.cc ├── communicator.cc ├── connection.cc ├── context.cc ├── core.cc ├── debug.cc ├── endpoint.cc ├── env.cpp ├── errors.cc ├── executor │ ├── execution_kernel.cu │ ├── execution_plan.cc │ └── executor.cc ├── fifo.cc ├── gpu_utils.cc ├── ib.cc ├── include │ ├── api.h │ ├── atomic.hpp │ ├── communicator.hpp │ ├── connection.hpp │ ├── context.hpp │ ├── debug.h │ ├── endpoint.hpp │ ├── execution_common.hpp │ ├── execution_kernel.hpp │ ├── execution_plan.hpp │ ├── ib.hpp │ ├── ibverbs_wrapper.hpp │ ├── registered_memory.hpp │ ├── socket.h │ └── utils_internal.hpp ├── memory_channel.cc ├── npkit │ └── npkit.cc ├── numa.cc ├── nvls.cc ├── port_channel.cc ├── proxy.cc ├── registered_memory.cc ├── semaphore.cc ├── utils.cc └── utils_internal.cc ├── test ├── CMakeLists.txt ├── allgather_test_cpp.cu ├── allgather_test_host_offloading.cu ├── deploy │ ├── config │ ├── deploy.sh │ ├── hostfile │ ├── hostfile_ci │ ├── hostfile_mpi │ ├── perf_ndmv4.jsonl │ ├── perf_ndmv5.jsonl │ ├── pytest.sh │ ├── run_tests.sh │ └── setup.sh ├── execution-files │ ├── allreduce.json │ ├── allreduce_nvls.json │ ├── allreduce_packet.json │ ├── sendrecv.json │ └── sendrecv_packet.json ├── executor_test.cc ├── mp_unit │ ├── CMakeLists.txt │ ├── bootstrap_tests.cc │ ├── communicator_tests.cu │ ├── executor_tests.cc │ ├── ib_tests.cu │ ├── memory_channel_tests.cu │ ├── mp_unit_tests.cc │ ├── mp_unit_tests.hpp │ └── port_channel_tests.cu ├── mscclpp-test │ ├── CMakeLists.txt │ ├── allgather_test.cu │ ├── allreduce_test.cu │ ├── alltoall_test.cu │ ├── check_perf_result.py │ ├── common.cc │ ├── common.hpp │ └── sendrecv_test.cu ├── nvls_test.cu ├── run_mpi_test.sh.in └── unit │ ├── CMakeLists.txt │ ├── compile_tests.cu │ ├── core_tests.cc │ ├── cuda_utils_tests.cc │ ├── errors_tests.cc │ ├── fifo_tests.cu │ ├── numa_tests.cc │ ├── socket_tests.cc │ ├── utils_internal_tests.cc │ └── utils_tests.cc └── tools └── npkit ├── build_and_run_npkit.sh └── npkit_trace_generator.py /.azure-pipelines/integration-test.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | - main 3 | - release/* 4 | 5 | pr: 6 | branches: 7 | include: 8 | - main 9 | - release/* 10 | drafts: false 11 | 12 | jobs: 13 | - job: IntegrationTestA100 14 | displayName: Integration test A100 15 | strategy: 16 | matrix: 17 | cuda11: 18 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 19 | cuda12: 20 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 21 | 22 | pool: 23 | name: msccl-ci 24 | container: 25 | image: $(containerImage) 26 | 27 | steps: 28 | - template: templates/integration-test.yaml 29 | parameters: 30 | subscription: mscclpp-ci 31 | vmssName: mscclpp-ci 32 | sshKeySecureFile: mscclpp.pem 33 | 34 | - job: IntegrationTestH100 35 | displayName: Integration test H100 36 | strategy: 37 | matrix: 38 | cuda12: 39 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 40 | 41 | pool: 42 | name: msccl-ci-h100 43 | container: 44 | image: $(containerImage) 45 | 46 | steps: 47 | - template: templates/integration-test.yaml 48 | parameters: 49 | subscription: mscclpp-ci-h100 50 | vmssName: mscclpp-h100-ci 51 | sshKeySecureFile: mscclpp.pem 52 | perfBaselineFile: test/deploy/perf_ndmv5.jsonl 53 | -------------------------------------------------------------------------------- /.azure-pipelines/nccl-api-test.yaml: -------------------------------------------------------------------------------- 1 | trigger: 2 | - main 3 | - release/* 4 | 5 | pr: 6 | branches: 7 | include: 8 | - main 9 | - release/* 10 | drafts: false 11 | 12 | jobs: 13 | - job: NcclTestA100 14 | displayName: Run MSCCLPP over NCCL Test (A100) 15 | pool: 16 | name: msccl-ci 17 | 18 | strategy: 19 | matrix: 20 | cuda11: 21 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 22 | cuda12: 23 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 24 | 25 | container: 26 | image: $(containerImage) 27 | 28 | steps: 29 | - template: templates/nccl-test.yaml 30 | parameters: 31 | subscription: mscclpp-ci 32 | vmssName: mscclpp-ci 33 | sshKeySecureFile: mscclpp.pem 34 | nvccGencode: "-gencode=arch=compute_80,code=sm_80" 35 | 36 | - job: NcclTestH100 37 | displayName: Run MSCCLPP over NCCL Test (H100) 38 | pool: 39 | name: msccl-ci-h100 40 | 41 | strategy: 42 | matrix: 43 | cuda12: 44 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 45 | 46 | container: 47 | image: $(containerImage) 48 | 49 | steps: 50 | - template: templates/nccl-test.yaml 51 | parameters: 52 | subscription: mscclpp-ci-h100 53 | vmssName: mscclpp-h100-ci 54 | sshKeySecureFile: mscclpp.pem 55 | nvccGencode: "-gencode=arch=compute_90,code=sm_90" -------------------------------------------------------------------------------- /.azure-pipelines/ut.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | - main 3 | - release/* 4 | 5 | pr: 6 | branches: 7 | include: 8 | - main 9 | - release/* 10 | drafts: false 11 | 12 | jobs: 13 | - job: UnitTestA100 14 | timeoutInMinutes: 40 15 | pool: 16 | name: msccl-ci 17 | strategy: 18 | matrix: 19 | cuda11: 20 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 21 | cuda12: 22 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 23 | 24 | container: 25 | image: $(containerImage) 26 | 27 | steps: 28 | - template: templates/ut.yaml 29 | parameters: 30 | subscription: mscclpp-ci 31 | vmssName: mscclpp-ci 32 | sshKeySecureFile: mscclpp.pem 33 | 34 | - job: UnitTestWithNpKitA100 35 | timeoutInMinutes: 30 36 | pool: 37 | name: msccl-ci 38 | strategy: 39 | matrix: 40 | cuda11: 41 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 42 | cuda12: 43 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 44 | 45 | container: 46 | image: $(containerImage) 47 | 48 | steps: 49 | - template: templates/ut-npkit.yaml 50 | parameters: 51 | subscription: mscclpp-ci 52 | vmssName: mscclpp-ci 53 | sshKeySecureFile: mscclpp.pem 54 | 55 | - job: UnitTestH100 56 | timeoutInMinutes: 40 57 | pool: 58 | name: msccl-ci-h100 59 | strategy: 60 | matrix: 61 | cuda12: 62 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 63 | 64 | container: 65 | image: $(containerImage) 66 | 67 | steps: 68 | - template: templates/ut.yaml 69 | parameters: 70 | subscription: mscclpp-ci-h100 71 | vmssName: mscclpp-h100-ci 72 | sshKeySecureFile: mscclpp.pem 73 | 74 | - job: UnitTestWithNpKitH100 75 | timeoutInMinutes: 30 76 | pool: 77 | name: msccl-ci-h100 78 | strategy: 79 | matrix: 80 | cuda12: 81 | containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 82 | 83 | container: 84 | image: $(containerImage) 85 | 86 | steps: 87 | - template: templates/ut-npkit.yaml 88 | parameters: 89 | subscription: mscclpp-ci-h100 90 | vmssName: mscclpp-h100-ci 91 | sshKeySecureFile: mscclpp.pem 92 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | ColumnLimit: 120 3 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | FROM ${BASE_IMAGE} 3 | ARG USERNAME=mscclpp 4 | ARG USER_UID=1000 5 | ARG USER_GID=$USER_UID 6 | 7 | # Create the user 8 | RUN groupadd --gid $USER_GID $USERNAME && \ 9 | useradd --uid $USER_UID --gid $USER_GID -m $USERNAME && \ 10 | echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME && \ 11 | chmod 0440 /etc/sudoers.d/$USERNAME 12 | 13 | USER $USERNAME 14 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "MSCCL++ Dev Container", 3 | "build": { 4 | "dockerfile": "Dockerfile", 5 | "args": { 6 | "BASE_IMAGE": "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8" 7 | } 8 | }, 9 | "remoteUser": "mscclpp", 10 | "customizations": { 11 | "vscode": { 12 | "extensions": [ 13 | // Python 14 | "ms-python.python", 15 | "ms-python.vscode-pylance", 16 | // C++ 17 | "ms-vscode.cpptools", 18 | "ms-vscode.cpptools-extension-pack", 19 | "ms-vscode.cmake-tools" 20 | ] 21 | } 22 | }, 23 | "privileged": true, 24 | "runArgs": [ 25 | "--net=host", 26 | "--ipc=host", 27 | "--gpus=all", 28 | "--ulimit=memlock=-1:-1" 29 | ], 30 | "workspaceFolder": "/home/mscclpp/mscclpp", 31 | "workspaceMount": "source=${localWorkspaceFolder},target=/home/mscclpp/mscclpp,type=bind,consistency=cached" 32 | } 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us fix 4 | title: "[Bug]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation-improvement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation improvement 3 | about: Enhance or fix documentation 4 | title: "[Doc]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[Feature]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/perf_improvement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Performance improvement 3 | about: Discuss on performance issues 4 | title: "[Perf]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - release/* 8 | pull_request: 9 | branches: 10 | - main 11 | - release/* 12 | schedule: 13 | - cron: "30 1 * * 1" 14 | 15 | jobs: 16 | analyze-cuda: 17 | name: Analyze (CUDA) 18 | runs-on: 'ubuntu-latest' 19 | container: 20 | image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }} 21 | 22 | permissions: 23 | actions: read 24 | contents: read 25 | security-events: write 26 | 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | language: [ 'cpp', 'python' ] 31 | version: [ 'cuda11.8', 'cuda12.8' ] 32 | 33 | steps: 34 | - name: Checkout repository 35 | uses: actions/checkout@v4 36 | 37 | - name: Check disk space 38 | run: | 39 | df -h 40 | 41 | - name: Initialize CodeQL 42 | uses: github/codeql-action/init@v2 43 | with: 44 | languages: ${{ matrix.language }} 45 | 46 | - name: Dubious ownership exception 47 | run: | 48 | git config --global --add safe.directory /__w/mscclpp/mscclpp 49 | 50 | - name: Build 51 | run: | 52 | rm -rf build && mkdir build && cd build 53 | cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON .. 54 | make -j 55 | 56 | - name: Perform CodeQL Analysis 57 | uses: github/codeql-action/analyze@v2 58 | with: 59 | category: "/language:${{matrix.language}}/version:${{matrix.version}}" 60 | 61 | analyze-rocm: 62 | name: Analyze (ROCm) 63 | runs-on: 'ubuntu-latest' 64 | container: 65 | image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }} 66 | 67 | permissions: 68 | actions: read 69 | contents: read 70 | security-events: write 71 | 72 | strategy: 73 | fail-fast: false 74 | matrix: 75 | language: [ 'cpp', 'python' ] 76 | version: [ 'rocm6.2' ] 77 | 78 | steps: 79 | - name: Checkout repository 80 | uses: actions/checkout@v4 81 | 82 | - name: Check disk space 83 | run: | 84 | df -h 85 | 86 | - name: Initialize CodeQL 87 | uses: github/codeql-action/init@v2 88 | with: 89 | languages: ${{ matrix.language }} 90 | 91 | - name: Dubious ownership exception 92 | run: | 93 | git config --global --add safe.directory /__w/mscclpp/mscclpp 94 | 95 | - name: Build 96 | run: | 97 | rm -rf build && mkdir build && cd build 98 | CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON .. 99 | make -j 100 | 101 | - name: Perform CodeQL Analysis 102 | uses: github/codeql-action/analyze@v2 103 | with: 104 | category: "/language:${{matrix.language}}/version:${{matrix.version}}" 105 | -------------------------------------------------------------------------------- /.github/workflows/doc-build.yaml: -------------------------------------------------------------------------------- 1 | name: Docs Build 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - '**' 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | 18 | - name: Setup Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: '3.10' 22 | 23 | - name: Install dependencies 24 | run: | 25 | sudo apt-get update 26 | sudo apt-get install -y doxygen graphviz 27 | pip install -r docs/requirements.txt 28 | 29 | - name: Build docs 30 | run: | 31 | cd docs 32 | doxygen 33 | make html 34 | touch _build/html/.nojekyll 35 | -------------------------------------------------------------------------------- /.github/workflows/gh-pages.yml: -------------------------------------------------------------------------------- 1 | name: GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | # Allows you to run this workflow manually from the Actions tab 9 | workflow_dispatch: 10 | 11 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 12 | permissions: 13 | contents: read 14 | pages: write 15 | id-token: write 16 | 17 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 18 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 19 | concurrency: 20 | group: "pages" 21 | cancel-in-progress: false 22 | 23 | jobs: 24 | build: 25 | runs-on: ubuntu-latest 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v4 29 | - name: Setup python 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: '3.10' 33 | - name: Install dependencies 34 | run: | 35 | sudo apt-get update 36 | sudo apt-get install -y doxygen graphviz 37 | pip install -r docs/requirements.txt 38 | - name: Build docs 39 | run: | 40 | cd docs 41 | doxygen 42 | make html 43 | touch _build/html/.nojekyll 44 | - name: Upload artifacts 45 | uses: actions/upload-pages-artifact@v3 46 | with: 47 | path: docs/_build/html 48 | 49 | deploy: 50 | environment: 51 | name: github-pages 52 | url: ${{ steps.deployment.outputs.page_url }} 53 | runs-on: ubuntu-latest 54 | needs: build 55 | steps: 56 | - name: Deploy to GitHub Pages 57 | id: deployment 58 | uses: actions/deploy-pages@v4 59 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - '**' 7 | 8 | jobs: 9 | cpplint: 10 | runs-on: ubuntu-22.04 11 | 12 | steps: 13 | - name: Check out Git repository 14 | uses: actions/checkout@v4 15 | 16 | - name: Install ClangFormat 17 | run: | 18 | sudo apt-get update 19 | sudo apt-get install -y clang-format 20 | 21 | - name: Run cpplint 22 | run: | 23 | CPPSOURCES=$(find ./src ./include ./python ./test ./apps -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') 24 | clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES} 25 | 26 | pylint: 27 | runs-on: ubuntu-22.04 28 | 29 | steps: 30 | - name: Check out Git repository 31 | uses: actions/checkout@v4 32 | 33 | - name: Set up Python 34 | uses: actions/setup-python@v4 35 | with: 36 | python-version: 3 37 | 38 | - name: Install Python dependencies 39 | run: python3 -m pip install black 40 | 41 | - name: Run black 42 | run: python3 -m black --check --config pyproject.toml . 43 | 44 | spelling: 45 | runs-on: ubuntu-22.04 46 | 47 | steps: 48 | - name: Check out Git repository 49 | uses: actions/checkout@v4 50 | 51 | - name: Download misspell 52 | run: | 53 | curl -L https://github.com/client9/misspell/releases/download/v0.3.4/misspell_0.3.4_linux_64bit.tar.gz -o /tmp/misspell_0.3.4_linux_64bit.tar.gz 54 | tar -xzf /tmp/misspell_0.3.4_linux_64bit.tar.gz -C . 55 | 56 | - name: Check spelling 57 | run: | 58 | ./misspell -error . 59 | -------------------------------------------------------------------------------- /.github/workflows/mscclpp-lang.yml: -------------------------------------------------------------------------------- 1 | name: MSCCLPPLang 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - release/* 8 | 9 | jobs: 10 | compare-diffs: 11 | runs-on: 'ubuntu-latest' 12 | container: 13 | image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }} 14 | 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | version: [ 'cuda11.8', 'cuda12.8' ] 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - name: Set environment variable 24 | run: echo "LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/local/cuda/lib64" >> $GITHUB_ENV 25 | 26 | - name: Install mscclpp 27 | run: | 28 | CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install . 29 | 30 | - name: Copy test script/config to temp directory 31 | run: | 32 | cp python/test/test_generate_mscclpp_lang_result.py $RUNNER_TEMP/ 33 | cp python/test/configs/mscclpp_lang_test_config.json $RUNNER_TEMP/ 34 | - name: generate outputs 35 | run: | 36 | python3 $RUNNER_TEMP/test_generate_mscclpp_lang_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/pr-outputs/ 37 | - name: Checkout main branch 38 | uses: actions/checkout@v4 39 | if: github.event_name == 'pull_request' || github.event_name == 'push' 40 | with: 41 | ref: main 42 | - name: Install msccl and dependencies 43 | run: | 44 | CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install . 45 | - name: generate outputs 46 | run: | 47 | python3 $RUNNER_TEMP/test_generate_mscclpp_lang_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/main-outputs/ 48 | - name: Compare outputs 49 | run: | 50 | diff -rw $RUNNER_TEMP/tests/main-outputs/ $RUNNER_TEMP/tests/pr-outputs/ -------------------------------------------------------------------------------- /.github/workflows/update-version.yml: -------------------------------------------------------------------------------- 1 | name: Update Version 2 | on: 3 | pull_request: 4 | branches: 5 | - main 6 | - release/** 7 | paths: 8 | - 'VERSION' 9 | 10 | permissions: 11 | contents: write 12 | 13 | jobs: 14 | update-version: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v4 19 | with: 20 | ref: ${{ github.head_ref }} 21 | fetch-depth: 0 22 | 23 | - name: Read version 24 | id: read_version 25 | run: echo "VERSION=$(cat VERSION)" >> $GITHUB_ENV 26 | 27 | - name: Update Version in Files 28 | run: | 29 | VERSION=${{ env.VERSION }} 30 | sed -i "s/^version: .*/version: ${VERSION}/" CITATION.cff 31 | sed -i "s/^release = \".*\"/release = \"v${VERSION}\"/" docs/conf.py 32 | sed -i "s/^version = \".*\"/version = \"${VERSION}\"/" pyproject.toml 33 | 34 | IFS='.' read -ra VER <<< "$VERSION" 35 | MAJOR=${VER[0]} 36 | MINOR=${VER[1]} 37 | PATCH=${VER[2]} 38 | 39 | # Update CMakeLists.txt 40 | sed -i "s/set(MSCCLPP_MAJOR \".*\")/set(MSCCLPP_MAJOR \"${MAJOR}\")/" CMakeLists.txt 41 | sed -i "s/set(MSCCLPP_MINOR \".*\")/set(MSCCLPP_MINOR \"${MINOR}\")/" CMakeLists.txt 42 | sed -i "s/set(MSCCLPP_PATCH \".*\")/set(MSCCLPP_PATCH \"${PATCH}\")/" CMakeLists.txt 43 | 44 | # Update header files 45 | sed -i "s/#define MSCCLPP_MAJOR .*/#define MSCCLPP_MAJOR ${MAJOR}/" include/mscclpp/core.hpp 46 | sed -i "s/#define MSCCLPP_MINOR .*/#define MSCCLPP_MINOR ${MINOR}/" include/mscclpp/core.hpp 47 | sed -i "s/#define MSCCLPP_PATCH .*/#define MSCCLPP_PATCH ${PATCH}/" include/mscclpp/core.hpp 48 | 49 | - name: Commit and Push Changes 50 | run: | 51 | git config user.name "github-actions" 52 | git config user.email "github-actions@github.com" 53 | git add CITATION.cff docs/conf.py include/mscclpp/core.hpp pyproject.toml || true 54 | if git diff --cached --exit-code; then 55 | echo "No changes to commit." 56 | else 57 | git commit -m "Update version to ${{ env.VERSION }}" 58 | git push 59 | fi 60 | env: 61 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 62 | -------------------------------------------------------------------------------- /.github/workflows/ut-backup.yml: -------------------------------------------------------------------------------- 1 | name: UnitTest 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | UnitTest: 7 | runs-on: [ self-hosted, A100 ] 8 | defaults: 9 | run: 10 | shell: bash 11 | timeout-minutes: 30 12 | strategy: 13 | matrix: 14 | cuda: [ cuda11.8, cuda12.2 ] 15 | 16 | container: 17 | image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" 18 | options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 19 | 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v4 23 | 24 | - name: Build 25 | run: | 26 | mkdir build && cd build 27 | cmake -DCMAKE_BUILD_TYPE=Release .. 28 | make -j 29 | working-directory: ${{ github.workspace }} 30 | 31 | - name: LockGPUClock 32 | run: | 33 | sudo nvidia-smi -pm 1 34 | for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do 35 | sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i 36 | done 37 | 38 | - name: UnitTests 39 | run: | 40 | ./build/test/unit_tests 41 | 42 | - name: MpUnitTests 43 | run: | 44 | set -e 45 | mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests 46 | mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests 47 | mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests 48 | 49 | - name: PyTests 50 | run: | 51 | set -e 52 | mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .hypothesis/ 3 | build/ 4 | dist/ 5 | __pycache__ 6 | .*.swp 7 | .idea/ 8 | *.so 9 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for Sphinx projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | apt_packages: 11 | - doxygen 12 | tools: 13 | python: "3.12" 14 | jobs: 15 | pre_build: 16 | - cd docs && doxygen 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | sphinx: 20 | configuration: docs/conf.py 21 | # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs 22 | # builder: "dirhtml" 23 | # Fail on all warnings to avoid broken references 24 | # fail_on_warning: true 25 | 26 | # Optionally build your docs in additional formats such as PDF and ePub 27 | # formats: 28 | # - pdf 29 | # - epub 30 | 31 | # Optional but recommended, declare the Python requirements required 32 | # to build your documentation 33 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 34 | python: 35 | install: 36 | - requirements: docs/requirements.txt 37 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | title: >- 3 | MSCCL++: Rethinking GPU Communication Abstractions for 4 | Cutting-edge AI Applications 5 | message: >- 6 | If you use this software, please cite it using the 7 | metadata from this file. 8 | type: software 9 | authors: 10 | - given-names: Aashaka 11 | family-names: Shah 12 | affiliation: Microsoft Research 13 | - given-names: Abhinav 14 | family-names: Jangda 15 | affiliation: Microsoft Research 16 | - given-names: Binyang 17 | family-names: Li 18 | affiliation: Microsoft Azure 19 | - given-names: Caio 20 | family-names: Rocha 21 | affiliation: Microsoft Azure 22 | - given-names: Changho 23 | family-names: Hwang 24 | affiliation: Microsoft Research 25 | - given-names: Jithin 26 | family-names: Jose 27 | affiliation: Microsoft Azure 28 | - given-names: Madan 29 | family-names: Musuvathi 30 | affiliation: Microsoft Research 31 | - given-names: Olli 32 | family-names: Saarikivi 33 | affiliation: Microsoft Research 34 | - given-names: Peng 35 | family-names: Cheng 36 | affiliation: Microsoft Research 37 | - given-names: Qinghua 38 | family-names: Zhou 39 | affiliation: Microsoft Azure 40 | - given-names: Roshan 41 | family-names: Dathathri 42 | affiliation: Microsoft Research 43 | - given-names: Saeed 44 | family-names: Maleki 45 | affiliation: Microsoft Research 46 | - given-names: Ziyue 47 | family-names: Yang 48 | affiliation: Microsoft Research 49 | identifiers: 50 | - type: other 51 | value: 'arxiv:2504.09014' 52 | repository-code: 'https://github.com/microsoft/mscclpp' 53 | url: 'https://microsoft.github.io/mscclpp/index.html' 54 | abstract: >- 55 | MSCCL++ redefines the interface for inter-GPU communication, thereby 56 | delivering a highly efficient and customizable communication stack 57 | tailored for distributed GPU applications. 58 | license: MIT 59 | license-url: https://github.com/microsoft/mscclpp/blob/main/LICENSE 60 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new Issue. 8 | 9 | For help and questions about using this project, please file them as new Issues. 10 | 11 | ## Microsoft Support Policy 12 | 13 | Support for this project is limited to the resources listed above. 14 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.6.0 2 | -------------------------------------------------------------------------------- /apps/nccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/*) 5 | file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS include/nccl.h) 6 | 7 | if(MSCCLPP_USE_ROCM) 8 | set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE CXX) 9 | endif() 10 | 11 | add_library(mscclpp_nccl_obj OBJECT) 12 | target_sources(mscclpp_nccl_obj PRIVATE ${SOURCES}) 13 | target_sources(mscclpp_nccl_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS}) 14 | target_include_directories(mscclpp_nccl_obj PRIVATE include ${PROJECT_SOURCE_DIR}/src/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) 15 | target_link_libraries(mscclpp_nccl_obj PRIVATE ${GPU_LIBRARIES} PUBLIC mscclpp_obj) 16 | set_target_properties(mscclpp_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) 17 | if(MSCCLPP_USE_CUDA) 18 | target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_CUDA) 19 | elseif(MSCCLPP_USE_ROCM) 20 | target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_ROCM) 21 | endif() 22 | if(MSCCLPP_NPKIT_FLAGS) 23 | target_compile_definitions(mscclpp_nccl_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS}) 24 | endif() 25 | add_library(mscclpp_nccl SHARED) 26 | target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj) 27 | set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) 28 | add_library(mscclpp_nccl_static STATIC) 29 | target_link_libraries(mscclpp_nccl_static PUBLIC mscclpp_obj mscclpp_nccl_obj) 30 | set_target_properties(mscclpp_nccl_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) 31 | 32 | install(TARGETS mscclpp_nccl_obj 33 | FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include) 34 | install(TARGETS mscclpp_nccl 35 | LIBRARY DESTINATION ${INSTALL_PREFIX}/lib) 36 | install(TARGETS mscclpp_nccl_static 37 | ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib) 38 | 39 | if(MSCCLPP_BUILD_TESTS) 40 | add_subdirectory(test) 41 | endif() 42 | -------------------------------------------------------------------------------- /apps/nccl/src/common.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef NCCL_COMMON_HPP_ 5 | #define NCCL_COMMON_HPP_ 6 | 7 | #include 8 | #include 9 | 10 | #if defined(__HIP_PLATFORM_AMD__) 11 | #define WARP_SIZE 64 12 | #define __syncwarp() __builtin_amdgcn_wave_barrier() 13 | #else 14 | #define WARP_SIZE 32 15 | #endif 16 | 17 | constexpr int NUM_NVLS_CONNECTION = 8; 18 | constexpr int NUM_SEMAPHORES = 64; 19 | 20 | constexpr int NRANKS_PER_NODE = 8; 21 | constexpr int NPEERS = 7; 22 | 23 | constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB 24 | static bool mscclppDisableChannelCache = mscclpp::env()->disableChannelCache; 25 | 26 | __device__ mscclpp::DeviceSyncer deviceSyncer; 27 | __constant__ mscclpp::DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES]; 28 | 29 | #endif // NCCL_COMMON_HPP_ 30 | -------------------------------------------------------------------------------- /apps/nccl/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | find_package(MPI) 5 | 6 | add_executable(nccl_api_test nccl_api_test.cc) 7 | target_link_libraries(nccl_api_test mscclpp mscclpp_nccl ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX) 8 | if(IBVERBS_FOUND) 9 | target_link_libraries(nccl_api_test ${IBVERBS_LIBRARIES}) 10 | target_compile_definitions(nccl_api_test PRIVATE USE_IBVERBS) 11 | endif() 12 | target_include_directories(nccl_api_test PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/apps/nccl/include) 13 | -------------------------------------------------------------------------------- /cmake/AddFormatTargets.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | # Add targets to run clang-format and black 5 | 6 | add_custom_target(check-format) 7 | add_custom_target(format) 8 | 9 | find_program(CLANG_FORMAT clang-format) 10 | if(CLANG_FORMAT) 11 | message(STATUS "Found clang-format: ${CLANG_FORMAT}") 12 | set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps/nccl/src) 13 | add_custom_target(check-format-cpp ALL 14 | COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` 15 | ) 16 | add_dependencies(check-format check-format-cpp) 17 | add_custom_target(format-cpp 18 | COMMAND ${CLANG_FORMAT} -style=file -i `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` 19 | ) 20 | add_dependencies(format format-cpp) 21 | else() 22 | message(STATUS "clang-format not found.") 23 | endif() 24 | 25 | find_program(BLACK black) 26 | if (BLACK) 27 | message(STATUS "Found black: ${BLACK}") 28 | add_custom_target(check-format-py 29 | COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR} 30 | ) 31 | add_dependencies(check-format check-format-py) 32 | add_custom_target(format-py 33 | COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR} 34 | ) 35 | add_dependencies(format format-py) 36 | else() 37 | message(STATUS "black not found.") 38 | endif() 39 | -------------------------------------------------------------------------------- /cmake/CheckAmdGpu.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | set(AMD_FOUND "FALSE") 5 | 6 | set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}") 7 | # Temporal fix for rocm5.6 8 | set(ENV{amd_comgr_DIR} "/opt/rocm/lib/cmake/amd_comgr") 9 | set(ENV{AMDDeviceLibs_DIR} "/opt/rocm/lib/cmake/AMDDeviceLibs") 10 | 11 | find_package(hip QUIET) 12 | 13 | if(NOT hip_FOUND) 14 | return() 15 | endif() 16 | 17 | enable_language(HIP) 18 | 19 | set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_amd_gpu.hip") 20 | 21 | try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC}) 22 | 23 | if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0) 24 | set(AMD_FOUND "TRUE") 25 | endif() 26 | -------------------------------------------------------------------------------- /cmake/CheckNvidiaGpu.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | set(NVIDIA_FOUND "FALSE") 5 | 6 | find_package(CUDAToolkit) 7 | 8 | if(NOT CUDAToolkit_FOUND) 9 | return() 10 | endif() 11 | 12 | set(CMAKE_CUDA_ARCHITECTURES "60") 13 | if(NOT CMAKE_CUDA_COMPILER) 14 | # In case the CUDA Toolkit directory is not in the PATH 15 | find_program(CUDA_COMPILER 16 | NAMES nvcc 17 | PATHS ${CUDAToolkit_BIN_DIR}) 18 | if(NOT CUDA_COMPILER) 19 | message(WARNING "Could not find nvcc in ${CUDAToolkit_BIN_DIR}") 20 | unset(CMAKE_CUDA_ARCHITECTURES) 21 | return() 22 | endif() 23 | set(CMAKE_CUDA_COMPILER "${CUDA_COMPILER}") 24 | endif() 25 | enable_language(CUDA) 26 | 27 | set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_nvidia_gpu.cu") 28 | 29 | try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC}) 30 | 31 | if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0) 32 | set(NVIDIA_FOUND "TRUE") 33 | else() 34 | unset(CMAKE_CUDA_ARCHITECTURES) 35 | unset(CMAKE_CUDA_COMPILER) 36 | endif() 37 | -------------------------------------------------------------------------------- /cmake/FindGDRCopy.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | # Find the GDRCopy libraries 5 | # 6 | # The following variables are optionally searched for defaults 7 | # GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found 8 | # GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found 9 | # GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found 10 | 11 | # The following are set after configuration is done: 12 | # GDRCOPY_FOUND 13 | # GDRCOPY_INCLUDE_DIRS 14 | # GDRCOPY_LIBRARIES 15 | 16 | # An imported target MSCCLPP::gdrcopy is created if the library is found. 17 | 18 | find_path(GDRCOPY_INCLUDE_DIRS 19 | NAMES gdrapi.h 20 | HINTS 21 | ${GDRCOPY_INCLUDE_DIR} 22 | ${GDRCOPY_ROOT_DIR} 23 | ${GDRCOPY_ROOT_DIR}/include) 24 | 25 | find_library(GDRCOPY_LIBRARIES 26 | NAMES gdrapi 27 | HINTS 28 | ${GDRCOPY_LIB_DIR} 29 | ${GDRCOPY_ROOT_DIR} 30 | ${GDRCOPY_ROOT_DIR}/lib) 31 | 32 | include(FindPackageHandleStandardArgs) 33 | find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES) 34 | mark_as_advanced(GDRCOPY_INCLUDE_DIR GDRCOPY_LIBRARIES) 35 | 36 | if(GDRCOPY_FOUND) 37 | if(NOT TARGET MSCCLPP::gdrcopy) 38 | add_library(MSCCLPP::gdrcopy UNKNOWN IMPORTED) 39 | endif() 40 | set_target_properties(MSCCLPP::gdrcopy PROPERTIES 41 | INTERFACE_INCLUDE_DIRECTORIES "${GDRCOPY_INCLUDE_DIR}" 42 | IMPORTED_LINK_INTERFACE_LANGUAGES "C" 43 | IMPORTED_LOCATION "${GDRCOPY_LIBRARIES}") 44 | endif() -------------------------------------------------------------------------------- /cmake/FindIBVerbs.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | # Find the IB Verbs libraries 5 | # 6 | # The following variables are optionally searched for defaults 7 | # IBVERBS_ROOT_DIR: Base directory where all ibverbs components are found 8 | # IBVERBS_INCLUDE_DIR: Directory where ibverbs headers are found 9 | # IBVERBS_LIB_DIR: Directory where ibverbs libraries are found 10 | 11 | # The following are set after configuration is done: 12 | # IBVERBS_FOUND 13 | # IBVERBS_INCLUDE_DIRS 14 | # IBVERBS_LIBRARIES 15 | 16 | # An imported target MSCCLPP::ibverbs is created if the library is found. 17 | 18 | find_path(IBVERBS_INCLUDE_DIRS 19 | NAMES infiniband/verbs.h 20 | HINTS 21 | ${IBVERBS_INCLUDE_DIR} 22 | ${IBVERBS_ROOT_DIR} 23 | ${IBVERBS_ROOT_DIR}/include) 24 | 25 | find_library(IBVERBS_LIBRARIES 26 | NAMES ibverbs 27 | HINTS 28 | ${IBVERBS_LIB_DIR} 29 | ${IBVERBS_ROOT_DIR} 30 | ${IBVERBS_ROOT_DIR}/lib) 31 | 32 | include(FindPackageHandleStandardArgs) 33 | find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES) 34 | mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES) 35 | 36 | if(IBVERBS_FOUND) 37 | if(NOT TARGET MSCCLPP::ibverbs) 38 | add_library(MSCCLPP::ibverbs UNKNOWN IMPORTED) 39 | endif() 40 | set_target_properties(MSCCLPP::ibverbs PROPERTIES 41 | INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" 42 | IMPORTED_LINK_INTERFACE_LANGUAGES "C" 43 | IMPORTED_LOCATION "${IBVERBS_LIBRARIES}") 44 | endif() -------------------------------------------------------------------------------- /cmake/FindNUMA.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | # Find the numa libraries 5 | # 6 | # The following variables are optionally searched for defaults 7 | # NUMA_ROOT_DIR: Base directory where all numa components are found 8 | # NUMA_INCLUDE_DIR: Directory where numa headers are found 9 | # NUMA_LIB_DIR: Directory where numa libraries are found 10 | 11 | # The following are set after configuration is done: 12 | # NUMA_FOUND 13 | # NUMA_INCLUDE_DIRS 14 | # NUMA_LIBRARIES 15 | 16 | # An imported target MSCCLPP::numa is created if the library is found. 17 | 18 | find_path(NUMA_INCLUDE_DIRS 19 | NAMES numa.h 20 | HINTS 21 | ${NUMA_INCLUDE_DIR} 22 | ${NUMA_ROOT_DIR} 23 | ${NUMA_ROOT_DIR}/include) 24 | 25 | find_library(NUMA_LIBRARIES 26 | NAMES numa 27 | HINTS 28 | ${NUMA_LIB_DIR} 29 | ${NUMA_ROOT_DIR} 30 | ${NUMA_ROOT_DIR}/lib) 31 | 32 | include(FindPackageHandleStandardArgs) 33 | find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_INCLUDE_DIRS NUMA_LIBRARIES) 34 | mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARIES) 35 | 36 | if(NUMA_FOUND) 37 | if(NOT TARGET MSCCLPP::numa) 38 | add_library(MSCCLPP::numa UNKNOWN IMPORTED) 39 | endif() 40 | set_target_properties(MSCCLPP::numa PROPERTIES 41 | INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" 42 | IMPORTED_LINK_INTERFACE_LANGUAGES "C" 43 | IMPORTED_LOCATION "${NUMA_LIBRARIES}") 44 | endif() -------------------------------------------------------------------------------- /cmake/check_amd_gpu.hip: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | __global__ void kernel() {} 7 | 8 | int main() { 9 | int cnt; 10 | hipError_t err = hipGetDeviceCount(&cnt); 11 | if (err != hipSuccess || cnt == 0) { 12 | return 1; 13 | } 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /cmake/check_nvidia_gpu.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | __global__ void kernel() {} 7 | 8 | int main() { 9 | int cnt; 10 | cudaError_t err = cudaGetDeviceCount(&cnt); 11 | if (err != cudaSuccess || cnt == 0) { 12 | return 1; 13 | } 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /docker/base-dev-x.dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | FROM ${BASE_IMAGE} 3 | 4 | LABEL maintainer="MSCCL++" 5 | LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp 6 | 7 | RUN apt-get update && \ 8 | apt-get install -y --no-install-recommends \ 9 | htop \ 10 | lcov \ 11 | vim \ 12 | && \ 13 | apt-get autoremove -y && \ 14 | apt-get clean && \ 15 | rm -rf /var/lib/apt/lists/* /tmp/* 16 | 17 | # Install CMake 3.26.4 18 | RUN ARCH=$(uname -m) && \ 19 | CMAKE_VERSION="3.26.4" && \ 20 | CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-${ARCH}" && \ 21 | CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${ARCH}.tar.gz" && \ 22 | curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ 23 | tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \ 24 | rm -rf ${CMAKE_HOME}.tar.gz && \ 25 | ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${ARCH}/bin/* /usr/bin/ 26 | 27 | # Install Python dependencies 28 | ADD . /tmp/mscclpp 29 | WORKDIR /tmp/mscclpp 30 | ARG TARGET="cuda12.1" 31 | RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \ 32 | python3 -m pip install --no-cache-dir --upgrade pip && \ 33 | python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt 34 | 35 | # Cleanup 36 | RUN rm -rf /tmp/mscclpp 37 | WORKDIR / 38 | -------------------------------------------------------------------------------- /docker/base-x-rocm.dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | FROM ${BASE_IMAGE} 3 | 4 | LABEL maintainer="MSCCL++" 5 | LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp 6 | 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | 9 | ENV RCCL_VERSION=rocm-6.2.0 10 | ARG ARCH=gfx942 11 | ENV ARCH_TARGET=${ARCH} 12 | RUN cd /tmp && \ 13 | git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \ 14 | cd rccl && \ 15 | ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \ 16 | cd .. && \ 17 | rm -rf /tmp/rccl 18 | 19 | WORKDIR / 20 | -------------------------------------------------------------------------------- /docker/base-x.dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | FROM ${BASE_IMAGE} 3 | 4 | LABEL maintainer="MSCCL++" 5 | LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp 6 | 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | USER root 9 | 10 | RUN rm -rf /opt/nvidia 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y --no-install-recommends \ 14 | build-essential \ 15 | ca-certificates \ 16 | curl \ 17 | git \ 18 | libcap2 \ 19 | libnuma-dev \ 20 | lsb-release \ 21 | openssh-client \ 22 | openssh-server \ 23 | python3-dev \ 24 | python3-pip \ 25 | python3-setuptools \ 26 | python3-wheel \ 27 | sudo \ 28 | wget 29 | 30 | # Install OFED 31 | ARG OFED_VERSION=5.2-2.2.3.0 32 | RUN cd /tmp && \ 33 | ARCH=$(uname -m) && \ 34 | OS_VERSION=$(lsb_release -rs) && \ 35 | OS_VERSION=ubuntu${OS_VERSION} && \ 36 | wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \ 37 | tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \ 38 | MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ 39 | rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* 40 | 41 | # Install OpenMPI (should be done after the OFED installation) & clean apt cache 42 | RUN apt-get update && \ 43 | apt-get install -y --no-install-recommends \ 44 | libopenmpi-dev \ 45 | && \ 46 | apt-get autoremove -y && \ 47 | apt-get clean && \ 48 | rm -rf /var/lib/apt/lists/* /tmp/* 49 | 50 | # OpenMPI short link (for compatibility with old images) 51 | RUN ln -s /usr/lib/x86_64-linux-gnu/openmpi /usr/local/mpi 52 | 53 | ARG EXTRA_LD_PATH= 54 | ENV LD_LIBRARY_PATH="${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}" 55 | RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment 56 | 57 | ENTRYPOINT [] 58 | WORKDIR / 59 | -------------------------------------------------------------------------------- /docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | declare -A baseImageTable 6 | baseImageTable=( 7 | ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" 8 | ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" 9 | ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" 10 | ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04" 11 | ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04" 12 | ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04" 13 | ["rocm6.2"]="rocm/rocm-terminal:6.2.1" 14 | ) 15 | 16 | declare -A extraLdPathTable 17 | extraLdPathTable=( 18 | ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" 19 | ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" 20 | ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64" 21 | ["rocm6.2"]="/opt/rocm/lib" 22 | ) 23 | 24 | declare -A ofedVersionTable 25 | ofedVersionTable=( 26 | ["cuda12.4"]="23.07-0.5.1.2" 27 | ["cuda12.8"]="24.10-1.1.4.0" 28 | ) 29 | 30 | GHCR="ghcr.io/microsoft/mscclpp/mscclpp" 31 | TARGET=${1} 32 | 33 | print_usage() { 34 | echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|rocm6.2]" 35 | } 36 | 37 | if [[ ! -v "baseImageTable[${TARGET}]" ]]; then 38 | echo "Invalid target: ${TARGET}" 39 | print_usage 40 | exit 1 41 | fi 42 | echo "Target: ${TARGET}" 43 | 44 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 45 | 46 | cd ${SCRIPT_DIR}/.. 47 | 48 | DEFAULT_OFED_VERSION="5.2-2.2.3.0" 49 | OFED_VERSION=${ofedVersionTable[${TARGET}]} 50 | if [[ -z ${OFED_VERSION} ]]; then 51 | OFED_VERSION=${DEFAULT_OFED_VERSION} 52 | fi 53 | 54 | docker build -t ${GHCR}-common:base-${TARGET} \ 55 | -f docker/base-x.dockerfile \ 56 | --build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \ 57 | --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ 58 | --build-arg TARGET=${TARGET} \ 59 | --build-arg OFED_VERSION=${OFED_VERSION} . 60 | 61 | if [[ ${TARGET} == rocm* ]]; then 62 | echo "Building ROCm base image..." 63 | docker build -t ${GHCR}:base-${TARGET} \ 64 | -f docker/base-x-rocm.dockerfile \ 65 | --build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \ 66 | --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ 67 | --build-arg TARGET=${TARGET} \ 68 | --build-arg ARCH="gfx942" . 69 | docker rmi ${GHCR}-common:base-${TARGET} 70 | else 71 | echo "Building CUDA base image..." 72 | docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET} 73 | docker rmi --no-prune ${GHCR}-common:base-${TARGET} 74 | fi 75 | 76 | docker build -t ${GHCR}:base-dev-${TARGET} \ 77 | -f docker/base-dev-x.dockerfile \ 78 | --build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \ 79 | --build-arg TARGET=${TARGET} . 80 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | doxygen/ 2 | _build/ 3 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ## How to build docs 2 | 3 | 1. Install `doxygen`. 4 | 5 | ```bash 6 | $ sudo apt-get install doxygen graphviz 7 | ``` 8 | 9 | 2. Install Python packages below. If you install them on the user's local, you need to include `~/.local/bin` to `$PATH` (to use `sphinx-build`). 10 | 11 | ```bash 12 | $ sudo python3 -m pip install -r ./requirements.txt 13 | ``` 14 | 15 | 3. Create Doxygen documents. 16 | 17 | ```bash 18 | $ doxygen 19 | ``` 20 | 21 | 4. Create Sphinx documents. 22 | 23 | ```bash 24 | $ make html 25 | ``` 26 | 27 | 5. Done. The HTML files will be on `_build/` directory. 28 | -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | .. doxygennamespace:: mscclpp 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = "mscclpp" 10 | copyright = "2024, MSCCL++ Team" 11 | author = "MSCCL++ Team" 12 | release = "v0.6.0" 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | extensions = ["breathe", "myst_parser"] 18 | 19 | templates_path = ["_templates"] 20 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 21 | 22 | # Breathe configuration 23 | breathe_projects = {"mscclpp": "./doxygen/xml"} 24 | breathe_default_project = "mscclpp" 25 | 26 | # -- Options for HTML output ------------------------------------------------- 27 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 28 | 29 | html_theme = "sphinx_rtd_theme" 30 | html_static_path = ["_static"] 31 | -------------------------------------------------------------------------------- /docs/design/nccl-over-mscclpp.md: -------------------------------------------------------------------------------- 1 | # NCCL Over MSCCL++ 2 | 3 | (limitations)= 4 | ## Limitations 5 | 6 | Current NCCL over MSCCL++ has a few limitations. 7 | 8 | * We do not cover all APIs yet. See the [API Support Table](#api-support-table) for details. 9 | * Multi-node communication is not supported yet. 10 | * Currently, collective communication functions may not work correctly if the buffer address is differed from that of previous function calls while sharing the same base address (returned by [cuMemGetAddressRange](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g64fee5711274a2a0573a789c94d8299b)) with the previous address. This is because the current implementation performs zero-copy communication over user buffers, and it is difficult to efficiently inform all ranks if the buffer address dynamically changes. 11 | 12 | (api-support-table)= 13 | ## API Support Table 14 | 15 | The table below lists all NCCL APIs (v2.21). We may cover more APIs in the future. 16 | 17 | | API Name | Supported | 18 | | :----------------------- | :-------: | 19 | | ncclGetLastError | X | 20 | | ncclGetErrorString | O | 21 | | ncclGetVersion | O | 22 | | ncclGetUniqueId | O | 23 | | ncclCommInitRank | O | 24 | | ncclCommInitAll | X | 25 | | ncclCommInitRankConfig | X | 26 | | ncclCommSplit | X | 27 | | ncclCommFinalize | O | 28 | | ncclCommDestroy | O | 29 | | ncclCommAbort | X | 30 | | ncclCommGetAsyncError | O | 31 | | ncclCommCount | O | 32 | | ncclCommCuDevice | O | 33 | | ncclCommUserRank | O | 34 | | ncclCommRegister | X | 35 | | ncclCommDeregister | X | 36 | | ncclMemAlloc | X | 37 | | ncclMemFree | X | 38 | | ncclAllReduce | O | 39 | | ncclBroadcast | X | 40 | | ncclReduce | X | 41 | | ncclAllGather | O | 42 | | ncclReduceScatter | X | 43 | | ncclGroupStart | O | 44 | | ncclGroupEnd | O | 45 | | ncclSend | X | 46 | | ncclRecv | X | 47 | | ncclRedOpCreatePreMulSum | X | 48 | | ncclRedOpDestroy | X | 49 | 50 | ## Executor Support 51 | 52 | The executor is a versatile tool designed to specify how mscclpp executes algorithms. Currently, only the allReduce operation allows for algorithm customization. The following environment variables can be managed: 53 | 54 | - MSCCLPP_EXECUTION_PLAN_DIR: Specifies the directory where the executor will look for JSON files. 55 | 56 | ```{figure} ../figs/size_boundary_diagram.png 57 | :name: MMSCCL++ Abstractions 58 | :alt: MSCCL++ Abstractions 59 | :align: center 60 | 61 | Decision Flowchart for Message Size-Based Algorithm Execution 62 | ``` 63 | 64 | This is an example of executing the interface with the executor: 65 | ``` bash 66 | mpirun -np 8 -x MSCCLPP_EXECUTION_PLAN_DIR=/root/azure-mscclpp/nccl/test/execution-files ./apps/nccl/test/nccl_api_test 67 | ``` 68 | -------------------------------------------------------------------------------- /docs/figs/abstractions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/docs/figs/abstractions.png -------------------------------------------------------------------------------- /docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg -------------------------------------------------------------------------------- /docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg -------------------------------------------------------------------------------- /docs/figs/size_boundary_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/docs/figs/size_boundary_diagram.png -------------------------------------------------------------------------------- /docs/getting-started/tutorials/customized-proxy-service.md: -------------------------------------------------------------------------------- 1 | # Customize the Proxy Service 2 | -------------------------------------------------------------------------------- /docs/getting-started/tutorials/index.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ---------- 3 | 4 | This tutorial section provides a step-by-step guide to help you get started with the C++/Python API. 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | :caption: Tutorials 9 | :hidden: 10 | 11 | initialization 12 | port-channel 13 | memory-channel 14 | packet-api 15 | customized-proxy-service 16 | python-api 17 | -------------------------------------------------------------------------------- /docs/getting-started/tutorials/initialization.md: -------------------------------------------------------------------------------- 1 | # Commnunication initialize with mscclpp API 2 | 3 | In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ C++ API. You will also learn how to use the Python API to initialize communication. 4 | 5 | ## Prerequisites 6 | A system with eight GPUs is required to run this tutorial. 7 | 8 | Also make sure that you have installed MSCCL++ on your system. If not, please follow the [quick start](../quickstart.md). 9 | 10 | ## Initialize Communication with C++ API 11 | We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ C++ API. 12 | 13 | ```cpp 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | template 23 | using DeviceHandle = mscclpp::DeviceHandle; 24 | __constant__ DeviceHandle constPortChans[8]; 25 | 26 | void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) { 27 | std::string ip_port = "10.0.0.4:50000"; 28 | auto bootstrap = std::make_shared(rank, worldsize); 29 | bootstrap->initialize(ip_port); 30 | mscclpp::Communicator comm(bootstrap); 31 | mscclpp::ProxyService proxyService; 32 | 33 | std::vector semaphoreIds; 34 | std::vector localMemories; 35 | std::vector>> connections(world_size); 36 | std::vector> remoteMemories; 37 | 38 | for (int r = 0; r < world_size; ++r) { 39 | if (r == rank) continue; 40 | mscclpp::Transport transport = mscclpp::Transport::CudaIpc; 41 | // Connect with all other ranks 42 | connections[r] = comm.connect(r, 0, transport); 43 | auto memory = comm.registerMemory(data, dataSize, mscclpp::Transport::CudaIpc | ibTransport); 44 | localMemories.push_back(memory); 45 | comm.sendMemory(memory, r, 0); 46 | remoteMemories.push_back(comm.recvMemory(r, 0)); 47 | } 48 | 49 | for (int r = 0; r < world_size; ++r) { 50 | if (r == rank) continue; 51 | semaphoreIds.push_back(proxyService.buildAndAddSemaphore(comm, connections[r].get())); 52 | } 53 | 54 | std::vector> portChannels; 55 | for (size_t i = 0; i < semaphoreIds.size(); ++i) { 56 | portChannels.push_back(mscclpp::deviceHandle(mscclpp::PortChannel( 57 | proxyService.portChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()), 58 | proxyService.addMemory(localMemories[i])))); 59 | } 60 | 61 | if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle)) { 62 | std::runtime_error("unexpected error"); 63 | } 64 | CUDACHECK(cudaMemcpyToSymbol(constPortChans, portChannels.data(), 65 | sizeof(DeviceHandle) * portChannels.size())); 66 | } 67 | ``` 68 | -------------------------------------------------------------------------------- /docs/getting-started/tutorials/memory-channel.md: -------------------------------------------------------------------------------- 1 | # Using MemoryChannel for Intra-Node Communication 2 | 3 | TBU 4 | -------------------------------------------------------------------------------- /docs/getting-started/tutorials/packet-api.md: -------------------------------------------------------------------------------- 1 | # Packet API for latency sensitive applications 2 | -------------------------------------------------------------------------------- /docs/getting-started/tutorials/port-channel.md: -------------------------------------------------------------------------------- 1 | # Offload commnunication to CPU with PortChannel 2 | 3 | TBU 4 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. MSCCL++ documentation master file, created by 2 | sphinx-quickstart on Tue Sep 5 13:03:46 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to MSCCL++'s documentation! 7 | =================================== 8 | 9 | MSCCL++ is a GPU-driven communication stack for scalable AI applications. It is designed to provide a high-performance, scalable, and customizable communication stack for distributed GPU applications. 10 | 11 | Getting Started 12 | --------------- 13 | - Follow the :doc:`quick start ` for your platform of choice. 14 | - Take a look at the :doc:`tutorials ` to learn how to write your first mscclpp program. 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: Getting Started 19 | :hidden: 20 | 21 | getting-started/quickstart 22 | getting-started/tutorials/index 23 | 24 | Design 25 | ------- 26 | - :doc:`Design ` doc for those who want to understand the internals of MSCCL++. 27 | - :doc:`NCCL over MSCCL++ ` doc for those who want to understand how to use NCCL over MSCCL++. 28 | - :doc:`MSCCL++ DSL ` doc for those who want to understand the MSCCL++ DSL. 29 | 30 | .. toctree:: 31 | :maxdepth: 1 32 | :caption: Design 33 | :hidden: 34 | 35 | design/design 36 | design/nccl-over-mscclpp 37 | design/mscclpp-dsl 38 | 39 | Performance 40 | --------------- 41 | - We evaluate the performance of MSCCL++ in A100 and H100. Here are some :doc:`performance results ` for all-reduce operations. 42 | 43 | .. toctree:: 44 | :maxdepth: 1 45 | :caption: Performance 46 | :hidden: 47 | 48 | performance/performance-ndmv4 49 | 50 | C++ API 51 | --------------- 52 | - :doc:`mscclpp ` 53 | 54 | 55 | .. toctree:: 56 | :maxdepth: 1 57 | :caption: C++ API 58 | :hidden: 59 | 60 | api/index 61 | 62 | Indices and tables 63 | ================== 64 | 65 | * :ref:`genindex` 66 | * :ref:`modindex` 67 | * :ref:`search` 68 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/performance/performance-ndmv4.md: -------------------------------------------------------------------------------- 1 | # NDmv4 Performance 2 | 3 | TBU 4 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | breathe 2 | sphinx_rtd_theme 3 | myst_parser 4 | -------------------------------------------------------------------------------- /include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS *.hpp) 5 | target_sources(mscclpp_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS}) 6 | -------------------------------------------------------------------------------- /include/mscclpp/assert_device.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_ASSERT_DEVICE_HPP_ 5 | #define MSCCLPP_ASSERT_DEVICE_HPP_ 6 | 7 | #include "device.hpp" 8 | 9 | #if defined(MSCCLPP_DEVICE_COMPILE) 10 | 11 | #include 12 | 13 | #if !defined(DEBUG_BUILD) 14 | 15 | #define MSCCLPP_ASSERT_DEVICE(__cond, __msg) 16 | 17 | #else // defined(DEBUG_BUILD) 18 | 19 | #if defined(MSCCLPP_DEVICE_HIP) 20 | extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, 21 | const char *__function); 22 | #else // !defined(MSCCLPP_DEVICE_HIP) 23 | extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, 24 | const char *__function) __THROW; 25 | #endif // !defined(MSCCLPP_DEVICE_HIP) 26 | 27 | #define MSCCLPP_ASSERT_DEVICE(__cond, __msg) \ 28 | do { \ 29 | if (!(__cond)) { \ 30 | __assert_fail(__msg, __FILE__, __LINE__, __PRETTY_FUNCTION__); \ 31 | } \ 32 | } while (0) 33 | 34 | #endif // !defined(DEBUG_BUILD) 35 | 36 | #endif // defined(MSCCLPP_DEVICE_COMPILE) 37 | 38 | #endif // MSCCLPP_ASSERT_DEVICE_HPP_ 39 | -------------------------------------------------------------------------------- /include/mscclpp/atomic_device.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_ATOMIC_DEVICE_HPP_ 5 | #define MSCCLPP_ATOMIC_DEVICE_HPP_ 6 | 7 | #include "device.hpp" 8 | 9 | #if defined(MSCCLPP_DEVICE_CUDA) 10 | #include 11 | #endif // defined(MSCCLPP_DEVICE_CUDA) 12 | 13 | namespace mscclpp { 14 | 15 | #if defined(MSCCLPP_DEVICE_CUDA) 16 | 17 | constexpr cuda::memory_order memoryOrderRelaxed = cuda::memory_order_relaxed; 18 | constexpr cuda::memory_order memoryOrderAcquire = cuda::memory_order_acquire; 19 | constexpr cuda::memory_order memoryOrderRelease = cuda::memory_order_release; 20 | constexpr cuda::memory_order memoryOrderAcqRel = cuda::memory_order_acq_rel; 21 | constexpr cuda::memory_order memoryOrderSeqCst = cuda::memory_order_seq_cst; 22 | 23 | constexpr cuda::thread_scope scopeSystem = cuda::thread_scope_system; 24 | constexpr cuda::thread_scope scopeDevice = cuda::thread_scope_device; 25 | 26 | template 27 | MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order memoryOrder) { 28 | return cuda::atomic_ref{*ptr}.load(memoryOrder); 29 | } 30 | 31 | template 32 | MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, cuda::memory_order memoryOrder) { 33 | cuda::atomic_ref{*ptr}.store(val, memoryOrder); 34 | } 35 | 36 | template 37 | MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_order memoryOrder) { 38 | return cuda::atomic_ref{*ptr}.fetch_add(val, memoryOrder); 39 | } 40 | 41 | #elif defined(MSCCLPP_DEVICE_HIP) 42 | 43 | constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED; 44 | constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE; 45 | constexpr auto memoryOrderRelease = __ATOMIC_RELEASE; 46 | constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL; 47 | constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST; 48 | 49 | // HIP does not have thread scope enums like CUDA 50 | constexpr auto scopeSystem = 0; 51 | constexpr auto scopeDevice = 0; 52 | 53 | template 54 | MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(const T* ptr, int memoryOrder) { 55 | return __atomic_load_n(ptr, memoryOrder); 56 | } 57 | 58 | template 59 | MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, int memoryOrder) { 60 | __atomic_store_n(ptr, val, memoryOrder); 61 | } 62 | 63 | template 64 | MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrder) { 65 | return __atomic_fetch_add(ptr, val, memoryOrder); 66 | } 67 | 68 | #endif // defined(MSCCLPP_DEVICE_HIP) 69 | 70 | } // namespace mscclpp 71 | 72 | #endif // MSCCLPP_ATOMIC_DEVICE_HPP_ 73 | -------------------------------------------------------------------------------- /include/mscclpp/device.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_DEVICE_HPP_ 5 | #define MSCCLPP_DEVICE_HPP_ 6 | 7 | #if defined(__HIP_PLATFORM_AMD__) 8 | #include 9 | #endif // defined(__HIP_PLATFORM_AMD__) 10 | 11 | #if (defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)) 12 | 13 | #define MSCCLPP_DEVICE_COMPILE 14 | #define MSCCLPP_INLINE __forceinline__ 15 | #define MSCCLPP_DEVICE_INLINE __forceinline__ __device__ 16 | #define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__ 17 | #if defined(__HIP_PLATFORM_AMD__) 18 | #define MSCCLPP_DEVICE_HIP 19 | #else // !(defined(__HIP_PLATFORM_AMD__) 20 | #define MSCCLPP_DEVICE_CUDA 21 | #endif // !(defined(__HIP_PLATFORM_AMD__)) 22 | 23 | #else // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)) 24 | 25 | #define MSCCLPP_HOST_COMPILE 26 | #define MSCCLPP_INLINE inline 27 | #define MSCCLPP_HOST_DEVICE_INLINE inline 28 | 29 | #endif // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)) 30 | 31 | #endif // MSCCLPP_DEVICE_HPP_ 32 | -------------------------------------------------------------------------------- /include/mscclpp/env.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_ENV_HPP_ 5 | #define MSCCLPP_ENV_HPP_ 6 | 7 | #include 8 | #include 9 | 10 | namespace mscclpp { 11 | 12 | class Env; 13 | 14 | /// Get the MSCCL++ environment. 15 | /// @return A reference to the global environment object. 16 | std::shared_ptr env(); 17 | 18 | /// The MSCCL++ environment. The constructor reads environment variables and sets the corresponding fields. 19 | /// Use the @ref env() function to get the environment object. 20 | class Env { 21 | public: 22 | const std::string debug; 23 | const std::string debugSubsys; 24 | const std::string debugFile; 25 | const std::string hcaDevices; 26 | const std::string hostid; 27 | const std::string socketFamily; 28 | const std::string socketIfname; 29 | const std::string commId; 30 | const std::string executionPlanDir; 31 | const std::string npkitDumpDir; 32 | const bool cudaIpcUseDefaultStream; 33 | const std::string ncclSharedLibPath; 34 | const std::string forceNcclFallbackOperation; 35 | const bool enableNcclFallback; 36 | const bool disableChannelCache; 37 | const bool forceDisableNvls; 38 | 39 | private: 40 | Env(); 41 | 42 | friend std::shared_ptr env(); 43 | }; 44 | 45 | } // namespace mscclpp 46 | 47 | #endif // MSCCLPP_ENV_HPP_ 48 | -------------------------------------------------------------------------------- /include/mscclpp/errors.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_ERRORS_HPP_ 5 | #define MSCCLPP_ERRORS_HPP_ 6 | 7 | #include 8 | 9 | namespace mscclpp { 10 | 11 | /// Enumeration of error codes used by MSCCL++. 12 | enum class ErrorCode { 13 | SystemError, // A system error occurred. 14 | InternalError, // An MSCCL++ internal error occurred. 15 | RemoteError, // An error occurred on a remote system. 16 | InvalidUsage, // The function was used incorrectly. 17 | Timeout, // The operation timed out. 18 | Aborted, // The operation was aborted. 19 | ExecutorError, // An error occurred in the MSCCL++ executor. 20 | }; 21 | 22 | /// Convert an error code to a string. 23 | /// 24 | /// @param error The error code to convert. 25 | /// @return The string representation of the error code. 26 | std::string errorToString(enum ErrorCode error); 27 | 28 | /// Base class for all errors thrown by MSCCL++. 29 | class BaseError : public std::runtime_error { 30 | public: 31 | /// Constructor for @ref BaseError. 32 | /// 33 | /// @param message The error message. 34 | /// @param errorCode The error code. 35 | BaseError(const std::string& message, int errorCode); 36 | 37 | /// Constructor for @ref BaseError. 38 | /// 39 | /// @param errorCode The error code. 40 | explicit BaseError(int errorCode); 41 | 42 | /// Virtual destructor for BaseError. 43 | virtual ~BaseError() = default; 44 | 45 | /// Get the error code. 46 | /// 47 | /// @return The error code. 48 | int getErrorCode() const; 49 | 50 | /// Get the error message. 51 | /// 52 | /// @return The error message. 53 | const char* what() const noexcept override; 54 | 55 | protected: 56 | std::string message_; 57 | int errorCode_; 58 | }; 59 | 60 | /// A generic error. 61 | class Error : public BaseError { 62 | public: 63 | Error(const std::string& message, ErrorCode errorCode); 64 | virtual ~Error() = default; 65 | ErrorCode getErrorCode() const; 66 | }; 67 | 68 | /// An error from a system call that sets `errno`. 69 | class SysError : public BaseError { 70 | public: 71 | SysError(const std::string& message, int errorCode); 72 | virtual ~SysError() = default; 73 | }; 74 | 75 | /// An error from a CUDA runtime library call. 76 | class CudaError : public BaseError { 77 | public: 78 | CudaError(const std::string& message, int errorCode); 79 | virtual ~CudaError() = default; 80 | }; 81 | 82 | /// An error from a CUDA driver library call. 83 | class CuError : public BaseError { 84 | public: 85 | CuError(const std::string& message, int errorCode); 86 | virtual ~CuError() = default; 87 | }; 88 | 89 | /// An error from an ibverbs library call. 90 | class IbError : public BaseError { 91 | public: 92 | IbError(const std::string& message, int errorCode); 93 | virtual ~IbError() = default; 94 | }; 95 | 96 | }; // namespace mscclpp 97 | 98 | #endif // MSCCLPP_ERRORS_HPP_ 99 | -------------------------------------------------------------------------------- /include/mscclpp/executor.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_EXECUTOR_HPP_ 5 | #define MSCCLPP_EXECUTOR_HPP_ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace mscclpp { 13 | 14 | enum class DataType { 15 | INT32, 16 | UINT32, 17 | FLOAT16, 18 | FLOAT32, 19 | BFLOAT16, 20 | }; 21 | 22 | enum class PacketType { 23 | LL8, 24 | LL16, 25 | }; 26 | 27 | class ExecutionPlan { 28 | public: 29 | ExecutionPlan(const std::string& planPath); 30 | ~ExecutionPlan() = default; 31 | 32 | std::string name() const; 33 | std::string collective() const; 34 | size_t minMessageSize() const; 35 | size_t maxMessageSize() const; 36 | bool isInPlace() const; 37 | 38 | private: 39 | struct Impl; 40 | std::shared_ptr impl_; 41 | 42 | friend class Executor; 43 | }; 44 | 45 | class Executor { 46 | public: 47 | Executor(std::shared_ptr comm); 48 | Executor(const Executor&) = delete; 49 | Executor& operator=(const Executor&) = delete; 50 | ~Executor(); 51 | 52 | void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, DataType dataType, 53 | const ExecutionPlan& plan, cudaStream_t stream, PacketType packetType = PacketType::LL16); 54 | 55 | private: 56 | struct Impl; 57 | std::unique_ptr impl_; 58 | }; 59 | } // namespace mscclpp 60 | 61 | #endif // MSCCLPP_EXECUTOR_HPP_ 62 | -------------------------------------------------------------------------------- /include/mscclpp/fifo.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_FIFO_HPP_ 5 | #define MSCCLPP_FIFO_HPP_ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "fifo_device.hpp" 12 | 13 | namespace mscclpp { 14 | 15 | constexpr size_t DEFAULT_FIFO_SIZE = 128; 16 | 17 | /// A class representing a host proxy FIFO that can consume work elements pushed by device threads. 18 | class Fifo { 19 | public: 20 | /// Constructs a new @ref Fifo object. 21 | /// @param size The number of entires in the FIFO. 22 | Fifo(int size = DEFAULT_FIFO_SIZE); 23 | 24 | /// Destroys the @ref Fifo object. 25 | ~Fifo(); 26 | 27 | /// Polls the FIFO for a trigger. 28 | /// 29 | /// Returns @ref ProxyTrigger which is the trigger at the head of fifo. 30 | ProxyTrigger poll(); 31 | 32 | /// Pops a trigger from the FIFO. 33 | void pop(); 34 | 35 | /// Flushes the tail of the FIFO. 36 | /// 37 | /// @param sync If true, waits for the flush to complete before returning. 38 | void flushTail(bool sync = false); 39 | 40 | /// Return the FIFO size. 41 | /// @return The FIFO size. 42 | int size() const; 43 | 44 | /// Returns a @ref FifoDeviceHandle object representing the device FIFO. 45 | /// 46 | /// @return A @ref FifoDeviceHandle object representing the device FIFO. 47 | FifoDeviceHandle deviceHandle(); 48 | 49 | private: 50 | struct Impl; 51 | std::unique_ptr pimpl; 52 | }; 53 | 54 | } // namespace mscclpp 55 | 56 | #endif // MSCCLPP_FIFO_HPP_ 57 | -------------------------------------------------------------------------------- /include/mscclpp/gpu_data_types.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_GPU_DATA_TYPES_HPP_ 5 | #define MSCCLPP_GPU_DATA_TYPES_HPP_ 6 | 7 | #if defined(__HIP_PLATFORM_AMD__) 8 | 9 | #include 10 | #include 11 | 12 | using __bfloat16 = __hip_bfloat16; 13 | using __bfloat162 = __hip_bfloat162; 14 | #define __CUDA_BF16_TYPES_EXIST__ 15 | 16 | #else 17 | 18 | #include 19 | #include 20 | #if (CUDART_VERSION >= 11000) 21 | #include 22 | #endif 23 | #if (CUDART_VERSION >= 11080) 24 | #include 25 | #endif 26 | 27 | using __bfloat16 = __nv_bfloat16; 28 | using __bfloat162 = __nv_bfloat162; 29 | 30 | #endif 31 | 32 | #endif // MSCCLPP_GPU_DATA_TYPES_HPP_ 33 | -------------------------------------------------------------------------------- /include/mscclpp/memory_channel.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_MEMORY_CHANNEL_HPP_ 5 | #define MSCCLPP_MEMORY_CHANNEL_HPP_ 6 | 7 | #include 8 | 9 | #include "core.hpp" 10 | #include "memory_channel_device.hpp" 11 | #include "semaphore.hpp" 12 | 13 | namespace mscclpp { 14 | 15 | /// Memory channel without specifying source/destination memory regions. 16 | struct BaseMemoryChannel { 17 | protected: 18 | std::shared_ptr semaphore_; 19 | 20 | public: 21 | /// Default constructor. 22 | BaseMemoryChannel() = default; 23 | 24 | /// Constructor. 25 | /// @param semaphore The semaphore used to synchronize the communication. 26 | BaseMemoryChannel(std::shared_ptr semaphore); 27 | 28 | BaseMemoryChannel(const BaseMemoryChannel& other) = default; 29 | 30 | BaseMemoryChannel& operator=(BaseMemoryChannel& other) = default; 31 | 32 | /// Device-side handle for @ref BaseMemoryChannel. 33 | using DeviceHandle = BaseMemoryChannelDeviceHandle; 34 | 35 | /// Returns the device-side handle. 36 | /// 37 | /// User should make sure the BaseMemoryChannel is not released when using the returned handle. 38 | /// 39 | DeviceHandle deviceHandle() const; 40 | }; 41 | 42 | /// Channel for accessing peer memory directly from GPU threads. 43 | struct MemoryChannel : public BaseMemoryChannel { 44 | private: 45 | RegisteredMemory dst_; 46 | void* src_; 47 | void* packetBuffer_; 48 | 49 | public: 50 | /// Default constructor. 51 | MemoryChannel() = default; 52 | 53 | /// Constructor. 54 | /// @param semaphore The semaphore used to synchronize the communication. 55 | /// @param dst Registered memory of the destination. 56 | /// @param src The source memory address. 57 | /// @param packetBuffer A buffer used to store packets. @p packetBuffer is optional and if it is nullptr, 58 | /// unpackPacket() and unpackPackets() methods are not available. 59 | MemoryChannel(std::shared_ptr semaphore, RegisteredMemory dst, void* src, 60 | void* packetBuffer = nullptr); 61 | 62 | /// Device-side handle for @ref MemoryChannel. 63 | using DeviceHandle = MemoryChannelDeviceHandle; 64 | 65 | /// Returns the device-side handle. 66 | /// 67 | /// User should make sure the MemoryChannel is not released when using the returned handle. 68 | /// 69 | DeviceHandle deviceHandle() const; 70 | }; 71 | 72 | /// @deprecated Use @ref MemoryChannel instead. 73 | [[deprecated("Use MemoryChannel instead.")]] typedef MemoryChannel SmChannel; 74 | 75 | } // namespace mscclpp 76 | 77 | #endif // MSCCLPP_MEMORY_CHANNEL_HPP_ 78 | -------------------------------------------------------------------------------- /include/mscclpp/npkit/npkit_event.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef NPKIT_EVENT_H_ 5 | #define NPKIT_EVENT_H_ 6 | 7 | #define NPKIT_EVENT_INVALID 0x0 8 | 9 | #define NPKIT_EVENT_TIME_SYNC_GPU 0x1 10 | #define NPKIT_EVENT_TIME_SYNC_CPU 0x2 11 | 12 | #define NPKIT_EVENT_CONN_CUDA_IPC_WRITE_ENTRY 0x3 13 | #define NPKIT_EVENT_CONN_CUDA_IPC_WRITE_EXIT 0x4 14 | #define NPKIT_EVENT_CONN_CUDA_IPC_UPDATE_AND_SYNC_ENTRY 0x5 15 | #define NPKIT_EVENT_CONN_CUDA_IPC_UPDATE_AND_SYNC_EXIT 0x6 16 | #define NPKIT_EVENT_CONN_CUDA_IPC_FLUSH_ENTRY 0x7 17 | #define NPKIT_EVENT_CONN_CUDA_IPC_FLUSH_EXIT 0x8 18 | 19 | #define NPKIT_EVENT_CONN_IB_WRITE_ENTRY 0x9 20 | #define NPKIT_EVENT_CONN_IB_WRITE_EXIT 0xA 21 | #define NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_ENTRY 0xB 22 | #define NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT 0xC 23 | #define NPKIT_EVENT_CONN_IB_FLUSH_ENTRY 0xD 24 | #define NPKIT_EVENT_CONN_IB_FLUSH_EXIT 0xE 25 | 26 | #define NPKIT_EVENT_CONN_ETH_WRITE_ENTRY 0xF 27 | #define NPKIT_EVENT_CONN_ETH_WRITE_EXIT 0x10 28 | #define NPKIT_EVENT_CONN_ETH_UPDATE_AND_SYNC_ENTRY 0x11 29 | #define NPKIT_EVENT_CONN_ETH_UPDATE_AND_SYNC_EXIT 0x12 30 | #define NPKIT_EVENT_CONN_ETH_FLUSH_ENTRY 0x13 31 | #define NPKIT_EVENT_CONN_ETH_FLUSH_EXIT 0x14 32 | #define NPKIT_EVENT_CONN_ETH_RECV_META_ENTRY 0x15 33 | #define NPKIT_EVENT_CONN_ETH_RECV_META_EXIT 0x16 34 | #define NPKIT_EVENT_CONN_ETH_RECV_DATA_ENTRY 0x17 35 | #define NPKIT_EVENT_CONN_ETH_RECV_DATA_EXIT 0x18 36 | 37 | #define NPKIT_EVENT_EXECUTOR_INIT_ENTRY 0x19 38 | #define NPKIT_EVENT_EXECUTOR_INIT_EXIT 0x1A 39 | 40 | #define NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY 0x1B 41 | #define NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT 0x1C 42 | 43 | #define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1D 44 | #define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x31 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /include/mscclpp/npkit/npkit_struct.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef NPKIT_STRUCT_H_ 5 | #define NPKIT_STRUCT_H_ 6 | 7 | #include 8 | 9 | #pragma pack(push, 1) 10 | 11 | union NpKitEvent { 12 | uint64_t bits[2]; 13 | struct { 14 | uint64_t type : 8; 15 | uint64_t size : 32; 16 | uint64_t rsvd : 24; 17 | uint64_t timestamp; 18 | } fields; 19 | }; 20 | 21 | struct NpKitEventCollectContext { 22 | NpKitEvent* event_buffer; 23 | uint64_t event_buffer_head; 24 | }; 25 | 26 | #pragma pack(pop) 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /include/mscclpp/numa.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_NUMA_HPP_ 5 | #define MSCCLPP_NUMA_HPP_ 6 | 7 | namespace mscclpp { 8 | 9 | int getDeviceNumaNode(int cudaDev); 10 | void numaBind(int node); 11 | 12 | } // namespace mscclpp 13 | 14 | #endif // MSCCLPP_NUMA_HPP_ 15 | -------------------------------------------------------------------------------- /include/mscclpp/nvls.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_NVLS_HPP_ 5 | #define MSCCLPP_NVLS_HPP_ 6 | 7 | #include 8 | #include 9 | 10 | namespace mscclpp { 11 | 12 | class NvlsConnection { 13 | public: 14 | NvlsConnection(size_t bufferSize, int numDevices); 15 | NvlsConnection(const std::vector& data); 16 | NvlsConnection() = delete; 17 | std::vector serialize(); 18 | 19 | // Everyone needs to synchronize after creating a NVLS connection before adding devices 20 | void addDevice(); 21 | void addDevice(int cudaDeviceId); 22 | 23 | struct DeviceMulticastPointer { 24 | private: 25 | void* devicePtr_; 26 | std::shared_ptr mcPtr_; 27 | size_t bufferSize_; 28 | 29 | public: 30 | using DeviceHandle = DeviceMulticastPointerDeviceHandle; 31 | DeviceMulticastPointer(void* devicePtr, std::shared_ptr mcPtr, size_t bufferSize) 32 | : devicePtr_(devicePtr), mcPtr_(mcPtr), bufferSize_(bufferSize) {} 33 | DeviceHandle deviceHandle() const; 34 | void* getDevicePtr(); 35 | 36 | friend class NvlsConnection; 37 | }; 38 | 39 | /// @brief bind the memory allocated via @ref mscclpp::GpuBuffer to the multicast handle. The behavior 40 | /// is undefined if the devicePtr is not allocated by @ref mscclpp::GpuBuffer. 41 | /// @param devicePtr The device pointer returned by `mscclpp::GpuBuffer::data()`. 42 | /// @param size The bytes of the memory to bind to the multicast handle. 43 | /// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize 44 | DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size); 45 | 46 | size_t getMultiCastMinGranularity(); 47 | 48 | private: 49 | class Impl; 50 | std::shared_ptr pimpl_; 51 | }; 52 | 53 | class Communicator; 54 | 55 | /// Connect to NVLS on setup. 56 | /// 57 | /// This function used to connect to NVLS on setup. NVLS collective using multicast operations to send/recv data. 58 | /// Here we need to put all involved ranks into the collective group. 59 | /// 60 | /// @param comm The communicator. 61 | /// @param allRanks The ranks of all processes involved in the collective. 62 | /// @param config The configuration for the local endpoint. 63 | /// @return std::shared_ptr A shared pointer to the NVLS connection. 64 | std::shared_ptr connectNvlsCollective(std::shared_ptr comm, std::vector allRanks, 65 | size_t bufferSize); 66 | 67 | } // namespace mscclpp 68 | 69 | #endif // MSCCLPP_NVLS_HPP_ 70 | -------------------------------------------------------------------------------- /include/mscclpp/poll_device.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_POLL_DEVICE_HPP_ 5 | #define MSCCLPP_POLL_DEVICE_HPP_ 6 | 7 | #include "assert_device.hpp" 8 | 9 | #if defined(MSCCLPP_DEVICE_COMPILE) 10 | 11 | // If a spin is stuck, print a warning and keep spinning. 12 | #define POLL_MAYBE_JAILBREAK(__cond, __max_spin_cnt) \ 13 | do { \ 14 | [[maybe_unused]] int64_t __spin_cnt = 0; \ 15 | while (__cond) { \ 16 | MSCCLPP_ASSERT_DEVICE((__max_spin_cnt < 0 || __spin_cnt++ != __max_spin_cnt), #__cond); \ 17 | } \ 18 | } while (0); 19 | 20 | // the as POLL_MAYBE_JAILBREAK except that __cond1 is checked before __cond2 21 | // this is specially useful when __cond1 is faster to check 22 | #define OR_POLL_MAYBE_JAILBREAK(__cond1, __cond2, __max_spin_cnt) \ 23 | do { \ 24 | [[maybe_unused]] int64_t __spin_cnt = 0; \ 25 | while (true) { \ 26 | if (!(__cond1)) { \ 27 | break; \ 28 | } else if (!(__cond2)) { \ 29 | break; \ 30 | } \ 31 | MSCCLPP_ASSERT_DEVICE((__max_spin_cnt < 0 || __spin_cnt++ != __max_spin_cnt), #__cond1 #__cond2); \ 32 | } \ 33 | } while (0); 34 | 35 | #endif // defined(MSCCLPP_DEVICE_COMPILE) 36 | 37 | #endif // MSCCLPP_POLL_DEVICE_HPP_ 38 | -------------------------------------------------------------------------------- /include/mscclpp/proxy.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_PROXY_HPP_ 5 | #define MSCCLPP_PROXY_HPP_ 6 | 7 | #include 8 | #include 9 | 10 | #include "fifo.hpp" 11 | 12 | namespace mscclpp { 13 | 14 | enum class ProxyHandlerResult { 15 | Continue, 16 | FlushFifoTailAndContinue, 17 | Stop, 18 | }; 19 | 20 | class Proxy; 21 | using ProxyHandler = std::function; 22 | 23 | class Proxy { 24 | public: 25 | Proxy(ProxyHandler handler, std::function threadInit, size_t fifoSize = DEFAULT_FIFO_SIZE); 26 | Proxy(ProxyHandler handler, size_t fifoSize = DEFAULT_FIFO_SIZE); 27 | ~Proxy(); 28 | 29 | void start(); 30 | void stop(); 31 | 32 | /// This is a concurrent fifo which is multiple threads from the device 33 | /// can produce for and the sole proxy thread consumes it. 34 | /// @return the fifo 35 | Fifo& fifo(); 36 | 37 | private: 38 | struct Impl; 39 | std::unique_ptr pimpl; 40 | }; 41 | 42 | } // namespace mscclpp 43 | 44 | #endif // MSCCLPP_PROXY_HPP_ 45 | -------------------------------------------------------------------------------- /include/mscclpp/utils.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_UTILS_HPP_ 5 | #define MSCCLPP_UTILS_HPP_ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace mscclpp { 12 | 13 | struct Timer { 14 | std::chrono::steady_clock::time_point start_; 15 | int timeout_; 16 | 17 | Timer(int timeout = -1); 18 | 19 | ~Timer(); 20 | 21 | /// Returns the elapsed time in microseconds. 22 | int64_t elapsed() const; 23 | 24 | void set(int timeout); 25 | 26 | void reset(); 27 | 28 | void print(const std::string& name); 29 | }; 30 | 31 | struct ScopedTimer : public Timer { 32 | const std::string name_; 33 | 34 | ScopedTimer(const std::string& name); 35 | 36 | ~ScopedTimer(); 37 | }; 38 | 39 | std::string getHostName(int maxlen, const char delim); 40 | 41 | /// Get the number of available InfiniBand devices. 42 | /// 43 | /// @return The number of available InfiniBand devices. 44 | int getIBDeviceCount(); 45 | 46 | /// Get the name of the InfiniBand device associated with the specified transport. 47 | /// 48 | /// @param ibTransport The InfiniBand transport to get the device name for. 49 | /// @return The name of the InfiniBand device associated with the specified transport. 50 | std::string getIBDeviceName(Transport ibTransport); 51 | 52 | /// Get the InfiniBand transport associated with the specified device name. 53 | /// 54 | /// @param ibDeviceName The name of the InfiniBand device to get the transport for. 55 | /// @return The InfiniBand transport associated with the specified device name. 56 | Transport getIBTransportByDeviceName(const std::string& ibDeviceName); 57 | 58 | } // namespace mscclpp 59 | 60 | #endif // MSCCLPP_UTILS_HPP_ 61 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | [build-system] 5 | requires = ["scikit-build-core"] 6 | build-backend = "scikit_build_core.build" 7 | 8 | [project] 9 | name = "mscclpp" 10 | version = "0.6.0" 11 | 12 | [tool.scikit-build] 13 | cmake.version = ">=3.25.0" 14 | cmake.build-type = "Release" 15 | # for dlpack issue: https://github.com/microsoft/vcpkg/pull/44679 16 | cmake.args = ["-DCMAKE_POLICY_VERSION_MINIMUM=3.5"] 17 | build-dir = "build/{wheel_tag}" 18 | wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"] 19 | wheel.install-dir = "mscclpp" 20 | 21 | [tool.scikit-build.cmake.define] 22 | MSCCLPP_BUILD_PYTHON_BINDINGS = "ON" 23 | MSCCLPP_BUILD_TESTS = "OFF" 24 | 25 | [tool.black] 26 | line-length = 120 27 | target-version = ['py38'] 28 | include = '\.pyi?$' 29 | -------------------------------------------------------------------------------- /python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | add_subdirectory(mscclpp) 5 | add_subdirectory(test) 6 | 7 | add_custom_target(pytest_lib_copy ALL 8 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 9 | ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so 10 | ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp 11 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 12 | ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so 13 | ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp 14 | DEPENDS mscclpp_py mscclpp_py_test 15 | ) 16 | -------------------------------------------------------------------------------- /python/examples/allgather_allpairs_multinodes_packets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from mscclpp.language import * 6 | from mscclpp.language.collectives import AllGather 7 | from mscclpp.language.buffer import Buffer 8 | from mscclpp.language.types import ChannelType, ReplicationPolicy 9 | 10 | 11 | def allgather_multinodes_allpair(gpus, gpus_per_node, instances): 12 | """ 13 | Implements a multi-node allgather collective using an allpairs algorithm with MSCCL++ DSL. 14 | @param gpus: Total number of GPUs 15 | @param gpus_per_node: Number of GPUs per node 16 | Steps: 17 | 1. Each rank sends a chunk to all other ranks' scratch buffers using packet format. 18 | 2. Copy the chunk from the scratch buffer to the output buffer using packet format. 19 | """ 20 | collective = AllGather(gpus, 1, True) 21 | with MSCCLPPProgram( 22 | "allgather_multinodes_allpair", 23 | collective, 24 | gpus, 25 | instances, 26 | protocol="LL", 27 | replication_policy=ReplicationPolicy.interleaved, 28 | num_threads_per_block=1024, 29 | ): 30 | for g in range(gpus): 31 | src_rank = g 32 | c = chunk(src_rank, Buffer.input, 0, 1) 33 | for peer in range(1, gpus): 34 | dst_rank = (src_rank + peer) % gpus 35 | tb = dst_rank if dst_rank < src_rank else dst_rank - 1 36 | if src_rank // gpus_per_node == dst_rank // gpus_per_node: 37 | c.put_packet(dst_rank, Buffer.scratch, index=src_rank, sendtb=tb) 38 | else: 39 | c.put_packet( 40 | dst_rank, 41 | Buffer.scratch, 42 | index=src_rank, 43 | sendtb=tb, 44 | chan_type=ChannelType.port, 45 | temp_buffer=Buffer.scratch, 46 | temp_buffer_index=src_rank, 47 | ) 48 | 49 | # Copying packet from local scratch buffer to local buffer 50 | for g in range(gpus): 51 | src_rank = g 52 | src_offset = src_rank 53 | for peer in range(1, gpus): 54 | dst_rank = (g + peer) % gpus 55 | tb = src_offset if src_offset < dst_rank else src_offset - 1 56 | c = chunk(dst_rank, Buffer.scratch, src_offset, 1) 57 | c.copy_packet(dst_rank, Buffer.output, src_offset, sendtb=tb + gpus - 1) 58 | 59 | Json() 60 | Check() 61 | 62 | 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument("num_gpus", type=int, help="number of gpus") 65 | parser.add_argument("gpus_per_node", type=int, help="number of gpus") 66 | parser.add_argument("instances", type=int, help="number of instances") 67 | 68 | args = parser.parse_args() 69 | 70 | allgather_multinodes_allpair( 71 | args.num_gpus, 72 | args.gpus_per_node, 73 | args.instances, 74 | ) 75 | -------------------------------------------------------------------------------- /python/examples/allgather_barrier.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mscclpp.language import * 3 | from mscclpp.language.buffer import Buffer 4 | from mscclpp.language.collectives import AllGather 5 | from mscclpp.language.types import ChannelType, ReplicationPolicy 6 | 7 | 8 | def allgather_test(gpus, instances): 9 | """ 10 | Demonstrates how to use barrier in the MSCCL++ DSL with an allgather collective. 11 | This example uses an allpairs algorithm for the allgather operation. 12 | Steps: 13 | 1. Each rank sends a chunk to all other ranks' output buffers and copies the chunk to its own output buffer. 14 | 2. A barrier is called to synchronize the send and copy operations, and signal peers that the data has been sent. 15 | 3. Wait for all the chunks from other ranks to be received. 16 | """ 17 | size = gpus 18 | collective = AllGather(size, 1, False) 19 | with MSCCLPPProgram( 20 | "allgather_with_barrier", 21 | collective, 22 | size, 23 | instances, 24 | protocol="Simple", 25 | replication_policy=ReplicationPolicy.interleaved, 26 | ): 27 | for n in range(gpus): 28 | c = chunk(n, Buffer.input, 0, 1) 29 | for peer in range(gpus): 30 | if n != peer: 31 | c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.memory) 32 | else: 33 | c.copy(n, Buffer.output, n, sendtb=peer) 34 | # explicit barrier 35 | r = rank(n) 36 | r.barrier(tb_list=list(range(gpus))) 37 | for peer in range(gpus): 38 | if n != peer: 39 | c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.memory) 40 | 41 | for n in range(gpus): 42 | for peer in range(gpus): 43 | c = chunk(n, Buffer.output, peer, 1) 44 | if n != peer: 45 | c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.memory) 46 | 47 | Json() 48 | Check() 49 | 50 | 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("num_gpus", type=int, help="number of gpus") 53 | parser.add_argument("instances", type=int, help="number of instances") 54 | args = parser.parse_args() 55 | allgather_test(args.num_gpus, args.instances) 56 | -------------------------------------------------------------------------------- /python/examples/allreduce_allpairs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from mscclpp.language import * 6 | from mscclpp.language.collectives import AllReduce 7 | from mscclpp.language.buffer import Buffer 8 | 9 | 10 | def allreduce_allpairs(gpus, instances, protocol): 11 | """ 12 | Demonstrate allreduce with all pairs algorithm using put semantics. 13 | Steps: 14 | 1. Sync all ranks to ensure the data is ready. 15 | 2. Each rank reads chunks from all peers and reduces the data. 16 | 3. Put the reduced data to all peers. 17 | 4. Sync all ranks to ensure the data is received. 18 | """ 19 | size = gpus 20 | chunksperloop = gpus * gpus 21 | collective = AllReduce(size, chunksperloop, True) 22 | with MSCCLPPProgram("allreduce_pairs", collective, size, instances, protocol=protocol): 23 | for rank in range(size): 24 | for tb in range(size): 25 | index = rank * size 26 | c = chunk(rank, Buffer.input, index + tb) 27 | # step1 make sure the data is ready 28 | for nghr in range(size): 29 | peer_index = nghr * size 30 | if rank != nghr: 31 | # signal peer the buffer is ready 32 | c_peer = chunk(rank, Buffer.input, peer_index + tb) 33 | c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb) 34 | for nghr in range(size): 35 | if rank != nghr: 36 | c.wait(nghr, Buffer.input, index + tb, recvtb=tb) 37 | # step2 reduce the chunks and send to peers 38 | for nghr in range(size): 39 | if rank != nghr: 40 | c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb) 41 | for nghr in range(size): 42 | if rank != nghr: 43 | c.put(nghr, Buffer.input, index + tb, sendtb=tb) 44 | # step3 signal the peers buffer is ready 45 | for nghr in range(size): 46 | if rank != nghr: 47 | c.signal(nghr, Buffer.input, index + tb, sendtb=tb) 48 | for nghr in range(size): 49 | if rank != nghr: 50 | peer_index = nghr * size 51 | c_peer = chunk(rank, Buffer.input, peer_index + tb) 52 | c_peer.wait(nghr, Buffer.input, peer_index + tb, recvtb=tb) 53 | 54 | Json() 55 | Check() 56 | 57 | 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("num_gpus", type=int, help="number of gpus") 60 | parser.add_argument("instances", type=int, help="number of instances") 61 | parser.add_argument("--protocol", type=str, default="Simple", choices=["Simple"], help="Protocol") 62 | 63 | args = parser.parse_args() 64 | 65 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol) 66 | -------------------------------------------------------------------------------- /python/examples/allreduce_allpairs_packet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from mscclpp.language import * 6 | from mscclpp.language.collectives import AllReduce 7 | from mscclpp.language.buffer import Buffer 8 | 9 | 10 | def allreduce_allpairs(gpus, instances): 11 | """ 12 | AllReduce with all pairs algorithm using packets format. 13 | Steps: 14 | 1. Each rank sends its nth chunk to the nth rank's scratch space. 15 | 2. Each rank performs a local reduction on its nth chunk using data from all other ranks' scratch spaces. 16 | 3. Each rank sends the reduced data to all other ranks' scratch spaces. 17 | 4. Each rank retrieves the final reduced result from the scratch space. 18 | """ 19 | size = gpus 20 | chunksperloop = gpus * gpus 21 | collective = AllReduce(size, chunksperloop, True) 22 | with MSCCLPPProgram( 23 | "allreduce_packets", 24 | collective, 25 | size, 26 | instances, 27 | protocol="LL", 28 | use_double_scratch_buffer=True, 29 | ): 30 | # Each rank sends the nth chunk to the nth rank into scratch space 31 | for r1 in range(size): 32 | for tb in range(size): 33 | if tb == r1: 34 | continue 35 | remote_rank = tb 36 | index = remote_rank * size 37 | c = chunk(r1, Buffer.input, index, size) 38 | c.put_packet(remote_rank, Buffer.scratch, index=r1 * size, sendtb=tb) 39 | 40 | # Each rank performs a local reduction on the nth chunk 41 | # Utilize 8 threadblocks for this reduction for better parallelism 42 | for r in range(size): 43 | for index in range(size): 44 | c = chunk(r, Buffer.input, r * size + index) 45 | for peer in range(size): 46 | if peer != r: 47 | c.reduce_packet(chunk(r, Buffer.scratch, peer * size + index), recvtb=index) 48 | for peer in range(size): 49 | if peer != r: 50 | c.put_packet(peer, Buffer.scratch, (size * size) + r * size + index, sendtb=index) 51 | 52 | # Each rank get final result from scratch space 53 | for r in range(size): 54 | for peer in range(size): 55 | if peer != r: 56 | c = chunk(r, Buffer.scratch, size * size + peer * size, size) 57 | c.copy_packet(r, Buffer.input, peer * size, sendtb=peer) 58 | 59 | Json() 60 | Check() 61 | 62 | 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument("num_gpus", type=int, help="number of gpus") 65 | parser.add_argument("instances", type=int, help="number of instances") 66 | 67 | args = parser.parse_args() 68 | 69 | allreduce_allpairs(args.num_gpus, args.instances) 70 | -------------------------------------------------------------------------------- /python/examples/allreduce_nvls.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from mscclpp.language import * 6 | from mscclpp.language.collectives import AllReduce 7 | from mscclpp.language.buffer import Buffer 8 | 9 | 10 | def allreduce_nvls(gpus, instances): 11 | """ 12 | Allreduce via NVLS channel 13 | Steps: 14 | 1. Sync all the ranks to make sure the data is ready. 15 | 2. Call group_load_reduce to reduce the data. 16 | 3. Call group_store to propagate the data to all the ranks. 17 | """ 18 | size = gpus 19 | chunksperloop = gpus 20 | collective = AllReduce(size, chunksperloop, True) 21 | with MSCCLPPProgram( 22 | "allreduce_nvls", 23 | collective, 24 | size, 25 | instances, 26 | ): 27 | # Each rank sends the nth chunk to the nth rank into scratch space 28 | for rank in range(size): 29 | index = rank 30 | c = chunk(rank, Buffer.input, index) 31 | reduce_chunks = [] 32 | # make sure the data is ready 33 | for nghr in range(size): 34 | if rank != nghr: 35 | c_peer = chunk(nghr, Buffer.input, index) 36 | reduce_chunks.append(c_peer) 37 | c.signal(nghr, Buffer.input, index, sendtb=0) 38 | for nghr in range(size): 39 | if rank != nghr: 40 | c.wait(nghr, Buffer.input, index, recvtb=0) 41 | c = c.group_load_reduce(reduce_chunks, recvtb=0) 42 | ngbrs = [nghr for nghr in range(size) if nghr != rank] 43 | c.group_store(ngbrs, sendtb=0) 44 | 45 | Json() 46 | Check() 47 | 48 | 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("num_gpus", type=int, help="number of gpus") 51 | parser.add_argument("instances", type=int, help="number of instances") 52 | 53 | args = parser.parse_args() 54 | 55 | allreduce_nvls(args.num_gpus, args.instances) 56 | -------------------------------------------------------------------------------- /python/examples/allreduce_ring.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from mscclpp.language import * 6 | from mscclpp.language.collectives import AllReduce 7 | from mscclpp.language.buffer import Buffer 8 | 9 | 10 | def allreduce_ring(size, instances): 11 | """ 12 | Implements a ring based allreduce. 13 | Steps: 14 | 1. Send signal to next rank and wait for signal from previous rank. Make sure the data is ready in previous rank. 15 | 2. Reduce the data and send to next rank. 16 | 3. After all the data is reduced, propagate the data to all the ranks. 17 | """ 18 | collective = AllReduce(size, size, True) 19 | with MSCCLPPProgram( 20 | f"allreduce_ring", 21 | collective, 22 | size, 23 | instances, 24 | protocol="Simple", 25 | ): 26 | # Reduce ring 27 | for step in range(0, size - 1): 28 | for index in range(0, size): 29 | rank = (index + step) % size 30 | next_rank = (index + step + 1) % size 31 | c = chunk(rank, Buffer.input, index) 32 | c.signal(next_rank, Buffer.input, index, 0) 33 | prev_rank = (index + step - 1) % size 34 | c = chunk(rank, Buffer.input, (index + size - 1) % size) 35 | c.wait(prev_rank, Buffer.input, (index + size - 1) % size, 0) 36 | c.reduce(chunk(prev_rank, Buffer.input, (index + size - 1) % size), recvtb=0) 37 | 38 | # Propagate ring 39 | for step in range(-1, size - 2): 40 | for index in range(0, size): 41 | rank = (index + step) % size 42 | c = chunk(rank, Buffer.input, index) 43 | next_rank = (index + step + 1) % size 44 | c.put(next_rank, Buffer.input, index, sendtb=0) 45 | c.signal(next_rank, Buffer.input, index, 0) 46 | prev_rank = (index + step - 1) % size 47 | c = chunk(rank, Buffer.input, (index + size - 1) % size) 48 | c.wait(prev_rank, Buffer.input, (index + size - 1) % size, 0) 49 | 50 | Json() 51 | Check() 52 | 53 | 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument("num_gpus", type=int, help="number of gpus") 56 | parser.add_argument("instances", type=int, help="number of instances") 57 | args = parser.parse_args() 58 | 59 | allreduce_ring(args.num_gpus, args.instances) 60 | -------------------------------------------------------------------------------- /python/examples/send_recv_packet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from mscclpp.language import * 6 | from mscclpp.language.collectives import SendRecv 7 | from mscclpp.language.buffer import Buffer 8 | from mscclpp.language.types import ChannelType 9 | 10 | 11 | def send_recv(instances): 12 | """ 13 | Send and receive data between two ranks using port channels, with LL protocol and double scratch buffer. 14 | Steps: 15 | 1. Each rank sends a chunk to every other rank's scratch buffer with packet format via port channel. 16 | 2. Wait for the data to be received, then copy it to the output buffer. 17 | """ 18 | size = 2 19 | chunksperloop = 1 20 | collective = SendRecv(size, chunksperloop, False) 21 | with MSCCLPPProgram( 22 | "send_recv", 23 | collective, 24 | size, 25 | instances, 26 | protocol="LL", 27 | use_double_scratch_buffer=True, 28 | ): 29 | for r in range(size): 30 | for nghr in range(size): 31 | if nghr == r: 32 | continue 33 | c = chunk(r, Buffer.input, 0) 34 | c.put_packet( 35 | nghr, 36 | Buffer.scratch, 37 | 1, 38 | sendtb=0, 39 | chan_type=ChannelType.port, 40 | temp_buffer=Buffer.scratch, 41 | temp_buffer_index=0, 42 | ) 43 | 44 | for r in range(size): 45 | c = chunk(r, Buffer.scratch, 1) 46 | c.copy_packet(r, Buffer.output, 0, sendtb=0) 47 | 48 | Json() 49 | Check() 50 | 51 | 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("instances", type=int, help="number of instances") 54 | 55 | args = parser.parse_args() 56 | 57 | send_recv(args.instances) 58 | -------------------------------------------------------------------------------- /python/examples/send_recv_proxy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from mscclpp.language import * 6 | from mscclpp.language.buffer import Buffer 7 | from mscclpp.language.collectives import SendRecv 8 | from mscclpp.language.types import ChannelType 9 | 10 | 11 | def send_recv(instances): 12 | """ 13 | Send and receive data between two ranks using port channels. 14 | steps: 15 | 1. Each rank sends a chunk to the other rank's scratch buffer and signals the other rank that the data has been sent. 16 | 2. Wait for the data to be received then copy it to the output buffer. 17 | """ 18 | size = 2 19 | chunksperloop = 1 20 | collective = SendRecv(size, chunksperloop, False) 21 | with MSCCLPPProgram( 22 | "send_recv", 23 | collective, 24 | size, 25 | instances, 26 | ): 27 | for r in range(size): 28 | for nghr in range(size): 29 | if nghr == r: 30 | continue 31 | c = chunk(r, Buffer.input, 0) 32 | c.put( 33 | nghr, 34 | Buffer.scratch, 35 | 1, 36 | sendtb=0, 37 | chan_type=ChannelType.port, 38 | ) 39 | c.signal(nghr, Buffer.scratch, 1, sendtb=0, chan_type=ChannelType.port) 40 | c.flush(nghr, Buffer.scratch, 1, sendtb=0, chan_type=ChannelType.port) 41 | 42 | for r in range(size): 43 | c = chunk(r, Buffer.scratch, 1) 44 | c.wait(1 - r, Buffer.input, 0, recvtb=0, chan_type=ChannelType.port) 45 | c.copy(r, Buffer.output, 0, sendtb=0) 46 | 47 | Json() 48 | Check() 49 | 50 | 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("instances", type=int, help="number of instances") 53 | 54 | args = parser.parse_args() 55 | 56 | send_recv(args.instances) 57 | -------------------------------------------------------------------------------- /python/mscclpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) 5 | include(FetchContent) 6 | FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0) 7 | FetchContent_MakeAvailable(nanobind) 8 | 9 | FetchContent_Declare(dlpack GIT_REPOSITORY https://github.com/dmlc/dlpack.git GIT_TAG v1.1) 10 | FetchContent_MakeAvailable(dlpack) 11 | 12 | file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp) 13 | nanobind_add_module(mscclpp_py ${SOURCES}) 14 | set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp) 15 | target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp_static ${GPU_LIBRARIES}) 16 | target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) 17 | install(TARGETS mscclpp_py LIBRARY DESTINATION .) 18 | -------------------------------------------------------------------------------- /python/mscclpp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import warnings 6 | from functools import wraps 7 | 8 | from ._mscclpp import ( 9 | Env, 10 | ErrorCode, 11 | BaseError, 12 | Error, 13 | SysError, 14 | CudaError, 15 | CuError, 16 | IbError, 17 | Communicator, 18 | Connection, 19 | connect_nvls_collective, 20 | EndpointConfig, 21 | Fifo, 22 | Host2DeviceSemaphore, 23 | Host2HostSemaphore, 24 | numa, 25 | ProxyService, 26 | RegisteredMemory, 27 | PortChannel, 28 | MemoryChannel, 29 | MemoryDevice2DeviceSemaphore, 30 | TcpBootstrap, 31 | Transport, 32 | TransportFlags, 33 | DataType, 34 | Executor, 35 | ExecutionPlan, 36 | PacketType, 37 | RawGpuBuffer, 38 | env, 39 | version, 40 | is_nvls_supported, 41 | npkit, 42 | ) 43 | 44 | 45 | __all__ = [ 46 | "Communicator", 47 | "Connection", 48 | "connect_nvls_collective", 49 | "EndpointConfig", 50 | "Fifo", 51 | "Host2DeviceSemaphore", 52 | "Host2HostSemaphore", 53 | "numa", 54 | "ProxyService", 55 | "RegisteredMemory", 56 | "PortChannel", 57 | "MemoryChannel", 58 | "MemoryDevice2DeviceSemaphore", 59 | "TcpBootstrap", 60 | "Transport", 61 | "TransportFlags", 62 | "DataType", 63 | "Executor", 64 | "ExecutionPlan", 65 | "PacketType", 66 | "version", 67 | "is_nvls_supported", 68 | "alloc_shared_physical_cuda", 69 | "npkit", 70 | "__version__", 71 | "get_include", 72 | "get_lib", 73 | ### Deprecated ### 74 | "ProxyChannel", 75 | "SmChannel", 76 | "SmDevice2DeviceSemaphore", 77 | ] 78 | 79 | __version__: str = str(version()) 80 | 81 | if os.environ.get("MSCCLPP_HOME", None) is None: 82 | os.environ["MSCCLPP_HOME"] = os.path.abspath(os.path.dirname(__file__)) 83 | 84 | 85 | def get_include() -> str: 86 | """Return the directory that contains the MSCCL++ headers.""" 87 | return os.path.join(os.path.dirname(__file__), "include") 88 | 89 | 90 | def get_lib() -> str: 91 | """Return the directory that contains the MSCCL++ headers.""" 92 | return os.path.join(os.path.dirname(__file__), "lib") 93 | 94 | 95 | def deprecated(new_cls): 96 | def decorator(old_cls): 97 | @wraps(old_cls) 98 | def wrapper(*args, **kwargs): 99 | warnings.warn( 100 | f"{old_cls.__name__} is deprecated, use {new_cls.__name__} instead.", 101 | DeprecationWarning, 102 | ) 103 | return new_cls(*args, **kwargs) 104 | 105 | return wrapper 106 | 107 | return decorator 108 | 109 | 110 | @deprecated(PortChannel) 111 | class ProxyChannel(PortChannel): 112 | pass 113 | 114 | 115 | @deprecated(MemoryChannel) 116 | class SmChannel(MemoryChannel): 117 | pass 118 | 119 | 120 | @deprecated(MemoryDevice2DeviceSemaphore) 121 | class SmDevice2DeviceSemaphore(MemoryDevice2DeviceSemaphore): 122 | pass 123 | -------------------------------------------------------------------------------- /python/mscclpp/env_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace nb = nanobind; 11 | using namespace mscclpp; 12 | 13 | void register_env(nb::module_& m) { 14 | nb::class_(m, "Env") 15 | .def_ro("debug", &Env::debug) 16 | .def_ro("debug_subsys", &Env::debugSubsys) 17 | .def_ro("debug_file", &Env::debugFile) 18 | .def_ro("hca_devices", &Env::hcaDevices) 19 | .def_ro("hostid", &Env::hostid) 20 | .def_ro("socket_family", &Env::socketFamily) 21 | .def_ro("socket_ifname", &Env::socketIfname) 22 | .def_ro("comm_id", &Env::commId) 23 | .def_ro("execution_plan_dir", &Env::executionPlanDir) 24 | .def_ro("npkit_dump_dir", &Env::npkitDumpDir) 25 | .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream); 26 | 27 | m.def("env", &env); 28 | } 29 | -------------------------------------------------------------------------------- /python/mscclpp/error_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace nb = nanobind; 10 | using namespace mscclpp; 11 | 12 | #define REGISTER_EXCEPTION_TRANSLATOR(name_) \ 13 | nb::register_exception_translator( \ 14 | [](const std::exception_ptr &p, void *payload) { \ 15 | try { \ 16 | std::rethrow_exception(p); \ 17 | } catch (const name_ &e) { \ 18 | PyErr_SetObject(reinterpret_cast(payload), \ 19 | PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \ 20 | } \ 21 | }, \ 22 | m.attr(#name_).ptr()); 23 | 24 | void register_error(nb::module_ &m) { 25 | nb::enum_(m, "ErrorCode") 26 | .value("SystemError", ErrorCode::SystemError) 27 | .value("InternalError", ErrorCode::InternalError) 28 | .value("RemoteError", ErrorCode::RemoteError) 29 | .value("InvalidUsage", ErrorCode::InvalidUsage) 30 | .value("Timeout", ErrorCode::Timeout) 31 | .value("Aborted", ErrorCode::Aborted) 32 | .value("ExecutorError", ErrorCode::ExecutorError); 33 | 34 | nb::exception(m, "BaseError"); 35 | REGISTER_EXCEPTION_TRANSLATOR(BaseError); 36 | 37 | nb::exception(m, "Error", m.attr("BaseError").ptr()); 38 | REGISTER_EXCEPTION_TRANSLATOR(Error); 39 | 40 | nb::exception(m, "SysError", m.attr("BaseError").ptr()); 41 | REGISTER_EXCEPTION_TRANSLATOR(SysError); 42 | 43 | nb::exception(m, "CudaError", m.attr("BaseError").ptr()); 44 | REGISTER_EXCEPTION_TRANSLATOR(CudaError); 45 | 46 | nb::exception(m, "CuError", m.attr("BaseError").ptr()); 47 | REGISTER_EXCEPTION_TRANSLATOR(CuError); 48 | 49 | nb::exception(m, "IbError", m.attr("BaseError").ptr()); 50 | REGISTER_EXCEPTION_TRANSLATOR(IbError); 51 | } 52 | -------------------------------------------------------------------------------- /python/mscclpp/executor_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace nb = nanobind; 12 | using namespace mscclpp; 13 | 14 | void register_executor(nb::module_& m) { 15 | nb::enum_(m, "DataType") 16 | .value("int32", DataType::INT32) 17 | .value("uint32", DataType::UINT32) 18 | .value("float16", DataType::FLOAT16) 19 | .value("float32", DataType::FLOAT32) 20 | .value("bfloat16", DataType::BFLOAT16); 21 | 22 | nb::enum_(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16); 23 | 24 | nb::class_(m, "ExecutionPlan") 25 | .def(nb::init(), nb::arg("planPath")) 26 | .def("name", &ExecutionPlan::name) 27 | .def("collective", &ExecutionPlan::collective) 28 | .def("min_message_size", &ExecutionPlan::minMessageSize) 29 | .def("max_message_size", &ExecutionPlan::maxMessageSize); 30 | 31 | nb::class_(m, "Executor") 32 | .def(nb::init>(), nb::arg("comm")) 33 | .def( 34 | "execute", 35 | [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize, 36 | DataType dataType, const ExecutionPlan& plan, uintptr_t stream, PacketType packetType) { 37 | self->execute(rank, reinterpret_cast(sendbuff), reinterpret_cast(recvBuff), sendBuffSize, 38 | recvBuffSize, dataType, plan, (cudaStream_t)stream, packetType); 39 | }, 40 | nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"), 41 | nb::arg("dataType"), nb::arg("plan"), nb::arg("stream"), nb::arg("packetType") = PacketType::LL16); 42 | } 43 | -------------------------------------------------------------------------------- /python/mscclpp/fifo_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace nb = nanobind; 9 | using namespace mscclpp; 10 | 11 | void register_fifo(nb::module_& m) { 12 | nb::class_(m, "ProxyTrigger").def_rw("fst", &ProxyTrigger::fst).def_rw("snd", &ProxyTrigger::snd); 13 | 14 | nb::class_(m, "FifoDeviceHandle") 15 | .def_rw("triggers", &FifoDeviceHandle::triggers) 16 | .def_rw("tail_replica", &FifoDeviceHandle::tailReplica) 17 | .def_rw("head", &FifoDeviceHandle::head) 18 | .def_rw("size", &FifoDeviceHandle::size) 19 | .def_prop_ro("raw", [](const FifoDeviceHandle& self) -> nb::bytes { 20 | return nb::bytes(reinterpret_cast(&self), sizeof(self)); 21 | }); 22 | 23 | nb::class_(m, "Fifo") 24 | .def(nb::init(), nb::arg("size") = DEFAULT_FIFO_SIZE) 25 | .def("poll", &Fifo::poll) 26 | .def("pop", &Fifo::pop) 27 | .def("flush_tail", &Fifo::flushTail, nb::arg("sync") = false) 28 | .def("size", &Fifo::size) 29 | .def("device_handle", &Fifo::deviceHandle); 30 | } 31 | -------------------------------------------------------------------------------- /python/mscclpp/language/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from mscclpp.language.program import MSCCLPPProgram, Json, Check, chunk, rank 5 | -------------------------------------------------------------------------------- /python/mscclpp/language/buffer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from enum import Enum 5 | 6 | 7 | # Scratch buffer slice with manual indexing 8 | class BufferSlice: 9 | def __init__(self, buf, name): 10 | self.name = name 11 | self.buf = buf 12 | self.offset = -1 # Offset into the global scratch buffer 13 | self.chunks = [] 14 | 15 | # Returns the global index into the scratch buffer 16 | def get_global_index(self, index): 17 | assert self.offset > -1, "set_offset needs to be called first" 18 | return self.offset + index 19 | 20 | def get_buffer(self): 21 | return self.buf 22 | 23 | def instance_size(self): 24 | return len(self.chunks) 25 | 26 | def set_offset(self, offset): 27 | self.offset = offset 28 | 29 | def __getitem__(self, index): 30 | return self.chunks[index] 31 | 32 | def __setitem__(self, index, value): 33 | current_size = len(self.chunks) 34 | while index > current_size: 35 | self.chunks.append(None) 36 | current_size = len(self.chunks) 37 | if index == current_size: 38 | self.chunks.append(value) 39 | else: 40 | self.chunks[index] = value 41 | 42 | def __len__(self): 43 | return len(self.chunks) 44 | 45 | 46 | class Buffer(Enum): 47 | input = "i" 48 | output = "o" 49 | scratch = "s" 50 | 51 | def __str__(self): 52 | return self.value 53 | 54 | def __lt__(self, other): 55 | return self.value < other.value 56 | 57 | def __gt__(self, other): 58 | return self.value < other.value 59 | -------------------------------------------------------------------------------- /python/mscclpp/language/chunk.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from dataclasses import dataclass 6 | 7 | 8 | @dataclass 9 | class Chunk: 10 | origin_rank: int # Rank the chunk initially started at 11 | origin_index: int # Index the chunk initially started at 12 | dst_rank: int = -1 13 | dst_index: int = -1 14 | 15 | def reduce(self, dst, chunk): 16 | if type(chunk) is ReduceChunk: 17 | return chunk.reduce(dst, self) 18 | elif type(chunk) is Chunk: 19 | chunks = [self, chunk] 20 | return ReduceChunk(dst, chunks) 21 | else: 22 | raise ValueError("Trying to reduce with chunk of None") 23 | 24 | def __hash__(self): 25 | return hash((self.origin_rank, self.origin_index)) 26 | 27 | def __eq__(self, other): 28 | return ( 29 | type(other) is Chunk and self.origin_rank == other.origin_rank and self.origin_index == other.origin_index 30 | ) 31 | 32 | def __lt__(self, other): 33 | return self.origin_rank < other.origin_rank or ( 34 | self.origin_rank == other.origin_rank and self.origin_index < other.origin_index 35 | ) 36 | 37 | 38 | @dataclass 39 | class ReduceChunk: 40 | creation_rank: int # Rank the Reduce Chunk is created. Necessary since the same ReduceChunk can be created on multiple ranks independently 41 | chunks: list # List of chunks reduced 42 | 43 | def reduce(self, dst, chunk): 44 | if type(chunk) is ReduceChunk: 45 | chunks = self.chunks + chunk.chunks 46 | elif type(chunk) is Chunk: 47 | chunks = self.chunks + [chunk] 48 | else: 49 | raise ValueError("Trying to reduce with chunk of None") 50 | return ReduceChunk(self.creation_rank, chunks) 51 | 52 | def sort(self): 53 | self.chunks.sort() 54 | 55 | def __hash__(self): 56 | self.sort() 57 | return hash((self.creation_rank,) + tuple(self.chunks)) 58 | 59 | # Two reduce chunks are equal if they contain the same list of 60 | # chunks being reduced 61 | def __eq__(self, other): 62 | self.sort() 63 | other.sort() 64 | return self.chunks == other.chunks 65 | -------------------------------------------------------------------------------- /python/mscclpp/language/dag/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from mscclpp.language.dag.instruction_dag import InstructionDAG 5 | from mscclpp.language.dag.lower import DagLower 6 | from mscclpp.language.dag.optimizer import DagOptimizer 7 | -------------------------------------------------------------------------------- /python/mscclpp/language/rank.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from dataclasses import dataclass, field 5 | from typing import Dict 6 | 7 | 8 | class BarrierInfo: 9 | def __init__(self, tb_list): 10 | self.tb_list = tb_list 11 | 12 | def __eq__(self, other): 13 | return self.tb_list == other.tb_list 14 | 15 | def __hash__(self): 16 | return hash(tuple(self.tb_list)) 17 | 18 | 19 | @dataclass 20 | class Rank: 21 | rank_id: int 22 | current_max_barrier_id: int = 0 23 | current_barriers: Dict[BarrierInfo, int] = field(default_factory=dict) 24 | 25 | def get_barrier_id(self, tb_list): 26 | barrier_info = BarrierInfo(tb_list) 27 | if barrier_info in self.current_barriers: 28 | return self.current_barriers[barrier_info] 29 | else: 30 | self.current_barriers[barrier_info] = self.current_max_barrier_id 31 | barrier_id = self.current_max_barrier_id 32 | self.current_max_barrier_id += 1 33 | return barrier_id 34 | -------------------------------------------------------------------------------- /python/mscclpp/language/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from mscclpp.language.types import Op 5 | 6 | 7 | def remove_op(op: Op): 8 | for p in op.prev: 9 | p.next.remove(op) 10 | p.next += op.next 11 | p.next = list(set(p.next)) 12 | 13 | for n in op.next: 14 | n.prev.remove(op) 15 | n.prev = op.prev.union(n.prev) 16 | 17 | op.next = [] 18 | op.prev = [] 19 | 20 | 21 | def merge_op(op: Op, other_op: Op): 22 | if other_op in op.next: 23 | op.next.remove(other_op) 24 | other_op.prev.remove(op) 25 | for p in other_op.prev: 26 | p.next.remove(other_op) 27 | p.next.append(op) 28 | 29 | for n in other_op.next: 30 | n.prev.remove(other_op) 31 | n.prev.add(op) 32 | 33 | op.prev = op.prev.union(other_op.prev) 34 | op.next = list(set(op.next + other_op.next)) 35 | 36 | 37 | def circular_dep_after_merge(op: Op, other_op: Op): 38 | root = set([op, other_op]) 39 | frontier = set(op.next) 40 | visited = set() 41 | if other_op in frontier: 42 | frontier.remove(other_op) 43 | frontier = list(frontier.union(other_op.next)) 44 | while len(frontier) > 0: 45 | current = frontier[0] 46 | for n in current.next: 47 | # The root node will be visited again if there is a circular dependency 48 | if n in root: 49 | return True 50 | if n not in visited: 51 | frontier.append(n) 52 | visited.add(n) 53 | frontier = frontier[1:] 54 | 55 | 56 | def all_prevs_visited_after_merge(op: Op, other_op: Op): 57 | """ 58 | For case: op2.prev = [op1, op3]. op1.next = [op2]. op3.next = [op2]. And op1 and op2 are satisfied to merge. 59 | We only apply the merge if all previous ops of op2 are visited. (op1 is the last previous op of op2). 60 | """ 61 | step = op.step 62 | for prev in other_op.prev: 63 | if prev.step > step: 64 | return False 65 | return True 66 | 67 | 68 | def same_tb(op1: Op, op2: Op): 69 | return op1.tb == op2.tb and op1.channel == op2.channel 70 | 71 | 72 | def same_count(op1: Op, op2: Op): 73 | return op1.cnt() == op2.cnt() 74 | 75 | 76 | def same_buf_dst(op1: Op, op2: Op): 77 | return op1.dst.buffer == op2.dst.buffer and op1.dst.index == op2.dst.index 78 | 79 | 80 | def same_src_dst_buffer_type(op1: Op, op2: Op): 81 | return op1.src.buffer == op2.src.buffer and op1.dst.buffer == op2.dst.buffer 82 | 83 | 84 | def buf_dst_src_match(op1: Op, op2: Op): 85 | return op1.dst.buffer == op2.src.buffer and op1.dst.index == op2.src.index 86 | 87 | 88 | def same_buf_src(op1: Op, op2: Op): 89 | return op1.src.buffer == op2.src.buffer and op1.src.index == op2.src.index 90 | 91 | 92 | def same_chan_type(op1: Op, op2: Op): 93 | return op1.channel_type == op2.channel_type 94 | 95 | 96 | def same_tb(op1: Op, op2: Op): 97 | return op1.tb == op2.tb 98 | -------------------------------------------------------------------------------- /python/mscclpp/memory_channel_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace nb = nanobind; 11 | using namespace mscclpp; 12 | 13 | void register_memory_channel(nb::module_& m) { 14 | nb::class_ baseMemoryChannel(m, "BaseMemoryChannel"); 15 | baseMemoryChannel 16 | .def("__init__", 17 | [](BaseMemoryChannel* baseMemoryChannel, std::shared_ptr semaphore) { 18 | new (baseMemoryChannel) BaseMemoryChannel(semaphore); 19 | }) 20 | .def("device_handle", &BaseMemoryChannel::deviceHandle); 21 | 22 | nb::class_(m, "BaseMemoryChannelDeviceHandle") 23 | .def(nb::init<>()) 24 | .def_rw("semaphore_", &BaseMemoryChannel::DeviceHandle::semaphore_) 25 | .def_prop_ro("raw", [](const BaseMemoryChannel::DeviceHandle& self) -> nb::bytes { 26 | return nb::bytes(reinterpret_cast(&self), sizeof(self)); 27 | }); 28 | 29 | nb::class_ memoryChannel(m, "MemoryChannel"); 30 | memoryChannel 31 | .def("__init__", 32 | [](MemoryChannel* memoryChannel, std::shared_ptr semaphore, 33 | RegisteredMemory dst, 34 | uintptr_t src) { new (memoryChannel) MemoryChannel(semaphore, dst, reinterpret_cast(src)); }) 35 | .def("__init__", 36 | [](MemoryChannel* memoryChannel, std::shared_ptr semaphore, 37 | RegisteredMemory dst, uintptr_t src, uintptr_t packet_buffer) { 38 | new (memoryChannel) 39 | MemoryChannel(semaphore, dst, reinterpret_cast(src), reinterpret_cast(packet_buffer)); 40 | }) 41 | .def("device_handle", &MemoryChannel::deviceHandle); 42 | 43 | nb::class_(m, "MemoryChannelDeviceHandle") 44 | .def(nb::init<>()) 45 | .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_) 46 | .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_) 47 | .def_rw("src_", &MemoryChannel::DeviceHandle::src_) 48 | .def_rw("packetBuffer_", &MemoryChannel::DeviceHandle::packetBuffer_) 49 | .def_prop_ro("raw", [](const MemoryChannel::DeviceHandle& self) -> nb::bytes { 50 | return nb::bytes(reinterpret_cast(&self), sizeof(self)); 51 | }); 52 | }; 53 | -------------------------------------------------------------------------------- /python/mscclpp/npkit_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace nb = nanobind; 10 | 11 | void register_npkit(nb::module_ &m) { 12 | nb::module_ sub_m = m.def_submodule("npkit", "NPKit functions"); 13 | sub_m.def("init", &NpKit::Init); 14 | sub_m.def("dump", &NpKit::Dump); 15 | sub_m.def("shutdown", &NpKit::Shutdown); 16 | } 17 | -------------------------------------------------------------------------------- /python/mscclpp/numa_py.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | namespace nb = nanobind; 3 | 4 | namespace mscclpp { 5 | int getDeviceNumaNode(int cudaDev); 6 | void numaBind(int node); 7 | }; // namespace mscclpp 8 | 9 | void register_numa(nb::module_ &m) { 10 | nb::module_ sub_m = m.def_submodule("numa", "numa functions"); 11 | sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode); 12 | sub_m.def("numa_bind", &mscclpp::numaBind); 13 | } 14 | -------------------------------------------------------------------------------- /python/mscclpp/nvls_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | namespace nb = nanobind; 15 | using namespace mscclpp; 16 | 17 | void register_nvls(nb::module_& m) { 18 | nb::class_(m, "DeviceMulticastPointer") 19 | .def("get_device_ptr", 20 | [](NvlsConnection::DeviceMulticastPointer* self) { return (uintptr_t)self->getDevicePtr(); }) 21 | .def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle); 22 | 23 | nb::class_(m, "DeviceHandle") 24 | .def(nb::init<>()) 25 | .def_rw("devicePtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::devicePtr) 26 | .def_rw("mcPtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::mcPtr) 27 | .def_rw("size", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::bufferSize) 28 | .def_prop_ro("raw", [](const NvlsConnection::DeviceMulticastPointer::DeviceHandle& self) -> nb::bytes { 29 | return nb::bytes(reinterpret_cast(&self), sizeof(self)); 30 | }); 31 | 32 | nb::class_(m, "NvlsConnection") 33 | .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("devicePtr"), nb::arg("size")) 34 | .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity); 35 | 36 | m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"), 37 | nb::arg("bufferSize")); 38 | } 39 | -------------------------------------------------------------------------------- /python/mscclpp/port_channel_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace nb = nanobind; 11 | using namespace mscclpp; 12 | 13 | void register_port_channel(nb::module_& m) { 14 | nb::class_(m, "BaseProxyService") 15 | .def("start_proxy", &BaseProxyService::startProxy) 16 | .def("stop_proxy", &BaseProxyService::stopProxy); 17 | 18 | nb::class_(m, "ProxyService") 19 | .def(nb::init(), nb::arg("fifoSize") = DEFAULT_FIFO_SIZE) 20 | .def("start_proxy", &ProxyService::startProxy) 21 | .def("stop_proxy", &ProxyService::stopProxy) 22 | .def("build_and_add_semaphore", &ProxyService::buildAndAddSemaphore, nb::arg("comm"), nb::arg("connection")) 23 | .def("add_semaphore", &ProxyService::addSemaphore, nb::arg("semaphore")) 24 | .def("add_memory", &ProxyService::addMemory, nb::arg("memory")) 25 | .def("semaphore", &ProxyService::semaphore, nb::arg("id")) 26 | .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id")) 27 | .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src")); 28 | 29 | nb::class_(m, "BasePortChannel") 30 | .def(nb::init, std::shared_ptr>(), 31 | nb::arg("semaphoreId"), nb::arg("semaphore"), nb::arg("proxy")) 32 | .def("device_handle", &BasePortChannel::deviceHandle); 33 | 34 | nb::class_(m, "BasePortChannelDeviceHandle") 35 | .def(nb::init<>()) 36 | .def_rw("semaphoreId_", &BasePortChannel::DeviceHandle::semaphoreId_) 37 | .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_) 38 | .def_rw("fifo_", &BasePortChannel::DeviceHandle::fifo_) 39 | .def_prop_ro("raw", [](const BasePortChannel::DeviceHandle& self) -> nb::bytes { 40 | return nb::bytes(reinterpret_cast(&self), sizeof(self)); 41 | }); 42 | 43 | nb::class_(m, "PortChannel") 44 | .def(nb::init, std::shared_ptr, MemoryId, MemoryId>(), 45 | nb::arg("semaphoreId"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src")) 46 | .def("device_handle", &PortChannel::deviceHandle); 47 | 48 | nb::class_(m, "PortChannelDeviceHandle") 49 | .def(nb::init<>()) 50 | .def_rw("semaphoreId_", &PortChannel::DeviceHandle::semaphoreId_) 51 | .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_) 52 | .def_rw("fifo_", &PortChannel::DeviceHandle::fifo_) 53 | .def_rw("src_", &PortChannel::DeviceHandle::src_) 54 | .def_rw("dst_", &PortChannel::DeviceHandle::dst_) 55 | .def_prop_ro("raw", [](const PortChannel::DeviceHandle& self) -> nb::bytes { 56 | return nb::bytes(reinterpret_cast(&self), sizeof(self)); 57 | }); 58 | }; 59 | -------------------------------------------------------------------------------- /python/mscclpp/semaphore_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace nb = nanobind; 10 | using namespace mscclpp; 11 | 12 | void register_semaphore(nb::module_& m) { 13 | nb::class_ host2DeviceSemaphore(m, "Host2DeviceSemaphore"); 14 | host2DeviceSemaphore 15 | .def(nb::init>(), nb::arg("communicator"), nb::arg("connection")) 16 | .def("connection", &Host2DeviceSemaphore::connection) 17 | .def("signal", &Host2DeviceSemaphore::signal) 18 | .def("device_handle", &Host2DeviceSemaphore::deviceHandle); 19 | 20 | nb::class_(host2DeviceSemaphore, "DeviceHandle") 21 | .def(nb::init<>()) 22 | .def_rw("inbound_semaphore_id", &Host2DeviceSemaphore::DeviceHandle::inboundSemaphoreId) 23 | .def_rw("expected_inbound_semaphore_id", &Host2DeviceSemaphore::DeviceHandle::expectedInboundSemaphoreId) 24 | .def_prop_ro("raw", [](const Host2DeviceSemaphore::DeviceHandle& self) -> nb::bytes { 25 | return nb::bytes(reinterpret_cast(&self), sizeof(self)); 26 | }); 27 | 28 | nb::class_(m, "Host2HostSemaphore") 29 | .def(nb::init>(), nb::arg("communicator"), nb::arg("connection")) 30 | .def("connection", &Host2HostSemaphore::connection) 31 | .def("signal", &Host2HostSemaphore::signal) 32 | .def("poll", &Host2HostSemaphore::poll) 33 | .def("wait", &Host2HostSemaphore::wait, nb::call_guard(), 34 | nb::arg("max_spin_count") = 10000000); 35 | 36 | nb::class_ memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore"); 37 | memoryDevice2DeviceSemaphore 38 | .def(nb::init>(), nb::arg("communicator"), nb::arg("connection")) 39 | .def("device_handle", &MemoryDevice2DeviceSemaphore::deviceHandle); 40 | 41 | nb::class_(memoryDevice2DeviceSemaphore, "DeviceHandle") 42 | .def(nb::init<>()) 43 | .def_rw("inboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundSemaphoreId) 44 | .def_rw("outboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundSemaphoreId) 45 | .def_rw("remoteInboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundSemaphoreId) 46 | .def_rw("expectedInboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundSemaphoreId) 47 | .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes { 48 | return nb::bytes(reinterpret_cast(&self), sizeof(self)); 49 | }); 50 | } 51 | -------------------------------------------------------------------------------- /python/mscclpp/utils_py.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace nb = nanobind; 10 | using namespace mscclpp; 11 | 12 | void register_utils(nb::module_& m) { 13 | nb::class_(m, "Timer") 14 | .def(nb::init(), nb::arg("timeout") = -1) 15 | .def("elapsed", &Timer::elapsed) 16 | .def("set", &Timer::set, nb::arg("timeout")) 17 | .def("reset", &Timer::reset) 18 | .def("print", &Timer::print, nb::arg("name")); 19 | 20 | nb::class_(m, "ScopedTimer").def(nb::init(), nb::arg("name")); 21 | 22 | m.def("get_host_name", &getHostName, nb::arg("maxlen"), nb::arg("delim")); 23 | } 24 | -------------------------------------------------------------------------------- /python/mscclpp_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5 2 | -------------------------------------------------------------------------------- /python/mscclpp_benchmark/nccl_op.py: -------------------------------------------------------------------------------- 1 | import cupy.cuda.nccl as nccl 2 | from mpi4py import MPI 3 | import cupy as cp 4 | 5 | 6 | class NcclAllReduce: 7 | def __init__(self, nccl_comm: nccl.NcclCommunicator, memory: cp.ndarray): 8 | self.nccl_comm = nccl_comm 9 | self.memory = memory 10 | if memory.dtype == cp.float32: 11 | self.nccl_dtype = nccl.NCCL_FLOAT32 12 | elif memory.dtype == cp.float16: 13 | self.nccl_dtype = nccl.NCCL_FLOAT16 14 | elif memory.dtype == cp.int32: 15 | self.nccl_dtype = nccl.NCCL_INT32 16 | else: 17 | raise RuntimeError("Make sure that the data type is mapped to the correct NCCL data type") 18 | 19 | def __call__(self, stream): 20 | stream_ptr = stream.ptr if stream else 0 21 | self.nccl_comm.allReduce( 22 | self.memory.data.ptr, self.memory.data.ptr, self.memory.size, self.nccl_dtype, nccl.NCCL_SUM, stream_ptr 23 | ) 24 | return self.memory 25 | -------------------------------------------------------------------------------- /python/requirements_cuda11.txt: -------------------------------------------------------------------------------- 1 | mpi4py 2 | cupy-cuda11x 3 | prettytable 4 | netifaces 5 | pytest 6 | numpy 7 | matplotlib 8 | -------------------------------------------------------------------------------- /python/requirements_cuda12.txt: -------------------------------------------------------------------------------- 1 | mpi4py 2 | cupy-cuda12x 3 | prettytable 4 | netifaces 5 | pytest 6 | numpy 7 | matplotlib 8 | -------------------------------------------------------------------------------- /python/requirements_rocm6.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/python/requirements_rocm6.txt -------------------------------------------------------------------------------- /python/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) 5 | include(FetchContent) 6 | FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0) 7 | FetchContent_MakeAvailable(nanobind) 8 | 9 | file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp) 10 | nanobind_add_module(mscclpp_py_test ${SOURCES}) 11 | set_target_properties(mscclpp_py_test PROPERTIES OUTPUT_NAME _ext) 12 | target_link_libraries(mscclpp_py_test PRIVATE mscclpp_static ${GPU_LIBRARIES}) 13 | target_include_directories(mscclpp_py_test SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) 14 | -------------------------------------------------------------------------------- /python/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/python/test/__init__.py -------------------------------------------------------------------------------- /python/test/_cpp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/mscclpp/c184485808aeaaec5625bc97905c819db1514184/python/test/_cpp/__init__.py -------------------------------------------------------------------------------- /python/test/configs/mscclpp_lang_test_config.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "filename": "allgather_barrier.py", 4 | "args": ["8", "8"] 5 | }, 6 | { 7 | "filename": "allreduce_allpairs_packet.py", 8 | "args": ["8", "8"] 9 | }, 10 | { 11 | "filename": "allreduce_allpairs_get.py", 12 | "args": ["8", "8"] 13 | }, 14 | { 15 | "filename": "allreduce_allpairs.py", 16 | "args": ["8", "8"] 17 | }, 18 | { 19 | "filename": "allreduce_ring.py", 20 | "args": ["8", "8"] 21 | }, 22 | { 23 | "filename": "send_recv_packet.py", 24 | "args": ["2"] 25 | }, 26 | { 27 | "filename": "send_recv_proxy.py", 28 | "args": ["2"] 29 | }, 30 | { 31 | "filename": "allreduce_nvls.py", 32 | "args": ["8", "2"] 33 | }, 34 | { 35 | "filename": "allgather_allpairs_multinodes_packets.py", 36 | "args": ["16", "8", "1"] 37 | } 38 | ] 39 | -------------------------------------------------------------------------------- /python/test/d2d_semaphore_test.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | // be careful about using semaphore[my_rank] as it is an invalid semaphore and it is there just for simplicity of 7 | // indexing 8 | extern "C" __global__ void __launch_bounds__(1024, 1) 9 | d2d_semaphore(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks) { 10 | int tid = threadIdx.x; 11 | if (tid < nranks && tid != my_rank) { 12 | semaphores[tid].signal(); 13 | semaphores[tid].wait(); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /python/test/fifo_test.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | extern "C" __global__ void __launch_bounds__(1024, 1) fifo(mscclpp::FifoDeviceHandle fifo) { 7 | mscclpp::ProxyTrigger trigger; 8 | trigger.fst = 123; 9 | fifo.push(trigger); 10 | } 11 | -------------------------------------------------------------------------------- /python/test/h2d_semaphore_test.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | // be careful about using semaphore[my_rank] as it is an invalid semaphore and it is there just for simplicity of 7 | // indexing 8 | extern "C" __global__ void __launch_bounds__(1024, 1) 9 | h2d_semaphore(mscclpp::Host2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks) { 10 | int tid = threadIdx.x; 11 | if (tid < nranks && tid != my_rank) semaphores[tid].wait(); 12 | } 13 | -------------------------------------------------------------------------------- /python/test/memory_channel_test.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing 7 | extern "C" __global__ void __launch_bounds__(1024, 1) 8 | memory_channel(mscclpp::MemoryChannelDeviceHandle* channels, int my_rank, int nranks, int num_elements, 9 | int use_packet) { 10 | int tid = threadIdx.x; 11 | int bid = blockIdx.x; 12 | uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks; 13 | uint64_t my_offset = size_per_rank * my_rank; 14 | uint64_t my_nghr_offset = size_per_rank * bid; 15 | int flag = 123; 16 | if (bid < nranks && bid != my_rank) { 17 | if (use_packet) { 18 | channels[bid].putPackets(2 * my_offset, my_offset, size_per_rank, tid, blockDim.x, flag); 19 | channels[bid].unpackPackets(2 * my_nghr_offset, my_nghr_offset, size_per_rank, tid, blockDim.x, flag); 20 | } else { 21 | channels[bid].put(my_offset, my_offset, size_per_rank, tid, blockDim.x); 22 | __syncthreads(); 23 | if (!use_packet && tid == 0) { 24 | channels[bid].signal(); 25 | channels[bid].wait(); 26 | } 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /python/test/mscclpp_mpi.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import atexit 5 | import logging 6 | 7 | import cupy as cp 8 | import mpi4py 9 | 10 | mpi4py.rc.initialize = False 11 | mpi4py.rc.finalize = False 12 | 13 | from mpi4py import MPI 14 | import pytest 15 | 16 | N_GPUS_PER_NODE = 8 17 | 18 | logging.basicConfig(level=logging.INFO) 19 | 20 | 21 | def init_mpi(): 22 | if not MPI.Is_initialized(): 23 | MPI.Init() 24 | shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL) 25 | N_GPUS_PER_NODE = shm_comm.size 26 | shm_comm.Free() 27 | cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use() 28 | 29 | 30 | # Define a function to finalize MPI 31 | def finalize_mpi(): 32 | if MPI.Is_initialized(): 33 | MPI.Finalize() 34 | 35 | 36 | # Register the function to be called on exit 37 | atexit.register(finalize_mpi) 38 | 39 | 40 | class MpiGroup: 41 | def __init__(self, ranks: list = []): 42 | world_group = MPI.COMM_WORLD.group 43 | if len(ranks) == 0: 44 | self.comm = MPI.COMM_WORLD 45 | else: 46 | group = world_group.Incl(ranks) 47 | self.comm = MPI.COMM_WORLD.Create(group) 48 | 49 | 50 | @pytest.fixture 51 | def mpi_group(request: pytest.FixtureRequest): 52 | MPI.COMM_WORLD.barrier() 53 | if request.param is None: 54 | pytest.skip(f"Skip for rank {MPI.COMM_WORLD.rank}") 55 | yield request.param 56 | 57 | 58 | def parametrize_mpi_groups(*tuples: tuple): 59 | def decorator(func): 60 | mpi_groups = [] 61 | for group_size in list(tuples): 62 | if MPI.COMM_WORLD.size < group_size: 63 | logging.warning(f"MPI.COMM_WORLD.size < {group_size}, skip") 64 | continue 65 | mpi_group = MpiGroup(list(range(group_size))) 66 | if mpi_group.comm == MPI.COMM_NULL: 67 | mpi_groups.append(None) 68 | else: 69 | mpi_groups.append(mpi_group) 70 | return pytest.mark.parametrize("mpi_group", mpi_groups, indirect=True)(func) 71 | 72 | return decorator 73 | 74 | 75 | init_mpi() 76 | -------------------------------------------------------------------------------- /python/test/nvls_test.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | __device__ mscclpp::DeviceSyncer deviceSyncer; 10 | 11 | extern "C" __global__ void __launch_bounds__(1024, 1) 12 | nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, 13 | mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) { 14 | int nelem = nbytes / sizeof(float); 15 | float* dev_ptr = (float*)nvlsPtrs.devicePtr; 16 | float* mc_ptr = (float*)nvlsPtrs.mcPtr; 17 | int tid = threadIdx.x; 18 | int bid = blockIdx.x; 19 | 20 | for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) { 21 | dev_ptr[idx] = my_rank; 22 | } 23 | deviceSyncer.sync(gridDim.x); 24 | if (tid == 0 && bid == 0) { 25 | __threadfence_system(); 26 | } 27 | 28 | if (bid == 0) { 29 | if (tid < nranks && tid != my_rank) { 30 | semaphores[tid].signal(); 31 | semaphores[tid].wait(); 32 | } 33 | } 34 | deviceSyncer.sync(gridDim.x); 35 | 36 | int my_st = ((int64_t)nelem * (int64_t)my_rank) / (int64_t)nranks; 37 | int my_en = ((int64_t)nelem * (int64_t)(my_rank + 1)) / (int64_t)nranks; 38 | 39 | int my_offset = (tid + bid * blockDim.x) * 4; 40 | int my_step = blockDim.x * gridDim.x * 4; 41 | 42 | for (int idx = my_st + my_offset; idx < my_en; idx += my_step) { 43 | uint4 val; 44 | mscclpp::DeviceMulticastPointerDeviceHandle::multimemLoadReduce(val, mc_ptr + idx); 45 | mscclpp::DeviceMulticastPointerDeviceHandle::multimemStore(val, mc_ptr + idx); 46 | } 47 | 48 | deviceSyncer.sync(gridDim.x); 49 | if (tid == 0 && bid == 0) { 50 | __threadfence_system(); 51 | } 52 | 53 | if (bid == 0) { 54 | if (tid < nranks && tid != my_rank) { 55 | semaphores[tid].signal(); 56 | semaphores[tid].wait(); 57 | } 58 | } 59 | deviceSyncer.sync(gridDim.x); 60 | 61 | for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) { 62 | MSCCLPP_ASSERT_DEVICE(dev_ptr[idx] == ((nranks * (nranks - 1)) / 2), "dev_ptr[idx] != nranks"); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /python/test/port_channel_test.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing 8 | extern "C" __global__ void __launch_bounds__(1024, 1) 9 | port_channel(mscclpp::PortChannelDeviceHandle* channels, int my_rank, int nranks, int* data, int* scratch, 10 | int num_elements, int use_packet) { 11 | int tid = threadIdx.x; 12 | int nthreads = blockDim.x; 13 | uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks; 14 | uint64_t my_offset = size_per_rank * my_rank; 15 | int nthreads_per_rank = nthreads / nranks; 16 | int my_nghr = tid / nthreads_per_rank; 17 | uint64_t my_nghr_offset = size_per_rank * my_nghr; 18 | __syncthreads(); 19 | int flag = 123; 20 | if (use_packet) { 21 | mscclpp::copyToPackets((char*)scratch + 2 * my_offset, (char*)data + my_offset, size_per_rank, tid, nthreads, flag); 22 | __syncthreads(); 23 | if (tid < nranks && tid != my_rank) { 24 | channels[tid].put(2 * my_offset, 2 * my_offset, 2 * size_per_rank); 25 | } 26 | if (my_nghr != my_rank && my_nghr < nranks) 27 | mscclpp::copyFromPackets((char*)data + my_nghr_offset, (char*)scratch + 2 * my_nghr_offset, size_per_rank, 28 | tid % nthreads_per_rank, nthreads_per_rank, flag); 29 | } else { 30 | if (tid < nranks && tid != my_rank) { 31 | channels[tid].putWithSignalAndFlush(my_offset, my_offset, size_per_rank); 32 | channels[tid].wait(); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /python/test/proxy_test.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | extern "C" __global__ void __launch_bounds__(1024, 1) proxy(int my_rank, int nranks, mscclpp::FifoDeviceHandle fifo, 8 | mscclpp::Host2DeviceSemaphoreDeviceHandle* semaphores) { 9 | int tid = threadIdx.x; 10 | if (tid == 0) { 11 | mscclpp::ProxyTrigger trigger; 12 | trigger.fst = 123; 13 | trigger.snd = 0; 14 | uint64_t currentFifoHead = fifo.push(trigger); 15 | // wait for the work to be done in cpu side 16 | fifo.sync(currentFifoHead); 17 | } 18 | __syncthreads(); 19 | if (tid < nranks && tid != my_rank) { 20 | semaphores[tid].wait(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /python/test/test_generate_mscclpp_lang_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | import json 6 | from pathlib import Path 7 | import subprocess 8 | 9 | 10 | def run_examples(input_folder, configs, output_folder): 11 | for config in configs: 12 | file_name = config["filename"] 13 | args = config["args"] 14 | 15 | input_file_path = Path(input_folder) / file_name 16 | # Strip the ".py" from the filename and add ".output" 17 | base_file_name = file_name[:-3] if file_name.endswith(".py") else file_name 18 | base_file_name = base_file_name.replace("/", "_") 19 | output_file_path = Path(output_folder) / f"{base_file_name}.output" 20 | 21 | # Construct the command to run the Python script 22 | command = ["python3", str(input_file_path)] + args 23 | 24 | # Run the command and capture output 25 | with open(output_file_path, "w") as output_file: 26 | result = subprocess.run(command, stdout=output_file, stderr=subprocess.STDOUT, text=True) 27 | 28 | # Optional: Check the return code to handle errors 29 | if result.returncode != 0: 30 | print(f"Error running {file_name}. See {output_file_path} for details.") 31 | 32 | 33 | def main(input_folder, config_path, output_folder): 34 | with open(config_path, "r") as f: 35 | config = json.load(f) 36 | 37 | Path(output_folder).mkdir(parents=True, exist_ok=True) 38 | run_examples(input_folder, config, output_folder) 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser(description="Process files according to a configuration and save the results.") 43 | parser.add_argument("input_folder", type=str, help="Path to the folder containing the input files.") 44 | parser.add_argument("config", type=str, help="Path to the configuration file.") 45 | parser.add_argument("output_folder", type=str, help="Path to the folder where the processed files will be saved.") 46 | args = parser.parse_args() 47 | main(args.input_folder, args.config, args.output_folder) 48 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | # Python in-place installs move the .so files into the source directories. 2 | *.so 3 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cpp *.cu) 5 | target_sources(mscclpp_obj PRIVATE ${SOURCES}) 6 | target_include_directories(mscclpp_obj PRIVATE include) 7 | -------------------------------------------------------------------------------- /src/c_style_remnants.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include "api.h" 5 | #include "debug.h" 6 | 7 | MSCCLPP_API void mscclppDefaultLogHandler(const char* msg) { mscclppDebugDefaultLogHandler(msg); } 8 | 9 | MSCCLPP_API mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler) { 10 | return mscclppDebugSetLogHandler(handler); 11 | } 12 | 13 | MSCCLPP_API const char* mscclppGetErrorString(mscclppResult_t code) { 14 | switch (code) { 15 | case mscclppSuccess: 16 | return "no error"; 17 | case mscclppUnhandledCudaError: 18 | return "unhandled cuda error"; 19 | case mscclppSystemError: 20 | return "unhandled system error"; 21 | case mscclppInternalError: 22 | return "internal error"; 23 | case mscclppInvalidArgument: 24 | return "invalid argument"; 25 | case mscclppInvalidUsage: 26 | return "invalid usage"; 27 | case mscclppRemoteError: 28 | return "remote process exited or there was a network error"; 29 | case mscclppInProgress: 30 | return "MSCCL++ operation in progress"; 31 | default: 32 | return "unknown result code"; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/context.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include "context.hpp" 5 | 6 | #include "api.h" 7 | #include "connection.hpp" 8 | #include "debug.h" 9 | #include "endpoint.hpp" 10 | #include "registered_memory.hpp" 11 | 12 | namespace mscclpp { 13 | 14 | Context::Impl::Impl() : ipcStream_(std::make_shared()) {} 15 | 16 | IbCtx* Context::Impl::getIbContext(Transport ibTransport) { 17 | // Find IB context or create it 18 | auto it = ibContexts_.find(ibTransport); 19 | if (it == ibContexts_.end()) { 20 | auto ibDev = getIBDeviceName(ibTransport); 21 | ibContexts_[ibTransport] = std::make_unique(ibDev); 22 | return ibContexts_[ibTransport].get(); 23 | } else { 24 | return it->second.get(); 25 | } 26 | } 27 | 28 | MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique()) {} 29 | 30 | MSCCLPP_API_CPP Context::~Context() = default; 31 | 32 | MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) { 33 | return RegisteredMemory(std::make_shared(ptr, size, transports, *pimpl_)); 34 | } 35 | 36 | MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) { 37 | return Endpoint(std::make_shared(config, *pimpl_)); 38 | } 39 | 40 | MSCCLPP_API_CPP std::shared_ptr Context::connect(Endpoint localEndpoint, Endpoint remoteEndpoint) { 41 | std::shared_ptr conn; 42 | if (localEndpoint.transport() == Transport::CudaIpc) { 43 | if (remoteEndpoint.transport() != Transport::CudaIpc) { 44 | throw mscclpp::Error("Local transport is CudaIpc but remote is not", ErrorCode::InvalidUsage); 45 | } 46 | conn = std::make_shared(localEndpoint, remoteEndpoint, pimpl_->ipcStream_); 47 | } else if (AllIBTransports.has(localEndpoint.transport())) { 48 | if (!AllIBTransports.has(remoteEndpoint.transport())) { 49 | throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage); 50 | } 51 | conn = std::make_shared(localEndpoint, remoteEndpoint, *this); 52 | } else if (localEndpoint.transport() == Transport::Ethernet) { 53 | if (remoteEndpoint.transport() != Transport::Ethernet) { 54 | throw mscclpp::Error("Local transport is Ethernet but remote is not", ErrorCode::InvalidUsage); 55 | } 56 | conn = std::make_shared(localEndpoint, remoteEndpoint); 57 | } else { 58 | throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError); 59 | } 60 | 61 | pimpl_->connections_.push_back(conn); 62 | return conn; 63 | } 64 | 65 | } // namespace mscclpp 66 | -------------------------------------------------------------------------------- /src/errors.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "api.h" 9 | 10 | namespace mscclpp { 11 | 12 | std::string errorToString(enum ErrorCode error) { 13 | switch (error) { 14 | case ErrorCode::SystemError: 15 | return "SystemError"; 16 | case ErrorCode::InternalError: 17 | return "InternalError"; 18 | case ErrorCode::InvalidUsage: 19 | return "InvalidUsage"; 20 | case ErrorCode::Timeout: 21 | return "Timeout"; 22 | case ErrorCode::Aborted: 23 | return "Aborted"; 24 | case ErrorCode::ExecutorError: 25 | return "ExecutorError"; 26 | default: 27 | return "UnknownError"; 28 | } 29 | } 30 | 31 | BaseError::BaseError(const std::string& message, int errorCode) 32 | : std::runtime_error(""), message_(message), errorCode_(errorCode) {} 33 | 34 | BaseError::BaseError(int errorCode) : std::runtime_error(""), errorCode_(errorCode) {} 35 | 36 | int BaseError::getErrorCode() const { return errorCode_; } 37 | 38 | const char* BaseError::what() const noexcept { return message_.c_str(); } 39 | 40 | MSCCLPP_API_CPP Error::Error(const std::string& message, ErrorCode errorCode) : BaseError(static_cast(errorCode)) { 41 | message_ = message + " (Mscclpp failure: " + errorToString(errorCode) + ")"; 42 | } 43 | 44 | MSCCLPP_API_CPP ErrorCode Error::getErrorCode() const { return static_cast(errorCode_); } 45 | 46 | MSCCLPP_API_CPP SysError::SysError(const std::string& message, int errorCode) : BaseError(errorCode) { 47 | message_ = message + " (System failure: " + std::strerror(errorCode) + ")"; 48 | } 49 | 50 | MSCCLPP_API_CPP CudaError::CudaError(const std::string& message, int errorCode) : BaseError(errorCode) { 51 | message_ = message + " (Cuda failure: " + cudaGetErrorString(static_cast(errorCode)) + ")"; 52 | } 53 | 54 | MSCCLPP_API_CPP CuError::CuError(const std::string& message, int errorCode) : BaseError(errorCode) { 55 | const char* errStr; 56 | if (cuGetErrorString(static_cast(errorCode), &errStr) != CUDA_SUCCESS) { 57 | errStr = "failed to get error string"; 58 | } 59 | message_ = message + " (Cu failure: " + errStr + ")"; 60 | } 61 | 62 | MSCCLPP_API_CPP IbError::IbError(const std::string& message, int errorCode) : BaseError(errorCode) { 63 | message_ = message + " (Ib failure: " + std::strerror(errorCode) + ")"; 64 | } 65 | 66 | }; // namespace mscclpp 67 | -------------------------------------------------------------------------------- /src/fifo.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include "api.h" 8 | #include "atomic.hpp" 9 | 10 | namespace mscclpp { 11 | 12 | struct Fifo::Impl { 13 | detail::UniqueGpuHostPtr triggers; 14 | detail::UniqueGpuPtr head; 15 | detail::UniqueGpuPtr tailReplica; 16 | const int size; 17 | 18 | // allocated on the host. Only accessed by the host. This is a copy of the 19 | // value pointed to by fifoTailDev and the invariant is that 20 | // *fifoTailDev <= hostTail. Meaning that host's copy of tail is 21 | // always ahead of the device's copy and host updates the device's copy 22 | // only when it is needed. Therefore, hostTail is the "true" tail 23 | // and fifoTailDev is a "stale" tail. See proxy.cc to undertand how 24 | // these updates are pushed to the device. 25 | uint64_t hostTail; 26 | 27 | // for transferring fifo tail 28 | CudaStreamWithFlags stream; 29 | 30 | Impl(int size) 31 | : triggers(detail::gpuCallocHostUnique(size)), 32 | head(detail::gpuCallocUnique()), 33 | tailReplica(detail::gpuCallocUnique()), 34 | size(size), 35 | hostTail(0), 36 | stream(cudaStreamNonBlocking) {} 37 | }; 38 | 39 | MSCCLPP_API_CPP Fifo::Fifo(int size) : pimpl(std::make_unique(size)) {} 40 | MSCCLPP_API_CPP Fifo::~Fifo() = default; 41 | 42 | MSCCLPP_API_CPP ProxyTrigger Fifo::poll() { 43 | ProxyTrigger trigger; 44 | ProxyTrigger* ptr = &pimpl->triggers.get()[pimpl->hostTail % pimpl->size]; 45 | // we are loading fst first. if fst is non-zero then snd is also valid 46 | trigger.fst = atomicLoad(&(ptr->fst), memoryOrderAcquire); 47 | trigger.snd = ptr->snd; 48 | return trigger; 49 | } 50 | 51 | MSCCLPP_API_CPP void Fifo::pop() { 52 | atomicStore(&(pimpl->triggers.get()[pimpl->hostTail % pimpl->size].fst), uint64_t{0}, memoryOrderRelease); 53 | (pimpl->hostTail)++; 54 | } 55 | 56 | MSCCLPP_API_CPP void Fifo::flushTail(bool sync) { 57 | // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure that the fifo can 58 | // make progress even if there is no request mscclppSync. However, mscclppSync type is for flush request. 59 | AvoidCudaGraphCaptureGuard cgcGuard; 60 | MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), &pimpl->hostTail, sizeof(uint64_t), 61 | cudaMemcpyHostToDevice, pimpl->stream)); 62 | if (sync) { 63 | MSCCLPP_CUDATHROW(cudaStreamSynchronize(pimpl->stream)); 64 | } 65 | } 66 | 67 | MSCCLPP_API_CPP int Fifo::size() const { return pimpl->size; } 68 | 69 | MSCCLPP_API_CPP FifoDeviceHandle Fifo::deviceHandle() { 70 | FifoDeviceHandle deviceHandle; 71 | deviceHandle.triggers = pimpl->triggers.get(); 72 | deviceHandle.head = pimpl->head.get(); 73 | deviceHandle.tailReplica = pimpl->tailReplica.get(); 74 | deviceHandle.size = pimpl->size; 75 | return deviceHandle; 76 | } 77 | 78 | } // namespace mscclpp 79 | -------------------------------------------------------------------------------- /src/include/api.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_API_H_ 5 | #define MSCCLPP_API_H_ 6 | 7 | #define MSCCLPP_API extern "C" __attribute__((visibility("default"))) 8 | #define MSCCLPP_API_CPP __attribute__((visibility("default"))) 9 | 10 | #endif // MSCCLPP_API_H_ 11 | -------------------------------------------------------------------------------- /src/include/atomic.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_ATOMIC_HPP_ 5 | #define MSCCLPP_ATOMIC_HPP_ 6 | 7 | #if defined(MSCCLPP_USE_CUDA) 8 | #define MSCCLPP_DEVICE_CUDA 9 | #include 10 | #undef MSCCLPP_DEVICE_CUDA 11 | #else // !defined(MSCCLPP_USE_CUDA) 12 | #define MSCCLPP_DEVICE_HIP 13 | #include 14 | #undef MSCCLPP_DEVICE_HIP 15 | #endif // !defined(MSCCLPP_USE_CUDA) 16 | 17 | #endif // MSCCLPP_ATOMIC_HPP_ 18 | -------------------------------------------------------------------------------- /src/include/communicator.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_COMMUNICATOR_HPP_ 5 | #define MSCCLPP_COMMUNICATOR_HPP_ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "utils_internal.hpp" 13 | 14 | namespace mscclpp { 15 | 16 | class BaseRecvItem { 17 | public: 18 | virtual ~BaseRecvItem() = default; 19 | virtual void wait() = 0; 20 | virtual bool isReady() const = 0; 21 | }; 22 | 23 | template 24 | class RecvItem : public BaseRecvItem { 25 | public: 26 | RecvItem(std::shared_future future) : future_(future) {} 27 | 28 | void wait() { future_.wait(); } 29 | 30 | bool isReady() const { return future_.wait_for(std::chrono::seconds(0)) == std::future_status::ready; } 31 | 32 | private: 33 | std::shared_future future_; 34 | }; 35 | 36 | struct ConnectionInfo { 37 | int remoteRank; 38 | int tag; 39 | }; 40 | 41 | struct Communicator::Impl { 42 | std::shared_ptr bootstrap_; 43 | std::shared_ptr context_; 44 | std::unordered_map connectionInfos_; 45 | std::shared_ptr lastRecvItem_; 46 | 47 | // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair. 48 | // If the RecvItem gets ready, it will be removed at the next call to getLastRecvItem. 49 | std::unordered_map, std::shared_ptr, PairHash> lastRecvItems_; 50 | 51 | Impl(std::shared_ptr bootstrap, std::shared_ptr context); 52 | 53 | // Set the last RecvItem for a {remoteRank, tag} pair. 54 | // This is used to store the corresponding RecvItem of a future returned by recvMemory() or connect(). 55 | void setLastRecvItem(int remoteRank, int tag, std::shared_ptr item); 56 | 57 | // Return the last RecvItem that is not ready. 58 | // If the item is ready, it will be removed from the map and nullptr will be returned. 59 | std::shared_ptr getLastRecvItem(int remoteRank, int tag); 60 | 61 | struct Connector; 62 | }; 63 | 64 | } // namespace mscclpp 65 | 66 | #endif // MSCCLPP_COMMUNICATOR_HPP_ 67 | -------------------------------------------------------------------------------- /src/include/connection.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_CONNECTION_HPP_ 5 | #define MSCCLPP_CONNECTION_HPP_ 6 | 7 | #include 8 | #include 9 | 10 | #include "communicator.hpp" 11 | #include "context.hpp" 12 | #include "ib.hpp" 13 | #include "registered_memory.hpp" 14 | #include "socket.h" 15 | 16 | namespace mscclpp { 17 | 18 | class CudaIpcConnection : public Connection { 19 | std::shared_ptr stream_; 20 | 21 | public: 22 | CudaIpcConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, std::shared_ptr stream); 23 | 24 | Transport transport() override; 25 | 26 | Transport remoteTransport() override; 27 | 28 | void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, 29 | uint64_t size) override; 30 | void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override; 31 | 32 | void flush(int64_t timeoutUsec) override; 33 | }; 34 | 35 | class IBConnection : public Connection { 36 | Transport transport_; 37 | Transport remoteTransport_; 38 | IbQp* qp; 39 | std::unique_ptr dummyAtomicSource_; // not used anywhere but IB needs a source 40 | RegisteredMemory dummyAtomicSourceMem_; 41 | mscclpp::TransportInfo dstTransportInfo_; 42 | 43 | public: 44 | IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context); 45 | 46 | Transport transport() override; 47 | 48 | Transport remoteTransport() override; 49 | 50 | void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, 51 | uint64_t size) override; 52 | void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override; 53 | 54 | void flush(int64_t timeoutUsec) override; 55 | }; 56 | 57 | class EthernetConnection : public Connection { 58 | std::unique_ptr sendSocket_; 59 | std::unique_ptr recvSocket_; 60 | std::thread threadRecvMessages_; 61 | volatile uint32_t* abortFlag_; 62 | const uint64_t sendBufferSize_; 63 | const uint64_t recvBufferSize_; 64 | std::vector sendBuffer_; 65 | std::vector recvBuffer_; 66 | 67 | public: 68 | EthernetConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, uint64_t sendBufferSize = 256 * 1024 * 1024, 69 | uint64_t recvBufferSize = 256 * 1024 * 1024); 70 | 71 | ~EthernetConnection(); 72 | 73 | Transport transport() override; 74 | 75 | Transport remoteTransport() override; 76 | 77 | void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, 78 | uint64_t size) override; 79 | void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override; 80 | 81 | void flush(int64_t timeoutUsec) override; 82 | 83 | private: 84 | void recvMessages(); 85 | 86 | void sendMessage(); 87 | }; 88 | 89 | } // namespace mscclpp 90 | 91 | #endif // MSCCLPP_CONNECTION_HPP_ 92 | -------------------------------------------------------------------------------- /src/include/context.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_CONTEXT_HPP_ 5 | #define MSCCLPP_CONTEXT_HPP_ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "ib.hpp" 13 | 14 | namespace mscclpp { 15 | 16 | struct Context::Impl { 17 | std::vector> connections_; 18 | std::unordered_map> ibContexts_; 19 | std::shared_ptr ipcStream_; 20 | CUmemGenericAllocationHandle mcHandle_; 21 | 22 | Impl(); 23 | 24 | IbCtx* getIbContext(Transport ibTransport); 25 | }; 26 | 27 | } // namespace mscclpp 28 | 29 | #endif // MSCCLPP_CONTEXT_HPP_ 30 | -------------------------------------------------------------------------------- /src/include/endpoint.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_ENDPOINT_HPP_ 5 | #define MSCCLPP_ENDPOINT_HPP_ 6 | 7 | #include 8 | #include 9 | 10 | #include "ib.hpp" 11 | #include "socket.h" 12 | 13 | #define MAX_IF_NAME_SIZE 16 14 | 15 | namespace mscclpp { 16 | 17 | struct Endpoint::Impl { 18 | Impl(EndpointConfig config, Context::Impl& contextImpl); 19 | Impl(const std::vector& serialization); 20 | 21 | Transport transport_; 22 | uint64_t hostHash_; 23 | int maxWriteQueueSize_; 24 | 25 | // The following are only used for IB and are undefined for other transports. 26 | bool ibLocal_; 27 | IbQp* ibQp_; 28 | IbQpInfo ibQpInfo_; 29 | 30 | // The following are only used for Ethernet and are undefined for other transports. 31 | std::unique_ptr socket_; 32 | SocketAddress socketAddress_; 33 | volatile uint32_t* abortFlag_; 34 | char netIfName_[MAX_IF_NAME_SIZE + 1]; 35 | }; 36 | 37 | } // namespace mscclpp 38 | 39 | #endif // MSCCLPP_ENDPOINT_HPP_ 40 | -------------------------------------------------------------------------------- /src/include/registered_memory.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #ifndef MSCCLPP_REGISTERED_MEMORY_HPP_ 5 | #define MSCCLPP_REGISTERED_MEMORY_HPP_ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "communicator.hpp" 12 | #include "ib.hpp" 13 | 14 | namespace mscclpp { 15 | 16 | struct TransportInfo { 17 | Transport transport; 18 | 19 | // TODO: rewrite this using std::variant or something 20 | bool ibLocal; 21 | union { 22 | struct { 23 | cudaIpcMemHandle_t cudaIpcBaseHandle; 24 | size_t cudaIpcOffsetFromBase; 25 | }; 26 | struct { 27 | const IbMr* ibMr; 28 | IbMrInfo ibMrInfo; 29 | }; 30 | struct { 31 | union { 32 | char shareableHandle[64]; 33 | struct { 34 | // These are only defined for multicast (NVLS) capability 35 | pid_t rootPid; 36 | int fileDesc; 37 | }; 38 | }; 39 | size_t offsetFromBase; 40 | }; 41 | }; 42 | }; 43 | 44 | struct RegisteredMemory::Impl { 45 | // This is the data pointer returned by RegisteredMemory::data(), which may be different from the original data 46 | // pointer for deserialized remote memory. 47 | void* data; 48 | // This is the original data pointer the RegisteredMemory was created with. 49 | void* originalDataPtr; 50 | size_t size; 51 | // This is the size returned by cuMemGetAddressRange of data 52 | size_t baseDataSize; 53 | uint64_t hostHash; 54 | uint64_t pidHash; 55 | bool isCuMemMapAlloc; 56 | TransportFlags transports; 57 | std::vector transportInfos; 58 | 59 | // For sharing memory handle via file descriptor 60 | int fileDesc = -1; 61 | 62 | Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl); 63 | /// Constructs a RegisteredMemory::Impl from a vector of data. The constructor should only be used for the remote 64 | /// memory. 65 | Impl(const std::vector& data); 66 | ~Impl(); 67 | 68 | const TransportInfo& getTransportInfo(Transport transport) const; 69 | }; 70 | 71 | } // namespace mscclpp 72 | 73 | #endif // MSCCLPP_REGISTERED_MEMORY_HPP_ 74 | -------------------------------------------------------------------------------- /src/include/utils_internal.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 2 | // Modifications Copyright (c) Microsoft Corporation. 3 | // Licensed under the MIT License. 4 | 5 | #ifndef MSCCLPP_UTILS_INTERNAL_HPP_ 6 | #define MSCCLPP_UTILS_INTERNAL_HPP_ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace mscclpp { 14 | 15 | // PCI Bus ID <-> int64 conversion functions 16 | std::string int64ToBusId(int64_t id); 17 | int64_t busIdToInt64(const std::string busId); 18 | 19 | uint64_t getHash(const char* string, int n); 20 | uint64_t getHostHash(); 21 | uint64_t getPidHash(); 22 | void getRandomData(void* buffer, size_t bytes); 23 | 24 | struct netIf { 25 | char prefix[64]; 26 | int port; 27 | }; 28 | 29 | int parseStringList(const char* string, struct netIf* ifList, int maxList); 30 | bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); 31 | 32 | template 33 | inline void hashCombine(std::size_t& hash, const T& v) { 34 | std::hash hasher; 35 | hash ^= hasher(v) + 0x9e3779b9 + (hash << 6) + (hash >> 2); 36 | } 37 | 38 | struct PairHash { 39 | public: 40 | template 41 | std::size_t operator()(const std::pair& x) const { 42 | std::size_t hash = 0; 43 | hashCombine(hash, x.first); 44 | hashCombine(hash, x.second); 45 | return hash; 46 | } 47 | }; 48 | 49 | } // namespace mscclpp 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/memory_channel.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include "api.h" 7 | #include "debug.h" 8 | 9 | namespace mscclpp { 10 | 11 | MSCCLPP_API_CPP BaseMemoryChannel::BaseMemoryChannel(std::shared_ptr semaphore) 12 | : semaphore_(semaphore) {} 13 | 14 | MSCCLPP_API_CPP MemoryChannel::MemoryChannel(std::shared_ptr semaphore, 15 | RegisteredMemory dst, void* src, void* packetBuffer) 16 | : BaseMemoryChannel(semaphore), dst_(dst), src_(src), packetBuffer_(packetBuffer) { 17 | if (!dst.transports().has(Transport::CudaIpc)) { 18 | throw Error("MemoryChannel: dst must be registered with CudaIpc", ErrorCode::InvalidUsage); 19 | } 20 | } 21 | 22 | MSCCLPP_API_CPP BaseMemoryChannel::DeviceHandle BaseMemoryChannel::deviceHandle() const { 23 | return BaseMemoryChannel::DeviceHandle(semaphore_->deviceHandle()); 24 | } 25 | 26 | MSCCLPP_API_CPP MemoryChannel::DeviceHandle MemoryChannel::deviceHandle() const { 27 | return MemoryChannel::DeviceHandle(semaphore_->deviceHandle(), dst_.data(), src_, packetBuffer_); 28 | } 29 | 30 | } // namespace mscclpp 31 | -------------------------------------------------------------------------------- /src/numa.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "api.h" 10 | 11 | // Convert a logical cudaDev index to the NVML device minor number 12 | static const std::string getBusId(int cudaDev) { 13 | // On most systems, the PCI bus ID comes back as in the 0000:00:00.0 14 | // format. Still need to allocate proper space in case PCI domain goes 15 | // higher. 16 | char busIdChar[] = "00000000:00:00.0"; 17 | MSCCLPP_CUDATHROW(cudaDeviceGetPCIBusId(busIdChar, sizeof(busIdChar), cudaDev)); 18 | // we need the hex in lower case format 19 | for (size_t i = 0; i < sizeof(busIdChar); i++) { 20 | busIdChar[i] = std::tolower(busIdChar[i]); 21 | } 22 | return std::string(busIdChar); 23 | } 24 | 25 | namespace mscclpp { 26 | 27 | MSCCLPP_API_CPP int getDeviceNumaNode(int cudaDev) { 28 | std::string busId = getBusId(cudaDev); 29 | std::string file_str = "/sys/bus/pci/devices/" + busId + "/numa_node"; 30 | std::ifstream file(file_str); 31 | int numaNode; 32 | if (file.is_open()) { 33 | if (!(file >> numaNode)) { 34 | throw Error("Failed to read NUMA node from file: " + file_str, ErrorCode::SystemError); 35 | } 36 | } else { 37 | throw Error("Failed to open file: " + file_str, ErrorCode::SystemError); 38 | } 39 | return numaNode; 40 | } 41 | 42 | MSCCLPP_API_CPP void numaBind(int node) { 43 | int totalNumNumaNodes = numa_num_configured_nodes(); 44 | if (node < 0 || node >= totalNumNumaNodes) { 45 | throw Error( 46 | "Invalid NUMA node " + std::to_string(node) + ", must be between 0 and " + std::to_string(totalNumNumaNodes), 47 | ErrorCode::InvalidUsage); 48 | } 49 | nodemask_t mask; 50 | nodemask_zero(&mask); 51 | nodemask_set_compat(&mask, node); 52 | numa_bind_compat(&mask); 53 | } 54 | 55 | } // namespace mscclpp 56 | -------------------------------------------------------------------------------- /src/utils.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | // Throw upon SIGALRM. 15 | static void sigalrmTimeoutHandler(int) { 16 | signal(SIGALRM, SIG_IGN); 17 | throw mscclpp::Error("Timer timed out", mscclpp::ErrorCode::Timeout); 18 | } 19 | 20 | namespace mscclpp { 21 | 22 | Timer::Timer(int timeout) { set(timeout); } 23 | 24 | Timer::~Timer() { 25 | if (timeout_ > 0) { 26 | alarm(0); 27 | signal(SIGALRM, SIG_DFL); 28 | } 29 | } 30 | 31 | int64_t Timer::elapsed() const { 32 | auto end = std::chrono::steady_clock::now(); 33 | return std::chrono::duration_cast(end - start_).count(); 34 | } 35 | 36 | void Timer::set(int timeout) { 37 | timeout_ = timeout; 38 | if (timeout > 0) { 39 | signal(SIGALRM, sigalrmTimeoutHandler); 40 | alarm(timeout); 41 | } 42 | start_ = std::chrono::steady_clock::now(); 43 | } 44 | 45 | void Timer::reset() { set(timeout_); } 46 | 47 | void Timer::print(const std::string& name) { 48 | auto us = elapsed(); 49 | std::stringstream ss; 50 | ss << name << ": " << us << " us\n"; 51 | std::cout << ss.str(); 52 | } 53 | 54 | ScopedTimer::ScopedTimer(const std::string& name) : name_(name) {} 55 | 56 | ScopedTimer::~ScopedTimer() { print(name_); } 57 | 58 | std::string getHostName(int maxlen, const char delim) { 59 | std::string hostname(maxlen + 1, '\0'); 60 | if (gethostname(const_cast(hostname.data()), maxlen) != 0) { 61 | throw Error("gethostname failed", ErrorCode::SystemError); 62 | } 63 | int i = 0; 64 | while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1)) i++; 65 | hostname[i] = '\0'; 66 | return hostname.substr(0, i); 67 | } 68 | 69 | } // namespace mscclpp 70 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | find_package(MPI) 5 | 6 | set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads) 7 | if(IBVERBS_FOUND) 8 | list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES}) 9 | endif() 10 | set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main) 11 | set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) 12 | set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/include) 13 | 14 | if(MSCCLPP_USE_ROCM) 15 | file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu) 16 | set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX) 17 | endif() 18 | 19 | function(add_test_executable name sources) 20 | add_executable(${name} ${sources}) 21 | target_link_libraries(${name} ${TEST_LIBS_COMMON} MPI::MPI_CXX) 22 | if(IBVERBS_FOUND) 23 | target_compile_definitions(${name} PRIVATE USE_IBVERBS) 24 | endif() 25 | target_include_directories(${name} ${TEST_INC_COMMON} ${TEST_INC_INTERNAL}) 26 | target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS) 27 | add_test(NAME ${name} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh ${name} 2) 28 | endfunction() 29 | 30 | add_test_executable(allgather_test_cpp allgather_test_cpp.cu) 31 | add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu) 32 | add_test_executable(nvls_test nvls_test.cu) 33 | add_test_executable(executor_test executor_test.cc) 34 | 35 | configure_file(run_mpi_test.sh.in run_mpi_test.sh) 36 | 37 | include(CTest) 38 | include(FetchContent) 39 | FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip) 40 | option(INSTALL_GTEST OFF) 41 | FetchContent_MakeAvailable(googletest) 42 | include(GoogleTest) 43 | 44 | # Unit tests 45 | add_executable(unit_tests) 46 | target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST}) 47 | target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL}) 48 | add_subdirectory(unit) 49 | gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST) 50 | 51 | # Multi-process unit tests 52 | add_executable(mp_unit_tests) 53 | target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX) 54 | target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL}) 55 | add_subdirectory(mp_unit) 56 | gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST) 57 | 58 | # mscclpp-test 59 | add_subdirectory(mscclpp-test) 60 | -------------------------------------------------------------------------------- /test/deploy/config: -------------------------------------------------------------------------------- 1 | Host mscclit-000000 2 | Port 22345 3 | IdentityFile /root/mscclpp/sshkey 4 | StrictHostKeyChecking no 5 | Host mscclit-000001 6 | Port 22345 7 | IdentityFile /root/mscclpp/sshkey 8 | StrictHostKeyChecking no 9 | -------------------------------------------------------------------------------- /test/deploy/deploy.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | # get parameter form $1 4 | TEST_NAME=$1 5 | 6 | KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} 7 | ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/" 8 | if [ "${TEST_NAME}" == "nccltest-single-node" ]; then 9 | ROOT_DIR="${ROOT_DIR}/mscclpp" 10 | SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp" 11 | fi 12 | DST_DIR="/tmp/mscclpp" 13 | if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then 14 | HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci" 15 | else 16 | HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile" 17 | fi 18 | SSH_OPTION="StrictHostKeyChecking=no" 19 | 20 | chmod 400 ${KeyFilePath} 21 | ssh-keygen -t rsa -f sshkey -P "" 22 | 23 | while true; do 24 | set +e 25 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname" 26 | if [ $? -eq 0 ]; then 27 | break 28 | fi 29 | echo "Waiting for sshd to start..." 30 | sleep 5 31 | done 32 | 33 | set -e 34 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}" 35 | parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR} 36 | 37 | # force to pull the latest image 38 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ 39 | "sudo docker pull ${CONTAINERIMAGE}" 40 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ 41 | "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \ 42 | -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ 43 | --entrypoint /bin/bash ${CONTAINERIMAGE}" 44 | parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ 45 | "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'" 46 | 47 | -------------------------------------------------------------------------------- /test/deploy/hostfile: -------------------------------------------------------------------------------- 1 | azureuser@mscclit-000000 2 | azureuser@mscclit-000001 3 | -------------------------------------------------------------------------------- /test/deploy/hostfile_ci: -------------------------------------------------------------------------------- 1 | azureuser@10.0.0.4 -------------------------------------------------------------------------------- /test/deploy/hostfile_mpi: -------------------------------------------------------------------------------- 1 | mscclit-000000 2 | mscclit-000001 3 | -------------------------------------------------------------------------------- /test/deploy/perf_ndmv4.jsonl: -------------------------------------------------------------------------------- 1 | {"name":"allgather", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":291.52, "busBw":255.08, "size":1073741824, "time":3683.13, "target":"throughput"} 2 | {"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":244.61, "busBw":229.33, "size":3221225472, "time":13168.31,"target":"throughput"} 3 | {"name":"allgather", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":0.1112, "busBw":0.0973, "size":8192, "time":73.63, "target":"latency"} 4 | {"name":"allreduce", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":139.41, "busBw":243.96, "size":1073741824, "time":7701.98, "target":"throughput"} 5 | {"name":"allreduce", "kernel":2, "ranks":8, "ranksPerNode":8, "algBw":1.25, "busBw":2.19, "size":8192, "time":6.51, "target":"latency"} 6 | {"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.44, "busBw":0.83, "size":8192, "time":18.42, "target":"latency"} 7 | {"name":"allreduce", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":139.08, "busBw":243.40, "size":1073741824, "time":7719.85, "target":"throughput"} 8 | {"name":"allreduce", "kernel":4, "ranks":8, "ranksPerNode":8, "algBw":106.98, "busBw":187.22, "size":16777216, "time":156.81, "target":"throughput"} 9 | {"name":"allreduce", "kernel":4, "ranks":8, "ranksPerNode":8, "algBw":116.24, "busBw":203.42, "size":33554432, "time":288.65, "target":"throughput"} 10 | {"name":"allreduce", "kernel":5, "ranks":8, "ranksPerNode":8, "algBw":126.52,"busBw":221.418,"size":50331648, "time":397.79, "target":"throughput"} 11 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.3919,"busBw":5.9359, "size":24576, "time":7.24, "target":"latency"} 12 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":6.21, "busBw":10.87, "size":49152, "time":7.91, "target":"latency"} 13 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":8.90, "busBw":15.57, "size":73728, "time":8.28, "target":"latency"} 14 | {"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":80.84, "busBw":151.58, "size":25165824, "time":311.28, "target":"throughput"} 15 | {"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":97.27, "busBw":182.38, "size":50331648, "time":517.43, "target":"throughput"} 16 | {"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":125.99, "busBw":236.24, "size":3221225472, "time":25565.46,"target":"throughput"} 17 | {"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5, "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"} 18 | {"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53, "busBw":43.63, "size":1073741824, "time":23071.5, "target":"throughput"} 19 | {"name":"alltoall", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"} 20 | -------------------------------------------------------------------------------- /test/deploy/perf_ndmv5.jsonl: -------------------------------------------------------------------------------- 1 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.98, "busBw":6.96, "size":24576, "time":6.18, "target":"latency"} 2 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":7.42, "busBw":12.99, "size":49152, "time":6.62, "target":"latency"} 3 | {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"} -------------------------------------------------------------------------------- /test/deploy/pytest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [[ $OMPI_COMM_WORLD_RANK == 0 ]] 5 | then 6 | pytest /root/mscclpp/python/test/test_mscclpp.py -x -v 7 | else 8 | pytest /root/mscclpp/python/test/test_mscclpp.py -x 2>&1 >/dev/null 9 | fi 10 | -------------------------------------------------------------------------------- /test/deploy/setup.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | mkdir -p /root/.ssh 4 | mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys 5 | chown root:root /root/.ssh/authorized_keys 6 | mv /root/mscclpp/test/deploy/config /root/.ssh/config 7 | chown root:root /root/.ssh/config 8 | chmod 400 /root/mscclpp/sshkey 9 | chown root:root /root/mscclpp/sshkey 10 | 11 | nvidia-smi -pm 1 12 | for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do 13 | nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i 14 | done 15 | 16 | if [[ "${CUDA_VERSION}" == *"11."* ]]; then 17 | pip3 install -r /root/mscclpp/python/requirements_cuda11.txt 18 | else 19 | pip3 install -r /root/mscclpp/python/requirements_cuda12.txt 20 | fi 21 | 22 | cd /root/mscclpp && pip3 install . 23 | 24 | mkdir -p /var/run/sshd 25 | /usr/sbin/sshd -p 22345 26 | -------------------------------------------------------------------------------- /test/mp_unit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | target_sources(mp_unit_tests PRIVATE 5 | mp_unit_tests.cc 6 | bootstrap_tests.cc 7 | ib_tests.cu 8 | communicator_tests.cu 9 | port_channel_tests.cu 10 | memory_channel_tests.cu 11 | executor_tests.cc 12 | ) 13 | -------------------------------------------------------------------------------- /test/mp_unit/executor_tests.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "mp_unit_tests.hpp" 11 | 12 | namespace { 13 | std::string getExecutablePath() { 14 | char result[PATH_MAX]; 15 | ssize_t count = readlink("/proc/self/exe", result, PATH_MAX); 16 | if (count == -1) { 17 | throw std::runtime_error("Failed to get executable path"); 18 | } 19 | return std::string(result, count); 20 | } 21 | } // namespace 22 | 23 | void ExecutorTest::SetUp() { 24 | MultiProcessTest::SetUp(); 25 | 26 | MSCCLPP_CUDATHROW(cudaSetDevice(rankToLocalRank(gEnv->rank))); 27 | std::shared_ptr bootstrap; 28 | mscclpp::UniqueId id; 29 | bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); 30 | if (gEnv->rank == 0) id = bootstrap->createUniqueId(); 31 | MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); 32 | bootstrap->initialize(id); 33 | std::shared_ptr communicator = std::make_shared(bootstrap); 34 | executor = std::make_shared(communicator); 35 | npkitDumpDir = mscclpp::env()->npkitDumpDir; 36 | if (npkitDumpDir != "") { 37 | NpKit::Init(gEnv->rank); 38 | } 39 | } 40 | 41 | void ExecutorTest::TearDown() { 42 | if (npkitDumpDir != "") { 43 | NpKit::Dump(npkitDumpDir); 44 | NpKit::Shutdown(); 45 | } 46 | executor.reset(); 47 | MultiProcessTest::TearDown(); 48 | } 49 | 50 | TEST_F(ExecutorTest, TwoNodesAllreduce) { 51 | if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) { 52 | GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2"; 53 | return; 54 | } 55 | std::string executablePath = getExecutablePath(); 56 | std::filesystem::path path = executablePath; 57 | std::filesystem::path executionFilesPath = 58 | path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json"; 59 | mscclpp::ExecutionPlan plan(executionFilesPath.string()); 60 | const int bufferSize = 1024 * 1024; 61 | std::shared_ptr sendbuff = mscclpp::GpuBuffer(bufferSize).memory(); 62 | mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); 63 | executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16, 64 | plan, stream); 65 | MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); 66 | } 67 | -------------------------------------------------------------------------------- /test/mscclpp-test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz) 5 | FetchContent_MakeAvailable(json) 6 | 7 | function(add_mscclpp_test_executable name sources) 8 | if(MSCCLPP_USE_ROCM) 9 | set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX) 10 | endif() 11 | add_executable(${name} ${sources} common.cc) 12 | target_link_libraries(${name} ${TEST_LIBS_COMMON} MPI::MPI_CXX nlohmann_json::nlohmann_json) 13 | target_include_directories(${name} ${TEST_INC_COMMON}) 14 | endfunction() 15 | 16 | add_mscclpp_test_executable(sendrecv_test_perf sendrecv_test.cu) 17 | add_mscclpp_test_executable(allgather_test_perf allgather_test.cu) 18 | add_mscclpp_test_executable(allreduce_test_perf allreduce_test.cu) 19 | add_mscclpp_test_executable(alltoall_test_perf alltoall_test.cu) 20 | -------------------------------------------------------------------------------- /test/mscclpp-test/check_perf_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import json 5 | import logging 6 | 7 | 8 | def load_perf_file(perf_fine: str) -> dict: 9 | res = {} 10 | with open(perf_fine, "r") as f: 11 | for line in f: 12 | data = json.loads(line) 13 | res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])] = { 14 | "algBw": data["algBw"], 15 | "busBw": data["busBw"], 16 | "time": data["time"], 17 | } 18 | if "target" in data: 19 | res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[ 20 | "target" 21 | ] 22 | return res 23 | 24 | 25 | def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float, bandwidth_threshold: float) -> bool: 26 | res = True 27 | threshold = None 28 | for key, value in perf_result.items(): 29 | if key not in baseline: 30 | continue 31 | if baseline[key]["target"] == "latency": 32 | threshold = time_threshold 33 | else: 34 | threshold = bandwidth_threshold 35 | if abs(value["time"] - baseline[key]["time"]) / baseline[key]["time"] > threshold: 36 | logging.error( 37 | "%s: time %f not match baseline %f with threshold %f", 38 | str(key), 39 | value["time"], 40 | baseline[key]["time"], 41 | threshold, 42 | ) 43 | res = False 44 | return res 45 | 46 | 47 | if __name__ == "__main__": 48 | import argparse 49 | 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument("--perf-file", type=str, required=True) 52 | parser.add_argument("--baseline-file", type=str, required=True) 53 | # We use different threshold for latency and bandwidth. For latency, 54 | # small data size is used which introduces more variance. For bandwidth, the performance is more stable. 55 | parser.add_argument("--time-threshold", type=float, default=0.15) 56 | parser.add_argument("--bandwidth-threshold", type=float, default=0.05) 57 | args = parser.parse_args() 58 | 59 | perf_result = load_perf_file(args.perf_file) 60 | baseline = load_perf_file(args.baseline_file) 61 | if check_perf_result(perf_result, baseline, args.time_threshold, args.bandwidth_threshold): 62 | print("PASS") 63 | else: 64 | print("FAIL") 65 | exit(1) 66 | -------------------------------------------------------------------------------- /test/run_mpi_test.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 2 ]; then 4 | echo "Usage: $0 [test_args]" 5 | exit 1 6 | fi 7 | test_name=$1 8 | np=$2 9 | shift 2 # Pass the rest of the arguments to the test 10 | 11 | mpirun --bind-to numa --tag-output -x MSCCLPP_DEBUG=INFO -x LD_LIBRARY_PATH=@CMAKE_BINARY_DIR@:$LD_LIBRARY_PATH -np $np @CMAKE_CURRENT_BINARY_DIR@/$test_name $@ 12 | -------------------------------------------------------------------------------- /test/unit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | target_sources(unit_tests PRIVATE 5 | core_tests.cc 6 | cuda_utils_tests.cc 7 | errors_tests.cc 8 | fifo_tests.cu 9 | numa_tests.cc 10 | socket_tests.cc 11 | utils_tests.cc 12 | utils_internal_tests.cc 13 | compile_tests.cu 14 | ) 15 | -------------------------------------------------------------------------------- /test/unit/compile_tests.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #undef NDEBUG 7 | #ifndef DEBUG_BUILD 8 | #define DEBUG_BUILD 9 | #endif // DEBUG_BUILD 10 | #include 11 | 12 | #include 13 | 14 | TEST(CompileTest, Assert) { assert(true); } 15 | -------------------------------------------------------------------------------- /test/unit/core_tests.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | class LocalCommunicatorTest : public ::testing::Test { 10 | protected: 11 | void SetUp() override { 12 | bootstrap = std::make_shared(0, 1); 13 | bootstrap->initialize(bootstrap->createUniqueId()); 14 | comm = std::make_shared(bootstrap); 15 | } 16 | 17 | std::shared_ptr bootstrap; 18 | std::shared_ptr comm; 19 | }; 20 | 21 | TEST_F(LocalCommunicatorTest, RegisterMemory) { 22 | int dummy[42]; 23 | auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports); 24 | EXPECT_EQ(memory.data(), &dummy); 25 | EXPECT_EQ(memory.size(), sizeof(dummy)); 26 | EXPECT_EQ(memory.transports(), mscclpp::NoTransports); 27 | } 28 | 29 | TEST_F(LocalCommunicatorTest, SendMemoryToSelf) { 30 | int dummy[42]; 31 | auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports); 32 | comm->sendMemory(memory, 0, 0); 33 | auto memoryFuture = comm->recvMemory(0, 0); 34 | auto sameMemory = memoryFuture.get(); 35 | EXPECT_EQ(sameMemory.data(), memory.data()); 36 | EXPECT_EQ(sameMemory.size(), memory.size()); 37 | EXPECT_EQ(sameMemory.transports(), memory.transports()); 38 | } 39 | -------------------------------------------------------------------------------- /test/unit/cuda_utils_tests.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | 8 | TEST(CudaUtilsTest, AllocShared) { 9 | auto p1 = mscclpp::detail::gpuCallocShared(); 10 | auto p2 = mscclpp::detail::gpuCallocShared(5); 11 | } 12 | 13 | TEST(CudaUtilsTest, AllocUnique) { 14 | auto p1 = mscclpp::detail::gpuCallocUnique(); 15 | auto p2 = mscclpp::detail::gpuCallocUnique(5); 16 | } 17 | 18 | TEST(CudaUtilsTest, MakeSharedHost) { 19 | auto p1 = mscclpp::detail::gpuCallocHostShared(); 20 | auto p2 = mscclpp::detail::gpuCallocHostShared(5); 21 | } 22 | 23 | TEST(CudaUtilsTest, MakeUniqueHost) { 24 | auto p1 = mscclpp::detail::gpuCallocHostUnique(); 25 | auto p2 = mscclpp::detail::gpuCallocHostUnique(5); 26 | } 27 | 28 | TEST(CudaUtilsTest, Memcpy) { 29 | const int nElem = 1024; 30 | std::vector hostBuff(nElem); 31 | for (int i = 0; i < nElem; ++i) { 32 | hostBuff[i] = i + 1; 33 | } 34 | std::vector hostBuffTmp(nElem, 0); 35 | auto devBuff = mscclpp::detail::gpuCallocShared(nElem); 36 | mscclpp::gpuMemcpy(devBuff.get(), hostBuff.data(), nElem, cudaMemcpyHostToDevice); 37 | mscclpp::gpuMemcpy(hostBuffTmp.data(), devBuff.get(), nElem, cudaMemcpyDeviceToHost); 38 | 39 | for (int i = 0; i < nElem; ++i) { 40 | EXPECT_EQ(hostBuff[i], hostBuffTmp[i]); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /test/unit/errors_tests.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | 8 | TEST(ErrorsTest, SystemError) { 9 | mscclpp::Error error("test", mscclpp::ErrorCode::SystemError); 10 | EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError); 11 | EXPECT_EQ(error.what(), std::string("test (Mscclpp failure: SystemError)")); 12 | } 13 | 14 | TEST(ErrorsTest, InternalError) { 15 | mscclpp::Error error("test", mscclpp::ErrorCode::InternalError); 16 | EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError); 17 | EXPECT_EQ(error.what(), std::string("test (Mscclpp failure: InternalError)")); 18 | } 19 | 20 | TEST(ErrorsTest, InvalidUsage) { 21 | mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage); 22 | EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage); 23 | EXPECT_EQ(error.what(), std::string("test (Mscclpp failure: InvalidUsage)")); 24 | } 25 | 26 | TEST(ErrorsTest, Timeout) { 27 | mscclpp::Error error("test", mscclpp::ErrorCode::Timeout); 28 | EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout); 29 | EXPECT_EQ(error.what(), std::string("test (Mscclpp failure: Timeout)")); 30 | } 31 | -------------------------------------------------------------------------------- /test/unit/fifo_tests.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define ITER 10000 // should be larger than the FIFO size for proper testing 12 | 13 | __constant__ mscclpp::FifoDeviceHandle gFifoTestFifoDeviceHandle; 14 | __global__ void kernelFifoTest() { 15 | if (threadIdx.x + blockIdx.x * blockDim.x != 0) return; 16 | 17 | mscclpp::FifoDeviceHandle& fifo = gFifoTestFifoDeviceHandle; 18 | mscclpp::ProxyTrigger trigger; 19 | for (uint64_t i = 1; i < ITER + 1; ++i) { 20 | trigger.fst = i; 21 | trigger.snd = i; 22 | uint64_t curFifoHead = fifo.push(trigger); 23 | if (i % fifo.size == 0) { 24 | fifo.sync(curFifoHead); 25 | } 26 | } 27 | } 28 | 29 | TEST(FifoTest, Fifo) { 30 | int cudaNum; 31 | MSCCLPP_CUDATHROW(cudaGetDevice(&cudaNum)); 32 | int numaNode = mscclpp::getDeviceNumaNode(cudaNum); 33 | mscclpp::numaBind(numaNode); 34 | 35 | mscclpp::Fifo hostFifo; 36 | if (hostFifo.size() >= ITER) { 37 | FAIL() << "ITER is too small for proper testing."; 38 | } 39 | 40 | mscclpp::FifoDeviceHandle devFifo = hostFifo.deviceHandle(); 41 | MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gFifoTestFifoDeviceHandle, &devFifo, sizeof(devFifo))); 42 | 43 | kernelFifoTest<<<1, 1>>>(); 44 | MSCCLPP_CUDATHROW(cudaGetLastError()); 45 | 46 | mscclpp::ProxyTrigger trigger; 47 | trigger.fst = 0; 48 | trigger.snd = 0; 49 | 50 | uint64_t spin = 0; 51 | uint64_t flushCnt = 0; 52 | mscclpp::Timer timer(3); 53 | for (uint64_t i = 0; i < ITER; ++i) { 54 | trigger = hostFifo.poll(); 55 | while (trigger.fst == 0 || trigger.snd == 0) { 56 | trigger = hostFifo.poll(); 57 | 58 | if (spin++ > 1000000) { 59 | FAIL() << "Polling is stuck."; 60 | } 61 | } 62 | // see `src/proxy.cc` for the reason of this line 63 | trigger.snd ^= ((uint64_t)1 << (uint64_t)63); 64 | ASSERT_TRUE(trigger.fst == (i + 1)); 65 | ASSERT_TRUE(trigger.snd == (i + 1)); 66 | hostFifo.pop(); 67 | if ((++flushCnt % hostFifo.size()) == 0) { 68 | hostFifo.flushTail(); 69 | } 70 | spin = 0; 71 | } 72 | hostFifo.flushTail(true); 73 | 74 | std::stringstream ss; 75 | ss << "FifoTest.Fifo: " << (float)timer.elapsed() / ITER << " us/iter\n"; 76 | std::cout << ss.str(); 77 | 78 | MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); 79 | } 80 | -------------------------------------------------------------------------------- /test/unit/numa_tests.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | TEST(NumaTest, Basic) { 10 | int num; 11 | MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num)); 12 | if (num == 0) { 13 | return; 14 | } 15 | for (int i = 0; i < num; i++) { 16 | int numaNode = mscclpp::getDeviceNumaNode(i); 17 | EXPECT_GE(numaNode, 0); 18 | mscclpp::numaBind(numaNode); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /test/unit/socket_tests.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "socket.h" 10 | 11 | TEST(Socket, ListenAndConnect) { 12 | mscclpp::Timer timeout(3); 13 | 14 | std::string ipPortPair = "127.0.0.1:51512"; 15 | mscclpp::SocketAddress listenAddr; 16 | 17 | ASSERT_NO_THROW(mscclpp::SocketGetAddrFromString(&listenAddr, ipPortPair.c_str())); 18 | 19 | mscclpp::Socket listenSock(&listenAddr); 20 | listenSock.bindAndListen(); 21 | 22 | std::thread clientThread([&listenAddr]() { 23 | mscclpp::Socket sock(&listenAddr); 24 | sock.connect(); 25 | }); 26 | 27 | mscclpp::Socket sock; 28 | sock.accept(&listenSock); 29 | 30 | clientThread.join(); 31 | } 32 | -------------------------------------------------------------------------------- /test/unit/utils_internal_tests.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include "utils_internal.hpp" 9 | 10 | TEST(UtilsInternalTest, getHostHash) { 11 | uint64_t hash1 = mscclpp::getHostHash(); 12 | uint64_t hash2; 13 | 14 | std::thread th([&hash2]() { hash2 = mscclpp::getHostHash(); }); 15 | 16 | ASSERT_TRUE(th.joinable()); 17 | th.join(); 18 | 19 | EXPECT_EQ(hash1, hash2); 20 | } 21 | -------------------------------------------------------------------------------- /test/unit/utils_tests.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | TEST(UtilsTest, Timer) { 11 | mscclpp::Timer timer; 12 | sleep(1); 13 | int64_t elapsed = timer.elapsed(); 14 | EXPECT_GE(elapsed, 1000000); 15 | 16 | timer.reset(); 17 | sleep(1); 18 | elapsed = timer.elapsed(); 19 | EXPECT_GE(elapsed, 1000000); 20 | EXPECT_LT(elapsed, 1100000); 21 | } 22 | 23 | TEST(UtilsTest, TimerTimeout) { 24 | mscclpp::Timer timer(1); 25 | ASSERT_THROW(sleep(2), mscclpp::Error); 26 | } 27 | 28 | TEST(UtilsTest, TimerTimeoutReset) { 29 | mscclpp::Timer timer(3); 30 | sleep(2); 31 | // Resetting the timer should prevent the timeout. 32 | timer.reset(); 33 | ASSERT_NO_THROW(sleep(2)); 34 | 35 | // Elapsed time should be slightly larger than 2 seconds. 36 | EXPECT_GT(timer.elapsed(), 2000000); 37 | EXPECT_LT(timer.elapsed(), 2100000); 38 | } 39 | 40 | TEST(UtilsTest, ScopedTimer) { 41 | mscclpp::ScopedTimer timerA("UtilsTest.ScopedTimer.A"); 42 | mscclpp::ScopedTimer timerB("UtilsTest.ScopedTimer.B"); 43 | sleep(1); 44 | int64_t elapsedA = timerA.elapsed(); 45 | int64_t elapsedB = timerB.elapsed(); 46 | EXPECT_GE(elapsedA, 1000000); 47 | EXPECT_GE(elapsedB, 1000000); 48 | } 49 | 50 | TEST(UtilsTest, getHostName) { 51 | std::string hostname1 = mscclpp::getHostName(1024, '.'); 52 | EXPECT_FALSE(hostname1.empty()); 53 | EXPECT_LE(hostname1.size(), 1024); 54 | 55 | EXPECT_EQ(mscclpp::getHostName(1024, hostname1[0]).size(), 0); 56 | 57 | std::string hostname2; 58 | 59 | std::thread th([&hostname2]() { hostname2 = mscclpp::getHostName(1024, '.'); }); 60 | 61 | ASSERT_TRUE(th.joinable()); 62 | th.join(); 63 | 64 | EXPECT_EQ(hostname1, hostname2); 65 | } 66 | -------------------------------------------------------------------------------- /tools/npkit/build_and_run_npkit.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | MSCCLPP_SRC_DIR="/mnt/mscclpp" 4 | NPKIT_RUN_DIR="/mnt/npkit_run" 5 | MPI_HOME="/usr/local/mpi" 6 | HOSTFILE="hostfile" 7 | LEADER_IP_PORT="10.6.0.4:50000" 8 | 9 | cd ${MSCCLPP_SRC_DIR} 10 | make clean 11 | MPI_HOME=${MPI_HOME} make -j NPKIT=1 12 | 13 | parallel-ssh -h ${HOSTFILE} "rm -rf ${NPKIT_RUN_DIR}" 14 | parallel-ssh -h ${HOSTFILE} "mkdir -p ${NPKIT_RUN_DIR}" 15 | parallel-scp -r -h ${HOSTFILE} ${MSCCLPP_SRC_DIR} ${NPKIT_RUN_DIR} 16 | parallel-ssh -h ${HOSTFILE} "mkdir -p ${NPKIT_RUN_DIR}/npkit_dump" 17 | parallel-ssh -h ${HOSTFILE} "mkdir -p ${NPKIT_RUN_DIR}/npkit_trace" 18 | 19 | # --bind-to numa is required because hardware timer from different cores (or core groups) can be non-synchronized. 20 | mpirun --allow-run-as-root -hostfile ${HOSTFILE} -map-by ppr:8:node --bind-to numa -x LD_PRELOAD=${NPKIT_RUN_DIR}/mscclpp/build/lib/libmscclpp.so -x MSCCLPP_DEBUG=WARN -x MSCCLPP_NPKIT_DUMP_DIR=${NPKIT_RUN_DIR}/npkit_dump ${NPKIT_RUN_DIR}/mscclpp/build/bin/tests/allgather_test -ip_port ${LEADER_IP_PORT} -kernel 0 21 | 22 | parallel-ssh -h ${HOSTFILE} "cd ${NPKIT_RUN_DIR}/mscclpp/tools/npkit && python npkit_trace_generator.py --npkit_dump_dir ${NPKIT_RUN_DIR}/npkit_dump --npkit_event_header_path ${NPKIT_RUN_DIR}/mscclpp/src/include/npkit/npkit_event.h --output_dir ${NPKIT_RUN_DIR}/npkit_trace" 23 | --------------------------------------------------------------------------------