├── .github
    └── workflows
    │   ├── codeql.yml
    │   └── tests.yaml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── SYNTHESIS.md
├── dockerfiles
    └── Dockerfile
├── examples
    ├── dgx1_allgather.ipynb
    ├── mscclang
    │   ├── allgather_a100_pcie.py
    │   ├── allgather_allpairs.py
    │   ├── allgather_recursive_doubling.py
    │   ├── allgather_ring.py
    │   ├── allreduce_1step.py
    │   ├── allreduce_a100_allpairs.py
    │   ├── allreduce_a100_allpairs_v2.py
    │   ├── allreduce_a100_multinode_allpairs.py
    │   ├── allreduce_a100_ncv4.py
    │   ├── allreduce_a100_ncv4_v2.py
    │   ├── allreduce_a100_pcie_hierarchical.py
    │   ├── allreduce_a100_recursive_doubling_halving.py
    │   ├── allreduce_a100_ring.py
    │   ├── allreduce_binomial_tree.py
    │   ├── allreduce_dgx1.py
    │   ├── allreduce_ndv2.py
    │   ├── allreduce_recursive_doubling_halving.py
    │   ├── alltoall_a100_three_step.py
    │   ├── alltoall_a100_two_step.py
    │   ├── alltoall_allpairs.py
    │   ├── alltonext_backward.py
    │   ├── alltonext_forward.py
    │   ├── hierarchical_allreduce.py
    │   ├── pipeline_a100_allpairs.py
    │   ├── pipeline_a100_ring.py
    │   ├── reducegather.py
    │   └── simple
    │   │   ├── allgather_ring.py
    │   │   ├── allreduce_ring.py
    │   │   └── custom_collective.py
    ├── requirements_sccl_init.txt
    ├── sccl_init.py
    ├── send.py
    └── unpermute_dgx1.py
├── msccl
    ├── __init__.py
    ├── __main__.py
    ├── algorithm.py
    ├── autosynth
    │   ├── __init__.py
    │   ├── msccl_ndv2_launcher.sh
    │   ├── ndv2_plans.py
    │   ├── ndv4_plans.py
    │   └── registry.py
    ├── cli
    │   ├── __init__.py
    │   ├── analyze.py
    │   ├── common.py
    │   ├── compose.py
    │   ├── distribute.py
    │   ├── known_collectives.py
    │   ├── known_distributed_topologies.py
    │   ├── known_topologies.py
    │   ├── known_transformers.py
    │   ├── ncclize.py
    │   ├── plans.py
    │   └── solve.py
    ├── collectives.py
    ├── composers.py
    ├── distributors
    │   ├── __init__.py
    │   ├── alltoall_subproblem.py
    │   ├── gather_scatter_alltoall.py
    │   └── greedy_alltoall.py
    ├── instance.py
    ├── isomorphisms.py
    ├── language
    │   ├── __init__.py
    │   ├── buffer.py
    │   ├── chunk.py
    │   ├── collectives.py
    │   ├── ir.py
    │   ├── passes.py
    │   ├── rank_dag.py
    │   ├── routines.py
    │   ├── tb_assignment.py
    │   └── visualize.py
    ├── ncclize.py
    ├── ncd_reduction.py
    ├── path_encoding.py
    ├── programs
    │   ├── __init__.py
    │   ├── allreduce_a100_ring.py
    │   ├── allreduce_allpairs.py
    │   ├── alltoall_a100_8kp1.py
    │   └── alltoall_a100_yifan.py
    ├── rounds_bound.py
    ├── serialization.py
    ├── steps_bound.py
    ├── strategies.py
    └── topologies
    │   ├── __init__.py
    │   ├── amd.py
    │   ├── distributed.py
    │   ├── generic.py
    │   ├── nvidia.py
    │   ├── topology.py
    │   └── transformers.py
├── pytest.ini
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── common.py
    ├── test_algorithm.py
    ├── test_analyses.py
    ├── test_autosynth.py
    ├── test_cli.py
    ├── test_distributors.py
    ├── test_language.py
    ├── test_path_encoding.py
    ├── test_programs.py
    ├── test_serialization.py
    └── test_topologies.py


/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   schedule:
 9 |     - cron: '16 14 * * 2'
10 | 
11 | jobs:
12 |   analyze:
13 |     name: Analyze
14 |     runs-on: ubuntu-latest
15 |     permissions:
16 |       actions: read
17 |       contents: read
18 |       security-events: write
19 | 
20 |     steps:
21 |     - name: Checkout repository
22 |       uses: actions/checkout@v2
23 | 
24 |     - name: Initialize CodeQL
25 |       uses: github/codeql-action/init@v1
26 |       with:
27 |         languages: python
28 | 
29 |     - name: Perform CodeQL Analysis
30 |       uses: github/codeql-action/analyze@v1
31 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     branches: [ main ]
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: ubuntu-latest
11 | 
12 |     strategy:
13 |       matrix:
14 |         python-version: [3.6, 3.7, 3.8, 3.9]
15 | 
16 |     name: Test with Python ${{ matrix.python-version }}
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install msccl and dependencies
25 |       run: |
26 |         pip install --upgrade pip
27 |         pip install -r requirements.txt
28 |     - name: Run tests and check at least 90% coverage
29 |       run: |
30 |         pytest
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # MSCCL specific
  2 | *.msccl.json
  3 | *.msccl.xml
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | This project welcomes contributions and suggestions. Most contributions require you to
 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
 5 | and actually do, grant us the rights to use your contribution. For details, visit
 6 | https://cla.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need
 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
11 | 
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/SYNTHESIS.md:
--------------------------------------------------------------------------------
  1 | ## Synthesizing Algorithms
  2 | 
  3 | MSCCL can synthesize algorithms for a given *topology* that implements a given *collective* in a given number of steps, bandwidth usage, memory limits, etc. These additional parameters are called the *instance*.
  4 | 
  5 | MSCCL groups its solver strategies under the `msccl solve` subcommand. For example, to synthesize a specific `instance` of an Allgather algorithm for the [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps:
  6 | ```
  7 | $ msccl solve instance DGX1 Allgather --steps 4
  8 | Solving instance steps=4... synthesized! (0.7s)
  9 | Wrote to Allgather.n8-DGX1-steps4.msccl.json
 10 | ```
 11 | The instance is satisfiable and `msccl` saves it to a file.
 12 | 
 13 | Four steps is not necessarily the least number of steps required. To find the least steps:
 14 | ```
 15 | $ msccl solve least-steps DGX1 Allgather
 16 | Algorithms need at least 2 steps.
 17 | Solving instance steps=2... synthesized! (0.2s)
 18 | Wrote to Allgather.n8-DGX1-steps2.msccl.json
 19 | ```
 20 | The `least-steps` strategy statically determines that any Allgather in a DGX-1 requires at least 2 steps and starting from that finds the smallest satisfiable number of steps.
 21 | 
 22 | While this two step algorithm is a latency-optimal one, there may be other algorithms that achieve higher bandwidth. The `pareto-optimal` strategy searches through different latency-bandwidth tradeoffs:
 23 | ```
 24 | $ msccl solve pareto-optimal DGX1 Allgather
 25 | Algorithms need at least 2 steps.
 26 | Algorithms need at least 7/6 rounds per chunk.
 27 | Solving instance steps=2... synthesized! (0.5s)
 28 | Solving instance steps=2,rounds=3,chunks=2... synthesized! (0.9s)
 29 | Solving instance steps=2,rounds=4,chunks=3... unsatisfiable. (1.1s)
 30 | Assuming 2 step algorithms need at least 4/3 rounds per chunk.
 31 | Solving instance steps=3,rounds=4,chunks=3... synthesized! (2.9s)
 32 | Solving instance steps=3,rounds=5,chunks=4... synthesized! (6.5s)
 33 | Solving instance steps=3,rounds=6,chunks=5... synthesized! (44.0s)
 34 | Solving instance steps=3,rounds=7,chunks=6... synthesized! (56.1s)
 35 | Bandwidth optimal algorithm found!
 36 | Found 2 Pareto optimal algorithms. Pruned 4 non-optimal algorithms.
 37 | Wrote to Allgather.n8-DGX1-steps2.rounds3.chunks2.msccl.json
 38 | Wrote to Allgather.n8-DGX1-steps3.rounds7.chunks6.msccl.json
 39 | ```
 40 | 
 41 | ## Collectives
 42 | 
 43 | MSCCL includes a number of built in common collectives.
 44 | 
 45 | | Collective | Arguments | Description | Kind |
 46 | | - | - | - | - |
 47 | | Broadcast | `--root N` | Send data from root to all nodes. | NC |
 48 | | Reduce | `--root N` | Combine data from all nodes to root. | CR |
 49 | | Scatter | `--root N` | Send slices of data from root to all nodes. | NC |
 50 | | Gather | `--root N` | Send slices of data from all nodes to root. | NC |
 51 | | Allgather | | Send slices of data from all nodes to all nodes. | NC |
 52 | | Allreduce | | Combine data from all nodes to all nodes. | CNR |
 53 | | Alltoall | | Transpose data between all nodes. | NC |
 54 | | ReduceScatter | | Combine slices of data to all nodes. | CR |
 55 | | Scan | | Combine partial prefixes of data to all nodes in sequence. | CNR |
 56 | | MultirootBroadcast | `--roots N [N ...]` | Like Broadcast, but set of nodes have slices of input. | NC |
 57 | | MultirootScatter | `--roots N [N ...]` | Like Scatter, but set of nodes have slices of input. | NC |
 58 | | MultirootGather | `--roots N [N ...]` | Like Gather, but output is sent in slices to a set of nodes. | NC |
 59 | | custom | `--collective-file` | Arbitrary collective serialized by the user. | ? |
 60 | 
 61 | Custom collectives may be defined by instantiating the `Collective` class, which is easiest through the `build_collective` function. For example, a send from rank 2 to rank 7 in an 8 node topology can be defined and saved with:
 62 | ```
 63 | from msccl.collectives import build_collective
 64 | from msccl.serialization import save_msccl_object
 65 | 
 66 | precondition = lambda r, c: r == 2
 67 | postcondition = lambda r, c: r == 7
 68 | coll = build_collective('Send', 8, 1, precondition, postcondition)
 69 | save_msccl_object(coll, 'send.json')
 70 | ```
 71 | 
 72 | The *kind* of the collective determines support for some features of MSCCL:
 73 | - **NC** are non-combining collectives, and are always supported.
 74 | - **CR** are combining collectives that have a non-combining dual collective, and are supported through a reduction.
 75 | - **CNR** are combining collectives with no dual, which may not always be supported.
 76 | 
 77 | Currently the rounds per chunk analysis described below can not support CNR collectives.
 78 | 
 79 | ## Steps and Rounds
 80 | 
 81 | MSCCL uses two related concepts, *steps and rounds*, to talk about the running time of algorithms. *Steps* is how many sequential sets of sends the algorithm consists of, where all sends inside a step execute in parallel. The number of sends between two nodes in a single step is limited by the bandwidth available in the topology. However, a step may consist of multiple *rounds*, which acts as a multiplier for all links in the topology during that step.
 82 | 
 83 | How much data a single round corresponds to depends on what is the actual size of a chunk at runtime, and how many chunks a collective uses can change (e.g. you can control this directly in the `instance` strategy by setting `--chunks N`). Thus for each collective the total data usage of different algorithms implementing it can be measured with their *rounds per chunk*.
 84 | 
 85 | MSCCL provides a standalone analysis to find a lower bound for the *rounds per chunk* required by any instance. For example, to find the least rouds per chunk for an Alltoall in a DGX-1:
 86 | ```
 87 | $ msccl analyze rounds DGX1 Gather
 88 | Gather(n=8,root=0) algorithms need at least 7/6 rounds in DGX1 topology.
 89 | ```
 90 | In this case the bound happens to be tight and the `pareto-optimal` strategy would use it to detect that it has found a bandwidth optimal algorithm.
 91 | 
 92 | ## Distributed Algorithms
 93 | 
 94 | MSCCL provides routines to synthesize algorithms for distributed topologies under the `msccl distribute` subcommand. These work by using algorithms for a local collective and stitcing instances of it together to create a distributed one.
 95 | 
 96 | **Alltoall from Gather and Scatter:** `alltoall-gather-scatter` combines a Gather and a Scatter algorithm with a transpose step in the middle to form a distributed Alltoall algorithm. For example, an Alltoall algorithm for a cluster of 4 DGX-1 machines can be created with:
 97 | ```
 98 | msccl solve least-steps DGX1 Gather -o gather.json
 99 | msccl solve least-steps DGX1 Scatter -o scatter.json --root 1
100 | msccl distribute alltoall-gather-scatter gather.json scatter.json --copies 4 -o alltoall.json
101 | ```
102 | This distributor works with any Gather and Scatter algorithm, as long as their roots have a direct connection in the topology. MSCCL also provides multi-root versions of Gather and Scatter that can be substituted here.
103 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-devel
 2 | 
 3 | ##############################################################################
 4 | # Temporary Installation Directory
 5 | ##############################################################################
 6 | ENV STAGE_DIR=/tmp
 7 | RUN mkdir -p ${STAGE_DIR}
 8 | 
 9 | 
10 | ##############################################################################
11 | # Installation/Basic Utilities
12 | ##############################################################################
13 | RUN apt-get update && \
14 |     apt-get install -y --allow-change-held-packages --no-install-recommends \
15 |     software-properties-common \
16 |     build-essential autotools-dev cmake g++ gcc \
17 |     openssh-client openssh-server \
18 |     nfs-common pdsh curl sudo net-tools \
19 |     vim iputils-ping wget perl unzip
20 | 
21 | ##############################################################################
22 | # Installation Latest Git
23 | ##############################################################################
24 | RUN add-apt-repository ppa:git-core/ppa -y && \
25 |     apt-get update && \
26 |     apt-get install -y git && \
27 |     git --version
28 | 
29 | ##############################################################################
30 | # Pip
31 | ##############################################################################
32 | # pip version <= 20.1.1 is needed for the ruamel.yaml installation conflict
33 | # between conda and pip. ruamel.yaml is needed by azureml.
34 | # https://github.com/Azure/MachineLearningNotebooks/issues/1110 for more info.
35 | ENV PIP_VERSION=20.1.1
36 | RUN conda install -y pip=${PIP_VERSION} && \
37 |     # Print python an pip version
38 |     python -V && pip -V
39 | 
40 | ##############################################################################
41 | # MPI
42 | ##############################################################################
43 | RUN cd ${STAGE_DIR} && mkdir openmpi/ && cd openmpi && wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.1.tar.gz && \
44 |     tar zxf openmpi-4.0.1.tar.gz && \
45 |     cd openmpi-4.0.1 && \
46 |     ./configure --enable-orterun-prefix-by-default && \
47 |     make -j $(nproc) all && \
48 |     make install && \
49 |     ldconfig && \
50 |     rm -rf ${STAGE_DIR}/openmpi/
51 | 
52 | ##############################################################################
53 | # MSCCL
54 | ##############################################################################
55 | 
56 | # update NCCL in pytorch, install MSCCL interpreter
57 | RUN pip uninstall torch -y
58 | 
59 | RUN pip install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
60 | 
61 | RUN conda install -c pytorch magma-cuda111 -y
62 | 
63 | ENV CMAKE_PREFIX_PATH=/opt/conda
64 | 
65 | # Change NCCL to MSCCL Runtime
66 | RUN cd ${STAGE_DIR} && \
67 |     git clone https://github.com/pytorch/pytorch.git && \
68 |     cd pytorch && \
69 |     git checkout tags/v1.9.0 -b v1.9.0_msccl && \
70 |     perl -p -i -e  's/url = https:\/\/github\.com\/NVIDIA\/nccl/url = https:\/\/github\.com\/microsoft\/msccl/g' .gitmodules && \
71 |     git submodule sync third_party/nccl  && \
72 |     git submodule update --init --recursive  && \
73 |     git submodule update --init --recursive --remote third_party/nccl && \
74 |     cd third_party/nccl/nccl/ && \
75 |     git checkout master && \
76 |     cd ../../../ && \
77 |     git apply third_party/nccl/nccl/patches/nccl.cpp.patch && \
78 |     python setup.py install && \
79 |     cd ${STAGE_DIR} && \
80 |     rm -rf ${STAGE_DIR}/pytorch
81 | 
82 | # Install MSCCL
83 | RUN cd ${STAGE_DIR}/ && \
84 |     git clone https://github.com/microsoft/msccl.git && \
85 |     cd msccl/ && python setup.py install && \
86 |     cd ${STAGE_DIR} && \
87 |     rm -rf ${STAGE_DIR}/msccl/
88 | 
89 | ##############################################################################
90 | # inspector-topo
91 | ##############################################################################
92 | 
93 | RUN apt-get install libibverbs-dev libnuma-dev -y
94 | RUN cd ${STAGE_DIR}/ && git clone https://github.com/microsoft/inspector-topo.git && \
95 |     cd inspector-topo/ && make && make install
96 | 
97 | 


--------------------------------------------------------------------------------
/examples/mscclang/allgather_a100_pcie.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllGather
 8 | 
 9 | # Hierarchical allgather for A100 
10 | def allgather_hier(gpus, instances, protocol):
11 |     size = gpus
12 |     chunksperloop = 1
13 |     topology = fully_connected(gpus)
14 |     collective = AllGather(size, chunksperloop, True)
15 | 
16 |     with MSCCLProgram("allgather_hierarchical", topology, collective, instances, protocol=protocol, 
17 |         interleaved_replication=True, dependence_nop=True):
18 |         for chnk in range(2):
19 |             for r in range(size):
20 |                 if ((r % 2) == chnk):
21 |                     c = chunk(r, Buffer.input, 0)
22 |                     c.copy(r + 1 - 2 * chnk, Buffer.output, r)
23 |             for r in range(size):
24 |                 if ((r % 2) == chnk):
25 |                     c = chunk(r, Buffer.input, 0)
26 |                     c.copy((r+2) % size, Buffer.output, r)
27 |             for r in range(size):
28 |                 if ((r % 2) == chnk):
29 |                     c = chunk(r, Buffer.output, (r+2) % size)
30 |                     c.copy(r + 1 - 2 * chnk, Buffer.output, (r+2) % size)
31 | 
32 |         XML()
33 |         Check()
34 | 
35 | 
36 | parser = argparse.ArgumentParser()
37 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
38 | parser.add_argument('instances', type=int, help='number of instances')
39 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
40 | args = parser.parse_args()
41 | 
42 | allgather_hier(args.num_gpus, args.instances, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/allgather_allpairs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllGather
 8 | 
 9 | # Allpairs allgather for A100 
10 | def allgather_allpairs(gpus, instances, protocol):
11 |     size = gpus
12 |     topology = fully_connected(gpus)
13 |     collective = AllGather(size, 1, True)
14 | 
15 |     with MSCCLProgram(f"allgather_allpairs", topology, collective, instances,
16 |          protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
17 |         
18 |         # Each rank sends its nth chunk to all other gpus
19 |         for r1 in range(gpus):
20 |             for r2 in range(gpus):
21 |                 if r1 != r2:
22 |                     index = 0
23 |                     c = chunk(r1, Buffer.input, index, 1)
24 |                     c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
25 |         XML()
26 |         Check()
27 | 
28 | 
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
31 | parser.add_argument('instances', type=int, help='number of instances')
32 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
33 | args = parser.parse_args()
34 | 
35 | allgather_allpairs(args.num_gpus, args.instances, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/allgather_recursive_doubling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllGather
 8 | 
 9 | # https://web.cels.anl.gov/~thakur/papers/mpi-coll.pdf
10 | def allgather_recursive_doubling(size, instances, protocol):
11 |     topology = fully_connected(size)
12 |     collective = AllGather(size, 1, True)
13 |     with MSCCLProgram("allgather_recursive_doubling", topology, collective, instances, protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
14 |         count = 1
15 |         while count < size:
16 |             # Every rank exchanges count chunks with neighbor count away
17 |             for rank in range(size):
18 |                 peer = rank ^ count
19 |                 index = (rank // count) * count
20 |                 chunk(rank, Buffer.output, index, size=count).copy(peer, Buffer.output, index, sendtb=peer, recvtb=rank) 
21 |             count *= 2
22 | 
23 |         XML()
24 |         Check()
25 | 
26 | parser = argparse.ArgumentParser()
27 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
28 | parser.add_argument('instances', type=int, help ='number of instances')
29 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
30 | args = parser.parse_args()
31 | 
32 | allgather_recursive_doubling(args.num_gpus, args.instances, args.protocol)
33 | 


--------------------------------------------------------------------------------
/examples/mscclang/allgather_ring.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllGather
 8 | 
 9 | # Ring allgather for A100s
10 | # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
11 | # channels=1 is standard ring, all chunks are assigned to the same tb/channel
12 | # channels=8 devotes 1 tb/channel to handling 1 chunk of the data
13 | def allgather_ring(size, channels, instances, protocol):
14 |     topology = fully_connected(size)
15 |     collective = AllGather(size, 1, True)
16 |     with MSCCLProgram(f"allgather_ring_{channels}channelsperring", topology, collective, instances,
17 |          protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
18 |         for step in range(0, size-1):
19 |             for index in range(0, size):
20 |                 rank = (index + step) % size
21 |                 c = chunk(rank, Buffer.output, index)
22 |                 next_rank = (index + step + 1) % size
23 |                 channel = index%channels
24 |                 c = c.copy(next_rank, Buffer.output, index, sendtb=channel, recvtb=channel, ch=channel)   
25 |         XML()
26 |         Check()
27 | 
28 | 
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
31 | parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]')
32 | parser.add_argument('instances', type=int, help='number of instances')
33 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
34 | args = parser.parse_args()
35 | 
36 | allgather_ring(args.num_gpus, args.channels, args.instances, args.protocol)
37 | 


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_1step.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | def allreduce_allpairs(gpus, instances, protocol):
10 |     size = gpus
11 |     chunksperloop = gpus
12 |     topology = fully_connected(size)
13 |     collective = AllReduce(size, chunksperloop, True)
14 |     with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
15 |         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
16 |         
17 |         # Each rank sends the nth chunk to the nth rank into scratch space
18 |         for rank in range(size):
19 |             tb = 0
20 |             for nghr in range(size):
21 |                 if rank != nghr:
22 |                     c = chunk(rank, Buffer.input, index=0, size=size)
23 |                     c.copy(nghr, 'scratch', sendtb=nghr, recvtb=rank)
24 |                     tb += 1
25 | 
26 |         # Each rank performs a local reduction on the nth chunk
27 |         # Utilize 8 threadblocks for this reduction for better parallelism
28 |         for rank in range(size):
29 |             index = 0
30 |             tb = 0
31 |             for nghr in range(size):
32 |                 if rank != nghr:
33 |                     for s in range(size):
34 |                         c = chunk(rank, Buffer.input, s)
35 |                         c.reduce(chunk(rank, 'scratch', index), sendtb=s)
36 |                         index += 1
37 |                         tb += 1
38 |                 
39 |         XML()
40 |         Check()
41 | 
42 | parser = argparse.ArgumentParser()
43 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
44 | parser.add_argument('instances', type=int, help='number of instances')
45 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
46 | 
47 | args = parser.parse_args()
48 | 
49 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_a100_allpairs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | def allreduce_allpairs(gpus, instances, protocol):
10 |     size = gpus
11 |     chunksperloop = gpus * gpus
12 |     topology = fully_connected(size)
13 |     collective = AllReduce(size, chunksperloop, True)
14 |     with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
15 |         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
16 |         
17 |         # Each rank sends the nth chunk to the nth rank into scratch space
18 |         for r1 in range(size):
19 |             for r2 in range(size):
20 |                 if r1 != r2:
21 |                     index = r2 * size
22 |                     c = chunk(r1, Buffer.input, index, size=size)
23 |                     c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
24 | 
25 |         # Each rank performs a local reduction on the nth chunk
26 |         # Utilize 8 threadblocks for this reduction for better parallelism
27 |         for r in range(size):
28 |             for index in range(0, size * (size-1)):
29 |                     c = chunk(r, Buffer.input, r*size + (index % size))
30 |                     c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
31 |         
32 |         # Each rank sends the fully reduced nth chunk to all other gpus
33 |         for r1 in range(size):
34 |             for r2 in range(size):
35 |                 if r1 != r2:
36 |                     index = r1 * size
37 |                     c = chunk(r1, Buffer.input, index, size)
38 |                     c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
39 |                 
40 |         XML()
41 |         Check()
42 | 
43 | parser = argparse.ArgumentParser()
44 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
45 | parser.add_argument('instances', type=int, help='number of instances')
46 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
47 | 
48 | args = parser.parse_args()
49 | 
50 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_a100_allpairs_v2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | import math
 9 | 
10 | def allreduce_allpairs(gpus, instances, protocol):
11 |     size = gpus
12 |     chunksperloop = gpus
13 |     topology = fully_connected(size)
14 |     collective = AllReduce(size, chunksperloop, True)
15 |     with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
16 |         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=False):
17 |         
18 |         # Each rank sends the nth chunk to the nth rank into scratch space
19 |         for r1 in range(size):
20 |             for r2 in range(size):
21 |                 if r1 != r2:
22 |                     index = r2 
23 |                     c = chunk(r1, Buffer.input, index)
24 |                     c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
25 | 
26 |         # Each rank performs a local reduction on the nth chunk
27 |         # Utilize 8 threadblocks for this reduction for better parallelism
28 |         for r in range(size):
29 |             for k in range(1,int(math.log2(size)+1)):
30 |               level = 2**k
31 |               for index in range(0, size//level):
32 |                     if index == 0:
33 |                         c = chunk(r, Buffer.input, r)
34 |                     else:
35 |                         c = chunk(r, 'scratch', (index-1))
36 |                     c.reduce(chunk(r, 'scratch', (index+size//level-1)), sendtb=index)
37 |                     #c = chunk(r, Buffer.input, r*size + (index % size))
38 |                     #c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
39 |         
40 |         # Each rank sends the fully reduced nth chunk to all other gpus
41 |         for r1 in range(size):
42 |             for r2 in range(size):
43 |                 if r1 != r2:
44 |                     index = r1
45 |                     c = chunk(r1, Buffer.input, index)
46 |                     c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
47 |                 
48 |         XML()
49 |         Check()
50 | 
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
53 | parser.add_argument('instances', type=int, help='number of instances')
54 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
55 | 
56 | args = parser.parse_args()
57 | 
58 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
59 | 


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_a100_multinode_allpairs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | def allreduce_allpairs(gpus, instances, protocol):
10 |     size = gpus
11 |     chunksperloop = gpus * gpus
12 |     topology = fully_connected(2*size)
13 |     collective = AllReduce(2*size, chunksperloop, True)
14 |     with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
15 |         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
16 |         # Each rank sends the nth chunk to the nth rank into scratch space
17 |         for r1 in range(size):
18 |             for r2 in range(size):
19 |                 if r1 != r2:
20 |                     index = r2 * size
21 |                     c = chunk(r1, Buffer.input, index, size=size)
22 |                     c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
23 | 
24 |                     c2 = chunk(r1+size, Buffer.input, index, size=size)
25 |                     c2.copy(r2+size, 'scratch', sendtb=r2, recvtb=r1)
26 | 
27 |         # Each rank performs a local reduction on the nth chunk
28 |         # Utilize 8 threadblocks for this reduction for better parallelism
29 |         for r in range(size):
30 |             for index in range(0, size * (size-1)):
31 |                     c = chunk(r, Buffer.input, r*size + (index % size))
32 |                     c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
33 | 
34 |                     c2 = chunk(r+size, Buffer.input, r*size + (index % size))
35 |                     c2.reduce(chunk(r+size, 'scratch', index), sendtb=(index % size))
36 | 
37 | 
38 |         for r in range(size):
39 |             index = r*size
40 |             c = chunk(r, Buffer.input, index, size)
41 |             c = c.copy(r+size, 'scratch2', index=0, sendtb=size, recvtb=size+1, ch=r%2)
42 | 
43 |             c2 = chunk(r+size, Buffer.input, index, size)
44 |             c2 = c2.copy(r, 'scratch2', index=0, sendtb=size+2, recvtb=size+3, ch=r%2)
45 | 
46 |             chunk(r, Buffer.input, index, size).reduce(c2, sendtb=size+3, recvtb=size+4, ch=r%2)
47 |             chunk(r+size, Buffer.input, index, size).reduce(c, sendtb=size+1, recvtb=size+1, ch=r%2)
48 | 
49 |         
50 |         # Each rank sends the fully reduced nth chunk to all other gpus
51 |         for r1 in range(size):
52 |             for r2 in range(size):
53 |                 if r1 != r2:
54 |                     index = r1 * size
55 |                     c = chunk(r1, Buffer.input, index, size)
56 |                     c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
57 | 
58 |                     c2 = chunk(r1+size, Buffer.input, index, size)
59 |                     c2.copy(r2+size, Buffer.input, index, sendtb=r2, recvtb=r1)
60 |                 
61 |         XML()
62 |         Check()
63 | 
64 | parser = argparse.ArgumentParser()
65 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
66 | parser.add_argument('instances', type=int, help='number of instances')
67 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
68 | 
69 | args = parser.parse_args()
70 | 
71 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
72 | 


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_a100_ncv4.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | def allreduce_allpairs(gpus, instances, protocol):
10 |     size = gpus
11 |     chunksperloop = 2
12 |     topology = fully_connected(size)
13 |     collective = AllReduce(size, chunksperloop, True)
14 |     with MSCCLProgram("allreduce_ncv4", topology, collective, instances, protocol=protocol, 
15 |         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
16 |         for chnk in range(chunksperloop):
17 |             for r in range(size):
18 |                 if ((r % 2) == chnk):
19 |                     c = chunk(r, Buffer.input, chnk)
20 |                     c.reduce(chunk(r + 1 - 2 * chnk, Buffer.input, chnk), sendtb=0, recvtb=0, ch=0)
21 | 
22 |             for r in range(size):
23 |                 if ((r % 2) == chnk):
24 |                     c = chunk(r, Buffer.input, chnk)
25 |                     c.copy((r+2) % size, 'scratch', chnk, sendtb=1, recvtb=1, ch=0)
26 |                     
27 |             for r in range(size):
28 |                 if ((r % 2) == chnk):
29 |                     c = chunk(r, Buffer.input, chnk)
30 |                     c.reduce(chunk(r, 'scratch', chnk), sendtb=1, recvtb=1, ch=0)
31 |             
32 |             for r in range(size):
33 |                 if ((r % 2) == chnk):
34 |                     c = chunk(r, Buffer.input, chnk)
35 |                     c.copy(r + 1 - 2 * chnk, Buffer.input, chnk, sendtb=2, recvtb=2, ch=1)
36 | 
37 |         XML()
38 |         Check()
39 | 
40 | parser = argparse.ArgumentParser()
41 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
42 | parser.add_argument('instances', type=int, help='number of instances')
43 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
44 | 
45 | args = parser.parse_args()
46 | 
47 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_a100_ncv4_v2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | def tree_algo(tree, chnk, size):
10 |     for i in range(size-1):
11 |         nextNghr = tree[i+1]
12 |         curNode = tree[i]
13 |         c = chunk(nextNghr, Buffer.input, chnk)
14 |         c.reduce(chunk(curNode, Buffer.input, chnk), sendtb=2*chnk, recvtb=2*chnk, ch=chnk)
15 |     for i in range(size-1):
16 |         curNode = tree[size-1-i]
17 |         nextNghr = tree[size-1-i-1]
18 |         c = chunk(curNode, Buffer.input, chnk)
19 |         c.copy(nextNghr, Buffer.input, chnk, sendtb=2*chnk+1, recvtb=2*chnk+1, ch=chnk)
20 | 
21 | def allreduce_allpairs(gpus, instances, protocol):
22 |     size = gpus
23 |     chunksperloop = 2
24 |     topology = fully_connected(size)
25 |     collective = AllReduce(size, chunksperloop, True)
26 |     with MSCCLProgram("allreduce_ncv4", topology, collective, instances, protocol=protocol, 
27 |         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
28 |         tree_algo([3,2,1,0], 0, size)
29 |         tree_algo([2,3,0,1], 1, size)
30 | 
31 |         XML()
32 |         Check()
33 | 
34 | parser = argparse.ArgumentParser()
35 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
36 | parser.add_argument('instances', type=int, help='number of instances')
37 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
38 | 
39 | args = parser.parse_args()
40 | 
41 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_a100_pcie_hierarchical.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from msccl.language import *
  3 | from msccl.topologies import *
  4 | from msccl.language.collectives import AllReduce
  5 | 
  6 | def allpairs_reduce_scatter(gpuIds, size, offset):
  7 |     ngpus = len(gpuIds)
  8 | 
  9 |     # Each rank sends the nth chunk to the nth rank into scratch space
 10 |     for r1 in range(ngpus):
 11 |         for r2 in range(ngpus):
 12 |             if gpuIds[r1] != gpuIds[r2]:
 13 |                 index = offset + r2 * size
 14 |                 c = chunk(gpuIds[r1], Buffer.input, index, size=size)
 15 |                 c.copy(gpuIds[r2], 'scratch', sendtb=gpuIds[r2], recvtb=gpuIds[r1])
 16 | 
 17 |     # Each rank performs a local reduction on the nth chunk
 18 |     # Utilize 8 threadblocks for this reduction for better parallelism
 19 |     for r in range(ngpus):
 20 |         for index in range(0, size * (ngpus-1)):
 21 |                 c = chunk(gpuIds[r], Buffer.input, offset + r*size + (index % size))
 22 |                 c.reduce(chunk(gpuIds[r], 'scratch', index), sendtb=(index % size))
 23 | 
 24 | 
 25 | def allpairs_all_gather(gpuIds, size, offset):
 26 |     ngpus = len(gpuIds)
 27 | 
 28 |     # Each rank sends its nth chunk to all other gpus
 29 |     for r1 in range(ngpus):
 30 |         for r2 in range(ngpus):
 31 |             if r1 != r2:
 32 |                 index = offset + r1 * size
 33 |                 c = chunk(gpuIds[r1], Buffer.input, index, size)
 34 |                 c.copy(gpuIds[r2], Buffer.input, index, sendtb=gpuIds[r2], recvtb=gpuIds[r1])
 35 | 
 36 | # Performs two levels of allReduce
 37 | def hierarchical_allreduce(gpus, instances, protocol):
 38 |     ncols = 2
 39 |     nrows = gpus // ncols
 40 |     chunkperloop = gpus * gpus
 41 |     topology = fully_connected(gpus)
 42 |     collective = AllReduce(gpus, chunkperloop, True)
 43 | 
 44 |     with MSCCLProgram("hierarchical_allreduce", topology, collective, instances, protocol=protocol, 
 45 |         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):     
 46 |         
 47 |         # A 4 x 3 GPU arranagement: 4 local GPUs, 3 instances, GPU Ids are numbered as such
 48 |         # 0  4   8
 49 |         # 1  5   9
 50 |         # 2  6   10
 51 |         # 3  7  11
 52 |         # Reduce-Scatter on each column first, assumption being GPUs in a column have faster connectivity - NVLINK
 53 |         # Each GPU exchanges (nrows - 1) * 1/rows of data with other GPUs in the same column 
 54 |         # After this step, first GPU in each column will have 1st 1/nrows, 2nd GPU will have 2nd of 1/nrows data reduced
 55 |         size = chunkperloop // nrows
 56 |         offset = 0
 57 |         for n in range(ncols):
 58 |             gpuIds = []
 59 |             for m in range(nrows): # collect all GPU Ids in a column
 60 |                 gpuIds.append( n * nrows + m)
 61 |             
 62 |             allpairs_reduce_scatter(gpuIds, size, 0)
 63 | 
 64 |         # Reduce-Scatter across rows, assumption being GPUs in a row have slower connectivity - PCIe, IP NW
 65 |         # Each GPU exachanges (1 / rows * cols) * (cols - 1) of data with other GPUs in the same row - less data is exchanged
 66 |         # After this step, first GPU each row, will have 1st 1/(nrows * ncols), 2nd will have 2nd of 1/(nrows * ncols)
 67 |         offset = size
 68 |         size = chunkperloop // (nrows * ncols)
 69 |         for n in range(nrows):
 70 |             gpuIds = []
 71 |             for m in range(ncols):
 72 |                 gpuIds.append(n + m * nrows)
 73 | 
 74 |             allpairs_reduce_scatter(gpuIds, size, offset * n)
 75 | 
 76 |         # AllGather: AllGather phase goes in reverse order, first gather across rows of GPU
 77 |         # After this step, Each GPU in a rows have 1/ncols of data
 78 |         for n in range(nrows):
 79 |             gpuIds = []
 80 |             for m in range(ncols):
 81 |                 gpuIds.append(n + m * nrows)
 82 | 
 83 |             allpairs_all_gather(gpuIds, size, offset * n)
 84 | 
 85 |         # AllGather: AllGather phase goes in reverse order, 2nd AllGather across columns of GPU
 86 |         # After this step, Each GPU the systems will have complete reduced data
 87 |         size = chunkperloop // nrows
 88 |         offset = 0
 89 |         for n in range(ncols):
 90 |             gpuIds = []
 91 |             for m in range(nrows):
 92 |                 gpuIds.append( n * nrows + m)
 93 | 
 94 |             allpairs_all_gather(gpuIds, size, 0)
 95 | 
 96 |         XML()
 97 |         Check()
 98 | 
 99 | parser = argparse.ArgumentParser()
100 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
101 | parser.add_argument('instances', type=int, help='number of instances')
102 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
103 | 
104 | args = parser.parse_args()
105 | 
106 | hierarchical_allreduce(args.num_gpus, args.instances, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_a100_recursive_doubling_halving.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | # Halving-doubling implementation of allreduce
 5 | 
 6 | import argparse
 7 | 
 8 | from msccl.language import *
 9 | from msccl.topologies import *
10 | from msccl.language.collectives import AllReduce
11 | 
12 | 
13 | def allreduce(ways, instances, protocol):
14 |     topology = fully_connected(4)
15 |     size = topology.num_nodes() #  Number of gpus
16 |     logical_chunk = 8 * ways
17 |     collective = AllReduce(size, logical_chunk, True)
18 |     with MSCCLProgram("allreduce_a100_recursive_doubling_halving", topology, collective, instances, protocol, interleaved_replication=False):
19 |         # 1 reduction between pairs of gpus of count
20 |         def recursive_doubling(pairs, count, next_index, lc, sendtb, recvtb):
21 |             current_index = next_index.copy()
22 |             for r in range(size):
23 |                 next = r ^ pairs
24 |                 offset = (count if r <= next else 0) 
25 |                 next_index[next] += offset
26 |                 # Split the reduce into two separate reduces to enable fused instructions
27 |                 block = 2 ** pairs
28 |                 for x in range(count):
29 |                     index = current_index[r] + offset + lc*8 + x
30 |                     c1 = chunk(r, Buffer.input, index)
31 |                     c = chunk(next, Buffer.input, index)
32 |                     c.reduce(c1, sendtb=sendtb, recvtb=recvtb)
33 | 
34 | 
35 |         # Propagates reduced chunks in reverse order 
36 |         def recursive_halving(pairs, count, next_index, lc, sendtb, recvtb):
37 |             current_index = next_index.copy()            
38 |             for r in range(size):
39 |                 next = r ^ pairs
40 |                 offset = (count if r > next else 0) 
41 |                 next_index[r] -= offset
42 |                 index = current_index[r] + lc*8
43 |                 c = chunk(r, Buffer.input, index, count)
44 |                 c.copy(next, Buffer.input, index, ch=lc, sendtb=sendtb, recvtb=recvtb)
45 | 
46 |         next_index = [0] * 8
47 |         recursive_doubling(1, 4, next_index, 0, 0, 1)
48 |         recursive_doubling(2, 2, next_index, 0, 1, 2)
49 |         recursive_doubling(4, 1, next_index, 0, 2, 3)
50 | 
51 |         recursive_halving(4, 1, next_index, 0, 2, 3)
52 |         recursive_halving(2, 2, next_index, 0, 1, 2)
53 |         recursive_halving(1, 4, next_index, 0, 0, 1)
54 | 
55 |         if ways > 1:
56 |             next_index = [0] * 8
57 |             lc = 1
58 |             recursive_doubling(4, 4, next_index, lc, 8, 9)
59 |             recursive_doubling(2, 2, next_index, lc, 9, 10)
60 |             recursive_doubling(1, 1, next_index, lc, 10, 11)
61 | 
62 |             recursive_halving(1, 1, next_index, lc, 10, 11)
63 |             recursive_halving(2, 2, next_index, lc, 9, 10)
64 |             recursive_halving(4, 4, next_index, lc, 8, 9)
65 |             
66 |         if ways > 2:
67 |             next_index = [0] * 8
68 |             lc = 2
69 |             recursive_doubling(2, 4, next_index, lc, 4, 5)
70 |             recursive_doubling(1, 2, next_index, lc, 5, 6)
71 |             recursive_doubling(4, 1, next_index, lc, 6, 7)
72 | 
73 |             
74 |             recursive_halving(4, 1, next_index, lc, 6, 7)
75 |             recursive_halving(1, 2, next_index, lc, 5, 6)
76 |             recursive_halving(2, 4, next_index, lc, 4, 5)
77 |             
78 | 
79 |         XML()
80 |         Check()
81 | 
82 | parser = argparse.ArgumentParser()
83 | parser.add_argument('ways', type=int, help='number of parallel trees to perform reduction min:1 max:2')
84 | parser.add_argument('instances', type=int, help='number of instances')
85 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
86 | args = parser.parse_args()
87 | assert args.ways >= 1 and args.ways <= 3
88 | allreduce(args.ways, args.instances, args.protocol)
89 | 


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_a100_ring.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | # Ring all reduce for A100s
10 | # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
11 | # channels=1 is standard ring, all chunks are assigned to the same tb/channel
12 | # channels=8 devotes 1 tb/channel to handling 1 chunk of the data
13 | def allreduce_ring(size, instances, channels, protocol):
14 |     topology = fully_connected(size)
15 |     collective = AllReduce(size, size, True)
16 |     with MSCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances,
17 |          protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
18 |         # Reduce ring
19 |         for step in range(0, size-1):
20 |             for index in range(0, size):
21 |                 rank = (index + step) % size
22 |                 next_rank = (index + step + 1) % size
23 |                 channel = index%channels
24 |                 c = chunk(next_rank, Buffer.input, index)
25 |                 c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel)
26 |         # Propagate ring
27 |         for step in range(-1, size-2):
28 |             for index in range(0, size):
29 |                 rank = (index + step) % size
30 |                 c = chunk(rank, Buffer.input, index)
31 |                 next_rank = (index + step + 1) % size
32 |                 channel = index%channels
33 |                 c = c.copy(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
34 |                
35 |         XML()
36 |         Check()
37 | 
38 | parser = argparse.ArgumentParser()
39 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
40 | parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]')
41 | parser.add_argument('instances', type=int, help='number of instances')
42 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: LL128')
43 | args = parser.parse_args()
44 | 
45 | allreduce_ring(args.num_gpus, args.instances, args.channels, args.protocol)
46 | 


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_binomial_tree.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | # Binomial tree and mirrored binomial tree
10 | # Mirrored trees adopted from: http://algo2.iti.kit.edu/documents/2tree.pdf
11 | def allreduce_binomial_tree(size, instances, trees, protocol):
12 |     topology = fully_connected(size)
13 |     collective = AllReduce(size, trees, True)
14 |     with MSCCLProgram("allreduce_binomial_tree", topology, collective, instances, protocol=protocol):
15 |         distance = 1
16 |         # Reduce tree - reducing onto Rank 0
17 |         while distance <= size // 2:
18 |             # Reduce onto the left neighbor that is distance away
19 |             for rank in range(0, size, distance*2):
20 |                 peer = rank + distance
21 |                 c1 = chunk(peer, Buffer.input, 0)
22 |                 chunk(rank, Buffer.input).reduce(c1, 0)
23 |             distance *= 2
24 |         # Broadcast tree - root is Rank 0
25 |         distance = distance // 2
26 |         while distance >= 1:
27 |             # Copy to the right neighbor that is distance away
28 |             for rank in range(0, size, distance*2):
29 |                 peer = rank + distance
30 |                 chunk(rank, Buffer.input, 0).copy(peer, Buffer.input, 0)
31 |             distance = distance // 2
32 | 
33 |         # Mirrored version of the tree
34 |         # Reduce tree - reducing onto Rank N-1
35 |         if trees == 2:
36 |             distance = 1
37 |             while distance <= size // 2:
38 |                 # Reduce onto the right neighbor that is distance away
39 |                 for rank in range(size-1, 0, -distance*2):
40 |                     peer = rank - distance
41 |                     c1 = chunk(peer, Buffer.input, 1)
42 |                     chunk(rank, Buffer.input, 1).reduce(c1)
43 |                 distance *= 2
44 |             # Broadcast tree - root is Rank N-1
45 |             distance = distance // 2
46 |             while distance >= 1:
47 |                 # Copy to the left neighbor that is distance away
48 |                 for rank in range(size-1, 0, -distance*2):
49 |                     peer = rank - distance
50 |                     chunk(rank, Buffer.input, 1).copy(peer, Buffer.input, 1)
51 |                 distance = distance // 2
52 | 
53 |         XML()
54 |         Check()
55 | 
56 | parser = argparse.ArgumentParser()
57 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
58 | parser.add_argument('trees', type=int, choices=[1, 2], help ='number of trees')
59 | parser.add_argument('instances', type=int, help ='number of instances')
60 | 
61 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
62 | args = parser.parse_args()
63 | allreduce_binomial_tree(args.num_gpus, args.instances, args.trees, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_dgx1.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | 
 6 | from msccl.language import *
 7 | from msccl.topologies.distributed import *
 8 | from msccl.topologies.nvidia import *
 9 | from msccl.language.collectives import AllReduce
10 | 
11 | def allreduce(num_nodes, instances):
12 |     local_topology = dgx1()
13 |     num_local_gpus = 8
14 |     remote_bw = 1
15 |     topology = distributed_fully_connected(local_topology, num_nodes, remote_bw)
16 |     size = topology.num_nodes()
17 |     collective = AllReduce(size, 1, True)
18 |     local_ring_order = [1,3,2,6,7,5,4,0]
19 | 
20 |     def rank(n, g):
21 |         return local_ring_order[g] + n * num_local_gpus
22 |         
23 |     with MSCCLProgram("allreduce_ring_dgx1", topology, collective, 1):
24 | 
25 |         # Chunks travels around local rings being reduced (local_gpus-1 hops) starting at local gpu 1
26 |         # At the end of the most reduced chunk ends up on local gpu 0 every each node
27 |         for n in range(num_nodes):
28 |             r = rank(n, 0) # Start at local gpu 1 (index 0 in local_ring_order)
29 |             c = chunk(r, Buffer.input, 0)
30 |             for g in range(1, 8):
31 |                 c = c.reduce(rank(n,g), Buffer.input, 0)
32 |     
33 |         # At this point gpu0 and gpu8 have the two most reduced chunks
34 |         # 1 IB send to fully reduce chunk + 1 IB send to update other node 
35 | 
36 |         chunk(0, Buffer.input, 0).send(9, Buffer.input, 0)
37 |         chunk(8, Buffer.input, 0).send(1, Buffer.input, 0).reduce(0, Buffer.input, 0)
38 |         chunk(9, Buffer.input, 0).reduce(8, Buffer.input, 0)
39 | 
40 |         #  Propagate the fully reduced chunks going backwards around the ring
41 |         for n in range(num_nodes):
42 |             r = rank(n, 7) 
43 |             c = chunk(r, Buffer.input, 0)
44 |             for g in range(6, -1, -1):
45 |                 next = rank(n, g)
46 |                 c = c.send(next, Buffer.input, 0)
47 | 
48 |         XML()
49 |         Check()
50 | 
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument('num_nodes', type=int, help='number of nodes')
53 | parser.add_argument('instances', type=int, help='number of instances')
54 | args = parser.parse_args()
55 | 
56 | assert args.num_nodes == 2, "Only works for 2 nodes right now"
57 | 
58 | allreduce(args.num_nodes, args.instances)
59 | 


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_ndv2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies.distributed import *
 7 | from msccl.topologies.nvidia import *
 8 | from msccl.topologies import *
 9 | from msccl.language.collectives import AllReduce
10 | 
11 | def allreduce(instances):
12 |     size = 8
13 |     num_local_gpus = size
14 |     topology = fully_connected(size)
15 |     # size = topology.num_nodes() #  Number of gpus
16 |     logical_chunk = size
17 |     collective = AllReduce(size, logical_chunk, True)
18 |     with MSCCLProgram("allreduce_ndv2", topology, collective, instances, interleaved_replication=False):
19 |         # local reduce_scatter
20 |         instances = 1
21 |         for lc in range(num_local_gpus//2):
22 |             for r in range(num_local_gpus):
23 |                 if lc == (r % (num_local_gpus//2)):
24 |                     continue
25 |                 within_socket_nghr = lc + (4 if (r >= num_local_gpus//2) else 0)
26 |                 index = lc * 2
27 |                 c = chunk(r, Buffer.input, index, 2)
28 |                 c.reduce(within_socket_nghr, buffer=Buffer.input, index=index)
29 |         #  cross-socket reduce_scatter
30 |         for r in range(num_local_gpus):
31 |             index = (r % (num_local_gpus//2)) * 2
32 |             if r >= num_local_gpus // 2:
33 |                 index += 1 # Handle the odd chunk
34 |             lc = chunk(r, Buffer.input, index)
35 |             lc = lc.reduce((r+num_local_gpus//2) % num_local_gpus, buffer=Buffer.input, index=index)
36 |             lc.send(r, Buffer.input, index, ch=1) # Reduce and send should be on different tbs
37 |         #  local all_gather
38 |         for r in range(num_local_gpus):
39 |             index = (r % (num_local_gpus//2)) * 2
40 |             lc = chunk(r, Buffer.input, index, 2)
41 |             for t in range(num_local_gpus//2):
42 |                 local_nghr = t + (num_local_gpus//2 if (r >= num_local_gpus//2) else 0)
43 |                 if local_nghr == r:
44 |                     continue
45 |                 lc.send(local_nghr, buffer=Buffer.input, index=index)
46 |         XML()
47 |         Check()
48 | 
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument('instances', type=int, help='number of instances')
51 | args = parser.parse_args()
52 | allreduce(args.instances)
53 | 


--------------------------------------------------------------------------------
/examples/mscclang/allreduce_recursive_doubling_halving.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | 
 5 | import argparse
 6 | 
 7 | from msccl.language import *
 8 | from msccl.topologies import *
 9 | from msccl.language.collectives import AllReduce
10 | 
11 | def reduce_scatter_vector_halving_distance_doubling(size):
12 |     count = size // 2
13 |     while count >= 1:
14 |         for rank in range(size):
15 |             peer = rank ^ count
16 |             index = ((peer // count) * count)
17 |             c1 = chunk(rank, Buffer.input, index, size=count)
18 |             chunk(peer, Buffer.output, index, size=count).reduce(c1, sendtb=peer, recvtb=rank, ch=0)
19 |         count //= 2
20 | 
21 | def allgather_recursive_vector_doubling_distance_halving(size):
22 |     count = 1
23 |     while count < size:
24 |         for rank in range(size):
25 |             peer = rank ^ count
26 |             index = ((rank // count) * count)
27 |             chunk(rank, Buffer.output, index, size=count).copy(peer, Buffer.output, index, sendtb=peer, recvtb=rank, ch=0) 
28 |         count *= 2
29 | 
30 | def allreduce(size, instances, protocol):
31 |     topology = fully_connected(size)
32 |     logical_chunk = size
33 |     collective = AllReduce(size, logical_chunk, True)
34 |     with MSCCLProgram("allreduce_recursive_doubling_halving", topology, collective, instances, protocol,
35 |          interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual):
36 |         reduce_scatter_vector_halving_distance_doubling(size)
37 |         allgather_recursive_vector_doubling_distance_halving(size)
38 |         XML()
39 |         Check()
40 | 
41 | parser = argparse.ArgumentParser()
42 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
43 | parser.add_argument('instances', type=int, help='number of instances')
44 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
45 | args = parser.parse_args()
46 | allreduce(args.num_gpus, args.instances, args.protocol)
47 | 


--------------------------------------------------------------------------------
/examples/mscclang/alltoall_a100_two_step.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from msccl.language import *
 4 | from msccl.topologies import *
 5 | from msccl.language.collectives import AllToAll
 6 | 
 7 | 
 8 | def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
 9 |     num_ranks = num_nodes * gpus_per_node
10 |     topology = fully_connected(num_ranks)
11 |     collective = AllToAll(num_ranks, 1, inplace=False)
12 | 
13 |         
14 |     with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1, protocol=protocol):
15 |         for n1 in range(num_nodes):
16 |             for r in range(1,num_nodes):
17 |                 n2 = (n1 + r) % num_nodes
18 | 
19 |                 # Gather all local chunks for the node neighbor
20 |                 for g1 in range(gpus_per_node):
21 |                     rank1 = n1 * gpus_per_node + g1
22 | 
23 |                     for g2 in range(gpus_per_node):
24 |                         rank2 = n1 * gpus_per_node + g2
25 |                         # chunk to copy: g2 on n2
26 |                         index = n2 * gpus_per_node + g2 
27 |                         c = chunk(rank1, Buffer.input, index)
28 |                         c = c.copy(rank2, f'copy_{n2}')
29 | 
30 |             for r in range(1,num_nodes):
31 |                 n2 = (n1 + r) % num_nodes
32 |                 # IB copy
33 |                 for g1 in range(gpus_per_node):
34 |                     rank = n1 * gpus_per_node + g1
35 |                     ib_peer = n2 * gpus_per_node + g1
36 |                     c = chunk(rank, f'copy_{n2}', 0, gpus_per_node)
37 |                     c = c.copy(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % gpus_per_node)*2+(rank%2)+2)
38 | 
39 |           
40 |         # Handle local chunks within a node
41 |         for rank in range(num_ranks):
42 |             for g in range(gpus_per_node):
43 |                 index = (rank // gpus_per_node) * gpus_per_node + g
44 |                 c = chunk(rank, Buffer.input, index)
45 |                 c.copy(c.get_dst_rank(), Buffer.output, c.get_dst_index())
46 | 
47 |         XML() # Prints the XML
48 |         Check()
49 | 
50 | 
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument('num_nodes', type=int, help ='number of nodes')
53 | parser.add_argument('gpus_per_node', type=int, help ='gpus per node')
54 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
55 | args = parser.parse_args()
56 | 
57 | 
58 | alltoall_hierarchical(args.num_nodes, args.gpus_per_node, args.protocol)
59 | 


--------------------------------------------------------------------------------
/examples/mscclang/alltoall_allpairs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from msccl.language import *
 4 | from msccl.topologies import *
 5 | from msccl.language.collectives import AllToAll
 6 | 
 7 | # One-step AllToAll program
 8 | # Each gpu makes sends and receives a chunk from every other gpu
 9 | 
10 | def alltoall(num_ranks, instances, protocol):
11 |     topology = fully_connected(num_ranks)
12 |     collective = AllToAll(num_ranks, 1, inplace=False)
13 | 
14 |     with MSCCLProgram("alltoall_allpairs", topology, collective, instances=instances, protocol=protocol):
15 |         for r in range(num_ranks):
16 |             for index in range(num_ranks):
17 |                 chunk(r, Buffer.input, index).copy(index, Buffer.output, r)
18 |         XML()
19 |         Check()
20 | 
21 | 
22 | parser = argparse.ArgumentParser()
23 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
24 | parser.add_argument('instances', type=int, help ='number of instances')
25 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
26 | args = parser.parse_args()
27 | 
28 | alltoall(args.num_gpus, args.instances, args.protocol)
29 | 


--------------------------------------------------------------------------------
/examples/mscclang/alltonext_backward.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | 
 6 | from msccl.language import *
 7 | from msccl.topologies.distributed import *
 8 | from msccl.topologies import *
 9 | from msccl.language.collectives import Collective
10 | 
11 | class Pipeline(Collective):
12 |     def init_buffers(self):
13 |         chunks_per_node = self.chunk_factor
14 |         rank_buffers = []
15 |         for r in range(self.num_ranks):
16 |             input_buffer = [None] * chunks_per_node
17 |             output_buffer = [None] * chunks_per_node
18 |             if r != 0:
19 |                 for c in range(chunks_per_node):
20 |                     input_buffer[c] = Chunk(r, c, r-1, c)
21 |             buffers = {Buffer.input : input_buffer, 
22 |                        Buffer.output : output_buffer}
23 |             rank_buffers.append(buffers)
24 |         return rank_buffers
25 |             
26 | 
27 |     # Final state chunks on rank(i) end up on rank(i-1)
28 |     def check(self, prog):
29 |         correct = True
30 |         for r in range(0, self.num_ranks-1):
31 |             output = prog.buffers[r][Buffer.output]
32 |             for c in range(self.chunk_factor):
33 |                 chunk = output[c]
34 |                 if chunk is None or chunk.origin_rank != r+1 or chunk.origin_index != c:
35 |                     print(f'Rank {r} chunk {c} is incorrect should be ({r+1}, {c}) given {chunk}')
36 |                     correct = False
37 |         return correct
38 | 
39 | 
40 | def pipeline(num_nodes, instances):
41 |     num_local_gpus = 8
42 |     chunks = num_local_gpus
43 |     chunk_factor = chunks
44 |     remote_bw = 1
45 |     size = num_local_gpus * num_nodes
46 |     topology = fully_connected(size)
47 |     collective = Pipeline(size, chunk_factor, False)
48 | 
49 |     def rank(node, local_rank):
50 |         return node * num_local_gpus + local_rank
51 |     
52 |     with MSCCLProgram("alltonext-backwards", topology, collective, instances):
53 | 
54 |         for n in range(num_nodes):
55 |             for g in range(num_local_gpus):
56 |                 r = rank(n, g)
57 | 
58 |                 # Do nothing for first gpu - end of pipeline
59 |                 if r == 0:
60 |                     continue
61 | 
62 |                 # Cross node copy - cooperative
63 |                 if g == 0:
64 |                     for ch in range(chunks):
65 |                         c = chunk(r, Buffer.input, ch)
66 |                         if ch == 0:
67 |                             # 2 steps: IB copy to (node-1, g) then gather onto (node+1, num_local_gpus-1)
68 |                             c = c.copy(rank(n-1, ch), f's{n}->{n+1}', 0, ch=ch%2)
69 |                         elif ch == num_local_gpus-1: 
70 |                             # 2 steps: Scatter - copy to (node, num_local_gpus-1), IB copy to (node+1, num_local_gpus-1)
71 |                             c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2)
72 |                         else:
73 |                             # 3 steps: Scatter - copy to (node, g), IB copy to (node-1, g), gather onto (node-1, num_local_gpus-1)
74 |                             c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2)
75 |                             c = c.copy(rank(n-1, ch), f's{n}->{n+1}', 0, ch=ch%2)
76 |                         c.copy(r-1, Buffer.output, c.get_dst_index(), ch=ch%2)
77 |                         
78 |                 # Normal copy - directly
79 |                 else:
80 |                     c = chunk(r, Buffer.input, 0, chunks)
81 |                     c.copy(r-1, Buffer.output, 0, ch=g%2)
82 |         
83 |         Check()
84 |         XML()
85 | 
86 | if __name__ == '__main__':
87 |     parser = argparse.ArgumentParser()
88 |     parser.add_argument('num_nodes', type=int, help ='number of nodes')
89 |     parser.add_argument('instances', type=int, help ='number of instances')
90 | 
91 |     args = parser.parse_args()
92 | 
93 |     pipeline(args.num_nodes, args.instances)


--------------------------------------------------------------------------------
/examples/mscclang/alltonext_forward.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | 
 6 | from msccl.language import *
 7 | from msccl.topologies.distributed import *
 8 | from msccl.topologies import *
 9 | from msccl.language.collectives import Collective
10 | 
11 | class Pipeline(Collective):
12 |     def init_buffers(self):
13 |         chunks_per_node = self.chunk_factor
14 |         rank_buffers = []
15 |         for r in range(self.num_ranks):
16 |             input_buffer = [None] * chunks_per_node
17 |             output_buffer = [None] * chunks_per_node
18 |             if r != self.num_ranks -1:
19 |                 for c in range(chunks_per_node):
20 |                     # Chunk(starting rank, starting index, ending rank, ending index)
21 |                     input_buffer[c] = Chunk(r, c, r+1, c)
22 |             buffers = {Buffer.input : input_buffer, 
23 |                        Buffer.output : output_buffer}
24 |             rank_buffers.append(buffers)
25 |         return rank_buffers
26 |             
27 | 
28 |     # Final state chunks on rank(i) end up on rank(i+1)
29 |     def check(self, prog):
30 |         correct = True
31 |         for r in range(1, self.num_ranks):
32 |             output = prog.buffers[r][Buffer.output]
33 |             for c in range(self.chunk_factor):
34 |                 chunk = output[c]
35 |                 # Check we got the previous rank's chunks
36 |                 if chunk is None or chunk.origin_rank != r-1 or chunk.origin_index != c:
37 |                     print(f'Rank {r} chunk {c} is incorrect should be ({r-1}, {c}) given {chunk}')
38 |                     correct = False
39 |         return correct
40 | 
41 | 
42 | def pipeline(num_nodes, instances):
43 |     num_local_gpus = 8
44 |     chunks = num_local_gpus
45 |     total_chunks_per_loop = chunks
46 |     remote_bw = 1
47 |     size = num_local_gpus * num_nodes
48 |     topology = fully_connected(size)
49 |     collective = Pipeline(size, total_chunks_per_loop, True)
50 | 
51 |     def rank(node, local_rank):
52 |         return node * num_local_gpus + local_rank
53 |     
54 |     with MSCCLProgram("alltonext-forward", topology, collective, instances):
55 | 
56 |         for n in range(num_nodes):
57 |             for g in range(num_local_gpus):
58 |                 r = rank(n, g)
59 | 
60 |                 # Do nothing for last gpu - end of pipeline
61 |                 if r == size - 1:
62 |                     continue
63 | 
64 |                 # Cross node copy - cooperative
65 |                 if g == num_local_gpus -1:
66 |                     for ch in range(chunks):
67 |                         c = chunk(r, Buffer.input, ch)
68 |                         if ch == 0: # 2 steps: Scatter - copy to (node, 0), IB copy to (node+1, 0)
69 |                             c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2)
70 | 
71 |                         elif ch == num_local_gpus-1:
72 |                             # 2 steps: IB copy to (node+1, g) then gather onto (node+1, 0)
73 |                             c = c.copy(rank(n+1, ch), f's{n}->{n+1}', 0, ch=ch%2)
74 |                         else:
75 |                             # 3 steps: Scatter - copy to (node, g), IB copy to (node+1, g), gather onto (node+1, 0)
76 |                             c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2)
77 |                             c = c.copy(rank(n+1, ch), f's{n}->{n+1}', 0, ch=ch%2)
78 |                         
79 |                         c.copy(r+1, Buffer.output, c.get_dst_index(), ch=ch%2)
80 |                         
81 |                 # Normal copy - directly
82 |                 else:
83 |                     c = chunk(r, Buffer.input, 0, chunks)
84 |                     c.copy(r+1, Buffer.output, 0, ch=g%2)
85 |         
86 |         Check()
87 |         XML()
88 | 
89 | if __name__ == '__main__':
90 |     parser = argparse.ArgumentParser()
91 |     parser.add_argument('num_nodes', type=int, help ='number of nodes')
92 |     parser.add_argument('instances', type=int, help ='number of instances')
93 | 
94 |     args = parser.parse_args()
95 | 
96 |     pipeline(args.num_nodes, args.instances)


--------------------------------------------------------------------------------
/examples/mscclang/hierarchical_allreduce.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | 
 6 | from msccl.language import *
 7 | from msccl.topologies import *
 8 | from msccl.language.collectives import AllReduce
 9 | 
10 | # Blue Connect style AllReduce https://proceedings.mlsys.org/paper/2019/file/9b8619251a19057cff70779273e95aa6-Paper.pdf
11 | # Assumes only two-level switches
12 | 
13 | def ring_reduce_scatter(size, rank_offset=0, rank_step=1, local_chunk_size=1, chunk_offset=0, chunk_stride=1, chan=-1):
14 |     for ch in range(0, size):
15 |         index = ch * chunk_stride * local_chunk_size + chunk_offset
16 |         for step in range(0, size-1):
17 |             other = chunk(((step+1+ch) % size)*rank_step +rank_offset, Buffer.input, index, local_chunk_size)
18 |             c = chunk(((step+2+ch) % size)*rank_step+rank_offset, Buffer.input, index, local_chunk_size)
19 |             c.reduce(other, ch=chan)
20 | 
21 | def ring_all_gather(size, rank_offset=0, rank_step=1, local_chunk_size=1, chunk_offset=0, chunk_stride=1, chan=-1):
22 |     for ch in range(0, size):
23 |         index = ch * chunk_stride * local_chunk_size + chunk_offset
24 |         for step in range(0, size-1):
25 |             c = chunk(((step+ch) % size)*rank_step + rank_offset, Buffer.input, index, local_chunk_size)
26 |             c.copy(((step+1+ch) % size)*rank_step + rank_offset, Buffer.input, index, ch=chan)
27 | 
28 | def hierarchical_allreduce(num_local_gpus, num_nodes, instances, protocol, schedule):
29 |     num_gpus = num_local_gpus * num_nodes
30 |     topology = fully_connected(num_gpus)
31 |     collective = AllReduce(num_gpus, num_gpus, True)
32 | 
33 |     with MSCCLProgram("hierarchical_allreduce", topology, collective, instances, protocol=protocol, 
34 |         interleaved_replication=False):
35 | 
36 |         local_chunk_size = num_nodes
37 |         if schedule == 'auto':
38 |             for n in range(num_nodes):
39 |                 for offset in range(num_nodes):
40 |                     ring_reduce_scatter(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes)
41 | 
42 |             # Cross node Reduce-Scatter
43 |             for g in range(num_local_gpus):
44 |                 ring_reduce_scatter(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes)
45 | 
46 |             # Cross node All-gather
47 |             for g in range(num_local_gpus):
48 |                 ring_all_gather(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes)
49 | 
50 | 
51 |             # All gather within each node
52 |             for n in range(num_nodes):
53 |                 for offset in range(num_nodes):
54 |                     ring_all_gather(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes)
55 | 
56 |         else:
57 |             # Reduce Scatter within each node
58 |             for n in range(num_nodes):
59 |                 for offset in range(num_nodes):
60 |                     ring_reduce_scatter(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes, chan=offset)
61 | 
62 |             # Cross node Reduce-Scatter
63 |             for g in range(num_local_gpus):
64 |                 ring_reduce_scatter(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes, chan=g%2+num_nodes*2)
65 | 
66 |             # Cross node All-gather
67 |             for g in range(num_local_gpus):
68 |                 ring_all_gather(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes, chan=g%2+num_nodes*2)
69 | 
70 | 
71 |             # All gather within each node
72 |             for n in range(num_nodes):
73 |                 for offset in range(num_nodes):
74 |                     ring_all_gather(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes, chan=offset+num_nodes)
75 | 
76 |         XML()
77 |         Check()
78 | 
79 | parser = argparse.ArgumentParser()
80 | parser.add_argument('num_gpus', type=int, help='number of gpus per node')
81 | parser.add_argument('num_nodes', type=int, help='number of nodes')
82 | parser.add_argument('instances', type=int, help='number of instances')
83 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL128', 'LL'], help='Protocol')
84 | parser.add_argument('--schedule', type=str, default='auto', choices=['auto', 'manual'], help='Scheduling')
85 | 
86 | args = parser.parse_args()
87 | 
88 | hierarchical_allreduce(args.num_gpus, args.num_nodes, args.instances, args.protocol, args.schedule)
89 | 
90 | 


--------------------------------------------------------------------------------
/examples/mscclang/pipeline_a100_allpairs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | def allreduce_allpairs(gpus, instances, protocol):
10 |     size = gpus
11 |     chunksperloop = gpus * gpus
12 |     topology = fully_connected(2*size)
13 |     collective = AllReduce(2*size, chunksperloop, True)
14 |     with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
15 |         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
16 |         
17 |         # Each rank sends the nth chunk to the nth rank into scratch space
18 |         for r1 in range(size):
19 |             for r2 in range(size):
20 |                 if r1 != r2:
21 |                     index = r2 * size
22 |                     c = chunk(r1, Buffer.input, index, size=size)
23 |                     c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
24 | 
25 |         # Each rank performs a local reduction on the nth chunk
26 |         # Utilize 8 threadblocks for this reduction for better parallelism
27 |         for r in range(size):
28 |             for index in range(0, size * (size-1)):
29 |                     c = chunk(r, Buffer.input, r*size + (index % size))
30 |                     c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
31 |             c = chunk(r, Buffer.input, r*size, size=size)
32 |             c.copy(r+size, Buffer.input, r*size, ch=r%2)
33 |         
34 |         # Each rank sends the fully reduced nth chunk to all other gpus
35 |         for r1 in range(size):
36 |             for r2 in range(size):
37 |                 if r1 != r2:
38 |                     index = r1 * size
39 |                     c = chunk(r1+size, Buffer.input, index, size)
40 |                     c.copy(r2+size, Buffer.input, index, sendtb=r2, recvtb=r1)
41 |                 
42 |         XML()
43 |         #Check()
44 | 
45 | parser = argparse.ArgumentParser()
46 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
47 | parser.add_argument('instances', type=int, help='number of instances')
48 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
49 | 
50 | args = parser.parse_args()
51 | 
52 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol)


--------------------------------------------------------------------------------
/examples/mscclang/pipeline_a100_ring.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | from msccl.language import *
 6 | from msccl.topologies import *
 7 | from msccl.language.collectives import AllReduce
 8 | 
 9 | # Ring all reduce for A100s
10 | # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
11 | # channels=1 is standard ring, all chunks are assigned to the same tb/channel
12 | # channels=8 devotes 1 tb/channel to handling 1 chunk of the data
13 | def allreduce_ring(size, instances, channels, protocol):
14 |     topology = fully_connected(2*size)
15 |     collective = AllReduce(2*size, size, True)
16 |     with MSCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances,
17 |          protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
18 |         # Reduce ring
19 |         for step in range(0, size-1):
20 |             for index in range(0, size):
21 |                 rank = (index + step) % size
22 |                 next_rank = (index + step + 1) % size
23 |                 channel = index%channels
24 |                 c = chunk(next_rank, Buffer.input, index)
25 |                 c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel)
26 |         # Propagate ring
27 |         for index in range(0, size):
28 |             rank = (index - 1) % size
29 |             c = chunk(rank, Buffer.input, index)
30 |             c.copy(rank+size, Buffer.input, index, ch=rank%2)
31 |         for step in range(-1, size-2):
32 |             for index in range(0, size):
33 |                 rank = (index + step) % size
34 |                 c = chunk(rank+size, Buffer.input, index)
35 |                 next_rank = (index + step + 1) % size
36 |                 channel = index%channels
37 |                 c = c.copy(next_rank+size, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
38 |                
39 |         XML()
40 | #        Check()
41 | 
42 | parser = argparse.ArgumentParser()
43 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
44 | parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]')
45 | parser.add_argument('instances', type=int, help='number of instances')
46 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: LL128')
47 | args = parser.parse_args()
48 | 
49 | allreduce_ring(args.num_gpus, args.instances, args.channels, args.protocol)
50 | 


--------------------------------------------------------------------------------
/examples/mscclang/reducegather.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | 
 6 | from msccl.language import *
 7 | from msccl.topologies import *
 8 | from msccl.language.collectives import Collective
 9 | 
10 | class ReduceGather(Collective):
11 |     def __init__(self, num_ranks, chunk_factor, inplace, groups):
12 |         Collective.__init__(self, num_ranks, chunk_factor, inplace)
13 |         self.groups = groups
14 |         self.gpus_per_group = num_ranks // groups
15 |         assert chunk_factor == 1, "Only supports chunks == number of ranks"
16 | 
17 |     def init_buffers(self):
18 |         assert self.chunk_factor == 1
19 |         rank_buffers = []
20 |         chunks_per_node = self.num_ranks
21 |         for r in range(self.num_ranks):
22 |             input_buffer = [None] * self.gpus_per_group
23 |             output_buffer = [None] * chunks_per_node
24 |             for c in range(self.groups):
25 |                 input_buffer[c] = Chunk(r, c, -1, c)
26 |             buffers = {Buffer.input : input_buffer, 
27 |                        Buffer.output : output_buffer}
28 |             rank_buffers.append(buffers)
29 |         return rank_buffers
30 |             
31 | 
32 |     def check(self, prog):
33 |         expected_chunks = []
34 |         for r in range(self.num_ranks):
35 |             chunk = ReduceChunk([])
36 |             for x in range(self.groups):
37 |                 y = r // self.groups
38 |                 next = y * self.groups + x
39 |                 chunk = chunk.reduce(Chunk(next, r % self.gpus_per_group))
40 |             expected_chunks.append(chunk)
41 | 
42 |         correct = True
43 |         for r in range(self.num_ranks):
44 |             output = prog.buffers[r][Buffer.output]
45 |             for c in range(self.num_ranks):
46 |                 chunk = output[c]
47 |                 if chunk is None or chunk != expected_chunks[c]:
48 |                     print(f'Rank {r} chunk {c} is incorrect should be {expected_chunks[c]} given {chunk}')
49 |                     correct = False
50 |         return correct
51 | 
52 | 
53 | def program(num_ranks, groups, instances, protocol):
54 |     gpus_per_group = num_ranks // groups
55 |     topology = fully_connected(num_ranks)
56 |     chunk_factor = 1
57 |     inplace = False
58 |     collective = ReduceGather(num_ranks, chunk_factor, inplace, groups)
59 | 
60 |     with MSCCLProgram("reduce-gather", topology, collective, instances, protocol, threadblock_policy=ThreadblockPolicy.manual):
61 | 
62 |         # Per group reduce scatter
63 |         for y in range(groups):
64 |             for x in range(gpus_per_group):
65 |                 output_index = y * groups + x
66 |                 input_index = x
67 |                 gpu = y * groups + (x+1) % gpus_per_group
68 |                 c = chunk(gpu, Buffer.input, input_index)
69 |                 # Use the input buffer to perform reduction across groups
70 |                 for x_ in range(1, gpus_per_group):
71 |                     c = c.reduce(y * groups + (x + 1 + x_) % gpus_per_group, Buffer.input, input_index, sendtb=0, recvtb=0, ch=0)
72 |                 # Copy reduced chunk into the output buffer
73 |                 c = c.send(c.rank, Buffer.output, output_index, sendtb=0, recvtb=0, ch=0)
74 | 
75 | 
76 |         # Ring Allgather
77 |         for r in range(num_ranks):
78 |             c = chunk(r, Buffer.output, r)
79 |             next = (r + 1) % num_ranks
80 |             while next != r:
81 |                 c = c.send(next, Buffer.output, r, sendtb=1, recvtb=1, ch=1)
82 |                 next = (next + 1) % num_ranks
83 | 
84 |         Check()
85 |         XML()
86 | 
87 | if __name__ == '__main__':
88 |     parser = argparse.ArgumentParser()
89 |     parser.add_argument('num_ranks', type=int, help ='number of ranks')
90 |     parser.add_argument('groups', type=int, help='number of reduction groups')
91 |     parser.add_argument('--instances', type=int, default=1, help='number of instances')
92 |     parser.add_argument('--protocol', type=str, default='Simple', 
93 |         choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol')
94 |     args = parser.parse_args()
95 | 
96 |     assert args.num_ranks % args.groups == 0
97 | 
98 |     program(args.num_ranks, args.groups, args.instances, args.protocol)
99 | 


--------------------------------------------------------------------------------
/examples/mscclang/simple/allgather_ring.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.language import *
 5 | from msccl.topologies import *
 6 | from msccl.language.collectives import AllGather
 7 | 
 8 | def allgather_ring(size):
 9 |     topology = fully_connected(size)
10 |     collective = AllGather(size, 1, False)
11 |     with MSCCLProgram("allgather_ring", topology, collective, 1):
12 |         # Loop over each chunk's root
13 |         for r in range(size):
14 |             # Get the chunk at rank r, input[r]
15 |             c = chunk(r, Buffer.input, 0)
16 |             # Copy chunk to the output buffer
17 |             c = c.copy(r, buffer=Buffer.output, index=r, sendtb=0)
18 | 
19 |             next = (r + 1) % size
20 |             while next != r:
21 |                 # For each rank in the ring, send the chunk to the next rank
22 |                 # Setting sender's tb and receiver's tb to be 0 so that send/receives on the
23 |                 # same rank can be merged into a receive-copy-send
24 |                 c = c.copy(next, buffer=Buffer.output, index=r)
25 |                 next = (next + 1) % size
26 |         XML()
27 |         Check()
28 | 
29 | def allgather_ring_inplace(size):
30 |     topology = fully_connected(size)
31 |     collective = AllGather(size, 1, True)
32 |     with MSCCLProgram("allgather_ring", topology, collective, 1):
33 |         # Loop over each chunk's root
34 |         for r in range(size):
35 |             # Get the chunk at rank r, input[r]
36 |             c = chunk(r, Buffer.input, 0)
37 | 
38 |             next = (r + 1) % size
39 |             while next != r:
40 |                 # For each rank in the ring, send the chunk to the next rank
41 |                 # Setting sender's tb and receiver's tb to be 0 so that send/receives on the
42 |                 # same rank can be merged into a receive-copy-send
43 |                 c = c.copy(next, buffer=Buffer.output, index=r)
44 |                 next = (next + 1) % size
45 |         XML()
46 |         Check()
47 | 
48 | allgather_ring(4)
49 | # allgather_ring_inplace(4)


--------------------------------------------------------------------------------
/examples/mscclang/simple/allreduce_ring.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | 
 6 | from msccl.language import *
 7 | from msccl.topologies import *
 8 | from msccl.collectives import *
 9 | from msccl.language.collectives import AllReduce
10 | 
11 | 
12 | def allreduce_ring(size, instances):
13 |     # Logical topology
14 |     topology = fully_connected(size)
15 |     collective = AllReduce(size, size, inplace=True)
16 | 
17 |     with MSCCLProgram("allreduce_ring_inplace", topology, collective, instances):
18 |         for r in range(size):
19 |             index = r
20 |             # (rank, buffer, index)
21 |             c = chunk(r, Buffer.input, index)
22 |             next = (r + 1) % size
23 |             # Chunk travels around the ring being reduced
24 |             while next != r:
25 |                 c1 = chunk(next, buffer=Buffer.input, index=r)
26 |                 # c1 += c
27 |                 c = c1.reduce(c)
28 |                 next = (next + 1) % size
29 |             
30 |             # Send the fully reduced chunk around the ring
31 |             while next != (r - 1) % size:
32 |                 c = c.copy(next, buffer=Buffer.input, index=r)
33 |                 next = (next + 1) % size
34 | 
35 |         Check()
36 |         XML()
37 | 
38 | parser = argparse.ArgumentParser()
39 | parser.add_argument('num_gpus', type=int, help ='number of gpus')
40 | parser.add_argument('instances', type=int, help='number of instances')
41 | 
42 | args = parser.parse_args()
43 | 
44 | allreduce_ring(args.num_gpus, args.instances)
45 | 


--------------------------------------------------------------------------------
/examples/mscclang/simple/custom_collective.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | # Example of a simple custom collective where Rank 0 sends a chunk to Ranks 1 and 2
 5 | 
 6 | from msccl.language import *
 7 | from msccl.topologies import *
 8 | from msccl.language.collectives import Collective
 9 | 
10 | # For custom collectives you need to define a new collective class
11 | # this is used by mscclang to initialize buffers with chunks (pre-condition)
12 | # and provide a checker to check that chunks satisfy the post-condition of the collective.
13 | class CollEx(Collective):
14 |     # Initial state is chunk0 is on rank0 in the input buffer
15 |     def init_buffers(self):
16 |         chunks_per_node = self.chunk_factor
17 |         rank_buffers = []
18 |         for r in range(self.num_ranks):
19 |             input_buffer = [None] * chunks_per_node
20 |             output_buffer = [None] * chunks_per_node 
21 |             if r == 0:
22 |                 for c in range(chunks_per_node):
23 |                     # Format for specifying a chunk
24 |                     # Chunk(starting rank, starting index, ending rank, ending index)
25 |                     # Because this chunk ends up on multiple ranks ending rank is set to -1
26 |                     input_buffer[c] = Chunk(r, c, -1, c)
27 |             buffers = {Buffer.input : input_buffer, 
28 |                        Buffer.output : output_buffer}
29 |             rank_buffers.append(buffers)
30 |         return rank_buffers
31 |             
32 | 
33 |     # Final state chunk0 from rank0 is in the output buffer of rank1 and rank2
34 |     def check(self, prog):
35 |         correct = True
36 |         for r in range(1, self.num_ranks):
37 |             output = prog.buffers[r][Buffer.output]
38 |             for c in range(self.chunk_factor):
39 |                 chunk = output[c]
40 |                 # Check that we got chunk 0 from rank 0
41 |                 if chunk is None or chunk.origin_rank != 0 or chunk.origin_index != 0:
42 |                     print(f'Rank {r} chunk {c} is incorrect should be ({0}, {0}) given {chunk}')
43 |                     correct = False
44 |         return correct
45 | 
46 | 
47 | def custom_example1():
48 |     # MSCCLang programs take in a name for hte program, the topology of the network, 
49 |     # collective being implemented, chunksperloop of the collective, and optionally the NCCL protocol to be used
50 |     size = 3
51 |     topology = fully_connected(size) 
52 |     # Collectives take in number of ranks in the network, chunksperloop of the collective, whether it is inplace, 
53 |     collective = CollEx(size, 1, inplace=False)
54 |     with MSCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"):
55 |         # Get the chunk at rank 0 index 0 of the input buffer
56 |         c = chunk(0, Buffer.input, 0)
57 |         # Send chunks to 1 and 2
58 |         # Can specify the sender's tb, receiver's tb, and channel for the send operation
59 |         # MSCCLang provides a default threadblock assignment if they aren't specified
60 |         # MSCCLang will also check the tb/channel combos are valid
61 |         c.copy(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
62 |         c.copy(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
63 | 
64 |         XML() # Generates the XML for this collective
65 |         Check() # Checks the routes defined for each chunk are correct. Currently doesn't check XML correct
66 | 
67 | def custom_example2():
68 | 
69 |     size = 3
70 |     topology = fully_connected(size) 
71 | 
72 |     collective = CollEx(size, 1, inplace=False)
73 |     with MSCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"):
74 |         c = chunk(0, Buffer.input, 0)
75 |         # This is the same program as above but instead of rank 0 sending to 1 and 2
76 |         # 0 sends to 1 which sends to 2
77 |         # send returns the chunk on the receiver's side
78 |         c = c.copy(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
79 |         c.copy(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
80 | 
81 |         XML()
82 |         Check() 
83 | 
84 | custom_example1()
85 | custom_example2()
86 | 


--------------------------------------------------------------------------------
/examples/requirements_sccl_init.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/parasailteam/msccl-presynth


--------------------------------------------------------------------------------
/examples/sccl_init.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | 
 6 | def show():
 7 |     if 'MSCCL_CONFIG' in os.environ:
 8 |         print()
 9 |         print(f"MSCCL_CONFIG = {os.environ['MSCCL_CONFIG']}")
10 |         print(f"Contents of {os.environ['MSCCL_CONFIG']}:")
11 |         with open(os.environ['MSCCL_CONFIG']) as f:
12 |             print(f.read())
13 |         print()
14 | 
15 | 
16 | print('=== Trigger a builtin synthesis plan ===')
17 | 
18 | import msccl
19 | msccl.init('ndv4', 9, (msccl.Collective.alltoall, '1GB'))
20 | 
21 | show()
22 | 
23 | 
24 | print('=== Register additional plans from a library ===')
25 | 
26 | import msccl_presynth
27 | msccl.init('ndv2', 3,
28 |     (msccl.Collective.alltoall, '1GB'),
29 |     (msccl.Collective.allgather, (128, '1KB')))
30 | 
31 | show()
32 | 
33 | 
34 | print('=== Register custom plans ===')
35 | 
36 | from msccl.autosynth.registry import register_synthesis_plan
37 | 
38 | @register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1MB', None))
39 | def alltoall_9000(machines):
40 |     return """<algo name="a2andv9000" nchunksperloop="2" nchannels="1" inplace="0" ngpus="2" proto="Simple">
41 |     ...
42 |     </algo>"""
43 | 
44 | msccl.init('ndv9000', 1, (msccl.Collective.alltoall, '2MB'))
45 | 
46 | show()
47 | 
48 | 
49 | print('=== Overlapping size ranges ===')
50 | 
51 | register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, (0, '1KB'), protocol='LL')(alltoall_9000)
52 | register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1KB', '1MB'), protocol='LL128')(alltoall_9000)
53 | 
54 | msccl.init('ndv9000', 1, (msccl.Collective.alltoall, ('2KB', None)))
55 | 
56 | show()
57 | 
58 | 
59 | # TODO: Update the following programs to use the new syntax
60 | # print('=== MSCCLang program ===')
61 | 
62 | # from msccl.autosynth.registry import register_msccl_program
63 | # from msccl.topologies import line
64 | # from msccl.language import *
65 | 
66 | # @register_msccl_program(line(2), 'allgather', 'two_gpus', machines= lambda m: m == 1)
67 | # def trivial_allgather(prog, nodes):
68 | #     chunk(Buffer.input, 0, 0).send(0, Buffer.output, 0).send(1)
69 | #     chunk(Buffer.input, 1, 0).send(1, Buffer.output, 1).send(0)
70 | 
71 | # msccl.init('two_gpus', 1, (msccl.Collective.allgather, (0, None)))
72 | 
73 | # show()
74 | 
75 | 
76 | # print('=== MSCCLang program example ====')
77 | 
78 | # from msccl.topologies import fully_connected
79 | # from msccl.programs.allreduce_a100_ring import allreduce_ring
80 | 
81 | # @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
82 | #     instances=4, protocol='LL128', threadblock_policy=ThreadblockPolicy.manual, machines=lambda x: x == 1)
83 | # def ndv4_ring_allreduce(prog, nodes):
84 | #     allreduce_ring(size=8, channels=8)
85 | 
86 | # msccl.init('ndv4', 1, (msccl.Collective.allreduce, (0, None)))
87 | 
88 | # show()


--------------------------------------------------------------------------------
/examples/send.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | # This script defines and saves a custom collective to send from rank 2 to rank 7
 5 | 
 6 | from msccl.collectives import build_collective
 7 | from msccl.serialization import save_msccl_object
 8 | 
 9 | precondition = lambda r, c: r == 2
10 | postcondition = lambda r, c: r == 7
11 | coll = build_collective('Send', 8, 1, precondition, postcondition)
12 | save_msccl_object(coll, 'send.json')
13 | 


--------------------------------------------------------------------------------
/examples/unpermute_dgx1.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | # This script shows how to use MSCCL to find a way to permute the nodes of a DGX1 to match the default order.
 5 | 
 6 | from msccl.topologies import *
 7 | from msccl.isomorphisms import find_isomorphisms
 8 | 
 9 | def solve_dgx1_permutation():
10 |     local = nvlink_only()
11 |     isomorphisms = find_isomorphisms(dgx1(), local, limit=4)
12 |     if len(isomorphisms) == 0:
13 |         raise RuntimeError('No isomorphism to DGX1 found')
14 |     return isomorphisms
15 | print(solve_dgx1_permutation())
16 | 


--------------------------------------------------------------------------------
/msccl/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | 
4 | from msccl.autosynth import init, tabulate_plans, print_plans
5 | from msccl.autosynth import ndv2_perm
6 | from msccl.autosynth import Collective
7 | 


--------------------------------------------------------------------------------
/msccl/__main__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # PYTHON_ARGCOMPLETE_OK
 3 | 
 4 | # Copyright (c) Microsoft Corporation.
 5 | # Licensed under the MIT License.
 6 | 
 7 | import msccl.collectives as collectives
 8 | import msccl.topologies as topologies
 9 | import msccl.strategies as strategies
10 | from msccl.cli import *
11 | 
12 | import argparse
13 | import argcomplete
14 | import sys
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser('msccl')
18 | 
19 |     cmd_parsers = parser.add_subparsers(title='command', dest='command')
20 |     cmd_parsers.required = True
21 | 
22 |     handlers = []
23 |     handlers.append(make_solvers(cmd_parsers))
24 |     handlers.append(make_composers(cmd_parsers))
25 |     handlers.append(make_distributors(cmd_parsers))
26 |     handlers.append(make_analyses(cmd_parsers))
27 |     handlers.append(make_handle_ncclize(cmd_parsers))
28 |     handlers.append(make_plans(cmd_parsers))
29 | 
30 |     argcomplete.autocomplete(parser)
31 |     args = parser.parse_args()
32 |     
33 |     for handler in handlers:
34 |         if handler(args, args.command):
35 |             break
36 | 
37 | if __name__ == '__main__':
38 |     main()
39 | 


--------------------------------------------------------------------------------
/msccl/algorithm.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | from dataclasses import dataclass
  5 | from collections import defaultdict
  6 | 
  7 | @dataclass
  8 | class Step(object):
  9 |     rounds: int
 10 |     sends: list
 11 | 
 12 | class Algorithm(object):
 13 |     def __init__(self, name, collective, topology, instance, steps, input_map = {}, output_map = {}):
 14 |         self.name = name
 15 |         self.topology = topology
 16 |         self.collective = collective
 17 |         self.instance = instance
 18 |         self.steps = steps
 19 |         self.input_map = input_map
 20 |         self.output_map = output_map
 21 | 
 22 |         self._update_link_utilizations()
 23 |         self._check_bandwidth_constraints()
 24 | 
 25 |         for step in self.steps:
 26 |             step.sends.sort()
 27 | 
 28 |     @classmethod
 29 |     def make_implementation(cls, collective, topology, instance, steps):
 30 |         chunked = collective.chunk_up(instance.chunks)
 31 | 
 32 |         # Figure out input and output addresses
 33 |         input_map = {}
 34 |         output_map = {}
 35 |         for rank in chunked.ranks():
 36 |             input_addrs = set()
 37 |             output_addrs = set()
 38 |             for chunk in chunked.chunks():
 39 |                 # An address is an input address if any of its chunks is in the precondition
 40 |                 if chunked.precondition(rank, chunk):
 41 |                     input_addrs.add(chunked.address(chunk))
 42 |                 # An address is an output address if any of its chunks is in the postcondition
 43 |                 if chunked.postcondition(rank, chunk):
 44 |                     output_addrs.add(chunked.address(chunk))
 45 |             if len(input_addrs) > 0:
 46 |                 input_map[rank] = input_addrs
 47 |             if len(output_addrs) > 0:
 48 |                 output_map[rank] = output_addrs
 49 | 
 50 |         # Concatenate collective and topology names plus instance arguments to create a name
 51 |         name = f'{collective.name}-{topology.name}-{instance}'
 52 | 
 53 |         algo = cls(name, collective, topology, instance, steps, input_map, output_map)
 54 |         algo.check_implements(chunked)
 55 |         if instance.extra_rounds > 0:
 56 |             used_extra_rounds = algo.extra_rounds()
 57 |             if used_extra_rounds > instance.extra_rounds:
 58 |                 raise ValueError(f'steps use {used_extra_rounds} extra rounds but only {instance.extra_rounds} were allowed')
 59 |         return algo
 60 | 
 61 |     def ranks(self):
 62 |         return range(self.topology.num_nodes())
 63 |     
 64 |     def num_steps(self):
 65 |         return len(self.steps)
 66 | 
 67 |     def extra_rounds(self):
 68 |         rounds = 0
 69 |         for step in self.steps:
 70 |             rounds += step.rounds
 71 |         return rounds - self.num_steps()
 72 | 
 73 |     def is_pipelined(self):
 74 |         return self.instance.pipeline != None
 75 | 
 76 |     def check_implements(self, collective):
 77 |         if self.topology.num_nodes() != collective.num_nodes:
 78 |             raise RuntimeError('topology and collective have different number of nodes')
 79 |         # Find which chunks will be sent from an address
 80 |         chunks_at_address = defaultdict(list)
 81 |         for chunk in collective.chunks():
 82 |             chunks_at_address[collective.address(chunk)].append(chunk)
 83 |         # State records if a rank holds a chunk
 84 |         def idx(rank, chunk):
 85 |             return rank * collective.num_chunks + chunk
 86 |         state = [False] * (collective.num_nodes * collective.num_chunks)
 87 |         # Initialize state from precondition
 88 |         for rank in collective.ranks():
 89 |             for chunk in collective.chunks():
 90 |                 state[idx(rank, chunk)] = collective.precondition(rank, chunk)
 91 |         # Propagate state through sends of every step
 92 |         for step in self.steps:
 93 |             next_state = state.copy()
 94 |             for addr, src, dst in step.sends:
 95 |                 for chunk in chunks_at_address[addr]:
 96 |                     next_state[idx(dst, chunk)] |= state[idx(src, chunk)]
 97 |             state = next_state
 98 |         # Check that the postcondition holds
 99 |         for rank in collective.ranks():
100 |             for chunk in collective.chunks():
101 |                 if collective.postcondition(rank, chunk) and not state[idx(rank, chunk)]:
102 |                     raise RuntimeError(f'rank {rank} does not get chunk {chunk} as required by the postcondition')
103 | 
104 |     def _update_link_utilizations(self):
105 |         self._link_utilizations = []
106 |         ranks = range(self.topology.num_nodes())
107 |         for step in self.steps:
108 |             step_utilizations = [[0 for _ in ranks] for _ in ranks]
109 |             for addr, src, dst in step.sends:
110 |                 step_utilizations[dst][src] += 1 # Same order as topology
111 |             self._link_utilizations.append(step_utilizations)
112 | 
113 |     def _check_bandwidth_constraints(self):
114 |         for srcs, dsts, bw, name in self.topology.bandwidth_constraints():
115 |             for step_num, step in enumerate(self.steps):
116 |                 util = 0
117 |                 for dst in dsts:
118 |                     for src in srcs:
119 |                         if self.is_pipelined():
120 |                             for overlapping_step in range(step_num, len(self.steps), self.instance.pipeline):
121 |                                 util += self._link_utilizations[overlapping_step][dst][src]
122 |                         else:
123 |                             util += self._link_utilizations[step_num][dst][src]
124 |                 assert util <= bw * step.rounds, \
125 |                     f'Step {step_num} uses {util} bandwidth but constraint {name} only allows for {bw * step.rounds} bandwidth (when rounds={step.rounds}).'
126 | 
127 |     def __str__(self):
128 |         s = ''
129 |         for i, step in enumerate(self.steps):
130 |             if i != 0:
131 |                 s += '\n'
132 |             if step.rounds > 1:
133 |                 s += f'(step {i+1}, rounds={step.rounds}) '
134 |             else:
135 |                 s += f'(step {i+1}) '
136 |             s += ', '.join([f'{chunk}:{src}→{dst}' for chunk, src, dst in step.sends])
137 |         return s
138 | 


--------------------------------------------------------------------------------
/msccl/autosynth/msccl_ndv2_launcher.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -c "import msccl; msccl.ndv2_perm()"
3 | order=/var/lock/msccl_autosynth_inspector_topo.lock
4 | if [ -f "$order" ]; then
5 |     export CUDA_VISIBLE_DEVICES=$(</var/lock/msccl_autosynth_inspector_topo.lock)
6 |     echo "Set CUDA_VISIBLE_DEVICES to: "$CUDA_VISIBLE_DEVICES
7 | fi
8 | $@


--------------------------------------------------------------------------------
/msccl/autosynth/ndv2_plans.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.topologies import dgx1
 5 | from msccl.collectives import gather, scatter
 6 | from msccl.strategies import solve_least_steps
 7 | from msccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
 8 | from msccl.autosynth.registry import register_synthesis_plan
 9 | from msccl.ncclize import ncclize
10 | 
11 | 
12 | def register_ndv2_plans():
13 |     @register_synthesis_plan('alltoall', 'ndv2', sizes=('1MB', None), machines=lambda x: x >= 2)
14 |     def synthesize_ndv2_relay_alltoall(machines):
15 |         gather_coll = gather(8, 0)
16 |         scatter_coll = scatter(8, 1)
17 |         gather_algo = solve_least_steps(dgx1(), gather_coll)
18 |         scatter_algo = solve_least_steps(dgx1(), scatter_coll)
19 |         algo = synthesize_gather_scatter_distributed_alltoall(
20 |             machines, gather_algo, scatter_algo)
21 |         return ncclize(algo, instances=8)


--------------------------------------------------------------------------------
/msccl/autosynth/ndv4_plans.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.autosynth.registry import register_synthesis_plan, register_msccl_program
 5 | from msccl.programs.allreduce_a100_ring import allreduce_ring
 6 | from msccl.programs.allreduce_allpairs import allreduce_allpairs
 7 | from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical
 8 | from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step
 9 | from msccl.topologies import fully_connected
10 | from msccl.language.ir import ThreadblockPolicy
11 | 
12 | def register_ndv4_plans():
13 | 
14 |     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=64, inplace=True,
15 |         instances=2, protocol='LL', sizes=('512B', '82944B'), threadblock_policy=ThreadblockPolicy.manual, interleaved_replication=False, dependence_nop=True, machines= lambda x: x == 1)
16 |     def ndv4_allpairs_allreduce_config1(prog, nodes):
17 |         allreduce_allpairs(8)
18 | 
19 |     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=64, inplace=True,
20 |         instances=4, protocol='LL', sizes=('82944B', '458752B'), threadblock_policy=ThreadblockPolicy.manual, interleaved_replication=False, dependence_nop=True, machines= lambda x: x == 1)
21 |     def ndv4_allpairs_allreduce_config2(prog, nodes):
22 |         allreduce_allpairs(8)
23 | 
24 |     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
25 |         instances=8, protocol='LL', sizes=('458752B', '2129920B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
26 |     def ndv4_ring_allreduce_config1(prog, nodes):
27 |         allreduce_ring(size=8, channels=4)
28 | 
29 |     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
30 |         instances=8, protocol='LL128', sizes=('2129920B', '22806528B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
31 |     def ndv4_ring_allreduce_config2(prog, nodes):
32 |         allreduce_ring(size=8, channels=4)
33 | 
34 |     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32 or x == 64)
35 |     def ndv4_alltoall_hierarchical_config1(prog, nodes):
36 |         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
37 | 
38 |     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 8 or x == 16 or x == 32)
39 |     def ndv4_alltoall_hierarchical_config2(prog, nodes):
40 |         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
41 | 
42 |     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 64)
43 |     def ndv4_alltoall_three_step(prog, nodes):
44 |         alltoall_three_step(num_nodes=nodes, gpus_per_node=8)
45 | 
46 |     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('1KB', None), machines=lambda x: x == 2 or x == 4)
47 |     def ndv4_alltoall_hierarchical_config2(prog, nodes):
48 |         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
49 | 
50 |         
51 | 


--------------------------------------------------------------------------------
/msccl/autosynth/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from collections import defaultdict
 5 | import math
 6 | import tempfile
 7 | import os
 8 | import atexit
 9 | import humanfriendly
10 | 
11 | from msccl.language import MSCCLProgram, ir_to_xml
12 | from msccl.language.ir import ThreadblockPolicy
13 | import msccl.language.collectives as lang_collectives
14 | from msccl.topologies import distributed_fully_connected
15 | 
16 | # The plans are keyed by (collective, machine_type) and each entry is a tuple
17 | # (name, function, machines, size_range, protocol, priority).
18 | synthesis_plans = defaultdict(list)
19 | 
20 | 
21 | def _register_ef_provider(desc, fun, collective, machine_type, machines, sizes, protocol, priority):
22 |     if sizes == None:
23 |         sizes = (0, math.inf)
24 |     else:
25 |         lower, upper = sizes
26 |         if isinstance(lower, str):
27 |             lower = humanfriendly.parse_size(lower)
28 |         if isinstance(upper, str):
29 |             upper = humanfriendly.parse_size(upper)
30 |         if upper == None:
31 |             upper = math.inf
32 |         sizes = (lower, upper)
33 |     # Register entries under all keys that might trigger this plan
34 |     entry = (desc, fun, machines, sizes, protocol, priority)
35 |     if isinstance(machine_type, list):
36 |         for mtype in machine_type:
37 |             synthesis_plans[(collective, mtype)].append(entry)
38 |     else:
39 |         synthesis_plans[(collective, machine_type)].append(entry)
40 | 
41 | 
42 | def register_ef_file(path, collective, machine_type, num_machines, sizes=None, protocol='Simple', priority=0):
43 |     def provide_ef_path(machines):
44 |         return path
45 |     _register_ef_provider(f'load {path}', provide_ef_path, collective,
46 |                          machine_type, lambda x: x == num_machines, sizes, protocol, priority)
47 | 
48 | 
49 | def register_synthesis_plan(collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', priority=0):
50 |     def decorator(fun):
51 |         def wrapped(machines):
52 |             ef = fun(machines)
53 |             fd, path = tempfile.mkstemp()
54 |             with os.fdopen(fd, 'w') as f:
55 |                 f.write(ef)
56 |             atexit.register(os.remove, path)
57 |             return path
58 |         _register_ef_provider(f'call {fun.__name__}', wrapped, collective,
59 |                              machine_type, machines, sizes, protocol, priority)
60 |         # Return the original function to not break other usage
61 |         return fun
62 |     return decorator
63 | 
64 | 
65 | def register_msccl_program(local_topology, collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', 
66 |     chunk_factor=1, priority=0, collective_obj=None, instances=1, inplace=False, threadblock_policy=ThreadblockPolicy.auto,
67 |     interleaved_replication=True, dependence_nop=False):
68 |     def decorator(fun):
69 |         name = fun.__name__
70 |         def wrapped(machines):
71 |             topology = distributed_fully_connected(local_topology, machines, 1)
72 |             co = collective_obj
73 |             if co == None:
74 |                 if collective == 'allreduce':
75 |                     co = lang_collectives.AllReduce(topology.num_nodes(), chunk_factor, inplace)
76 |                 elif collective == 'allgather':
77 |                     co = lang_collectives.AllGather(topology.num_nodes(), chunk_factor, inplace)
78 |                 elif collective == 'alltoall':
79 |                     co = lang_collectives.AllToAll(topology.num_nodes(), chunk_factor, inplace)
80 |                 elif collective == 'reduce_scatter':
81 |                     co = lang_collectives.ReduceScatter(topology.num_nodes(), chunk_factor, inplace)
82 |                 else:
83 |                     raise RuntimeError(f'No collective_obj in msccl.language.collectives known for "{collective}"')
84 |             prog = MSCCLProgram(name, topology, co, instances, protocol, threadblock_policy=threadblock_policy, 
85 |                 interleaved_replication=interleaved_replication, dependence_nop=dependence_nop)
86 |             with prog:
87 |                 fun(prog, machines)
88 |             prog.check()
89 |             ef = prog.generate_xml()
90 |             fd, path = tempfile.mkstemp()
91 |             with os.fdopen(fd, 'w') as f:
92 |                 f.write(ef)
93 |             atexit.register(os.remove, path)
94 |             return path
95 |         _register_ef_provider(f'run {name}', wrapped, collective,
96 |                              machine_type, machines, sizes, protocol, priority)
97 |         # Return the original function to not break other usage
98 |         return fun
99 |     return decorator


--------------------------------------------------------------------------------
/msccl/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .solve import *
 5 | from .compose import *
 6 | from .distribute import *
 7 | from .analyze import *
 8 | from .ncclize import *
 9 | from .plans import *
10 | 


--------------------------------------------------------------------------------
/msccl/cli/analyze.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .known_topologies import KnownTopologies
 5 | from .known_collectives import KnownCollectives
 6 | from .common import *
 7 | from msccl.rounds_bound import lower_bound_rounds
 8 | from msccl.isomorphisms import find_isomorphisms
 9 | 
10 | def make_analyses(cmd_parsers):
11 |     handler_funcs = []
12 |     handler_funcs.append(make_handle_bound_rounds)
13 |     handler_funcs.append(make_handle_find_isomorphisms)
14 | 
15 |     return make_cmd_category(cmd_parsers, 'analyze', 'analysis', handler_funcs)
16 | 
17 | def make_handle_bound_rounds(cmd_parsers):
18 |     cmd = cmd_parsers.add_parser('rounds')
19 |     topologies = KnownTopologies(cmd)
20 |     collectives = KnownCollectives(cmd)
21 | 
22 |     def handle(args, command):
23 |         if command != 'rounds':
24 |             return False
25 | 
26 |         topology = topologies.create(args)
27 |         collective = collectives.create(args, topology.num_nodes())
28 |         lower_bound_rounds(topology, collective, logging=True)
29 |         return True
30 |     
31 |     return handle
32 | 
33 | def make_handle_find_isomorphisms(cmd_parsers):
34 |     cmd = cmd_parsers.add_parser('isomorphisms')
35 |     topologies1 = KnownTopologies(cmd, tag='1')
36 |     topologies2 = KnownTopologies(cmd, tag='2')
37 | 
38 |     def handle(args, command):
39 |         if command != 'isomorphisms':
40 |             return False
41 | 
42 |         topology1 = topologies1.create(args)
43 |         topology2 = topologies2.create(args)
44 |         isomorphisms = find_isomorphisms(topology1, topology2, logging=True)
45 |         return True
46 |     
47 |     return handle
48 | 


--------------------------------------------------------------------------------
/msccl/cli/compose.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.composers import *
 5 | from .common import *
 6 | 
 7 | def make_composers(cmd_parsers):
 8 |     handler_funcs = []
 9 |     handler_funcs.append(make_handle_allreduce)
10 | 
11 |     return make_cmd_category(cmd_parsers, 'compose', 'composer', handler_funcs)
12 | 
13 | def make_handle_allreduce(cmd_parsers):
14 |     name = 'allreduce'
15 |     cmd = cmd_parsers.add_parser(name)
16 |     read_reducescatter_algorithm = add_input_algorithm(cmd, name="reducescatter-algorithm")
17 |     read_allgather_algorithm = add_input_algorithm(cmd, name="allgather-algorithm")
18 |     validate_output_args, output_handler = add_output_algorithm(cmd)
19 | 
20 |     def handle(args, command):
21 |         if command != name:
22 |             return False
23 | 
24 |         reducescatter_algorithm = read_reducescatter_algorithm(args)
25 |         allgather_algorithm = read_allgather_algorithm(args)
26 |         validate_output_args(args)
27 |         algo = compose_allreduce(reducescatter_algorithm, allgather_algorithm, logging=True)
28 |         output_handler(args, algo)
29 |         return True
30 | 
31 |     return handle


--------------------------------------------------------------------------------
/msccl/cli/distribute.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | from msccl.distributors import *
  5 | from .known_distributed_topologies import KnownDistributedTopologies
  6 | from .known_topologies import KnownTopologies
  7 | from .common import *
  8 | 
  9 | def make_distributors(cmd_parsers):
 10 |     handler_funcs = []
 11 |     handler_funcs.append(make_handle_greedy_alltoall)
 12 |     handler_funcs.append(make_handle_gather_scatter_alltoall)
 13 |     handler_funcs.append(make_handle_create_subproblem_distributed_alltoall)
 14 |     handler_funcs.append(make_handle_distribute_alltoall_stitch_subproblem)
 15 | 
 16 |     return make_cmd_category(cmd_parsers, 'distribute', 'distributor', handler_funcs)
 17 | 
 18 | def make_handle_greedy_alltoall(cmd_parsers):
 19 |     name = 'alltoall-greedy'
 20 |     cmd = cmd_parsers.add_parser(name)
 21 |     read_algorithm = add_input_algorithm(cmd)
 22 |     distributed_topologies = KnownDistributedTopologies(cmd)
 23 |     validate_output_args, output_handler = add_output_algorithm(cmd)
 24 | 
 25 |     def handle(args, command):
 26 |         if command != name:
 27 |             return False
 28 | 
 29 |         input_algorithm = read_algorithm(args)
 30 |         validate_output_args(args)
 31 |         topology = distributed_topologies.create(args, input_algorithm.topology)
 32 |         algo = synthesize_greedy_distributed_alltoall(topology, input_algorithm, logging=True)
 33 |         output_handler(args, algo)
 34 |         return True
 35 | 
 36 |     return handle
 37 | 
 38 | def make_handle_gather_scatter_alltoall(cmd_parsers):
 39 |     name = 'alltoall-gather-scatter'
 40 |     cmd = cmd_parsers.add_parser(name)
 41 |     read_gather_algorithm = add_input_algorithm(cmd, name='gather')
 42 |     read_scatter_algorithm = add_input_algorithm(cmd, name='scatter')
 43 |     cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology to be made')
 44 |     cmd.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='remote bandwidth', metavar='N')
 45 |     validate_output_args, output_handler = add_output_algorithm(cmd)
 46 | 
 47 |     def handle(args, command):
 48 |         if command != name:
 49 |             return False
 50 | 
 51 |         gather_algorithm = read_gather_algorithm(args)
 52 |         scatter_algorithm = read_scatter_algorithm(args)
 53 |         validate_output_args(args)
 54 |         algo = synthesize_gather_scatter_distributed_alltoall(args.copies, gather_algorithm, scatter_algorithm, args.remote_bandwidth, logging=True)
 55 |         output_handler(args, algo)
 56 |         return True
 57 | 
 58 |     return handle
 59 | 
 60 | def make_handle_create_subproblem_distributed_alltoall(cmd_parsers):
 61 |     name = 'alltoall-create-subproblem'
 62 |     cmd = cmd_parsers.add_parser(name)
 63 |     topologies = KnownTopologies(cmd)
 64 |     cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology to be made')
 65 |     cmd.add_argument('--relay-nodes', type=int, nargs='+', default=[0], help='relay nodes')
 66 |     cmd.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='remote bandwidth', metavar='N')
 67 |     cmd.add_argument('--share-bandwidth', action='store_true', help='share local bandwidth between relay nodes')
 68 |     validate_output_args, output_handler = add_output_msccl_objects(cmd)
 69 | 
 70 |     def handle(args, command):
 71 |         if command != name:
 72 |             return False
 73 | 
 74 |         local_topology = topologies.create(args)
 75 |         validate_output_args(args)
 76 | 
 77 |         collective, topology = make_alltoall_subproblem_collective_and_topology(local_topology, args.copies, args.relay_nodes, args.remote_bandwidth, args.share_bandwidth)
 78 | 
 79 |         output_handler(args, collective, collective.name)
 80 |         output_handler(args, topology, topology.name)
 81 |         return True
 82 | 
 83 |     return handle
 84 |  
 85 | def make_handle_distribute_alltoall_stitch_subproblem(cmd_parsers):
 86 |     name = 'alltoall-stitch-subproblem'
 87 |     cmd = cmd_parsers.add_parser(name)
 88 |     read_subproblem_algorithm = add_input_algorithm(cmd)
 89 |     cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology made for the subproblem')
 90 |     validate_output_args, output_handler = add_output_algorithm(cmd)
 91 | 
 92 |     def handle(args, command):
 93 |         if command != name:
 94 |             return False
 95 | 
 96 |         subproblem_algorithm = read_subproblem_algorithm(args)
 97 |         validate_output_args(args)
 98 |         algo = synthesize_alltoall_subproblem(subproblem_algorithm, args.copies, logging=True)
 99 |         output_handler(args, algo)
100 |         return True
101 | 
102 |     return handle


--------------------------------------------------------------------------------
/msccl/cli/known_collectives.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import msccl.collectives as collectives
 5 | from msccl.serialization import *
 6 | from pathlib import Path
 7 | import sys
 8 | 
 9 | class KnownCollectives:
10 |     def __init__(self, parser):
11 |         self.parser = parser
12 |         self.constructors = {
13 |             'Broadcast': self._rooted_coll(collectives.broadcast),
14 |             'Reduce': self._rooted_coll(collectives.reduce),
15 |             'Scatter': self._rooted_coll(collectives.scatter),
16 |             'Gather': self._rooted_coll(collectives.gather),
17 |             'Allgather': self._coll(collectives.allgather),
18 |             'Allreduce': self._coll(collectives.allreduce),
19 |             'Alltoall': self._coll(collectives.alltoall),
20 |             'ReduceScatter': self._coll(collectives.reduce_scatter),
21 |             'Scan': self._coll(collectives.scan),
22 |             'MultirootBroadcast': self._multiroot_coll(collectives.multiroot_broadcast),
23 |             'MultirootScatter': self._multiroot_coll(collectives.multiroot_scatter),
24 |             'MultirootGather': self._multiroot_coll(collectives.multiroot_gather),
25 |             'custom': self._custom_coll(),
26 |         }
27 |         self.parser.add_argument('collective', type=str, choices=self.constructors.keys(), help='collective')
28 |         self.parser.add_argument('--collective-file', type=Path, default=None, help='a serialized collective', metavar='FILE')
29 |         self.parser.add_argument('--root', type=int, default=0, help='used by rooted collectives', metavar='N')
30 |         self.parser.add_argument('--roots', type=int, nargs='+', default=[0], help='used by multi-rooted collectives', metavar='N')
31 | 
32 |     def create(self, args, num_nodes):
33 |         return self.constructors[args.collective](num_nodes, args)
34 | 
35 |     def _custom_coll(self):
36 |         def make(size, args):
37 |             input_file = args.collective_file
38 |             if input_file is None:
39 |                 self.parser.error('--collective-file is required for custom collectives')
40 |                 exit(1)
41 | 
42 |             if not input_file.exists():
43 |                 print(f'error: input file not found: {input_file}', file=sys.stderr)
44 |                 exit(1)
45 | 
46 |             return load_msccl_object(input_file)
47 |         return make
48 | 
49 |     def _rooted_coll(self, fun):
50 |         def make(size, args):
51 |             root = args.root
52 |             return fun(size, root)
53 |         return make
54 | 
55 |     def _coll(self, fun):
56 |         def make(size, args):
57 |             return fun(size)
58 |         return make
59 | 
60 |     def _multiroot_coll(self, fun):
61 |         def make(size, args):
62 |             roots = args.roots
63 |             return fun(size, roots)
64 |         return make
65 | 


--------------------------------------------------------------------------------
/msccl/cli/known_distributed_topologies.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import msccl.topologies as topologies
 5 | import pathlib
 6 | 
 7 | class KnownDistributedTopologies:
 8 |     def __init__(self, parser):
 9 |         self.parser = parser
10 |         self.constructors = {
11 |             'DistributedFullyConnected': topologies.distributed_fully_connected,
12 |             'DistributedHubAndSpoke': topologies.distributed_hub_and_spoke,
13 |         }
14 |         self.parser.add_argument('topology', type=str, choices=self.constructors.keys(), help='the distributed topology')
15 |         self.parser.add_argument('-n', '--nodes', type=int, help='total nodes in the distributed topology, must be divisible by local topology')
16 |         self.parser.add_argument('--copies', type=int, help='copies of the local topology to be made')
17 |         self.parser.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='bandwidth of links in the distributed topology', metavar='N')
18 | 
19 |     def create(self, args, local_topology):
20 |         if args.nodes != None and args.copies != None:
21 |             self.parser.error('please use only one of -n/--nodes, --copies')
22 |         if args.copies != None:
23 |             copies = args.copies
24 |         elif args.nodes != None:
25 |             if args.nodes % local_topology.num_nodes() != 0:
26 |                 self.parser.error(f'total number of nodes must be divisible by the local number of nodes {local_topology.num_nodes()}, but {args.nodes} was given')
27 |             copies = args.nodes // local_topology.num_nodes()
28 |         else:
29 |             self.parser.error('one of the following arguments is required: --nodes, --copies')
30 |         return self.constructors[args.topology](local_topology, copies, args.remote_bandwidth)
31 | 


--------------------------------------------------------------------------------
/msccl/cli/known_topologies.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import msccl.topologies as topologies
 5 | from msccl.serialization import *
 6 | from .known_transformers import KnownTransformers
 7 | from pathlib import Path
 8 | import sys
 9 | 
10 | class KnownTopologies:
11 |     def __init__(self, parser, tag=''):
12 |         self.parser = parser
13 |         self.tag = tag
14 |         self.constructors = {
15 |             'FullyConnected': self._sized_topo(topologies.fully_connected),
16 |             'HubAndSpoke': self._sized_topo(topologies.hub_and_spoke),
17 |             'Ring': self._sized_topo(topologies.ring),
18 |             'Line': self._sized_topo(topologies.line),
19 |             'Star': self._sized_topo(topologies.star),
20 |             'AMD4': self._fixed_topo(topologies.amd4),
21 |             'AMD8': self._fixed_topo(topologies.amd8),
22 |             'DGX1': self._fixed_topo(topologies.dgx1),
23 |             'DGX2': self._fixed_topo(lambda: topologies.hub_and_spoke(16)),
24 |             'NVLinkOnly': self._fixed_topo(topologies.nvlink_only),
25 |             'custom': self._custom_topo(),
26 |         }
27 |         self.parser.add_argument(f'topology{tag}', type=str, choices=self.constructors.keys(), help=f'topology {tag}')
28 |         self.parser.add_argument(f'--topology-file{tag}', type=Path, default=None, help=f'a serialized topology', metavar=f'FILE')
29 |         self.parser.add_argument(f'-n{tag}', f'--nodes{tag}', type=int, help='required for non-fixed topologies', metavar='N')
30 |         self.known_transformers = KnownTransformers(parser, tag=tag)
31 | 
32 |     def _topology(self, args):
33 |         return vars(args)[f'topology{self.tag}']
34 | 
35 |     def _nodes(self, args):
36 |         return vars(args)[f'nodes{self.tag}']
37 | 
38 |     def create(self, args):
39 |         topology = self.constructors[self._topology(args)](args)
40 |         topology = self.known_transformers.transform(args, topology)
41 |         return topology
42 | 
43 |     def _custom_topo(self):
44 |         def make(args):
45 |             input_file = vars(args)[f'topology_file{self.tag}']
46 |             if input_file is None:
47 |                 self.parser.error(f'--topology-file{self.tag} is required for custom topologies')
48 |                 exit(1)
49 | 
50 |             if not input_file.exists():
51 |                 print(f'error: input file not found: {input_file}', file=sys.stderr)
52 |                 exit(1)
53 | 
54 |             return load_msccl_object(input_file)
55 |         return make
56 | 
57 |     def _fixed_topo(self, Cls):
58 |         def make(args):
59 |             topo = Cls()
60 |             if self._nodes(args) != None and self._nodes(args) != topo.num_nodes():
61 |                 self.parser.error(f'fixed-size topology {self._topology(args)} has {topo.num_nodes()} nodes, but command line specified {self._nodes(args)} nodes')
62 |             return topo
63 |         return make
64 | 
65 |     def _sized_topo(self, Cls):
66 |         def make(args):
67 |             if self._nodes(args) == None:
68 |                 self.parser.error(f'topology {self._topology(args)} requires -n/--nodes')
69 |             return Cls(self._nodes(args))
70 |         return make
71 | 


--------------------------------------------------------------------------------
/msccl/cli/known_transformers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import msccl.topologies as topologies
 5 | 
 6 | class KnownTransformers:
 7 |     def __init__(self, parser, tag=''):
 8 |         self.parser = parser
 9 |         self.tag = tag
10 |         self.transformers = {
11 |             'reverse': topologies.reverse_topology,
12 |             'binarize': topologies.binarize_topology,
13 |         }
14 |         self.parser.add_argument(f'-t{tag}', f'--transform{tag}', action='append', default=[], choices=self.transformers.keys(), help='apply a topology transformer. may be used multiple times')
15 | 
16 |     def transform(self, args, topology):
17 |         for key in vars(args)[f'transform{self.tag}']:
18 |             topology = self.transformers[key](topology)
19 |         return topology
20 | 


--------------------------------------------------------------------------------
/msccl/cli/ncclize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.ncclize import *
 5 | from .common import *
 6 | 
 7 | def make_handle_ncclize(cmd_parsers):
 8 |     cmd = cmd_parsers.add_parser('ncclize')
 9 |     read_algorithm = add_input_algorithm(cmd, multiple=True)
10 |     validate_output_args, output_handler = add_output_file(cmd)
11 |     remap_scratch_grp = cmd.add_mutually_exclusive_group()
12 |     remap_scratch_grp.add_argument('--remap-scratch', action='store_true', default=None, help='remap scratch buffer indices into free input/output indices')
13 |     remap_scratch_grp.add_argument('--no-remap-scratch', action='store_false', dest='remap_scratch', help='don\'t remap scratch buffer indices into free input/output indices')
14 |     cmd.add_argument('--no-merge-contiguous', action='store_true', help='don\'t merge sends/receives from/to contiguous memory')
15 |     cmd.add_argument('--no-pretty-print', action='store_true', help='don\'t pretty print the generated XML')
16 |     cmd.add_argument('--greedy-scratch-sorting', action='store_true', help='sort scratch buffer indices greedily to increase contiguous operations')
17 |     cmd.add_argument('--no-scratch', action='store_true', help='use extra space at the end of output buffer instead of the scratch buffer')
18 |     cmd.add_argument('--channel-policy', type=ChannelPolicy, choices=list(ChannelPolicy), default=ChannelPolicy.MatchTopology, help='channel allocation policy')
19 |     cmd.add_argument('--instances', type=int, default=1, help='number of interleaved instances of the algorithm to make')
20 | 
21 |     def handle(args, command):
22 |         if command != 'ncclize':
23 |             return False
24 | 
25 |         input_algorithms = read_algorithm(args)
26 |         validate_output_args(args)
27 | 
28 |         for algo in input_algorithms:
29 |             ncclized = ncclize(algo,
30 |                 remap_scratch=args.remap_scratch,
31 |                 channel_policy=args.channel_policy,
32 |                 pretty_print=not args.no_pretty_print,
33 |                 use_scratch=not args.no_scratch,
34 |                 merge_contiguous=not args.no_merge_contiguous,
35 |                 greedy_scratch_sorting=args.greedy_scratch_sorting,
36 |                 instances=args.instances,
37 |                 logging=True)
38 | 
39 |             handled = output_handler(args, lambda: ncclized, name_msccl_object(algo.name, ending='msccl.xml'))
40 | 
41 |         return True
42 |     
43 |     return handle
44 | 


--------------------------------------------------------------------------------
/msccl/cli/plans.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .common import *
 5 | from msccl.autosynth import *
 6 | 
 7 | def make_plans(cmd_parsers):
 8 |     handler_funcs = []
 9 |     handler_funcs.append(make_handle_list)
10 | 
11 |     return make_cmd_category(cmd_parsers, 'plans', 'subcommand', handler_funcs)
12 | 
13 | def make_handle_list(cmd_parsers):
14 |     cmd = cmd_parsers.add_parser('list')
15 | 
16 |     def handle(args, command):
17 |         if command != 'list':
18 |             return False
19 | 
20 |         print_plans()
21 |         return True
22 |     
23 |     return handle
24 | 


--------------------------------------------------------------------------------
/msccl/cli/solve.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import msccl.strategies as strategies
 5 | from .known_topologies import KnownTopologies
 6 | from .known_collectives import KnownCollectives
 7 | from .common import *
 8 | 
 9 | def make_solvers(cmd_parsers):
10 |     handler_funcs = []
11 |     handler_funcs.append(make_handle_solve_instance)
12 |     handler_funcs.append(make_handle_solve_least_steps)
13 |     handler_funcs.append(make_handle_solve_pareto_optimal)
14 | 
15 |     return make_cmd_category(cmd_parsers, 'solve', 'solver', handler_funcs)
16 | 
17 | def _make_handle_strategy(cmd_parsers, name, invoke, take_steps = True):
18 |     cmd = cmd_parsers.add_parser(name)
19 |     instance_handler = add_instance(cmd, take_steps=take_steps)
20 |     topologies = KnownTopologies(cmd)
21 |     collectives = KnownCollectives(cmd)
22 |     validate_output_args, output_handler = add_output_algorithm(cmd)
23 | 
24 |     def handle(args, command):
25 |         if command != name:
26 |             return False
27 | 
28 |         validate_output_args(args)
29 |         topology = topologies.create(args)
30 |         collective = collectives.create(args, topology.num_nodes())
31 |         instance = instance_handler(args)
32 |         algo = invoke(args, topology, collective, instance)
33 |         output_handler(args, algo)
34 |         return True
35 |     
36 |     return cmd, handle
37 | 
38 | def make_handle_solve_instance(cmd_parsers):
39 |     def invoke(args, topology, collective, instance):
40 |         return strategies.solve_instance(topology, collective, instance, logging=True)
41 | 
42 |     cmd, handle = _make_handle_strategy(cmd_parsers, 'instance', invoke)
43 |     return handle
44 | 
45 | def make_handle_solve_least_steps(cmd_parsers):
46 |     def invoke(args, topology, collective, instance):
47 |         return strategies.solve_least_steps(topology, collective, args.initial_steps, instance, logging=True)
48 | 
49 |     cmd, handle = _make_handle_strategy(cmd_parsers, 'least-steps', invoke, take_steps=False)
50 |     cmd.add_argument('--initial-steps', type=int, default=1, metavar='N')
51 |     return handle
52 | 
53 | def make_handle_solve_pareto_optimal(cmd_parsers):
54 |     name = 'pareto-optimal'
55 |     cmd = cmd_parsers.add_parser(name)
56 |     topologies = KnownTopologies(cmd)
57 |     collectives = KnownCollectives(cmd)
58 |     validate_output_args, output_handler = add_output_msccl_objects(cmd)
59 |     cmd.add_argument('--min-chunks', type=int, default=1, metavar='N')
60 |     cmd.add_argument('--max-chunks', type=int, default=None, metavar='N')
61 |     cmd.add_argument('--assume-rpc-bound', default=None, help='assume bandwidth optimality requires at least this many rounds per chunk', metavar='N/N')
62 |     cmd.add_argument('--no-monotonic-feasibility', action='store_true', help='turn off an unproven assumption about monotonic feasibility of instances')
63 |     cmd.add_argument('--save-eagerly', action='store_true', help='save algorithms as soon as they are found, without pruning non-Pareto optimal algorithms at the end')
64 |     instance_handler = add_instance(cmd, take_steps=False, take_rounds=False)
65 | 
66 |     def handle(args, command):
67 |         if command != name:
68 |             return False
69 | 
70 |         validate_output_args(args)
71 |         topology = topologies.create(args)
72 |         instance = instance_handler(args)
73 |         collective = collectives.create(args, topology.num_nodes())
74 |         assume_rpc_bound = None
75 |         if args.assume_rpc_bound:
76 |             try:
77 |                 assume_rpc_bound = parse_fraction(args.assume_rpc_bound)
78 |             except ValueError:
79 |                 cmd.error('could not parse --assume-rpc-bound as a fraction')
80 |         algorithms = []
81 |         for algorithm in strategies.solve_all_latency_bandwidth_tradeoffs(topology, collective, args.min_chunks, args.max_chunks, assume_rpc_bound, not args.no_monotonic_feasibility, base_instance=instance, logging=True):
82 |             algorithms.append(algorithm)
83 |             if args.save_eagerly:
84 |                 output_handler(args, algorithm, algorithm.name)
85 |         if not args.save_eagerly:
86 |             efficient_algorithms = strategies.prune_pareto_optimal(algorithms)
87 |             print(f'Found {len(efficient_algorithms)} Pareto optimal algorithms. Pruned {len(algorithms) - len(efficient_algorithms)} non-optimal algorithms.')
88 |             for algorithm in efficient_algorithms:
89 |                 output_handler(args, algorithm, algorithm.name)
90 |         return True
91 |     
92 |     return handle
93 | 


--------------------------------------------------------------------------------
/msccl/collectives.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | from abc import ABC, abstractmethod
  5 | from dataclasses import dataclass
  6 | 
  7 | @dataclass
  8 | class Chunk:
  9 |     precondition: set
 10 |     postcondition: set
 11 |     address: int
 12 | 
 13 | class Collective:
 14 |     def __init__(self, name, num_nodes, chunks, triggers = {}, runtime_name= 'custom'):
 15 |         self.name = name
 16 |         self.num_nodes = num_nodes
 17 |         self.num_chunks = len(chunks)
 18 |         self._chunks = chunks
 19 |         self._triggers = triggers
 20 |         self.runtime_name = runtime_name
 21 | 
 22 |         self.is_combining = False
 23 |         addresses_seen = set()
 24 |         for chunk in self._chunks:
 25 |             if chunk.address in addresses_seen:
 26 |                 self.is_combining = True
 27 |             addresses_seen.add(chunk.address)
 28 |         self.num_addresses = len(addresses_seen)
 29 | 
 30 |     def ranks(self):
 31 |         return range(self.num_nodes)
 32 | 
 33 |     def chunks(self):
 34 |         return range(len(self._chunks))
 35 | 
 36 |     def precondition(self, rank, chunk):
 37 |         return rank in self._chunks[chunk].precondition
 38 | 
 39 |     def postcondition(self, rank, chunk):
 40 |         return rank in self._chunks[chunk].postcondition
 41 | 
 42 |     def address(self, chunk):
 43 |         return self._chunks[chunk].address
 44 | 
 45 |     def trigger(self, rank, chunk):
 46 |         if (rank, chunk) in self._triggers:
 47 |             return self._triggers[(rank, chunk)]
 48 |         else:
 49 |             return None
 50 | 
 51 |     def has_triggers(self):
 52 |         return len(self._triggers) > 0
 53 | 
 54 |     def chunk_up(self, div):
 55 |         if div < 1:
 56 |             raise ValueError('Divisor must be greater or equal to one (and one is a no-op).')
 57 |         if div == 1:
 58 |             return self
 59 | 
 60 |         def remap(addr, i):
 61 |             return addr * div + i
 62 | 
 63 |         new_chunks = []
 64 |         for chunk in self._chunks:
 65 |             for i in range(div):
 66 |                 new_chunks.append(Chunk(chunk.precondition, chunk.postcondition, remap(chunk.address, i)))
 67 | 
 68 |         name = f'{self.name},chunks={div}'
 69 |         return Collective(name, self.num_nodes, new_chunks)
 70 | 
 71 | def build_collective(name, num_nodes, num_chunks, precondition, postcondition, address = lambda c: c, trigger = lambda r, c: None, runtime_name = 'custom'):
 72 |     chunks = []
 73 |     for chunk in range(num_chunks):
 74 |         chunk_precondition = set(rank for rank in range(num_nodes) if precondition(rank, chunk))
 75 |         chunk_postcondition = set(rank for rank in range(num_nodes) if postcondition(rank, chunk))
 76 |         chunk_address = address(chunk)
 77 |         chunks.append(Chunk(chunk_precondition, chunk_postcondition, chunk_address))
 78 |     triggers = {(rank, chunk): trigger(rank, chunk) for rank in range(num_nodes) for chunk in range(num_chunks) if trigger(rank, chunk) != None}
 79 |     return Collective(name, num_nodes, chunks, triggers, runtime_name)
 80 | 
 81 | # Common pre- and postconditions
 82 | def _scattered(num_nodes, chunks = 1):
 83 |     def cond(rank, chunk):
 84 |         return rank == (chunk // chunks) % num_nodes
 85 |     return cond
 86 | 
 87 | def _transpose(num_nodes):
 88 |     def cond(rank, chunk):
 89 |         return rank == chunk // num_nodes
 90 |     return cond
 91 | 
 92 | def _all(rank, chunk):
 93 |     return True
 94 | 
 95 | def _root(root):
 96 |     def cond(rank, chunk):
 97 |         return rank == root
 98 |     return cond
 99 | 
100 | # Non-combining collectives
101 | 
102 | def broadcast(num_nodes, root):
103 |     return build_collective(f'Broadcast(n={num_nodes},root={root})', num_nodes, 1, _root(root), _all)
104 | 
105 | def scatter(num_nodes, root):
106 |     return build_collective(f'Scatter(n={num_nodes},root={root})', num_nodes, num_nodes, _root(root), _scattered(num_nodes))
107 | 
108 | def gather(num_nodes, root):
109 |     return build_collective(f'Gather(n={num_nodes},root={root})', num_nodes, num_nodes, _scattered(num_nodes), _root(root))
110 | 
111 | def allgather(num_nodes):
112 |     return build_collective(f'Allgather(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all, runtime_name='allgather')
113 | 
114 | def alltoall(num_nodes):
115 |     return build_collective(f'Alltoall(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes), runtime_name='alltoall')
116 | 
117 | # Combining collectives
118 | 
119 | # Represents a single buffer to reduce
120 | def _single_scattered(num_nodes):
121 |     def address(chunk):
122 |         return chunk // num_nodes
123 |     return address
124 | 
125 | def reduce(num_nodes, root):
126 |     return build_collective(f'Reduce(n={num_nodes},root={root})', num_nodes, num_nodes, _scattered(num_nodes), _root(root), _single_scattered(num_nodes))
127 | 
128 | def allreduce(num_nodes):
129 |     return build_collective(f'Allreduce(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all, _single_scattered(num_nodes), runtime_name='allreduce')
130 | 
131 | def reduce_scatter(num_nodes):
132 |     return build_collective(f'ReduceScatter(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes), _single_scattered(num_nodes), runtime_name='reduce_scatter')
133 | 
134 | def scan(num_nodes):
135 |     def postcondition(rank, chunk):
136 |         origin = chunk % num_nodes
137 |         return rank >= origin
138 |     return build_collective(f'Scan(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), postcondition, _single_scattered(num_nodes))
139 | 
140 | # Multi-root generalizations of MPI rooted collectives
141 | # TODO: Add one for reduce. That needs a new addressing function.
142 | 
143 | def _roots(roots):
144 |     def cond(rank, chunk):
145 |         return rank == roots[chunk % len(roots)]
146 |     return cond
147 | 
148 | def multiroot_broadcast(num_nodes, roots):
149 |     return build_collective(f'MultirootBroadcast(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, len(roots), _roots(roots), _all)
150 | 
151 | def multiroot_scatter(num_nodes, roots):
152 |     return build_collective(f'MultirootScatter(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, num_nodes * len(roots), _roots(roots), _scattered(num_nodes, len(roots)))
153 | 
154 | def multiroot_gather(num_nodes, roots):
155 |     return build_collective(f'MultirootGather(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, num_nodes * len(roots), _scattered(num_nodes, len(roots)), _roots(roots))
156 | 


--------------------------------------------------------------------------------
/msccl/composers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.collectives import allreduce
 5 | from msccl.algorithm import *
 6 | from msccl.instance import *
 7 | 
 8 | def compose_allreduce(reducescatter_algo, allgather_algo, logging=False):
 9 |     if reducescatter_algo.is_pipelined() or allgather_algo.is_pipelined():
10 |         raise ValueError('Pipelining is not supported.')
11 | 
12 |     if reducescatter_algo.instance.chunks != allgather_algo.instance.chunks:
13 |         raise ValueError(f'ReduceScatter and Allgather must have the same chunks (got {reducescatter_algo.instance.chunks} and {allgather_algo.instance.chunks})')
14 | 
15 |     if reducescatter_algo.topology.name != allgather_algo.topology.name:
16 |         # TODO: improve this check to check actual structure, not just name
17 |         raise ValueError(f'ReduceScatter and Allgather must have the same topology (got {reducescatter_algo.topology.name} and {allgather_algo.topology.name})')
18 |     topo = reducescatter_algo.topology
19 | 
20 |     coll = allreduce(topo.num_nodes())
21 | 
22 |     steps = reducescatter_algo.steps + allgather_algo.steps
23 |     instance = Instance(len(steps),
24 |         extra_rounds=reducescatter_algo.instance.extra_rounds+allgather_algo.instance.extra_rounds,
25 |         chunks=reducescatter_algo.instance.chunks)
26 |     return Algorithm.make_implementation(coll, topo, instance, steps)


--------------------------------------------------------------------------------
/msccl/distributors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | 
4 | from .greedy_alltoall import *
5 | from .gather_scatter_alltoall import *
6 | from .alltoall_subproblem import *
7 | 


--------------------------------------------------------------------------------
/msccl/instance.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from dataclasses import dataclass
 5 | 
 6 | @dataclass(frozen=True)
 7 | class Instance:
 8 |     steps: int
 9 |     extra_rounds: int = 0
10 |     chunks: int = 1
11 |     pipeline: int = None
12 |     extra_memory: int = None
13 |     allow_exchange: bool = False
14 | 
15 |     def rounds(self):
16 |         return self.steps + self.extra_rounds
17 | 
18 |     def set(self, steps = None, extra_rounds = None, chunks = None, pipeline = None, extra_memory = None, allow_exchange = None):
19 |         return Instance(
20 |             steps if steps != None else self.steps,
21 |             extra_rounds if extra_rounds != None else self.extra_rounds,
22 |             chunks if chunks != None else self.chunks,
23 |             pipeline if pipeline != None else self.pipeline,
24 |             extra_memory if extra_memory != None else self.extra_memory,
25 |             allow_exchange if allow_exchange != None else self.allow_exchange)
26 |     
27 |     def __str__(self):
28 |         s = f'steps={self.steps}'
29 |         if self.extra_rounds > 0:
30 |             s += f',rounds={self.steps + self.extra_rounds}'
31 |         if self.chunks > 1:
32 |             s += f',chunks={self.chunks}'
33 |         if self.pipeline != None:
34 |             s += f',pipeline={self.pipeline}'
35 |         if self.extra_memory != None:
36 |             s += f',extra_memory={self.extra_memory}'
37 |         if self.allow_exchange:
38 |             s += f',allow_exchange'
39 |         return s
40 | 


--------------------------------------------------------------------------------
/msccl/isomorphisms.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from z3 import *
 5 | from dataclasses import dataclass
 6 | 
 7 | @dataclass
 8 | class Permutation:
 9 |     nodes: list
10 | 
11 |     def __str__(self):
12 |         return f'Permutation(nodes={self.nodes})'
13 | 
14 | def _pn(node):
15 |     return Int(f'perm_node_{node}')
16 | 
17 | def _select_node_permutation(s, topology):
18 |     # Select a permutation of nodes
19 |     for node in topology.nodes():
20 |         s.add(_pn(node) >= 0)
21 |         s.add(_pn(node) < topology.num_nodes())
22 |         for prev in range(node):
23 |             s.add(_pn(node) != _pn(prev))
24 | 
25 | def _links_constraint(topology, target_topology):
26 |     nodes = range(topology.num_nodes())
27 | 
28 |     def links_isomorphic(perm_src, perm_dst, link):
29 |         # Return a condition on whether the permuted ranks are isomorphic from src to dst wrt. the given link
30 |         for src in nodes:
31 |             for dst in nodes:
32 |                 if target_topology.link(src, dst) != link:
33 |                     yield Not(And(perm_src == src, perm_dst == dst))
34 |     # Require all pairs of nodes to be isomorphic to their permuted counterparts
35 |     conditions = []
36 |     for src in nodes:
37 |         for dst in nodes:
38 |             link = topology.link(src, dst)
39 |             conditions.extend(links_isomorphic(_pn(src), _pn(dst), link))
40 |     return And(conditions)
41 | 
42 | def _decode_permutation(model, topology):
43 |     node_permutation = [model.eval(_pn(node)).as_long() for node in topology.nodes()]
44 |     return Permutation(node_permutation)
45 | 
46 | def find_isomorphisms(topology, target_topology, limit=None, logging=False):
47 |     '''
48 |     Finds all isomorphisms from one topology to a target topology. Returns a list of permutations.
49 |     '''
50 |     if len(topology.switches) > 0:
51 |         print('MSCCL Warning: Topologies with switches are not supported. import msccl will be ignored.')
52 |         return []
53 | 
54 |     if limit != None and limit <= 0:
55 |         raise ValueError('MSCCL error: limit was set improperly.')
56 |     
57 |     if topology.num_nodes() != target_topology.num_nodes():
58 |         raise ValueError('MSCCL error: target topology does not match with the given topology.')
59 | 
60 |     if logging:
61 |         print(f'Encoding {topology.name} - {target_topology.name} isomorphisms to Z3')
62 | 
63 |     s = Solver()
64 | 
65 |     _select_node_permutation(s, topology)
66 |     s.add(_links_constraint(topology, target_topology))
67 | 
68 |     if logging:
69 |         print(f'Solving isomorphisms incrementally...')
70 | 
71 |     isomorphisms = []
72 |     while s.check() == sat:
73 |         isomorphism = _decode_permutation(s.model(), topology)
74 |         isomorphisms.append(isomorphism)
75 | 
76 |         if logging:
77 |             print(isomorphism)
78 | 
79 |         if limit != None and len(isomorphisms) >= limit:
80 |             break
81 | 
82 |         # Block this permutation
83 |         assignment = [_pn(node) == perm for node, perm in enumerate(isomorphism.nodes)]
84 |         s.add(Not(And(assignment)))
85 | 
86 |     if logging:
87 |         print(f'{len(isomorphisms)} isomorphisms found.')
88 |     return isomorphisms
89 | 


--------------------------------------------------------------------------------
/msccl/language/buffer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | # Scratch buffer slice with manual indexing
 5 | class BufferSlice:
 6 |     def __init__(self, buf, name):
 7 |         self.name = name
 8 |         self.buf = buf
 9 |         self.offset = -1 # Offset into the global scratch buffer
10 |         self.chunks = []
11 | 
12 |     # Returns the global index into the scratch buffer
13 |     def get_global_index(self, index):
14 |         assert (self.offset > -1), 'set_offset needs to be called first'
15 |         return self.offset + index
16 | 
17 |     def get_buffer(self):
18 |         return self.buf
19 | 
20 |     def instance_size(self):
21 |         return len(self.chunks)
22 | 
23 |     def set_offset(self, offset):
24 |         self.offset = offset
25 | 
26 |     def __getitem__(self, index):
27 |         return self.chunks[index]
28 |     
29 |     def __setitem__(self, index, value):
30 |         current_size = len(self.chunks)
31 |         while index > current_size:
32 |             self.chunks.append(None)
33 |             current_size = len(self.chunks)
34 |         if index == current_size:
35 |             self.chunks.append(value)
36 |         else:
37 |             self.chunks[index] = value
38 | 
39 |     def __len__(self):
40 |         return len(self.chunks)


--------------------------------------------------------------------------------
/msccl/language/chunk.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | 
 5 | from dataclasses import dataclass
 6 | from msccl.language.ir import *
 7 | 
 8 | @dataclass
 9 | class Chunk:
10 |     origin_rank: int # Rank the chunk initially started at
11 |     origin_index: int # Index the chunk initially started at
12 |     dst_rank: int = -1
13 |     dst_index: int = -1
14 | 
15 |     def reduce(self, dst, chunk):
16 |         if type(chunk) is ReduceChunk:
17 |             return chunk.reduce(dst, self)
18 |         elif type(chunk) is Chunk:  
19 |             chunks = [self, chunk]
20 |             return ReduceChunk(dst, chunks)
21 |         else:
22 |             assert True, "Trying to reduce with chunk of None"
23 |             return None
24 | 
25 |     def __hash__(self):
26 |         return hash((self.origin_rank, self.origin_index))
27 | 
28 |     def __eq__(self, other):
29 |         return type(other) is Chunk and self.origin_rank == other.origin_rank and self.origin_index == other.origin_index
30 | 
31 |     def __lt__(self, other):
32 |         return self.origin_rank < other.origin_rank or \
33 |                (self.origin_rank == other.origin_rank and self.origin_index < other.origin_index)
34 | 
35 | @dataclass
36 | class ReduceChunk:
37 |     creation_rank: int # Rank the Reduce Chunk is created. Necessary since the same ReduceChunk can be created on multiple ranks independently
38 |     chunks: list # List of chunks reduced
39 | 
40 |     def reduce(self, dst, chunk):
41 |         if type(chunk) is ReduceChunk:
42 |             chunks = self.chunks + chunk.chunks
43 |         elif type(chunk) is Chunk:  
44 |             chunks =self.chunks + [chunk]
45 |         else:
46 |             assert True, "Trying to reduce with chunk of None"
47 |         return ReduceChunk(self.creation_rank, chunks)
48 | 
49 |     def sort(self):
50 |         self.chunks.sort()
51 | 
52 |     def __hash__(self):
53 |         self.sort()
54 |         return hash((self.creation_rank,) + tuple(self.chunks))
55 | 
56 |     # Two reduce chunks are equal if they contain the same list of
57 |     # chunks being reduced
58 |     def __eq__(self, other):
59 |         self.sort()
60 |         other.sort()
61 |         return self.chunks == other.chunks
62 | 


--------------------------------------------------------------------------------
/msccl/language/passes.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import sys
 5 | from msccl.language.ir import *
 6 | 
 7 | # Check that there are no cyclic dependencies within a Rank
 8 | def check_dependency_cycles(tbs):
 9 |     for rank, rank_tbs in enumerate(tbs):
10 |         for tbid, tb in rank_tbs.items():
11 |             for op in tb.ops:
12 |                 deps = op.depends
13 |                 chain = [op]
14 |                 # DFS to check for cycles
15 |                 while len(deps) > 0:
16 |                     dep = deps[0]
17 |                     if dep in chain:
18 |                         print(f"Cyclic dependency in rank {rank} threadblock {tbid} at {op}")
19 |                         for op in chain:
20 |                             print("  ", op)
21 |                         sys.exit(1)
22 |                     next_depends = dep.depends
23 |                     if len(next_depends) > 0:
24 |                         chain.append(dep)
25 |                     else:
26 |                         chain = [op]
27 |                     deps = next_depends + deps[1:]
28 | 
29 | 
30 | # Check there are no ordering violations between threadblocks across ranks
31 | def check_threadblock_ordering(rank_dag):
32 |     for rank in range(rank_dag.num_ranks):
33 |         for tb in rank_dag.tbs[rank].values():
34 |             prev_steps = {} # tbid -> step of last recv from tbid
35 |             # Check that sends and their corresponding receives between two threadblocks
36 |             # happen in the same order.
37 |             for op_step, op in enumerate(tb.ops):
38 |                 if op.is_send():
39 |                     match = op.recv_match
40 |                     if match.is_recv():
41 |                         assert op.dst.rank == match.rank, f"Bug in MSCCLang: Sends don't match receives"
42 | 
43 |                     other_tbid = match.tb
44 |                     if other_tbid in prev_steps:
45 |                         if match.step <= prev_steps[other_tbid].step:
46 |                             print("Offending Steps", match.step, prev_steps[other_tbid].step)
47 |                             print("Sending tb")
48 |                             for op in tb.ops:
49 |                                 print(f'{op.step}: Recv step: {op.recv_match.step if op.is_send() else -1} {op} priority:{(op.chunk_step, op.priority, op.dst.index)}')
50 |                             print("Receiving tb")
51 |                             for op in rank_dag.tbs[match.rank][other_tbid].ops:
52 |                                 print(f'{op.step}: {op} priority:{(op.chunk_step, op.priority, op.dst.index)}')
53 |                             assert match.step >  prev_steps[other_tbid].step, f"Rank {op.rank} sends op1 then op2 but {match.rank} receives op2 then op1"
54 |                         
55 |                     prev_steps[other_tbid] = match
56 | 


--------------------------------------------------------------------------------
/msccl/language/routines.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.language import *
 5 | from msccl.topologies import *
 6 | from msccl.language.collectives import *
 7 | 
 8 | def allgather_ring_inplace(gpus, gpu_offset=0, index_offset=0, ch=0):
 9 |     for rank in range(gpu_offset, gpu_offset+gpus):
10 |             index = index_offset + rank - gpu_offset
11 |             c = chunk(rank, Buffer.input, 0)
12 |             for r_next in range(1, gpus):
13 |                 next_rank = (rank + r_next) % gpus + gpu_offset
14 |                 c = c.copy(next_rank, Buffer.output, index, ch=0)
15 | 
16 | def allreduce_ring_inplace(gpus, gpu_offset=0, index_offset=0, ch=0):
17 |     for rank in range(gpu_offset, gpu_offset+gpus):
18 |         index = index_offset + rank - gpu_offset
19 |         c = chunk(rank, Buffer.input, index)
20 |         # Reduce ring
21 |         for r_next in range(1, gpus):
22 |             next_rank = (rank + r_next) % gpus + gpu_offset
23 |             c = chunk(next_rank, Buffer.input, index).reduce(c, ch=ch)
24 |         # Propagate ring
25 |         for r_next in range(0, gpus-1):
26 |             next_rank = (rank + r_next) % gpus + gpu_offset
27 |             c = c.copy(next_rank, Buffer.input, index, ch=ch)
28 |     


--------------------------------------------------------------------------------
/msccl/language/visualize.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import igraph as ig
  5 | from msccl.language.ir import *
  6 | from msccl.language.rank_dag import *
  7 | 
  8 | def visualize_chunk_dag(chunk_paths): # pragma: no cover
  9 |     frontier = []
 10 |     nnodes = 0
 11 |     vertex_label = []
 12 |     vertex_colors = []
 13 |     edges = []
 14 |     visited = set()
 15 | 
 16 |     def add_node(op, nnodes, vertex_label, vertex_colors):
 17 |         if op.num == -1:
 18 |             op.num = nnodes
 19 |             nnodes += 1
 20 |             if op.inst == ChunkInstruction.start:
 21 |                 vertex_label.append(f'Start at {op.dst.rank}, {op.dst.index}.')
 22 |                 vertex_colors.append('yellow')
 23 |             elif op.inst == ChunkInstruction.send:
 24 |                 vertex_label.append(f'Send to Rank {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}')
 25 |                 vertex_colors.append('blue')
 26 |             elif op.inst == ChunkInstruction.reduce:
 27 |                 vertex_label.append(f'Reduce with {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}')
 28 |                 vertex_colors.append('green')
 29 |         return nnodes
 30 | 
 31 |     for chunk, op in chunk_paths.items():
 32 |         if len(op.prev) == 0: 
 33 |             frontier.append(op)
 34 | 
 35 |     while len(frontier) > 0:
 36 |         op = frontier[0]
 37 |         if op in visited:
 38 |             frontier = frontier[1:]
 39 |         else:
 40 |             nnodes = add_node(op, nnodes, vertex_label, vertex_colors)
 41 |             for next_op in op.next:
 42 |                 nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors)
 43 |                 edges.append([op.num, next_op.num])
 44 |             frontier = frontier[1:] + op.next
 45 |             visited.add(op)
 46 | 
 47 |     g = ig.Graph(nnodes, edges, directed=True)
 48 |     layout = g.layout(layout=ig.Graph.layout_grid)
 49 |     ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='auto')
 50 | 
 51 | def visualize_rank_dag(operations): # pragma: no cover
 52 |     frontier = []
 53 |     nnodes = 0
 54 |     vertex_label = []
 55 |     vertex_colors = []
 56 |     edges = []
 57 |     visited = set()
 58 |     colors = ['red', 'green', 'blue', 'yellow', 'teal', 'pink', 'purple', 'orange']
 59 | 
 60 |     def add_node(op, nnodes, vertex_label, vertex_colors):
 61 |         if op.num == -1:
 62 |             op.num = nnodes
 63 |             nnodes += 1
 64 |             # Add new node to graph
 65 |             if op.inst == Instruction.start:
 66 |                 vertex_label.append(f'Chunk {op.src.index} Rank {op.src.rank}')
 67 |             elif op.inst == Instruction.send:
 68 |                 vertex_label.append(f'S to Rank {op.dst.rank}')
 69 |             elif op.inst == Instruction.recv:
 70 |                 vertex_label.append(f'R from {op.src.rank}')
 71 |             elif op.inst == Instruction.recv_reduce_copy:
 72 |                 vertex_label.append(f'RRC from {op.src.rank}')
 73 |             else:
 74 |                 vertex_label.append(f'{op.inst}')
 75 | 
 76 |             # Add colors 
 77 |             if op.inst == Instruction.start:
 78 |                 vertex_colors.append('gray')
 79 |             else:
 80 |                 vertex_colors.append(colors[op.tb % len(colors)])
 81 |         return nnodes
 82 | 
 83 |     for slot, op in operations.items():
 84 |         if len(op.prev) == 0: 
 85 |             frontier.append(op)
 86 | 
 87 |     while len(frontier) > 0:
 88 |         op = frontier[0]
 89 | 
 90 |         if op in visited:
 91 |             frontier = frontier[1:]
 92 |         else:
 93 |             nnodes = add_node(op, nnodes, vertex_label, vertex_colors)
 94 | 
 95 |         for next_op in op.next:
 96 |             nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors)
 97 |             edges.append([op.num, next_op.num])
 98 |             frontier = frontier[1:] + list(op.next)
 99 |         visited.add(op)
100 | 
101 |     g = ig.Graph(nnodes, edges, directed=True)
102 |     layout = g.layout(layout=ig.Graph.layout_grid)
103 |     ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='rt')


--------------------------------------------------------------------------------
/msccl/ncd_reduction.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.collectives import *
 5 | from msccl.topologies import reverse_topology
 6 | from msccl.algorithm import Algorithm, Step
 7 | from collections import defaultdict
 8 | 
 9 | class ReductionNotApplicableError(ValueError):
10 |     pass
11 | 
12 | def non_combining_dual(primal):
13 |     if not primal.is_combining:
14 |         raise ReductionNotApplicableError('The collective is already non-combining.')
15 | 
16 |     if primal.has_triggers():
17 |         raise ReductionNotApplicableError('The collective has triggers.')
18 | 
19 |     dual_precondition = defaultdict(set)
20 |     dual_postcondition = defaultdict(set)
21 | 
22 |     addresses = set()
23 |     for chunk in primal.chunks():
24 |         addr = primal.address(chunk)
25 |         addresses.add(addr)
26 |         for rank in primal.ranks():
27 |             if primal.postcondition(rank, chunk):
28 |                 dual_precondition[addr].add(rank)
29 |             if primal.precondition(rank, chunk):
30 |                 dual_postcondition[addr].add(rank)
31 |     for addr in dual_precondition:
32 |         if len(dual_precondition[addr]) > 1:
33 |             raise ReductionNotApplicableError('The non-combining reduction is only applicable to collectives with a unique root per address.')
34 | 
35 |     return build_collective(f'Dual{primal.name}', primal.num_nodes, len(addresses),
36 |         lambda r, c: r in dual_precondition[c],
37 |         lambda r, c: r in dual_postcondition[c])
38 | 
39 | def recover_primal_algorithm(dual_algorithm, primal, original_topology, instance):
40 |     primal_steps = []
41 |     for step in reversed(dual_algorithm.steps):
42 |         primal_sends = [(chunk, dst, src) for chunk, src, dst in step.sends]
43 |         primal_steps.append(Step(step.rounds, primal_sends))
44 |     return Algorithm.make_implementation(primal, original_topology, instance, primal_steps)
45 | 
46 | def wrap_try_ncd_reduction(solver_cls):
47 |     class NonCombiningReductionWrapper(solver_cls):
48 |         def __init__(self, topology, collective):
49 |             self.primal = collective
50 |             try:
51 |                 # Create the dual collective
52 |                 self.dual = non_combining_dual(collective)
53 |                 collective = self.dual
54 | 
55 |                 # Solve the dual in the reverse topology
56 |                 self.original_topology = topology
57 |                 topology = reverse_topology(topology)
58 |             except ReductionNotApplicableError:
59 |                 self.dual = None
60 |             super().__init__(topology, collective)
61 | 
62 |         def solve(self, instance):
63 |             algo = super().solve(instance)
64 |             if self.dual != None and algo != None:
65 |                 return recover_primal_algorithm(algo, self.primal, self.original_topology, instance)
66 |             else:
67 |                 return algo
68 | 
69 |     return NonCombiningReductionWrapper
70 | 


--------------------------------------------------------------------------------
/msccl/programs/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/msccl/programs/allreduce_a100_ring.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.language import *
 5 | 
 6 | # Ring all reduce for A100s
 7 | # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
 8 | # channels=1 is standard ring, all chunks are assigned to the same tb/channel
 9 | # channels=8 devotes 1 tb/channel to handling 1 chunk of the data
10 | def allreduce_ring(size, channels):   
11 |     # Reduce ring
12 |     for step in range(0, size-1):
13 |             for index in range(0, size):
14 |                 rank = (index + step) % size
15 |                 next_rank = (index + step + 1) % size
16 |                 channel = index%channels
17 |                 c = chunk(next_rank, Buffer.input, index)
18 |                 c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel)
19 |     # Propagate ring
20 |     for step in range(-1, size-2):
21 |         for index in range(0, size):
22 |             rank = (index + step) % size
23 |             c = chunk(rank, Buffer.input, index)
24 |             next_rank = (index + step + 1) % size
25 |             channel = index%channels
26 |             c = c.copy(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)


--------------------------------------------------------------------------------
/msccl/programs/allreduce_allpairs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.language import *
 5 | 
 6 | def allreduce_allpairs(size):
 7 |     # Each rank sends the nth chunk to the nth rank into scratch space
 8 |     for r1 in range(size):
 9 |         for r2 in range(size):
10 |             if r1 != r2:
11 |                 index = r2 * size
12 |                 c = chunk(r1, Buffer.input, index, size=size)
13 |                 c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
14 | 
15 |     # Each rank performs a local reduction on the nth chunk
16 |     # Utilize 8 threadblocks for this reduction for better parallelism
17 |     for r in range(size):
18 |         for index in range(0, size * (size-1)):
19 |                 c = chunk(r, Buffer.input, r*size + (index % size))
20 |                 c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
21 |     
22 |     # Each rank sends the fully reduced nth chunk to all other gpus
23 |     for r1 in range(size):
24 |         for r2 in range(size):
25 |             if r1 != r2:
26 |                 index = r1 * size
27 |                 c = chunk(r1, Buffer.input, index, size)
28 |                 c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
29 | 
30 |                     


--------------------------------------------------------------------------------
/msccl/programs/alltoall_a100_8kp1.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.language import *
 5 | 
 6 | def alltoall_three_step(num_nodes, gpus_per_node, instances=1, ib_connections=1):
 7 |     num_ranks = num_nodes * gpus_per_node
 8 | 
 9 |     # (node, local gpu) to rank
10 |     # (n, g) => r
11 |     def RankFromNodeGpuPair(n, g):
12 |         return n*gpus_per_node + g
13 | 
14 |     # For cross node traffic from node n1 to node n2, returns the ranks g
15 |     # gpus on n1 and n2 that handle that traffic.
16 |     def CrossNodeGpus(n1, n2):
17 |         def LocalRank(n1, n2):
18 |             return (n2 if n1 > n2 else n2-1) % gpus_per_node
19 |         r1 = RankFromNodeGpuPair(n1, LocalRank(n1, n2))
20 |         r2 = RankFromNodeGpuPair(n2, LocalRank(n2, n1))
21 |         return (r1, r2)
22 | 
23 |     # Groups chunk reference into one large chunk reference (used for IB)
24 |     # Save them under a key in the dictionary ib_chunks
25 |     def AddChunk(ib_chunks, key, c):
26 |         if key in ib_chunks: 
27 |             ib_chunks[key] = ib_chunks[key].group(c)
28 |         else:
29 |             ib_chunks[key] = c
30 |     
31 |     ib_chunks = {} # Keeps track of chunks going over IB buffer buffer name -> chunk
32 |     for n1 in range(num_nodes):
33 |         for g1 in range(gpus_per_node):
34 |             for ch in range(instances):
35 |                 for n2 in range(num_nodes):
36 |                     r1 = RankFromNodeGpuPair(n1, g1)
37 |                     if (n1 != n2): 
38 |                         # Send over all chunks destined for that node to the peer gpu that handles chunks to that node
39 |                         c = chunk(r1, Buffer.input, n2 * gpus_per_node * instances + ch * gpus_per_node, gpus_per_node)
40 |                         # Gather chunks destined for cross node ranks in scratch to route through IB
41 |                         gather_rank, _ = CrossNodeGpus(n1, n2)
42 |                         buffer_key = (n1, n2)
43 |                         # Send chunk to the gather_rank. Send returns a chunk reference to the 
44 |                         # receiver's chunk
45 |                         c = c.copy(gather_rank, buffer=buffer_key, ch=ch*2)
46 |                         # Group the chunks using a particular IB pair into one large chunk reference
47 |                         AddChunk(ib_chunks, buffer_key, c) 
48 |                     else:
49 |                         # Within a node - direct copy/copy the chunks over nvlink to the output buffer. 
50 |                         # Use a different channel to ensure that we don't get in the way of copys/receives above
51 |                         # which are on the critical path.
52 |                         for g2 in range(gpus_per_node):
53 |                             r2 = RankFromNodeGpuPair(n2, g2)
54 |                             c = chunk(r1, Buffer.input, r2 * instances + ch)
55 |                             c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
56 | 
57 |                 
58 | 
59 |     # IB Send and local scatters
60 |     for buffer_key, ib_chunk in ib_chunks.items(): 
61 |         (n1, n2) = buffer_key
62 |         _, scatter_rank = CrossNodeGpus(n1, n2)
63 |         # IB copy divided across multiple parallel channels
64 |         chunks = ib_chunk.split(ib_connections)
65 |         for ch, c in enumerate(chunks):
66 |             # Note: If we are only going to use 1 IB connection for each IB copy
67 |             # alternate between channels 0 and 1 to utilize both IB links.
68 |             if ib_connections == 1:
69 |                 ib_channel = c.rank % 2
70 |             else:
71 |                 ib_channel = ch
72 |             c = c.copy(scatter_rank, buffer=buffer_key, ch=ib_channel)
73 |             # Local scatter
74 |             cs = c.split(gpus_per_node * gpus_per_node)
75 |             for i, c in enumerate(cs):
76 |                 # Access the chunk's destination rank and index to route it to its final place
77 |                 final_rank = c.get_dst_rank()
78 |                 index = c.get_dst_index()
79 |                 c.copy(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)


--------------------------------------------------------------------------------
/msccl/programs/alltoall_a100_yifan.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.language import *
 5 | 
 6 | def alltoall_hierarchical(num_nodes, gpus_per_node):
 7 |     num_ranks = num_nodes * gpus_per_node
 8 | 
 9 |     for n1 in range(num_nodes):
10 |         for r in range(1,num_nodes):
11 |             n2 = (n1 + r) % num_nodes
12 | 
13 |             # Gather all local chunks for the node neighbor
14 |             for g1 in range(gpus_per_node):
15 |                 rank1 = n1 * gpus_per_node + g1
16 | 
17 |                 for g2 in range(gpus_per_node):
18 |                     rank2 = n1 * gpus_per_node + g2
19 |                     # chunk to copy: g2 on n2
20 |                     index = n2 * gpus_per_node + g2 
21 |                     c = chunk(rank1, Buffer.input, index)
22 |                     c = c.copy(rank2, f'copy_{n2}')
23 | 
24 |         for r in range(1,num_nodes):
25 |             n2 = (n1 + r) % num_nodes
26 |             # IB copy
27 |             for g1 in range(gpus_per_node):
28 |                 rank = n1 * gpus_per_node + g1
29 |                 ib_peer = n2 * gpus_per_node + g1
30 |                 c = chunk(rank, f'copy_{n2}', 0, 8)
31 |                 c = c.copy(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2)
32 | 
33 |         
34 |     # Handle local chunks within a node
35 |     for rank in range(num_ranks):
36 |         for g in range(gpus_per_node):
37 |             index = (rank // gpus_per_node) * gpus_per_node + g
38 |             c = chunk(rank, Buffer.input, index)
39 |             c.copy(c.get_dst_rank(), Buffer.output, c.get_dst_index())
40 | 


--------------------------------------------------------------------------------
/msccl/rounds_bound.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.ncd_reduction import non_combining_dual
 5 | from msccl.topologies import reverse_topology
 6 | from z3 import *
 7 | from fractions import Fraction
 8 | 
 9 | def _flow(chunk, src, dst):
10 |     return Real(f'flow_{chunk}_from_{src}_to_{dst}')
11 | 
12 | def lower_bound_rounds(topology, collective, logging=False):
13 |     '''
14 |     Solve a lower bound rounds required by any algorithm. Uses a multi-commodity feasibility inspired encoding to Z3.
15 |     '''
16 | 
17 |     opt = Optimize()
18 | 
19 |     # Remember names before possible non-combining dual reduction
20 |     collective_name = collective.name
21 |     topology_name = topology.name
22 | 
23 |     # Use non-combining dual if necessary
24 |     if collective.is_combining:
25 |         collective = non_combining_dual(collective)
26 |         topology = reverse_topology(topology)
27 | 
28 |     chunks = collective.chunks()
29 |     ranks = collective.ranks()
30 | 
31 |     for chunk in chunks:
32 |         for rank in ranks:
33 |             # All flows are between 0 and 1
34 |             for dst in topology.destinations(rank):
35 |                 opt.add(_flow(chunk,rank,dst) >= 0)
36 |                 opt.add(_flow(chunk,rank,dst) <= 1)
37 |             total_in = sum(_flow(chunk,src,rank) for src in topology.sources(rank))
38 |             if not collective.precondition(rank, chunk):
39 |                 # Ranks not in the precondition need to justify outflows
40 |                 for dst in topology.destinations(rank):
41 |                     opt.add(_flow(chunk,rank,dst) <= total_in)
42 |                 # Ranks in the postcondition, but not in the precondition need the whole chunk
43 |                 if collective.postcondition(rank, chunk):
44 |                     opt.add(total_in == 1)
45 | 
46 |     # Represents how many rounds all the steps of the algorithm would use
47 |     rounds = Real(f'rounds')
48 |     
49 |     for srcs, dsts, bw, _ in topology.bandwidth_constraints():
50 |         # Sum of all flows relevant to this constraint
51 |         sum_flow = sum(_flow(chunk,src,dst) for src in srcs for dst in dsts for chunk in chunks)
52 |         # Total flow must be less than the limit, taking rounds into consideration
53 |         opt.add(sum_flow <= bw * rounds)
54 | 
55 |     # Minimize the number of rounds
56 |     min_rounds = opt.minimize(rounds)
57 |     result = opt.check()
58 |     if result == sat:
59 |         bound_ref = opt.lower(min_rounds)
60 |         if isinstance(bound_ref, IntNumRef):
61 |             rounds_lb = Fraction(bound_ref.as_long(), 1)
62 |         elif isinstance(bound_ref, RatNumRef):
63 |             rounds_lb = bound_ref.as_fraction()
64 |         else:
65 |             raise RuntimeError(f'Unhandled Z3 numeral type: {type(bound_ref)}')
66 |         if logging:
67 |             print(f'{collective_name} algorithms need at least {rounds_lb} rounds in {topology_name} topology.')
68 |         return rounds_lb
69 |     else:
70 |         if logging:
71 |             if result == unsat:
72 |                 print(f'Unsat. {collective_name} is not implementable in {topology_name} topology.')
73 |             else:
74 |                 assert result == unknown, 'Unhandled Z3 result'
75 |                 print('Unknown. Z3 was not able to solve the lower bound.')
76 |         return None
77 | 


--------------------------------------------------------------------------------
/msccl/serialization.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | from msccl.algorithm import Algorithm, Step
  5 | from msccl.topologies import Topology
  6 | from msccl.instance import Instance
  7 | from msccl.collectives import Collective, Chunk
  8 | 
  9 | import json
 10 | import warnings
 11 | 
 12 | def _msccl_object_hook(o):
 13 |     if not 'msccl_type' in o:
 14 |         return o
 15 |     if o['msccl_type'] == 'algorithm':
 16 |         input_map = { int(k): set(v) for k, v in o['input_map'].items() }
 17 |         output_map = { int(k): set(v) for k, v in o['output_map'].items() }
 18 |         return Algorithm(o['name'], o['collective'], o['topology'], o['instance'], o['steps'], input_map, output_map)
 19 |     if o['msccl_type'] == 'step':
 20 |         sends = [(addr, src, dst) for addr, src, dst in o['sends']]
 21 |         return Step(o['rounds'], sends)
 22 |     if o['msccl_type'] == 'collective':
 23 |         triggers = { (int(r), int(c)): v for r, rmap in o['triggers'].items() for c, v in rmap.items() }
 24 |         return Collective(o['name'], o['nodes'], o['chunks'], triggers, o['runtime_name'])
 25 |     if o['msccl_type'] == 'chunk':
 26 |         pre = set(o['pre'])
 27 |         post = set(o['post'])
 28 |         return Chunk(pre, post, o['addr'])
 29 |     if o['msccl_type'] == 'topology':
 30 |         return Topology(o['name'], o['links'], o['switches'])
 31 |     if o['msccl_type'] == 'instance':
 32 |         return Instance(o['steps'], o['extra_rounds'], o['chunks'], o['pipeline'], o['extra_memory'], o['allow_exchange'])
 33 |     warnings.warn('Unhandled msccl_type in JSON')
 34 | 
 35 | def MSCCLDecoder():
 36 |     return json.JSONDecoder(object_hook=_msccl_object_hook)
 37 | 
 38 | class MSCCLEncoder(json.JSONEncoder):
 39 |     def __init__(self):
 40 |         super().__init__()
 41 |     
 42 |     def default(self, o):
 43 |         if isinstance(o, Algorithm):
 44 |             input_map = { k: list(v) for k, v in o.input_map.items() }
 45 |             output_map = { k: list(v) for k, v in o.output_map.items() }
 46 |             return {
 47 |                 'msccl_type': 'algorithm',
 48 |                 'name': o.name,
 49 |                 'instance': o.instance,
 50 |                 'input_map': input_map,
 51 |                 'output_map': output_map,
 52 |                 'steps': o.steps,
 53 |                 'collective': o.collective,
 54 |                 'topology': o.topology,
 55 |             }
 56 |         if isinstance(o, Step):
 57 |             return {
 58 |                 'msccl_type': 'step',
 59 |                 'rounds': o.rounds,
 60 |                 'sends': o.sends,
 61 |             }
 62 |         if isinstance(o, Collective):
 63 |             triggers = {}
 64 |             for (r, c), v in o._triggers.items():
 65 |                 if not r in triggers:
 66 |                     triggers[r] = {}
 67 |                 triggers[r][c] = v
 68 |             return {
 69 |                 'msccl_type': 'collective',
 70 |                 'name': o.name,
 71 |                 'nodes': o.num_nodes,
 72 |                 'chunks': o._chunks,
 73 |                 'triggers': triggers,
 74 |                 'runtime_name': o.runtime_name,
 75 |             }
 76 |         if isinstance(o, Chunk):
 77 |             return {
 78 |                 'msccl_type': 'chunk',
 79 |                 'pre': list(o.precondition),
 80 |                 'post': list(o.postcondition),
 81 |                 'addr': o.address,
 82 |             }
 83 |         if isinstance(o, Topology):
 84 |             return {
 85 |                 'msccl_type': 'topology',
 86 |                 'name': o.name,
 87 |                 'switches': o.switches,
 88 |                 'links': o.links,
 89 |             }
 90 |         if isinstance(o, Instance):
 91 |             return {
 92 |                 'msccl_type': 'instance',
 93 |                 'steps': o.steps,
 94 |                 'extra_rounds': o.extra_rounds,
 95 |                 'chunks': o.chunks,
 96 |                 'pipeline': o.pipeline,
 97 |                 'extra_memory': o.extra_memory,
 98 |                 'allow_exchange': o.allow_exchange,
 99 |             }
100 |         return json.JSONEncoder.default(self, o)
101 | 
102 | def save_msccl_object(obj, filename):
103 |     with open(filename, 'w') as f:
104 |         f.write(MSCCLEncoder().encode(obj))
105 | 
106 | def load_msccl_object(filename):
107 |     with open(filename) as f:
108 |         return MSCCLDecoder().decode(f.read())
109 | 


--------------------------------------------------------------------------------
/msccl/steps_bound.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import math
 5 | 
 6 | def _distances(topology):
 7 |     # Floyd–Warshall algorithm for all-pairs shortest paths
 8 |     nodes = range(topology.num_nodes())
 9 |     dist = [[math.inf for _ in nodes] for _ in nodes]
10 |     for dst in nodes:
11 |         for src in topology.sources(dst):
12 |             dist[src][dst] = 1
13 |     for node in nodes:
14 |         dist[node][node] = 0
15 |     for k in nodes:
16 |         for i in nodes:
17 |             for j in nodes:
18 |                 if dist[i][j] > dist[i][k] + dist[k][j]:
19 |                     dist[i][j] = dist[i][k] + dist[k][j]
20 |     return dist
21 | 
22 | def lower_bound_steps(topology, collective):
23 |     ''' Finds a lower bound for the steps required as the maximum distance for a chunk from any of its sources. '''
24 | 
25 |     dist = _distances(topology)
26 | 
27 |     # Find the maximum of the least steps required for each chunk 
28 |     least_steps = 0
29 |     for chunk in collective.chunks():
30 |         for dst in collective.ranks():
31 |             if collective.postcondition(dst, chunk):
32 |                 # Find the shortest distance from some rank in the precondition
33 |                 least_distance = math.inf
34 |                 for src in collective.ranks():
35 |                     if collective.precondition(src, chunk):
36 |                         least_distance = min(least_distance, dist[src][dst])
37 |                 # Update the least steps required if the distance from any rank in the precondition is larger
38 |                 least_steps = max(least_steps, least_distance)
39 | 
40 |     if least_steps == math.inf:
41 |         # Return None if the collective is unimplementable with any number of steps
42 |         return None
43 |     else:
44 |         return least_steps
45 | 


--------------------------------------------------------------------------------
/msccl/strategies.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | from msccl.instance import Instance
  5 | from msccl.path_encoding import PathEncoding
  6 | from msccl.rounds_bound import lower_bound_rounds
  7 | from msccl.steps_bound import lower_bound_steps
  8 | 
  9 | import time
 10 | import math
 11 | from fractions import Fraction
 12 | import itertools
 13 | from collections import defaultdict
 14 | 
 15 | def _solve_and_log(encoding, instance, logging):
 16 |     if logging:
 17 |         print(f'Solving instance {instance}... ', end='', flush=True)
 18 | 
 19 |     start_time = time.time()
 20 |     result = encoding.solve(instance)
 21 |     duration = time.time() - start_time
 22 |     
 23 |     if logging:
 24 |         if result != None:
 25 |             print(f'synthesized! ({duration:.1f}s)')
 26 |         else:
 27 |             print(f'unsatisfiable. ({duration:.1f}s)')
 28 | 
 29 |     return result
 30 | 
 31 | def solve_instance(topology, collective, instance, logging = False):
 32 |     encoding = PathEncoding(topology, collective)
 33 |     return _solve_and_log(encoding, instance, logging)
 34 | 
 35 | def solve_least_steps(topology, collective, initial_steps = 1, base_instance = Instance(None), logging = False):
 36 |     if initial_steps < 1:
 37 |         raise ValueError('initial_steps must be strictly positive')
 38 | 
 39 |     encoding = PathEncoding(topology, collective)
 40 | 
 41 |     # Lower bound the number of steps required
 42 |     steps_lb = lower_bound_steps(topology, collective)
 43 |     if steps_lb == None:
 44 |         if logging:
 45 |             raise ValueError('The collective is unimplementable in this topology.')
 46 |     if logging:
 47 |         print(f'Algorithms need at least {steps_lb} steps.')
 48 | 
 49 |     num_steps = max(initial_steps, steps_lb)
 50 |     if num_steps > steps_lb:
 51 |         result = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging)
 52 |         if result != None:
 53 |             if logging:
 54 |                 print('Synthesized on initial guess. Checking for fewer steps.')
 55 |             while num_steps > steps_lb:
 56 |                 num_steps -= 1
 57 |                 maybe_better = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging)
 58 |                 if maybe_better != None:
 59 |                     result = maybe_better
 60 |                 else:
 61 |                     break
 62 |             return result
 63 |         else:
 64 |             num_steps += 1
 65 |     
 66 |     while True:
 67 |         result = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging)
 68 |         if result != None:
 69 |             return result
 70 |         else:
 71 |             num_steps += 1
 72 | 
 73 | def solve_all_latency_bandwidth_tradeoffs(topology, collective, min_chunks = 1, max_chunks = None, assume_rounds_per_chunk_lb = None, assume_monotonic_feasibility = False, base_instance = Instance(None), logging = False):
 74 |     if min_chunks < 1:
 75 |         raise ValueError('min_chunks must be strictly positive.')
 76 |     if max_chunks != None and max_chunks < min_chunks:
 77 |         raise ValueError('max_chunks must be greater or equal to min_chunks.')
 78 |     if assume_rounds_per_chunk_lb != None and assume_rounds_per_chunk_lb < 0:
 79 |         raise ValueError('assume_rounds_per_chunk_lb must be positive.')
 80 | 
 81 |     # Lower bound the number of steps required
 82 |     steps_lb = lower_bound_steps(topology, collective)
 83 |     if logging:
 84 |         print(f'Algorithms need at least {steps_lb} steps.')
 85 | 
 86 |     # Lower bound the number of rounds per unit of chunkiness required
 87 |     if assume_rounds_per_chunk_lb != None:
 88 |         rounds_per_chunk_lb = assume_rounds_per_chunk_lb
 89 |         if logging:
 90 |             print(f'Assuming algorithms need at least {rounds_per_chunk_lb} rounds per chunk.')
 91 |     else:
 92 |         rounds_per_chunk_lb = lower_bound_rounds(topology, collective)
 93 |         if logging:
 94 |             print(f'Algorithms need at least {rounds_per_chunk_lb} rounds per chunk.')
 95 | 
 96 |     # Remember for which rounds per chunk fraction a given number of steps will be unsat
 97 |     step_rpc_lb = defaultdict(lambda: Fraction(0))
 98 | 
 99 |     chunks_iter = range(min_chunks, max_chunks+1) if max_chunks != None else itertools.count(min_chunks)
100 | 
101 |     algorithms = []
102 |     for chunks in chunks_iter:
103 |         encoding = PathEncoding(topology, collective)
104 |         rounds_lb = math.ceil(rounds_per_chunk_lb * chunks)
105 | 
106 |         rounds = rounds_lb - 1
107 |         found = False
108 |         while not found:
109 |             rounds += 1
110 |             rpc = Fraction(rounds, chunks)
111 |             # Skip this fraction if a lower number of chunks will have already considered it
112 |             if math.gcd(chunks, rounds) != 1:
113 |                 continue
114 |             for steps in range(steps_lb, rounds+1):
115 |                 # Skip this number of steps if a previous instance with stricter rounds per chunk already failed
116 |                 if assume_monotonic_feasibility and rpc < step_rpc_lb[steps]:
117 |                     continue
118 |                 instance = base_instance.set(steps=steps, extra_rounds=rounds - steps, chunks=chunks)
119 |                 result = _solve_and_log(encoding, instance, logging=logging)
120 |                 if result != None:
121 |                     assert rpc >= step_rpc_lb[steps], 'Monotonic feasibility assumption would have been violated.'
122 |                     found = True
123 |                     yield result
124 |                     break
125 |                 else:
126 |                     # Update the rounds per chunk for which this number of steps is not sufficient
127 |                     step_rpc_lb[steps] = max(step_rpc_lb[steps], rpc)
128 |                     if logging and assume_monotonic_feasibility:
129 |                         print(f'Assuming {steps} step algorithms need at least {rpc} rounds per chunk.')
130 |         # Check if a bandwidth optimal algorithm has been found
131 |         if found and rpc <= rounds_per_chunk_lb:
132 |             assert rpc == rounds_per_chunk_lb, 'Rounds per chunk lower bound did not hold.'
133 |             if logging:
134 |                 print(f'Bandwidth optimal algorithm found!')
135 |             break
136 |     else:
137 |         if logging:
138 |             print(f'Reached the limit for chunks.')
139 | 
140 | def _steps(algo):
141 |     return len(algo.steps)
142 | 
143 | def _rpc(algo):
144 |     return Fraction(_steps(algo) + algo.extra_rounds(), algo.instance.chunks) 
145 | 
146 | def prune_pareto_optimal(algorithms):
147 |     efficient_algorithms = []
148 |     for i, algo in enumerate(algorithms):
149 |         is_efficient = True
150 |         for j, other in enumerate(algorithms):
151 |             either_worse = _steps(algo) > _steps(other) or _rpc(algo) > _rpc(other)
152 |             neither_better = _steps(algo) >= _steps(other) and _rpc(algo) >= _rpc(other)
153 |             if either_worse and neither_better:
154 |                 is_efficient = False
155 |                 break
156 |         if is_efficient:
157 |             efficient_algorithms.append(algo)
158 | 
159 |     return efficient_algorithms
160 | 


--------------------------------------------------------------------------------
/msccl/topologies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | 
4 | from .generic import *
5 | from .transformers import *
6 | from .amd import *
7 | from .nvidia import *
8 | from .distributed import *
9 | 


--------------------------------------------------------------------------------
/msccl/topologies/amd.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .topology import Topology
 5 | 
 6 | def amd4():
 7 |     links = [
 8 |         [0, 1, 1, 0],
 9 |         [1, 0, 0, 1],
10 |         [1, 0, 0, 1],
11 |         [0, 1, 1, 0]
12 |     ]
13 |     return Topology('AMD4', links)
14 | 
15 | def amd8():
16 |     links = [
17 |         [0, 5, 6, 6, 5, 6, 5, 5],
18 |         [5, 0, 5, 5, 6, 5, 6, 6],
19 |         [6, 5, 0, 6, 5, 6, 5, 5],
20 |         [6, 5, 6, 0, 5, 6, 5, 5],
21 |         [5, 6, 5, 5, 0, 5, 6, 6],
22 |         [6, 5, 6, 6, 5, 0, 5, 5],
23 |         [5, 6, 5, 5, 6, 5, 0, 6],
24 |         [5, 6, 5, 5, 6, 5, 6, 0]
25 |     ]
26 |     return Topology('AMD8', links)
27 | 


--------------------------------------------------------------------------------
/msccl/topologies/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .topology import Topology
 5 | 
 6 | def _copy_links(remote_bw, num_local, num_dist, local_links):
 7 |     return [[remote_bw if src // num_local != dst // num_local else local_links[dst % num_local][src % num_local]
 8 |         for src in range(num_dist)] for dst in range(num_dist)]
 9 | 
10 | def _copy_switches(num_local, num_copies, local_switches):
11 |     switches = []
12 |     for srcs, dsts, bw, name in local_switches:
13 |         for i in range(num_copies):
14 |             dist_srcs = [src + i * num_local for src in srcs]
15 |             dist_dsts = [dst + i * num_local for dst in dsts]
16 |             switches.append((dist_srcs, dist_dsts, bw, f'copy_{i}_{name}_local'))
17 |     return switches
18 | 
19 | def distributed_fully_connected(local_topology, num_copies, remote_bw):
20 |     num_local = local_topology.num_nodes()
21 |     num_dist = num_local * num_copies
22 | 
23 |     links = _copy_links(remote_bw, num_local, num_dist, local_topology.links)
24 |     switches = _copy_switches(num_local, num_copies, local_topology.switches)
25 | 
26 |     return Topology(f'DistributedFullyConnected(local={local_topology.name},copies={num_copies},bw={remote_bw})', links, switches)
27 | 
28 | def distributed_hub_and_spoke(local_topology, num_copies, remote_bw):
29 |     num_local = local_topology.num_nodes()
30 |     num_dist = num_local * num_copies
31 | 
32 |     links = _copy_links(remote_bw, num_local, num_dist, local_topology.links)
33 |     switches = _copy_switches(num_local, num_copies, local_topology.switches)
34 | 
35 |     for i in range(num_copies):
36 |         local_ranks = [j + i * num_local for j in range(num_local)]
37 |         remote_ranks = [k for k in range(num_dist) if k // num_local != i]
38 |         switches.append((local_ranks, remote_ranks, remote_bw, f'copy_{i}_out_remote'))
39 |         switches.append((remote_ranks, local_ranks, remote_bw, f'copy_{i}_in_remote'))
40 |     
41 |     return Topology(f'DistributedHubAndSpoke(local={local_topology.name},copies={num_copies},bw={remote_bw})', links, switches)
42 | 


--------------------------------------------------------------------------------
/msccl/topologies/generic.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .topology import Topology
 5 | 
 6 | def hub_and_spoke(num_nodes):
 7 |     links = [[0 if x==y else 1 for y in range(num_nodes)] for x in range(num_nodes)]
 8 |     switches = []
 9 |     for node in range(num_nodes):
10 |         others = [other for other in range(num_nodes) if other != node]
11 |         switches.append(([node],others,1,f'node_{node}_out'))
12 |         switches.append((others,[node],1,f'node_{node}_in'))
13 |     return Topology(f'HubAndSpoke(n={num_nodes})', links, switches)
14 | 
15 | def fully_connected(num_nodes):
16 |     links = []
17 |     for i in range(num_nodes):
18 |         row = [1] * num_nodes
19 |         row[i] = 0
20 |         links.append(row)
21 |     return Topology(f'FullyConnected(n={num_nodes})', links)
22 | 
23 | def ring(num_nodes):
24 |     links = []
25 |     for i in range(num_nodes):
26 |         row = [0] * num_nodes
27 |         row[(i+1) % num_nodes] = 1
28 |         row[(i-1) % num_nodes] = 1
29 |         links.append(row)
30 |     return Topology(f'Ring(n={num_nodes})', links)
31 | 
32 | def line(num_nodes):
33 |     links = []
34 |     for i in range(num_nodes):
35 |         row = [0] * num_nodes
36 |         if i - 1 >= 0:
37 |             row[i-1] = 1
38 |         if i + 1 < num_nodes:
39 |             row[i+1] = 1
40 |         links.append(row)
41 |     return Topology(f'Line(n={num_nodes})', links)
42 | 
43 | def star(num_nodes, non_blocking=True):
44 |     links = [[0 if i == 0 else 1 for i in range(num_nodes)]]
45 |     for i in range(1, num_nodes):
46 |         links.append([1 if j == 0 else 0 for j in range(num_nodes)])
47 |     switches = []
48 |     if not non_blocking:
49 |         points = [i for i in range(num_nodes) if i != 0]
50 |         switches.append(([0],points,1,f'to_points'))
51 |         switches.append((points,[0],1,f'from_points'))
52 |     return Topology(f'Star(n={num_nodes})', links, switches)
53 | 


--------------------------------------------------------------------------------
/msccl/topologies/nvidia.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .topology import Topology
 5 | 
 6 | from fractions import Fraction
 7 | import subprocess
 8 | 
 9 | def dgx1():
10 |     # (0 1 2 3) (4 5 6 7) are two sockets
11 |     # 0 1 3 2 is the high bandwidth chain in socket 1
12 |     # 4 5 7 6 is the high bandwidth chain in socket 2
13 |     # 0 4 and 2 6 are high bandwidth intersocket links  
14 | 
15 |     links = [
16 |         #0  1  2  3  4  5  6  7
17 |         [0, 2, 1, 1, 2, 0, 0, 0],
18 |         [2, 0, 1, 2, 0, 1, 0, 0],
19 |         [1, 1, 0, 2, 0, 0, 2, 0],
20 |         [1, 2, 2, 0, 0, 0, 0, 1],
21 |         [2, 0, 0, 0, 0, 2, 1, 1],
22 |         [0, 1, 0, 0, 2, 0, 1, 2],
23 |         [0, 0, 2, 0, 1, 1, 0, 2],
24 |         [0, 0, 0, 1, 1, 2, 2, 0]
25 |     ]
26 | 
27 |     # self.symmetries = [
28 |     #     [0, 1, 2, 3, 4, 5, 6, 7], #0 goes to itself
29 |     #     [0, 1, 2, 3, 4, 5, 6, 7], #1 goes to itself
30 |     #     [2, 3, 0, 1, 6, 7, 4, 5], #2 goes to 0, 3 goes to 1, ... top - bottom symmetry
31 |     #     [2, 3, 0, 1, 6, 7, 4, 5], #3 goes to 1, 2 goes to 0, ... top - bottom symmetry
32 |     #     [4, 5, 6, 7, 0, 1, 2, 3], #4 goes to 0, 5 goes to 1, ... left - right symmetry
33 |     #     [4, 5, 6, 7, 0, 1, 2, 3], #5 goes to 1, 4 goes to 0, ... left - right symmetry
34 |     #     [6, 7, 4, 5, 2, 3, 0, 1], #6 goes to 0, 7 goes to 1, ... top-bottom + left-right
35 |     #     [6, 7, 4, 5, 2, 3, 0, 1]  #7 goes to 1, 6 goes to 0, ... top-bottom + left-right
36 |     # ]
37 | 
38 |     # self.beta_bound = Fraction(7,6)
39 |     # self.diameter = 2
40 | 
41 |     return Topology('DGX1', links)
42 | 
43 | def dgx_a100():
44 |     links = [[12]*8 for i in range(8)]  
45 |     for i in range(8):
46 |         links[i][i] = 0
47 |     
48 |     return Topology('DGX_A100', links)
49 | 
50 | def nvlink_only(nvidia_smi_topo=None):
51 |     if nvidia_smi_topo == None:
52 |         nvidia_smi_topo = _get_nvidia_smi_topo()
53 |     links = _parse_nvidia_smi_topo(nvidia_smi_topo)
54 |     return Topology('NVLinkOnly', links)
55 | 
56 | def _get_nvidia_smi_topo():
57 |     output = subprocess.check_output("nvidia-smi topo -m".split())
58 |     return output.decode("utf-8")
59 | 
60 | def _parse_nvidia_smi_topo(output):
61 |     lines = output.splitlines()
62 |     before_legend = []
63 |     for l in lines[1:]:
64 |         if l and l.startswith("GPU"):
65 |             # Only look at the rows for GPU
66 |             before_legend.append(l)
67 |         else:
68 |             break
69 |     devices = [x.split("\t")[0] for x in before_legend]
70 |     gpus = [i for i in range(len(before_legend))
71 |             if before_legend[i].startswith("GPU")]
72 |     matrix = [x.split("\t")[1:] for x in before_legend]
73 |     nvlink_matrix = [[_nvlink_num(x[g]) for g in gpus] for x in matrix]
74 |     return nvlink_matrix
75 | 
76 | def _nvlink_num(x):
77 |     x = x.strip()
78 |     if x.startswith("NV"):
79 |         return int(x[2:])
80 |     else:
81 |         return 0
82 | 


--------------------------------------------------------------------------------
/msccl/topologies/topology.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | class Topology(object):
 5 |     def __init__(self, name, links, switches=[]):
 6 |         self.name = name
 7 |         self.links = links
 8 |         self.switches = switches
 9 |         for srcs, dsts, bw, switch_name in switches:
10 |             if bw == 0:
11 |                 raise ValueError(f'Switch {switch_name} has zero bandwidth, but switch bandwidths must be strictly positive. Please encode connectedness in links.')
12 |             if bw < 0:
13 |                 raise ValueError(f'Switch {switch_name} has a negative bandwidth of {bw}. Bandwidth must be strictly positive.')
14 | 
15 |     def sources(self, dst):
16 |         for src, bw in enumerate(self.links[dst]):
17 |             if bw > 0:
18 |                 yield src
19 | 
20 |     def destinations(self, src):
21 |         for dst, links in enumerate(self.links):
22 |             bw = links[src]
23 |             if bw > 0:
24 |                 yield dst
25 | 
26 |     def link(self, src, dst):
27 |         return self.links[dst][src]
28 | 
29 |     def num_nodes(self):
30 |         return len(self.links)
31 | 
32 |     def nodes(self):
33 |         return range(self.num_nodes())
34 |     
35 |     def bandwidth_constraints(self):
36 |         for dst, dst_links in enumerate(self.links):
37 |             for src, bw in enumerate(dst_links):
38 |                 if bw > 0:
39 |                     yield ([src], [dst], bw, f'{src}→{dst}')
40 |         for srcs, dsts, bw, switch_name in self.switches:
41 |             yield (srcs, dsts, bw, switch_name)
42 | 


--------------------------------------------------------------------------------
/msccl/topologies/transformers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .topology import Topology
 5 | 
 6 | def reverse_topology(topology):
 7 |     '''
 8 |     Reverses the direction of all links and switches in the topology.
 9 |     '''
10 |     num_nodes = topology.num_nodes()
11 |     # Transpose the links
12 |     links = [[topology.links[src][dst] for src in range(num_nodes)] for dst in range(num_nodes)]
13 |     # Reverse the switches
14 |     switches = [(dsts, srcs, bw, f'{name}_reversed') for srcs, dsts, bw, name in topology.switches]
15 |     return Topology(f'Reverse{topology.name}', links, switches)
16 | 
17 | def binarize_topology(topology):
18 |     '''
19 |     Makes all link bandwidths 1 and removes all switches. Essentially, the bandwidth modeling part of the topology
20 |     is stripped out and only connectivity information is kept.
21 |     '''
22 |     num_nodes = topology.num_nodes()
23 |     links = [[1 if topology.links[src][dst] > 0 else 0 for src in range(num_nodes)] for dst in range(num_nodes)]
24 |     return Topology(f'Binarized{topology.name}', links, [])
25 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --cov=msccl --cov-report term-missing:skip-covered --cov-fail-under 90 -n auto
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | dataclasses; python_version < "3.7"
 2 | z3-solver
 3 | argcomplete
 4 | lxml
 5 | humanfriendly
 6 | tabulate
 7 | pytest
 8 | pytest-cov
 9 | pytest-xdist
10 | -e .
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | setup(
 7 |     name='msccl',
 8 |     version='2.3.0',
 9 |     packages=find_packages(),
10 |     entry_points={
11 |         'console_scripts': [
12 |             'msccl = msccl.__main__:main',
13 |         ],
14 |     },
15 |     scripts = [
16 |         'msccl/autosynth/msccl_ndv2_launcher.sh'
17 |     ],
18 |     install_requires=[
19 |         'dataclasses; python_version < "3.7"',
20 |         'z3-solver',
21 |         'argcomplete',
22 |         'lxml',
23 |         'humanfriendly',
24 |         'tabulate',
25 |         'igraph'
26 |     ],
27 |     python_requires='>=3.6',
28 | )
29 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.


--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.collectives import *
 5 | 
 6 | def null_collective(num_nodes):
 7 |     return build_collective(f'Null(n={num_nodes})', num_nodes, 1,
 8 |         lambda r, c: True, lambda r, c: False)
 9 | 
10 | def impossible_collective(num_nodes):
11 |     return build_collective(f'Impossible(n={num_nodes})', num_nodes, 1,
12 |         lambda r, c: False, lambda r, c: True)
13 | 


--------------------------------------------------------------------------------
/tests/test_algorithm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pytest
 5 | from .common import *
 6 | from msccl.algorithm import Algorithm, Step
 7 | from msccl.topologies import fully_connected
 8 | from msccl.instance import Instance
 9 | 
10 | def test_invalid_empty():
11 |     with pytest.raises(RuntimeError):
12 |         num_nodes = 2
13 |         topo = fully_connected(num_nodes)
14 |         algo = Algorithm.make_implementation(impossible_collective(num_nodes), topo, Instance(1), [Step(1,[])])
15 | 
16 | def test_valid_empty():
17 |     num_nodes = 2
18 |     topo = fully_connected(num_nodes)
19 |     algo = Algorithm.make_implementation(null_collective(num_nodes), topo, Instance(1), [Step(1,[])])
20 |     assert algo != None
21 | 


--------------------------------------------------------------------------------
/tests/test_analyses.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pytest
 5 | from msccl.topologies import Topology
 6 | from msccl.collectives import build_collective
 7 | from msccl.rounds_bound import *
 8 | 
 9 | def test_rounds_bound_unimplementable():
10 |     topo = Topology('Unconnected', [[0,0],[0,0]])
11 |     coll = build_collective('Send', 2, 1, lambda r, c: r == 0, lambda r, c: r == 1)
12 |     assert lower_bound_rounds(topo, coll) == None
13 | 


--------------------------------------------------------------------------------
/tests/test_autosynth.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pytest
 5 | import msccl
 6 | import os
 7 | from msccl.autosynth.registry import register_synthesis_plan
 8 | 
 9 | 
10 | def test_msccl_init(capsys):
11 |     msccl.init('not_a_machine_type', 4, ('alltoall', 0))
12 |     out, err = capsys.readouterr()
13 |     assert 'No plan found' in out
14 |     assert not 'MSCCL_CONFIG' in os.environ
15 |     assert 'NCCL_ALGO' not in os.environ
16 | 
17 |     msccl.init('ndv2', 2, ('alltoall', '1MB'))
18 |     out, err = capsys.readouterr()
19 |     assert 'synthesize_ndv2_relay_alltoall' in out
20 |     assert 'MSCCL_CONFIG' in os.environ
21 |     assert 'NCCL_IB_AR_THRESHOLD' not in os.environ
22 |     assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'MSCCL,RING,TREE'
23 | 
24 |     os.environ['NCCL_ALGO'] = 'RING,FAKE_MSCCL'
25 |     msccl.init('ndv4', 8, (msccl.Collective.alltoall, '2MB'))
26 |     out, err = capsys.readouterr()
27 |     assert 'ndv4_alltoall' in out
28 |     assert 'NCCL_IB_AR_THRESHOLD' in os.environ
29 |     assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'MSCCL,RING,FAKE_MSCCL'
30 | 
31 |     os.environ['NCCL_ALGO'] = 'HELLO,MSCCL,WORLD'
32 |     msccl.init('ndv4', 16, (msccl.Collective.alltoall, '35MB'))
33 |     out, err = capsys.readouterr()
34 |     assert 'ndv4_alltoall' in out
35 |     assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'HELLO,MSCCL,WORLD'
36 | 
37 | 
38 | def test_register_plan():
39 |     @register_synthesis_plan('allgather', 'fancy_machine', sizes=(0, '4MB'))
40 |     def dummy_plan(m, s):
41 |         pass
42 | 
43 |     @register_synthesis_plan('allgather', ['m1', 'm2'], sizes=[(0, '4MB'), ('1GiB', None)])
44 |     def dummy_plan(m, s):
45 |         pass
46 | 


--------------------------------------------------------------------------------
/tests/test_distributors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .common import *
 5 | from msccl.topologies import fully_connected, ring, distributed_fully_connected
 6 | from msccl.collectives import alltoall
 7 | from msccl.instance import Instance
 8 | from msccl.path_encoding import PathEncoding
 9 | from msccl.distributors import *
10 | 
11 | 
12 | def test_greedy_alltoall():
13 |     num_nodes = 2
14 |     num_copies = 2
15 |     local_topo = fully_connected(num_nodes)
16 |     encoding = PathEncoding(local_topo, alltoall(num_nodes))
17 |     local_algo = encoding.solve(Instance(1))
18 |     dist_topo = distributed_fully_connected(local_topo, num_copies, remote_bw=1)
19 |     dist_algo = synthesize_greedy_distributed_alltoall(dist_topo, local_algo)
20 |     dist_algo.check_implements(alltoall(num_nodes * num_copies))
21 | 
22 | def test_alltoall_subproblem():
23 |     num_nodes = 2
24 |     num_copies = 2
25 |     local_topo = ring(num_nodes)
26 |     sub_coll, sub_topo = make_alltoall_subproblem_collective_and_topology(local_topo, num_copies, [0])
27 |     encoding = PathEncoding(sub_topo, sub_coll)
28 |     sub_algo = encoding.solve(Instance(3, extra_rounds=1))
29 |     dist_algo = synthesize_alltoall_subproblem(sub_algo, num_copies)
30 |     dist_algo.check_implements(alltoall(num_nodes * num_copies))
31 | 


--------------------------------------------------------------------------------
/tests/test_path_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.path_encoding import PathEncoding
 5 | from msccl.topologies import fully_connected, line, dgx1
 6 | from msccl.collectives import *
 7 | from msccl.instance import Instance
 8 | 
 9 | def test_fc_noncombining():
10 |     num_nodes = 2
11 |     enc = PathEncoding(fully_connected(num_nodes), allgather(num_nodes))
12 |     assert enc.solve(Instance(1, chunks=2)) == None
13 |     assert enc.solve(Instance(2, chunks=2)) != None
14 | 
15 | def test_fc_combining_reducible():
16 |     num_nodes = 2
17 |     enc = PathEncoding(fully_connected(num_nodes), reduce_scatter(num_nodes))
18 |     assert enc.solve(Instance(1, chunks=2)) == None
19 |     assert enc.solve(Instance(2, chunks=2)) != None
20 | 
21 | def test_fc_combining_nonreducible():
22 |     num_nodes = 2
23 |     enc = PathEncoding(fully_connected(num_nodes), allreduce(num_nodes))
24 |     assert enc.solve(Instance(1, chunks=2)) == None
25 |     assert enc.solve(Instance(2, chunks=2)) != None
26 | 
27 | def test_dgx1_noncombining():
28 |     topo = dgx1()
29 |     enc = PathEncoding(topo, allgather(topo.num_nodes()))
30 |     assert enc.solve(Instance(1)) == None
31 |     assert enc.solve(Instance(2)) != None
32 | 
33 | def test_dgx1_combining_reducible():
34 |     topo = dgx1()
35 |     enc = PathEncoding(topo, reduce_scatter(topo.num_nodes()))
36 |     assert enc.solve(Instance(1)) == None
37 |     assert enc.solve(Instance(2)) != None
38 | 
39 | def test_dgx1_combining_nonreducible():
40 |     topo = dgx1()
41 |     enc = PathEncoding(topo, allreduce(topo.num_nodes()))
42 |     assert enc.solve(Instance(1)) == None
43 |     assert enc.solve(Instance(2)) != None
44 | 
45 | def test_memory_constraint():
46 |     topo = line(3)
47 |     enc = PathEncoding(topo, alltoall(topo.num_nodes()))
48 |     assert enc.solve(Instance(2, extra_memory=0)) == None
49 |     assert enc.solve(Instance(2, extra_memory=1)) != None
50 | 


--------------------------------------------------------------------------------
/tests/test_programs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import msccl
 5 | from msccl.topologies import fully_connected
 6 | from msccl.language.collectives import *
 7 | import os
 8 | import pytest
 9 | 
10 | def test_registered_alltoall_yifan():
11 |     from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical 
12 | 
13 |     num_nodes = 4
14 |     gpus_per_node = 8
15 |     num_ranks = num_nodes * gpus_per_node
16 |     topology = fully_connected(num_ranks)
17 |     collective = AllToAll(num_ranks, 1, inplace=False)
18 |     with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
19 |         alltoall_hierarchical(num_nodes, gpus_per_node)
20 |         assert Check()
21 | 
22 | def test_registered_alltoall_8kp1():
23 |     from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step 
24 | 
25 |     num_nodes = 9
26 |     gpus_per_node = 8
27 |     num_ranks = num_nodes * gpus_per_node
28 |     topology = fully_connected(num_ranks)
29 |     collective = AllToAll(num_ranks, 1, inplace=False)
30 |     with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
31 |         alltoall_three_step(num_nodes, gpus_per_node)
32 |         assert Check()
33 |         XML()
34 | 
35 | def test_registered_allreduce_ring():
36 |     from msccl.programs.allreduce_a100_ring import allreduce_ring 
37 | 
38 |     num_ranks = 8
39 |     instances = 4
40 |     topology = fully_connected(num_ranks)
41 |     collective = AllReduce(num_ranks, num_ranks, inplace=True)
42 |     with MSCCLProgram(f"allreduce_ring", topology, collective, instances,
43 |         protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
44 |         allreduce_ring(num_ranks, num_ranks)
45 |         assert Check()
46 |         XML()
47 | 
48 | def test_registered_allreduce_allpairs():
49 |     from msccl.programs.allreduce_allpairs import allreduce_allpairs
50 | 
51 |     num_ranks = 8
52 |     instances = 2
53 |     topology = fully_connected(num_ranks)
54 |     collective = AllReduce(num_ranks, num_ranks*num_ranks, inplace=True)
55 |     with MSCCLProgram(f"allreduce_allpairs", topology, collective, instances,
56 |         protocol="LL", threadblock_policy=ThreadblockPolicy.manual):
57 |         allreduce_allpairs(num_ranks)
58 |         assert Check()
59 |         XML()
60 | 
61 | def test_registered_ndv4_allreduce(capsys):
62 |     msccl.init('ndv4', 1, (msccl.Collective.allreduce, (512, 1024)))
63 |     out, err = capsys.readouterr()
64 |     assert 'ndv4_allpairs_allreduce_config1 with LL protocol' in out
65 | 
66 |     msccl.init('ndv4', 1, (msccl.Collective.allreduce, (82944, 458752)))
67 |     out, err = capsys.readouterr()
68 |     assert 'ndv4_allpairs_allreduce_config2 with LL protocol' in out
69 | 
70 |     msccl.init('ndv4', 1, (msccl.Collective.allreduce, (458752, 2129920)))
71 |     out, err = capsys.readouterr()
72 |     assert 'ndv4_ring_allreduce_config1 with LL protocol' in out
73 | 
74 |     msccl.init('ndv4', 1, (msccl.Collective.allreduce, (2129920, 22806528)))
75 |     out, err = capsys.readouterr()
76 |     assert 'ndv4_ring_allreduce_config2 with LL128 protocol' in out
77 | 
78 | 
79 | def test_registered_ndv4_alltoall(capsys):
80 |     msccl.init('ndv4', 8, (msccl.Collective.alltoall, ('1MB', '32MB')))
81 |     out, err = capsys.readouterr()
82 |     assert 'ndv4_alltoall_hierarchical_config1 with LL128 protocol' in out
83 | 
84 |     msccl.init('ndv4', 8, (msccl.Collective.alltoall, ('32MB', '64MB')))
85 |     out, err = capsys.readouterr()
86 |     assert 'ndv4_alltoall_hierarchical_config2 with Simple protocol' in out
87 | 
88 |     # msccl.init('ndv4', 64, (msccl.Collective.alltoall, ('32MB', '64MB')))
89 |     # out, err = capsys.readouterr()
90 |     # assert 'ndv4_alltoall_three_step with Simple protocol' in out
91 | 


--------------------------------------------------------------------------------
/tests/test_serialization.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from .common import *
 5 | from msccl.serialization import MSCCLEncoder, MSCCLDecoder
 6 | from msccl.algorithm import Algorithm, Step
 7 | from msccl.topologies import fully_connected
 8 | from msccl.instance import Instance
 9 | 
10 | def test_algorithm_roundtrip():
11 |     name = 'test_algorithm'
12 |     num_nodes = 2
13 |     collective = null_collective(num_nodes)
14 |     topo = fully_connected(num_nodes)
15 |     steps = [Step(1,[(0,0,1)]),Step(2,[(1,1,0),(1,0,1)]),Step(1,[(0,1,0)])]
16 |     instance = Instance(3, pipeline=2)
17 |     algo1 = Algorithm(name, collective, topo, instance, steps)
18 |     json = MSCCLEncoder().encode(algo1)
19 |     assert json != None
20 | 
21 |     algo2 = MSCCLDecoder().decode(json)
22 |     assert algo2.name == name
23 |     assert algo2.instance == instance
24 |     assert algo2.steps == steps
25 | 


--------------------------------------------------------------------------------
/tests/test_topologies.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from msccl.topologies import *
 5 | 
 6 | def test_local_topologies():
 7 |     assert hub_and_spoke(4) != None
 8 |     assert fully_connected(6) != None
 9 |     assert ring(3) != None
10 |     assert line(5) != None
11 |     assert star(6) != None
12 |     assert dgx1() != None
13 |     assert amd4() != None
14 |     assert amd8() != None
15 | 
16 | def test_distributed_topologies():
17 |     assert distributed_fully_connected(ring(4), 2, 1) != None
18 |     assert distributed_hub_and_spoke(star(6), 4, 2) != None
19 | 
20 | def test_transformers():
21 |     assert binarize_topology(dgx1()) != None
22 |     assert reverse_topology(dgx1()) != None
23 | 
24 | def test_nvlink_only():
25 |     dgx1_topo = '''	GPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	mlx5_0	mlx5_2	mlx5_1	mlx5_3	CPU Affinity
26 | GPU0	 X 	NV1	NV1	NV2	NV2	SYS	SYS	SYS	PIX	SYS	PHB	SYS	0-19,40-59
27 | GPU1	NV1	 X 	NV2	NV1	SYS	NV2	SYS	SYS	PIX	SYS	PHB	SYS	0-19,40-59
28 | GPU2	NV1	NV2	 X 	NV2	SYS	SYS	NV1	SYS	PHB	SYS	PIX	SYS	0-19,40-59
29 | GPU3	NV2	NV1	NV2	 X 	SYS	SYS	SYS	NV1	PHB	SYS	PIX	SYS	0-19,40-59
30 | GPU4	NV2	SYS	SYS	SYS	 X 	NV1	NV1	NV2	SYS	PIX	SYS	PHB	20-39,60-79
31 | GPU5	SYS	NV2	SYS	SYS	NV1	 X 	NV2	NV1	SYS	PIX	SYS	PHB	20-39,60-79
32 | GPU6	SYS	SYS	NV1	SYS	NV1	NV2	 X 	NV2	SYS	PHB	SYS	PIX	20-39,60-79
33 | GPU7	SYS	SYS	SYS	NV1	NV2	NV1	NV2	 X 	SYS	PHB	SYS	PIX	20-39,60-79
34 | mlx5_0	PIX	PIX	PHB	PHB	SYS	SYS	SYS	SYS	 X 	SYS	PHB	SYS	
35 | mlx5_2	SYS	SYS	SYS	SYS	PIX	PIX	PHB	PHB	SYS	 X 	SYS	PHB	
36 | mlx5_1	PHB	PHB	PIX	PIX	SYS	SYS	SYS	SYS	PHB	SYS	 X 	SYS	
37 | mlx5_3	SYS	SYS	SYS	SYS	PHB	PHB	PIX	PIX	SYS	PHB	SYS	 X 	
38 | 
39 | Legend:
40 | 
41 |   X    = Self
42 |   SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
43 |   NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
44 |   PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
45 |   PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
46 |   PIX  = Connection traversing a single PCIe switch
47 |   NV#  = Connection traversing a bonded set of # NVLinks'''
48 |     topo = nvlink_only(dgx1_topo)
49 |     assert topo != None
50 |     assert topo.num_nodes() == 8
51 | 


--------------------------------------------------------------------------------