├── .github └── workflows │ ├── codeql.yml │ └── tests.yaml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── SYNTHESIS.md ├── dockerfiles └── Dockerfile ├── examples ├── dgx1_allgather.ipynb ├── mscclang │ ├── allgather_a100_pcie.py │ ├── allgather_allpairs.py │ ├── allgather_recursive_doubling.py │ ├── allgather_ring.py │ ├── allreduce_1step.py │ ├── allreduce_a100_allpairs.py │ ├── allreduce_a100_allpairs_v2.py │ ├── allreduce_a100_multinode_allpairs.py │ ├── allreduce_a100_ncv4.py │ ├── allreduce_a100_ncv4_v2.py │ ├── allreduce_a100_pcie_hierarchical.py │ ├── allreduce_a100_recursive_doubling_halving.py │ ├── allreduce_a100_ring.py │ ├── allreduce_binomial_tree.py │ ├── allreduce_dgx1.py │ ├── allreduce_ndv2.py │ ├── allreduce_recursive_doubling_halving.py │ ├── alltoall_a100_three_step.py │ ├── alltoall_a100_two_step.py │ ├── alltoall_allpairs.py │ ├── alltonext_backward.py │ ├── alltonext_forward.py │ ├── hierarchical_allreduce.py │ ├── pipeline_a100_allpairs.py │ ├── pipeline_a100_ring.py │ ├── reducegather.py │ └── simple │ │ ├── allgather_ring.py │ │ ├── allreduce_ring.py │ │ └── custom_collective.py ├── requirements_sccl_init.txt ├── sccl_init.py ├── send.py └── unpermute_dgx1.py ├── msccl ├── __init__.py ├── __main__.py ├── algorithm.py ├── autosynth │ ├── __init__.py │ ├── msccl_ndv2_launcher.sh │ ├── ndv2_plans.py │ ├── ndv4_plans.py │ └── registry.py ├── cli │ ├── __init__.py │ ├── analyze.py │ ├── common.py │ ├── compose.py │ ├── distribute.py │ ├── known_collectives.py │ ├── known_distributed_topologies.py │ ├── known_topologies.py │ ├── known_transformers.py │ ├── ncclize.py │ ├── plans.py │ └── solve.py ├── collectives.py ├── composers.py ├── distributors │ ├── __init__.py │ ├── alltoall_subproblem.py │ ├── gather_scatter_alltoall.py │ └── greedy_alltoall.py ├── instance.py ├── isomorphisms.py ├── language │ ├── __init__.py │ ├── buffer.py │ ├── chunk.py │ ├── collectives.py │ ├── ir.py │ ├── passes.py │ ├── rank_dag.py │ ├── routines.py │ ├── tb_assignment.py │ └── visualize.py ├── ncclize.py ├── ncd_reduction.py ├── path_encoding.py ├── programs │ ├── __init__.py │ ├── allreduce_a100_ring.py │ ├── allreduce_allpairs.py │ ├── alltoall_a100_8kp1.py │ └── alltoall_a100_yifan.py ├── rounds_bound.py ├── serialization.py ├── steps_bound.py ├── strategies.py └── topologies │ ├── __init__.py │ ├── amd.py │ ├── distributed.py │ ├── generic.py │ ├── nvidia.py │ ├── topology.py │ └── transformers.py ├── pytest.ini ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── common.py ├── test_algorithm.py ├── test_analyses.py ├── test_autosynth.py ├── test_cli.py ├── test_distributors.py ├── test_language.py ├── test_path_encoding.py ├── test_programs.py ├── test_serialization.py └── test_topologies.py /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | schedule: 9 | - cron: '16 14 * * 2' 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v2 23 | 24 | - name: Initialize CodeQL 25 | uses: github/codeql-action/init@v1 26 | with: 27 | languages: python 28 | 29 | - name: Perform CodeQL Analysis 30 | uses: github/codeql-action/analyze@v1 31 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | pull_request: 6 | branches: [ main ] 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | 12 | strategy: 13 | matrix: 14 | python-version: [3.6, 3.7, 3.8, 3.9] 15 | 16 | name: Test with Python ${{ matrix.python-version }} 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install msccl and dependencies 25 | run: | 26 | pip install --upgrade pip 27 | pip install -r requirements.txt 28 | - name: Run tests and check at least 90% coverage 29 | run: | 30 | pytest 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # MSCCL specific 2 | *.msccl.json 3 | *.msccl.xml 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 5 | and actually do, grant us the rights to use your contribution. For details, visit 6 | https://cla.microsoft.com. 7 | 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /SYNTHESIS.md: -------------------------------------------------------------------------------- 1 | ## Synthesizing Algorithms 2 | 3 | MSCCL can synthesize algorithms for a given *topology* that implements a given *collective* in a given number of steps, bandwidth usage, memory limits, etc. These additional parameters are called the *instance*. 4 | 5 | MSCCL groups its solver strategies under the `msccl solve` subcommand. For example, to synthesize a specific `instance` of an Allgather algorithm for the [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps: 6 | ``` 7 | $ msccl solve instance DGX1 Allgather --steps 4 8 | Solving instance steps=4... synthesized! (0.7s) 9 | Wrote to Allgather.n8-DGX1-steps4.msccl.json 10 | ``` 11 | The instance is satisfiable and `msccl` saves it to a file. 12 | 13 | Four steps is not necessarily the least number of steps required. To find the least steps: 14 | ``` 15 | $ msccl solve least-steps DGX1 Allgather 16 | Algorithms need at least 2 steps. 17 | Solving instance steps=2... synthesized! (0.2s) 18 | Wrote to Allgather.n8-DGX1-steps2.msccl.json 19 | ``` 20 | The `least-steps` strategy statically determines that any Allgather in a DGX-1 requires at least 2 steps and starting from that finds the smallest satisfiable number of steps. 21 | 22 | While this two step algorithm is a latency-optimal one, there may be other algorithms that achieve higher bandwidth. The `pareto-optimal` strategy searches through different latency-bandwidth tradeoffs: 23 | ``` 24 | $ msccl solve pareto-optimal DGX1 Allgather 25 | Algorithms need at least 2 steps. 26 | Algorithms need at least 7/6 rounds per chunk. 27 | Solving instance steps=2... synthesized! (0.5s) 28 | Solving instance steps=2,rounds=3,chunks=2... synthesized! (0.9s) 29 | Solving instance steps=2,rounds=4,chunks=3... unsatisfiable. (1.1s) 30 | Assuming 2 step algorithms need at least 4/3 rounds per chunk. 31 | Solving instance steps=3,rounds=4,chunks=3... synthesized! (2.9s) 32 | Solving instance steps=3,rounds=5,chunks=4... synthesized! (6.5s) 33 | Solving instance steps=3,rounds=6,chunks=5... synthesized! (44.0s) 34 | Solving instance steps=3,rounds=7,chunks=6... synthesized! (56.1s) 35 | Bandwidth optimal algorithm found! 36 | Found 2 Pareto optimal algorithms. Pruned 4 non-optimal algorithms. 37 | Wrote to Allgather.n8-DGX1-steps2.rounds3.chunks2.msccl.json 38 | Wrote to Allgather.n8-DGX1-steps3.rounds7.chunks6.msccl.json 39 | ``` 40 | 41 | ## Collectives 42 | 43 | MSCCL includes a number of built in common collectives. 44 | 45 | | Collective | Arguments | Description | Kind | 46 | | - | - | - | - | 47 | | Broadcast | `--root N` | Send data from root to all nodes. | NC | 48 | | Reduce | `--root N` | Combine data from all nodes to root. | CR | 49 | | Scatter | `--root N` | Send slices of data from root to all nodes. | NC | 50 | | Gather | `--root N` | Send slices of data from all nodes to root. | NC | 51 | | Allgather | | Send slices of data from all nodes to all nodes. | NC | 52 | | Allreduce | | Combine data from all nodes to all nodes. | CNR | 53 | | Alltoall | | Transpose data between all nodes. | NC | 54 | | ReduceScatter | | Combine slices of data to all nodes. | CR | 55 | | Scan | | Combine partial prefixes of data to all nodes in sequence. | CNR | 56 | | MultirootBroadcast | `--roots N [N ...]` | Like Broadcast, but set of nodes have slices of input. | NC | 57 | | MultirootScatter | `--roots N [N ...]` | Like Scatter, but set of nodes have slices of input. | NC | 58 | | MultirootGather | `--roots N [N ...]` | Like Gather, but output is sent in slices to a set of nodes. | NC | 59 | | custom | `--collective-file` | Arbitrary collective serialized by the user. | ? | 60 | 61 | Custom collectives may be defined by instantiating the `Collective` class, which is easiest through the `build_collective` function. For example, a send from rank 2 to rank 7 in an 8 node topology can be defined and saved with: 62 | ``` 63 | from msccl.collectives import build_collective 64 | from msccl.serialization import save_msccl_object 65 | 66 | precondition = lambda r, c: r == 2 67 | postcondition = lambda r, c: r == 7 68 | coll = build_collective('Send', 8, 1, precondition, postcondition) 69 | save_msccl_object(coll, 'send.json') 70 | ``` 71 | 72 | The *kind* of the collective determines support for some features of MSCCL: 73 | - **NC** are non-combining collectives, and are always supported. 74 | - **CR** are combining collectives that have a non-combining dual collective, and are supported through a reduction. 75 | - **CNR** are combining collectives with no dual, which may not always be supported. 76 | 77 | Currently the rounds per chunk analysis described below can not support CNR collectives. 78 | 79 | ## Steps and Rounds 80 | 81 | MSCCL uses two related concepts, *steps and rounds*, to talk about the running time of algorithms. *Steps* is how many sequential sets of sends the algorithm consists of, where all sends inside a step execute in parallel. The number of sends between two nodes in a single step is limited by the bandwidth available in the topology. However, a step may consist of multiple *rounds*, which acts as a multiplier for all links in the topology during that step. 82 | 83 | How much data a single round corresponds to depends on what is the actual size of a chunk at runtime, and how many chunks a collective uses can change (e.g. you can control this directly in the `instance` strategy by setting `--chunks N`). Thus for each collective the total data usage of different algorithms implementing it can be measured with their *rounds per chunk*. 84 | 85 | MSCCL provides a standalone analysis to find a lower bound for the *rounds per chunk* required by any instance. For example, to find the least rouds per chunk for an Alltoall in a DGX-1: 86 | ``` 87 | $ msccl analyze rounds DGX1 Gather 88 | Gather(n=8,root=0) algorithms need at least 7/6 rounds in DGX1 topology. 89 | ``` 90 | In this case the bound happens to be tight and the `pareto-optimal` strategy would use it to detect that it has found a bandwidth optimal algorithm. 91 | 92 | ## Distributed Algorithms 93 | 94 | MSCCL provides routines to synthesize algorithms for distributed topologies under the `msccl distribute` subcommand. These work by using algorithms for a local collective and stitcing instances of it together to create a distributed one. 95 | 96 | **Alltoall from Gather and Scatter:** `alltoall-gather-scatter` combines a Gather and a Scatter algorithm with a transpose step in the middle to form a distributed Alltoall algorithm. For example, an Alltoall algorithm for a cluster of 4 DGX-1 machines can be created with: 97 | ``` 98 | msccl solve least-steps DGX1 Gather -o gather.json 99 | msccl solve least-steps DGX1 Scatter -o scatter.json --root 1 100 | msccl distribute alltoall-gather-scatter gather.json scatter.json --copies 4 -o alltoall.json 101 | ``` 102 | This distributor works with any Gather and Scatter algorithm, as long as their roots have a direct connection in the topology. MSCCL also provides multi-root versions of Gather and Scatter that can be substituted here. 103 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-devel 2 | 3 | ############################################################################## 4 | # Temporary Installation Directory 5 | ############################################################################## 6 | ENV STAGE_DIR=/tmp 7 | RUN mkdir -p ${STAGE_DIR} 8 | 9 | 10 | ############################################################################## 11 | # Installation/Basic Utilities 12 | ############################################################################## 13 | RUN apt-get update && \ 14 | apt-get install -y --allow-change-held-packages --no-install-recommends \ 15 | software-properties-common \ 16 | build-essential autotools-dev cmake g++ gcc \ 17 | openssh-client openssh-server \ 18 | nfs-common pdsh curl sudo net-tools \ 19 | vim iputils-ping wget perl unzip 20 | 21 | ############################################################################## 22 | # Installation Latest Git 23 | ############################################################################## 24 | RUN add-apt-repository ppa:git-core/ppa -y && \ 25 | apt-get update && \ 26 | apt-get install -y git && \ 27 | git --version 28 | 29 | ############################################################################## 30 | # Pip 31 | ############################################################################## 32 | # pip version <= 20.1.1 is needed for the ruamel.yaml installation conflict 33 | # between conda and pip. ruamel.yaml is needed by azureml. 34 | # https://github.com/Azure/MachineLearningNotebooks/issues/1110 for more info. 35 | ENV PIP_VERSION=20.1.1 36 | RUN conda install -y pip=${PIP_VERSION} && \ 37 | # Print python an pip version 38 | python -V && pip -V 39 | 40 | ############################################################################## 41 | # MPI 42 | ############################################################################## 43 | RUN cd ${STAGE_DIR} && mkdir openmpi/ && cd openmpi && wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.1.tar.gz && \ 44 | tar zxf openmpi-4.0.1.tar.gz && \ 45 | cd openmpi-4.0.1 && \ 46 | ./configure --enable-orterun-prefix-by-default && \ 47 | make -j $(nproc) all && \ 48 | make install && \ 49 | ldconfig && \ 50 | rm -rf ${STAGE_DIR}/openmpi/ 51 | 52 | ############################################################################## 53 | # MSCCL 54 | ############################################################################## 55 | 56 | # update NCCL in pytorch, install MSCCL interpreter 57 | RUN pip uninstall torch -y 58 | 59 | RUN pip install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses 60 | 61 | RUN conda install -c pytorch magma-cuda111 -y 62 | 63 | ENV CMAKE_PREFIX_PATH=/opt/conda 64 | 65 | # Change NCCL to MSCCL Runtime 66 | RUN cd ${STAGE_DIR} && \ 67 | git clone https://github.com/pytorch/pytorch.git && \ 68 | cd pytorch && \ 69 | git checkout tags/v1.9.0 -b v1.9.0_msccl && \ 70 | perl -p -i -e 's/url = https:\/\/github\.com\/NVIDIA\/nccl/url = https:\/\/github\.com\/microsoft\/msccl/g' .gitmodules && \ 71 | git submodule sync third_party/nccl && \ 72 | git submodule update --init --recursive && \ 73 | git submodule update --init --recursive --remote third_party/nccl && \ 74 | cd third_party/nccl/nccl/ && \ 75 | git checkout master && \ 76 | cd ../../../ && \ 77 | git apply third_party/nccl/nccl/patches/nccl.cpp.patch && \ 78 | python setup.py install && \ 79 | cd ${STAGE_DIR} && \ 80 | rm -rf ${STAGE_DIR}/pytorch 81 | 82 | # Install MSCCL 83 | RUN cd ${STAGE_DIR}/ && \ 84 | git clone https://github.com/microsoft/msccl.git && \ 85 | cd msccl/ && python setup.py install && \ 86 | cd ${STAGE_DIR} && \ 87 | rm -rf ${STAGE_DIR}/msccl/ 88 | 89 | ############################################################################## 90 | # inspector-topo 91 | ############################################################################## 92 | 93 | RUN apt-get install libibverbs-dev libnuma-dev -y 94 | RUN cd ${STAGE_DIR}/ && git clone https://github.com/microsoft/inspector-topo.git && \ 95 | cd inspector-topo/ && make && make install 96 | 97 | -------------------------------------------------------------------------------- /examples/mscclang/allgather_a100_pcie.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllGather 8 | 9 | # Hierarchical allgather for A100 10 | def allgather_hier(gpus, instances, protocol): 11 | size = gpus 12 | chunksperloop = 1 13 | topology = fully_connected(gpus) 14 | collective = AllGather(size, chunksperloop, True) 15 | 16 | with MSCCLProgram("allgather_hierarchical", topology, collective, instances, protocol=protocol, 17 | interleaved_replication=True, dependence_nop=True): 18 | for chnk in range(2): 19 | for r in range(size): 20 | if ((r % 2) == chnk): 21 | c = chunk(r, Buffer.input, 0) 22 | c.copy(r + 1 - 2 * chnk, Buffer.output, r) 23 | for r in range(size): 24 | if ((r % 2) == chnk): 25 | c = chunk(r, Buffer.input, 0) 26 | c.copy((r+2) % size, Buffer.output, r) 27 | for r in range(size): 28 | if ((r % 2) == chnk): 29 | c = chunk(r, Buffer.output, (r+2) % size) 30 | c.copy(r + 1 - 2 * chnk, Buffer.output, (r+2) % size) 31 | 32 | XML() 33 | Check() 34 | 35 | 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 38 | parser.add_argument('instances', type=int, help='number of instances') 39 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 40 | args = parser.parse_args() 41 | 42 | allgather_hier(args.num_gpus, args.instances, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/allgather_allpairs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllGather 8 | 9 | # Allpairs allgather for A100 10 | def allgather_allpairs(gpus, instances, protocol): 11 | size = gpus 12 | topology = fully_connected(gpus) 13 | collective = AllGather(size, 1, True) 14 | 15 | with MSCCLProgram(f"allgather_allpairs", topology, collective, instances, 16 | protocol=protocol, threadblock_policy=ThreadblockPolicy.manual): 17 | 18 | # Each rank sends its nth chunk to all other gpus 19 | for r1 in range(gpus): 20 | for r2 in range(gpus): 21 | if r1 != r2: 22 | index = 0 23 | c = chunk(r1, Buffer.input, index, 1) 24 | c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1) 25 | XML() 26 | Check() 27 | 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 31 | parser.add_argument('instances', type=int, help='number of instances') 32 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 33 | args = parser.parse_args() 34 | 35 | allgather_allpairs(args.num_gpus, args.instances, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/allgather_recursive_doubling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllGather 8 | 9 | # https://web.cels.anl.gov/~thakur/papers/mpi-coll.pdf 10 | def allgather_recursive_doubling(size, instances, protocol): 11 | topology = fully_connected(size) 12 | collective = AllGather(size, 1, True) 13 | with MSCCLProgram("allgather_recursive_doubling", topology, collective, instances, protocol=protocol, threadblock_policy=ThreadblockPolicy.manual): 14 | count = 1 15 | while count < size: 16 | # Every rank exchanges count chunks with neighbor count away 17 | for rank in range(size): 18 | peer = rank ^ count 19 | index = (rank // count) * count 20 | chunk(rank, Buffer.output, index, size=count).copy(peer, Buffer.output, index, sendtb=peer, recvtb=rank) 21 | count *= 2 22 | 23 | XML() 24 | Check() 25 | 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 28 | parser.add_argument('instances', type=int, help ='number of instances') 29 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 30 | args = parser.parse_args() 31 | 32 | allgather_recursive_doubling(args.num_gpus, args.instances, args.protocol) 33 | -------------------------------------------------------------------------------- /examples/mscclang/allgather_ring.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllGather 8 | 9 | # Ring allgather for A100s 10 | # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs. 11 | # channels=1 is standard ring, all chunks are assigned to the same tb/channel 12 | # channels=8 devotes 1 tb/channel to handling 1 chunk of the data 13 | def allgather_ring(size, channels, instances, protocol): 14 | topology = fully_connected(size) 15 | collective = AllGather(size, 1, True) 16 | with MSCCLProgram(f"allgather_ring_{channels}channelsperring", topology, collective, instances, 17 | protocol=protocol, threadblock_policy=ThreadblockPolicy.manual): 18 | for step in range(0, size-1): 19 | for index in range(0, size): 20 | rank = (index + step) % size 21 | c = chunk(rank, Buffer.output, index) 22 | next_rank = (index + step + 1) % size 23 | channel = index%channels 24 | c = c.copy(next_rank, Buffer.output, index, sendtb=channel, recvtb=channel, ch=channel) 25 | XML() 26 | Check() 27 | 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 31 | parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]') 32 | parser.add_argument('instances', type=int, help='number of instances') 33 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 34 | args = parser.parse_args() 35 | 36 | allgather_ring(args.num_gpus, args.channels, args.instances, args.protocol) 37 | -------------------------------------------------------------------------------- /examples/mscclang/allreduce_1step.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | def allreduce_allpairs(gpus, instances, protocol): 10 | size = gpus 11 | chunksperloop = gpus 12 | topology = fully_connected(size) 13 | collective = AllReduce(size, chunksperloop, True) 14 | with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 15 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True): 16 | 17 | # Each rank sends the nth chunk to the nth rank into scratch space 18 | for rank in range(size): 19 | tb = 0 20 | for nghr in range(size): 21 | if rank != nghr: 22 | c = chunk(rank, Buffer.input, index=0, size=size) 23 | c.copy(nghr, 'scratch', sendtb=nghr, recvtb=rank) 24 | tb += 1 25 | 26 | # Each rank performs a local reduction on the nth chunk 27 | # Utilize 8 threadblocks for this reduction for better parallelism 28 | for rank in range(size): 29 | index = 0 30 | tb = 0 31 | for nghr in range(size): 32 | if rank != nghr: 33 | for s in range(size): 34 | c = chunk(rank, Buffer.input, s) 35 | c.reduce(chunk(rank, 'scratch', index), sendtb=s) 36 | index += 1 37 | tb += 1 38 | 39 | XML() 40 | Check() 41 | 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 44 | parser.add_argument('instances', type=int, help='number of instances') 45 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol') 46 | 47 | args = parser.parse_args() 48 | 49 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/allreduce_a100_allpairs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | def allreduce_allpairs(gpus, instances, protocol): 10 | size = gpus 11 | chunksperloop = gpus * gpus 12 | topology = fully_connected(size) 13 | collective = AllReduce(size, chunksperloop, True) 14 | with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 15 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True): 16 | 17 | # Each rank sends the nth chunk to the nth rank into scratch space 18 | for r1 in range(size): 19 | for r2 in range(size): 20 | if r1 != r2: 21 | index = r2 * size 22 | c = chunk(r1, Buffer.input, index, size=size) 23 | c.copy(r2, 'scratch', sendtb=r2, recvtb=r1) 24 | 25 | # Each rank performs a local reduction on the nth chunk 26 | # Utilize 8 threadblocks for this reduction for better parallelism 27 | for r in range(size): 28 | for index in range(0, size * (size-1)): 29 | c = chunk(r, Buffer.input, r*size + (index % size)) 30 | c.reduce(chunk(r, 'scratch', index), sendtb=(index % size)) 31 | 32 | # Each rank sends the fully reduced nth chunk to all other gpus 33 | for r1 in range(size): 34 | for r2 in range(size): 35 | if r1 != r2: 36 | index = r1 * size 37 | c = chunk(r1, Buffer.input, index, size) 38 | c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1) 39 | 40 | XML() 41 | Check() 42 | 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 45 | parser.add_argument('instances', type=int, help='number of instances') 46 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol') 47 | 48 | args = parser.parse_args() 49 | 50 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/allreduce_a100_allpairs_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | import math 9 | 10 | def allreduce_allpairs(gpus, instances, protocol): 11 | size = gpus 12 | chunksperloop = gpus 13 | topology = fully_connected(size) 14 | collective = AllReduce(size, chunksperloop, True) 15 | with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 16 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=False): 17 | 18 | # Each rank sends the nth chunk to the nth rank into scratch space 19 | for r1 in range(size): 20 | for r2 in range(size): 21 | if r1 != r2: 22 | index = r2 23 | c = chunk(r1, Buffer.input, index) 24 | c.copy(r2, 'scratch', sendtb=r2, recvtb=r1) 25 | 26 | # Each rank performs a local reduction on the nth chunk 27 | # Utilize 8 threadblocks for this reduction for better parallelism 28 | for r in range(size): 29 | for k in range(1,int(math.log2(size)+1)): 30 | level = 2**k 31 | for index in range(0, size//level): 32 | if index == 0: 33 | c = chunk(r, Buffer.input, r) 34 | else: 35 | c = chunk(r, 'scratch', (index-1)) 36 | c.reduce(chunk(r, 'scratch', (index+size//level-1)), sendtb=index) 37 | #c = chunk(r, Buffer.input, r*size + (index % size)) 38 | #c.reduce(chunk(r, 'scratch', index), sendtb=(index % size)) 39 | 40 | # Each rank sends the fully reduced nth chunk to all other gpus 41 | for r1 in range(size): 42 | for r2 in range(size): 43 | if r1 != r2: 44 | index = r1 45 | c = chunk(r1, Buffer.input, index) 46 | c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1) 47 | 48 | XML() 49 | Check() 50 | 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 53 | parser.add_argument('instances', type=int, help='number of instances') 54 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol') 55 | 56 | args = parser.parse_args() 57 | 58 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol) 59 | -------------------------------------------------------------------------------- /examples/mscclang/allreduce_a100_multinode_allpairs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | def allreduce_allpairs(gpus, instances, protocol): 10 | size = gpus 11 | chunksperloop = gpus * gpus 12 | topology = fully_connected(2*size) 13 | collective = AllReduce(2*size, chunksperloop, True) 14 | with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 15 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True): 16 | # Each rank sends the nth chunk to the nth rank into scratch space 17 | for r1 in range(size): 18 | for r2 in range(size): 19 | if r1 != r2: 20 | index = r2 * size 21 | c = chunk(r1, Buffer.input, index, size=size) 22 | c.copy(r2, 'scratch', sendtb=r2, recvtb=r1) 23 | 24 | c2 = chunk(r1+size, Buffer.input, index, size=size) 25 | c2.copy(r2+size, 'scratch', sendtb=r2, recvtb=r1) 26 | 27 | # Each rank performs a local reduction on the nth chunk 28 | # Utilize 8 threadblocks for this reduction for better parallelism 29 | for r in range(size): 30 | for index in range(0, size * (size-1)): 31 | c = chunk(r, Buffer.input, r*size + (index % size)) 32 | c.reduce(chunk(r, 'scratch', index), sendtb=(index % size)) 33 | 34 | c2 = chunk(r+size, Buffer.input, r*size + (index % size)) 35 | c2.reduce(chunk(r+size, 'scratch', index), sendtb=(index % size)) 36 | 37 | 38 | for r in range(size): 39 | index = r*size 40 | c = chunk(r, Buffer.input, index, size) 41 | c = c.copy(r+size, 'scratch2', index=0, sendtb=size, recvtb=size+1, ch=r%2) 42 | 43 | c2 = chunk(r+size, Buffer.input, index, size) 44 | c2 = c2.copy(r, 'scratch2', index=0, sendtb=size+2, recvtb=size+3, ch=r%2) 45 | 46 | chunk(r, Buffer.input, index, size).reduce(c2, sendtb=size+3, recvtb=size+4, ch=r%2) 47 | chunk(r+size, Buffer.input, index, size).reduce(c, sendtb=size+1, recvtb=size+1, ch=r%2) 48 | 49 | 50 | # Each rank sends the fully reduced nth chunk to all other gpus 51 | for r1 in range(size): 52 | for r2 in range(size): 53 | if r1 != r2: 54 | index = r1 * size 55 | c = chunk(r1, Buffer.input, index, size) 56 | c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1) 57 | 58 | c2 = chunk(r1+size, Buffer.input, index, size) 59 | c2.copy(r2+size, Buffer.input, index, sendtb=r2, recvtb=r1) 60 | 61 | XML() 62 | Check() 63 | 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 66 | parser.add_argument('instances', type=int, help='number of instances') 67 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol') 68 | 69 | args = parser.parse_args() 70 | 71 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol) 72 | -------------------------------------------------------------------------------- /examples/mscclang/allreduce_a100_ncv4.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | def allreduce_allpairs(gpus, instances, protocol): 10 | size = gpus 11 | chunksperloop = 2 12 | topology = fully_connected(size) 13 | collective = AllReduce(size, chunksperloop, True) 14 | with MSCCLProgram("allreduce_ncv4", topology, collective, instances, protocol=protocol, 15 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True): 16 | for chnk in range(chunksperloop): 17 | for r in range(size): 18 | if ((r % 2) == chnk): 19 | c = chunk(r, Buffer.input, chnk) 20 | c.reduce(chunk(r + 1 - 2 * chnk, Buffer.input, chnk), sendtb=0, recvtb=0, ch=0) 21 | 22 | for r in range(size): 23 | if ((r % 2) == chnk): 24 | c = chunk(r, Buffer.input, chnk) 25 | c.copy((r+2) % size, 'scratch', chnk, sendtb=1, recvtb=1, ch=0) 26 | 27 | for r in range(size): 28 | if ((r % 2) == chnk): 29 | c = chunk(r, Buffer.input, chnk) 30 | c.reduce(chunk(r, 'scratch', chnk), sendtb=1, recvtb=1, ch=0) 31 | 32 | for r in range(size): 33 | if ((r % 2) == chnk): 34 | c = chunk(r, Buffer.input, chnk) 35 | c.copy(r + 1 - 2 * chnk, Buffer.input, chnk, sendtb=2, recvtb=2, ch=1) 36 | 37 | XML() 38 | Check() 39 | 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 42 | parser.add_argument('instances', type=int, help='number of instances') 43 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol') 44 | 45 | args = parser.parse_args() 46 | 47 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/allreduce_a100_ncv4_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | def tree_algo(tree, chnk, size): 10 | for i in range(size-1): 11 | nextNghr = tree[i+1] 12 | curNode = tree[i] 13 | c = chunk(nextNghr, Buffer.input, chnk) 14 | c.reduce(chunk(curNode, Buffer.input, chnk), sendtb=2*chnk, recvtb=2*chnk, ch=chnk) 15 | for i in range(size-1): 16 | curNode = tree[size-1-i] 17 | nextNghr = tree[size-1-i-1] 18 | c = chunk(curNode, Buffer.input, chnk) 19 | c.copy(nextNghr, Buffer.input, chnk, sendtb=2*chnk+1, recvtb=2*chnk+1, ch=chnk) 20 | 21 | def allreduce_allpairs(gpus, instances, protocol): 22 | size = gpus 23 | chunksperloop = 2 24 | topology = fully_connected(size) 25 | collective = AllReduce(size, chunksperloop, True) 26 | with MSCCLProgram("allreduce_ncv4", topology, collective, instances, protocol=protocol, 27 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True): 28 | tree_algo([3,2,1,0], 0, size) 29 | tree_algo([2,3,0,1], 1, size) 30 | 31 | XML() 32 | Check() 33 | 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 36 | parser.add_argument('instances', type=int, help='number of instances') 37 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol') 38 | 39 | args = parser.parse_args() 40 | 41 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/allreduce_a100_pcie_hierarchical.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from msccl.language import * 3 | from msccl.topologies import * 4 | from msccl.language.collectives import AllReduce 5 | 6 | def allpairs_reduce_scatter(gpuIds, size, offset): 7 | ngpus = len(gpuIds) 8 | 9 | # Each rank sends the nth chunk to the nth rank into scratch space 10 | for r1 in range(ngpus): 11 | for r2 in range(ngpus): 12 | if gpuIds[r1] != gpuIds[r2]: 13 | index = offset + r2 * size 14 | c = chunk(gpuIds[r1], Buffer.input, index, size=size) 15 | c.copy(gpuIds[r2], 'scratch', sendtb=gpuIds[r2], recvtb=gpuIds[r1]) 16 | 17 | # Each rank performs a local reduction on the nth chunk 18 | # Utilize 8 threadblocks for this reduction for better parallelism 19 | for r in range(ngpus): 20 | for index in range(0, size * (ngpus-1)): 21 | c = chunk(gpuIds[r], Buffer.input, offset + r*size + (index % size)) 22 | c.reduce(chunk(gpuIds[r], 'scratch', index), sendtb=(index % size)) 23 | 24 | 25 | def allpairs_all_gather(gpuIds, size, offset): 26 | ngpus = len(gpuIds) 27 | 28 | # Each rank sends its nth chunk to all other gpus 29 | for r1 in range(ngpus): 30 | for r2 in range(ngpus): 31 | if r1 != r2: 32 | index = offset + r1 * size 33 | c = chunk(gpuIds[r1], Buffer.input, index, size) 34 | c.copy(gpuIds[r2], Buffer.input, index, sendtb=gpuIds[r2], recvtb=gpuIds[r1]) 35 | 36 | # Performs two levels of allReduce 37 | def hierarchical_allreduce(gpus, instances, protocol): 38 | ncols = 2 39 | nrows = gpus // ncols 40 | chunkperloop = gpus * gpus 41 | topology = fully_connected(gpus) 42 | collective = AllReduce(gpus, chunkperloop, True) 43 | 44 | with MSCCLProgram("hierarchical_allreduce", topology, collective, instances, protocol=protocol, 45 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True): 46 | 47 | # A 4 x 3 GPU arranagement: 4 local GPUs, 3 instances, GPU Ids are numbered as such 48 | # 0 4 8 49 | # 1 5 9 50 | # 2 6 10 51 | # 3 7 11 52 | # Reduce-Scatter on each column first, assumption being GPUs in a column have faster connectivity - NVLINK 53 | # Each GPU exchanges (nrows - 1) * 1/rows of data with other GPUs in the same column 54 | # After this step, first GPU in each column will have 1st 1/nrows, 2nd GPU will have 2nd of 1/nrows data reduced 55 | size = chunkperloop // nrows 56 | offset = 0 57 | for n in range(ncols): 58 | gpuIds = [] 59 | for m in range(nrows): # collect all GPU Ids in a column 60 | gpuIds.append( n * nrows + m) 61 | 62 | allpairs_reduce_scatter(gpuIds, size, 0) 63 | 64 | # Reduce-Scatter across rows, assumption being GPUs in a row have slower connectivity - PCIe, IP NW 65 | # Each GPU exachanges (1 / rows * cols) * (cols - 1) of data with other GPUs in the same row - less data is exchanged 66 | # After this step, first GPU each row, will have 1st 1/(nrows * ncols), 2nd will have 2nd of 1/(nrows * ncols) 67 | offset = size 68 | size = chunkperloop // (nrows * ncols) 69 | for n in range(nrows): 70 | gpuIds = [] 71 | for m in range(ncols): 72 | gpuIds.append(n + m * nrows) 73 | 74 | allpairs_reduce_scatter(gpuIds, size, offset * n) 75 | 76 | # AllGather: AllGather phase goes in reverse order, first gather across rows of GPU 77 | # After this step, Each GPU in a rows have 1/ncols of data 78 | for n in range(nrows): 79 | gpuIds = [] 80 | for m in range(ncols): 81 | gpuIds.append(n + m * nrows) 82 | 83 | allpairs_all_gather(gpuIds, size, offset * n) 84 | 85 | # AllGather: AllGather phase goes in reverse order, 2nd AllGather across columns of GPU 86 | # After this step, Each GPU the systems will have complete reduced data 87 | size = chunkperloop // nrows 88 | offset = 0 89 | for n in range(ncols): 90 | gpuIds = [] 91 | for m in range(nrows): 92 | gpuIds.append( n * nrows + m) 93 | 94 | allpairs_all_gather(gpuIds, size, 0) 95 | 96 | XML() 97 | Check() 98 | 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 101 | parser.add_argument('instances', type=int, help='number of instances') 102 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol') 103 | 104 | args = parser.parse_args() 105 | 106 | hierarchical_allreduce(args.num_gpus, args.instances, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/allreduce_a100_recursive_doubling_halving.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # Halving-doubling implementation of allreduce 5 | 6 | import argparse 7 | 8 | from msccl.language import * 9 | from msccl.topologies import * 10 | from msccl.language.collectives import AllReduce 11 | 12 | 13 | def allreduce(ways, instances, protocol): 14 | topology = fully_connected(4) 15 | size = topology.num_nodes() # Number of gpus 16 | logical_chunk = 8 * ways 17 | collective = AllReduce(size, logical_chunk, True) 18 | with MSCCLProgram("allreduce_a100_recursive_doubling_halving", topology, collective, instances, protocol, interleaved_replication=False): 19 | # 1 reduction between pairs of gpus of count 20 | def recursive_doubling(pairs, count, next_index, lc, sendtb, recvtb): 21 | current_index = next_index.copy() 22 | for r in range(size): 23 | next = r ^ pairs 24 | offset = (count if r <= next else 0) 25 | next_index[next] += offset 26 | # Split the reduce into two separate reduces to enable fused instructions 27 | block = 2 ** pairs 28 | for x in range(count): 29 | index = current_index[r] + offset + lc*8 + x 30 | c1 = chunk(r, Buffer.input, index) 31 | c = chunk(next, Buffer.input, index) 32 | c.reduce(c1, sendtb=sendtb, recvtb=recvtb) 33 | 34 | 35 | # Propagates reduced chunks in reverse order 36 | def recursive_halving(pairs, count, next_index, lc, sendtb, recvtb): 37 | current_index = next_index.copy() 38 | for r in range(size): 39 | next = r ^ pairs 40 | offset = (count if r > next else 0) 41 | next_index[r] -= offset 42 | index = current_index[r] + lc*8 43 | c = chunk(r, Buffer.input, index, count) 44 | c.copy(next, Buffer.input, index, ch=lc, sendtb=sendtb, recvtb=recvtb) 45 | 46 | next_index = [0] * 8 47 | recursive_doubling(1, 4, next_index, 0, 0, 1) 48 | recursive_doubling(2, 2, next_index, 0, 1, 2) 49 | recursive_doubling(4, 1, next_index, 0, 2, 3) 50 | 51 | recursive_halving(4, 1, next_index, 0, 2, 3) 52 | recursive_halving(2, 2, next_index, 0, 1, 2) 53 | recursive_halving(1, 4, next_index, 0, 0, 1) 54 | 55 | if ways > 1: 56 | next_index = [0] * 8 57 | lc = 1 58 | recursive_doubling(4, 4, next_index, lc, 8, 9) 59 | recursive_doubling(2, 2, next_index, lc, 9, 10) 60 | recursive_doubling(1, 1, next_index, lc, 10, 11) 61 | 62 | recursive_halving(1, 1, next_index, lc, 10, 11) 63 | recursive_halving(2, 2, next_index, lc, 9, 10) 64 | recursive_halving(4, 4, next_index, lc, 8, 9) 65 | 66 | if ways > 2: 67 | next_index = [0] * 8 68 | lc = 2 69 | recursive_doubling(2, 4, next_index, lc, 4, 5) 70 | recursive_doubling(1, 2, next_index, lc, 5, 6) 71 | recursive_doubling(4, 1, next_index, lc, 6, 7) 72 | 73 | 74 | recursive_halving(4, 1, next_index, lc, 6, 7) 75 | recursive_halving(1, 2, next_index, lc, 5, 6) 76 | recursive_halving(2, 4, next_index, lc, 4, 5) 77 | 78 | 79 | XML() 80 | Check() 81 | 82 | parser = argparse.ArgumentParser() 83 | parser.add_argument('ways', type=int, help='number of parallel trees to perform reduction min:1 max:2') 84 | parser.add_argument('instances', type=int, help='number of instances') 85 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 86 | args = parser.parse_args() 87 | assert args.ways >= 1 and args.ways <= 3 88 | allreduce(args.ways, args.instances, args.protocol) 89 | -------------------------------------------------------------------------------- /examples/mscclang/allreduce_a100_ring.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | # Ring all reduce for A100s 10 | # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs. 11 | # channels=1 is standard ring, all chunks are assigned to the same tb/channel 12 | # channels=8 devotes 1 tb/channel to handling 1 chunk of the data 13 | def allreduce_ring(size, instances, channels, protocol): 14 | topology = fully_connected(size) 15 | collective = AllReduce(size, size, True) 16 | with MSCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances, 17 | protocol=protocol, threadblock_policy=ThreadblockPolicy.manual): 18 | # Reduce ring 19 | for step in range(0, size-1): 20 | for index in range(0, size): 21 | rank = (index + step) % size 22 | next_rank = (index + step + 1) % size 23 | channel = index%channels 24 | c = chunk(next_rank, Buffer.input, index) 25 | c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel) 26 | # Propagate ring 27 | for step in range(-1, size-2): 28 | for index in range(0, size): 29 | rank = (index + step) % size 30 | c = chunk(rank, Buffer.input, index) 31 | next_rank = (index + step + 1) % size 32 | channel = index%channels 33 | c = c.copy(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel) 34 | 35 | XML() 36 | Check() 37 | 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 40 | parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]') 41 | parser.add_argument('instances', type=int, help='number of instances') 42 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: LL128') 43 | args = parser.parse_args() 44 | 45 | allreduce_ring(args.num_gpus, args.instances, args.channels, args.protocol) 46 | -------------------------------------------------------------------------------- /examples/mscclang/allreduce_binomial_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | # Binomial tree and mirrored binomial tree 10 | # Mirrored trees adopted from: http://algo2.iti.kit.edu/documents/2tree.pdf 11 | def allreduce_binomial_tree(size, instances, trees, protocol): 12 | topology = fully_connected(size) 13 | collective = AllReduce(size, trees, True) 14 | with MSCCLProgram("allreduce_binomial_tree", topology, collective, instances, protocol=protocol): 15 | distance = 1 16 | # Reduce tree - reducing onto Rank 0 17 | while distance <= size // 2: 18 | # Reduce onto the left neighbor that is distance away 19 | for rank in range(0, size, distance*2): 20 | peer = rank + distance 21 | c1 = chunk(peer, Buffer.input, 0) 22 | chunk(rank, Buffer.input).reduce(c1, 0) 23 | distance *= 2 24 | # Broadcast tree - root is Rank 0 25 | distance = distance // 2 26 | while distance >= 1: 27 | # Copy to the right neighbor that is distance away 28 | for rank in range(0, size, distance*2): 29 | peer = rank + distance 30 | chunk(rank, Buffer.input, 0).copy(peer, Buffer.input, 0) 31 | distance = distance // 2 32 | 33 | # Mirrored version of the tree 34 | # Reduce tree - reducing onto Rank N-1 35 | if trees == 2: 36 | distance = 1 37 | while distance <= size // 2: 38 | # Reduce onto the right neighbor that is distance away 39 | for rank in range(size-1, 0, -distance*2): 40 | peer = rank - distance 41 | c1 = chunk(peer, Buffer.input, 1) 42 | chunk(rank, Buffer.input, 1).reduce(c1) 43 | distance *= 2 44 | # Broadcast tree - root is Rank N-1 45 | distance = distance // 2 46 | while distance >= 1: 47 | # Copy to the left neighbor that is distance away 48 | for rank in range(size-1, 0, -distance*2): 49 | peer = rank - distance 50 | chunk(rank, Buffer.input, 1).copy(peer, Buffer.input, 1) 51 | distance = distance // 2 52 | 53 | XML() 54 | Check() 55 | 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 58 | parser.add_argument('trees', type=int, choices=[1, 2], help ='number of trees') 59 | parser.add_argument('instances', type=int, help ='number of instances') 60 | 61 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 62 | args = parser.parse_args() 63 | allreduce_binomial_tree(args.num_gpus, args.instances, args.trees, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/allreduce_dgx1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | 6 | from msccl.language import * 7 | from msccl.topologies.distributed import * 8 | from msccl.topologies.nvidia import * 9 | from msccl.language.collectives import AllReduce 10 | 11 | def allreduce(num_nodes, instances): 12 | local_topology = dgx1() 13 | num_local_gpus = 8 14 | remote_bw = 1 15 | topology = distributed_fully_connected(local_topology, num_nodes, remote_bw) 16 | size = topology.num_nodes() 17 | collective = AllReduce(size, 1, True) 18 | local_ring_order = [1,3,2,6,7,5,4,0] 19 | 20 | def rank(n, g): 21 | return local_ring_order[g] + n * num_local_gpus 22 | 23 | with MSCCLProgram("allreduce_ring_dgx1", topology, collective, 1): 24 | 25 | # Chunks travels around local rings being reduced (local_gpus-1 hops) starting at local gpu 1 26 | # At the end of the most reduced chunk ends up on local gpu 0 every each node 27 | for n in range(num_nodes): 28 | r = rank(n, 0) # Start at local gpu 1 (index 0 in local_ring_order) 29 | c = chunk(r, Buffer.input, 0) 30 | for g in range(1, 8): 31 | c = c.reduce(rank(n,g), Buffer.input, 0) 32 | 33 | # At this point gpu0 and gpu8 have the two most reduced chunks 34 | # 1 IB send to fully reduce chunk + 1 IB send to update other node 35 | 36 | chunk(0, Buffer.input, 0).send(9, Buffer.input, 0) 37 | chunk(8, Buffer.input, 0).send(1, Buffer.input, 0).reduce(0, Buffer.input, 0) 38 | chunk(9, Buffer.input, 0).reduce(8, Buffer.input, 0) 39 | 40 | # Propagate the fully reduced chunks going backwards around the ring 41 | for n in range(num_nodes): 42 | r = rank(n, 7) 43 | c = chunk(r, Buffer.input, 0) 44 | for g in range(6, -1, -1): 45 | next = rank(n, g) 46 | c = c.send(next, Buffer.input, 0) 47 | 48 | XML() 49 | Check() 50 | 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('num_nodes', type=int, help='number of nodes') 53 | parser.add_argument('instances', type=int, help='number of instances') 54 | args = parser.parse_args() 55 | 56 | assert args.num_nodes == 2, "Only works for 2 nodes right now" 57 | 58 | allreduce(args.num_nodes, args.instances) 59 | -------------------------------------------------------------------------------- /examples/mscclang/allreduce_ndv2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies.distributed import * 7 | from msccl.topologies.nvidia import * 8 | from msccl.topologies import * 9 | from msccl.language.collectives import AllReduce 10 | 11 | def allreduce(instances): 12 | size = 8 13 | num_local_gpus = size 14 | topology = fully_connected(size) 15 | # size = topology.num_nodes() # Number of gpus 16 | logical_chunk = size 17 | collective = AllReduce(size, logical_chunk, True) 18 | with MSCCLProgram("allreduce_ndv2", topology, collective, instances, interleaved_replication=False): 19 | # local reduce_scatter 20 | instances = 1 21 | for lc in range(num_local_gpus//2): 22 | for r in range(num_local_gpus): 23 | if lc == (r % (num_local_gpus//2)): 24 | continue 25 | within_socket_nghr = lc + (4 if (r >= num_local_gpus//2) else 0) 26 | index = lc * 2 27 | c = chunk(r, Buffer.input, index, 2) 28 | c.reduce(within_socket_nghr, buffer=Buffer.input, index=index) 29 | # cross-socket reduce_scatter 30 | for r in range(num_local_gpus): 31 | index = (r % (num_local_gpus//2)) * 2 32 | if r >= num_local_gpus // 2: 33 | index += 1 # Handle the odd chunk 34 | lc = chunk(r, Buffer.input, index) 35 | lc = lc.reduce((r+num_local_gpus//2) % num_local_gpus, buffer=Buffer.input, index=index) 36 | lc.send(r, Buffer.input, index, ch=1) # Reduce and send should be on different tbs 37 | # local all_gather 38 | for r in range(num_local_gpus): 39 | index = (r % (num_local_gpus//2)) * 2 40 | lc = chunk(r, Buffer.input, index, 2) 41 | for t in range(num_local_gpus//2): 42 | local_nghr = t + (num_local_gpus//2 if (r >= num_local_gpus//2) else 0) 43 | if local_nghr == r: 44 | continue 45 | lc.send(local_nghr, buffer=Buffer.input, index=index) 46 | XML() 47 | Check() 48 | 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument('instances', type=int, help='number of instances') 51 | args = parser.parse_args() 52 | allreduce(args.instances) 53 | -------------------------------------------------------------------------------- /examples/mscclang/allreduce_recursive_doubling_halving.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import argparse 6 | 7 | from msccl.language import * 8 | from msccl.topologies import * 9 | from msccl.language.collectives import AllReduce 10 | 11 | def reduce_scatter_vector_halving_distance_doubling(size): 12 | count = size // 2 13 | while count >= 1: 14 | for rank in range(size): 15 | peer = rank ^ count 16 | index = ((peer // count) * count) 17 | c1 = chunk(rank, Buffer.input, index, size=count) 18 | chunk(peer, Buffer.output, index, size=count).reduce(c1, sendtb=peer, recvtb=rank, ch=0) 19 | count //= 2 20 | 21 | def allgather_recursive_vector_doubling_distance_halving(size): 22 | count = 1 23 | while count < size: 24 | for rank in range(size): 25 | peer = rank ^ count 26 | index = ((rank // count) * count) 27 | chunk(rank, Buffer.output, index, size=count).copy(peer, Buffer.output, index, sendtb=peer, recvtb=rank, ch=0) 28 | count *= 2 29 | 30 | def allreduce(size, instances, protocol): 31 | topology = fully_connected(size) 32 | logical_chunk = size 33 | collective = AllReduce(size, logical_chunk, True) 34 | with MSCCLProgram("allreduce_recursive_doubling_halving", topology, collective, instances, protocol, 35 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual): 36 | reduce_scatter_vector_halving_distance_doubling(size) 37 | allgather_recursive_vector_doubling_distance_halving(size) 38 | XML() 39 | Check() 40 | 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 43 | parser.add_argument('instances', type=int, help='number of instances') 44 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 45 | args = parser.parse_args() 46 | allreduce(args.num_gpus, args.instances, args.protocol) 47 | -------------------------------------------------------------------------------- /examples/mscclang/alltoall_a100_two_step.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from msccl.language import * 4 | from msccl.topologies import * 5 | from msccl.language.collectives import AllToAll 6 | 7 | 8 | def alltoall_hierarchical(num_nodes, gpus_per_node, protocol): 9 | num_ranks = num_nodes * gpus_per_node 10 | topology = fully_connected(num_ranks) 11 | collective = AllToAll(num_ranks, 1, inplace=False) 12 | 13 | 14 | with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1, protocol=protocol): 15 | for n1 in range(num_nodes): 16 | for r in range(1,num_nodes): 17 | n2 = (n1 + r) % num_nodes 18 | 19 | # Gather all local chunks for the node neighbor 20 | for g1 in range(gpus_per_node): 21 | rank1 = n1 * gpus_per_node + g1 22 | 23 | for g2 in range(gpus_per_node): 24 | rank2 = n1 * gpus_per_node + g2 25 | # chunk to copy: g2 on n2 26 | index = n2 * gpus_per_node + g2 27 | c = chunk(rank1, Buffer.input, index) 28 | c = c.copy(rank2, f'copy_{n2}') 29 | 30 | for r in range(1,num_nodes): 31 | n2 = (n1 + r) % num_nodes 32 | # IB copy 33 | for g1 in range(gpus_per_node): 34 | rank = n1 * gpus_per_node + g1 35 | ib_peer = n2 * gpus_per_node + g1 36 | c = chunk(rank, f'copy_{n2}', 0, gpus_per_node) 37 | c = c.copy(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % gpus_per_node)*2+(rank%2)+2) 38 | 39 | 40 | # Handle local chunks within a node 41 | for rank in range(num_ranks): 42 | for g in range(gpus_per_node): 43 | index = (rank // gpus_per_node) * gpus_per_node + g 44 | c = chunk(rank, Buffer.input, index) 45 | c.copy(c.get_dst_rank(), Buffer.output, c.get_dst_index()) 46 | 47 | XML() # Prints the XML 48 | Check() 49 | 50 | 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('num_nodes', type=int, help ='number of nodes') 53 | parser.add_argument('gpus_per_node', type=int, help ='gpus per node') 54 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 55 | args = parser.parse_args() 56 | 57 | 58 | alltoall_hierarchical(args.num_nodes, args.gpus_per_node, args.protocol) 59 | -------------------------------------------------------------------------------- /examples/mscclang/alltoall_allpairs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from msccl.language import * 4 | from msccl.topologies import * 5 | from msccl.language.collectives import AllToAll 6 | 7 | # One-step AllToAll program 8 | # Each gpu makes sends and receives a chunk from every other gpu 9 | 10 | def alltoall(num_ranks, instances, protocol): 11 | topology = fully_connected(num_ranks) 12 | collective = AllToAll(num_ranks, 1, inplace=False) 13 | 14 | with MSCCLProgram("alltoall_allpairs", topology, collective, instances=instances, protocol=protocol): 15 | for r in range(num_ranks): 16 | for index in range(num_ranks): 17 | chunk(r, Buffer.input, index).copy(index, Buffer.output, r) 18 | XML() 19 | Check() 20 | 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 24 | parser.add_argument('instances', type=int, help ='number of instances') 25 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') 26 | args = parser.parse_args() 27 | 28 | alltoall(args.num_gpus, args.instances, args.protocol) 29 | -------------------------------------------------------------------------------- /examples/mscclang/alltonext_backward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | 6 | from msccl.language import * 7 | from msccl.topologies.distributed import * 8 | from msccl.topologies import * 9 | from msccl.language.collectives import Collective 10 | 11 | class Pipeline(Collective): 12 | def init_buffers(self): 13 | chunks_per_node = self.chunk_factor 14 | rank_buffers = [] 15 | for r in range(self.num_ranks): 16 | input_buffer = [None] * chunks_per_node 17 | output_buffer = [None] * chunks_per_node 18 | if r != 0: 19 | for c in range(chunks_per_node): 20 | input_buffer[c] = Chunk(r, c, r-1, c) 21 | buffers = {Buffer.input : input_buffer, 22 | Buffer.output : output_buffer} 23 | rank_buffers.append(buffers) 24 | return rank_buffers 25 | 26 | 27 | # Final state chunks on rank(i) end up on rank(i-1) 28 | def check(self, prog): 29 | correct = True 30 | for r in range(0, self.num_ranks-1): 31 | output = prog.buffers[r][Buffer.output] 32 | for c in range(self.chunk_factor): 33 | chunk = output[c] 34 | if chunk is None or chunk.origin_rank != r+1 or chunk.origin_index != c: 35 | print(f'Rank {r} chunk {c} is incorrect should be ({r+1}, {c}) given {chunk}') 36 | correct = False 37 | return correct 38 | 39 | 40 | def pipeline(num_nodes, instances): 41 | num_local_gpus = 8 42 | chunks = num_local_gpus 43 | chunk_factor = chunks 44 | remote_bw = 1 45 | size = num_local_gpus * num_nodes 46 | topology = fully_connected(size) 47 | collective = Pipeline(size, chunk_factor, False) 48 | 49 | def rank(node, local_rank): 50 | return node * num_local_gpus + local_rank 51 | 52 | with MSCCLProgram("alltonext-backwards", topology, collective, instances): 53 | 54 | for n in range(num_nodes): 55 | for g in range(num_local_gpus): 56 | r = rank(n, g) 57 | 58 | # Do nothing for first gpu - end of pipeline 59 | if r == 0: 60 | continue 61 | 62 | # Cross node copy - cooperative 63 | if g == 0: 64 | for ch in range(chunks): 65 | c = chunk(r, Buffer.input, ch) 66 | if ch == 0: 67 | # 2 steps: IB copy to (node-1, g) then gather onto (node+1, num_local_gpus-1) 68 | c = c.copy(rank(n-1, ch), f's{n}->{n+1}', 0, ch=ch%2) 69 | elif ch == num_local_gpus-1: 70 | # 2 steps: Scatter - copy to (node, num_local_gpus-1), IB copy to (node+1, num_local_gpus-1) 71 | c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2) 72 | else: 73 | # 3 steps: Scatter - copy to (node, g), IB copy to (node-1, g), gather onto (node-1, num_local_gpus-1) 74 | c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2) 75 | c = c.copy(rank(n-1, ch), f's{n}->{n+1}', 0, ch=ch%2) 76 | c.copy(r-1, Buffer.output, c.get_dst_index(), ch=ch%2) 77 | 78 | # Normal copy - directly 79 | else: 80 | c = chunk(r, Buffer.input, 0, chunks) 81 | c.copy(r-1, Buffer.output, 0, ch=g%2) 82 | 83 | Check() 84 | XML() 85 | 86 | if __name__ == '__main__': 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument('num_nodes', type=int, help ='number of nodes') 89 | parser.add_argument('instances', type=int, help ='number of instances') 90 | 91 | args = parser.parse_args() 92 | 93 | pipeline(args.num_nodes, args.instances) -------------------------------------------------------------------------------- /examples/mscclang/alltonext_forward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | 6 | from msccl.language import * 7 | from msccl.topologies.distributed import * 8 | from msccl.topologies import * 9 | from msccl.language.collectives import Collective 10 | 11 | class Pipeline(Collective): 12 | def init_buffers(self): 13 | chunks_per_node = self.chunk_factor 14 | rank_buffers = [] 15 | for r in range(self.num_ranks): 16 | input_buffer = [None] * chunks_per_node 17 | output_buffer = [None] * chunks_per_node 18 | if r != self.num_ranks -1: 19 | for c in range(chunks_per_node): 20 | # Chunk(starting rank, starting index, ending rank, ending index) 21 | input_buffer[c] = Chunk(r, c, r+1, c) 22 | buffers = {Buffer.input : input_buffer, 23 | Buffer.output : output_buffer} 24 | rank_buffers.append(buffers) 25 | return rank_buffers 26 | 27 | 28 | # Final state chunks on rank(i) end up on rank(i+1) 29 | def check(self, prog): 30 | correct = True 31 | for r in range(1, self.num_ranks): 32 | output = prog.buffers[r][Buffer.output] 33 | for c in range(self.chunk_factor): 34 | chunk = output[c] 35 | # Check we got the previous rank's chunks 36 | if chunk is None or chunk.origin_rank != r-1 or chunk.origin_index != c: 37 | print(f'Rank {r} chunk {c} is incorrect should be ({r-1}, {c}) given {chunk}') 38 | correct = False 39 | return correct 40 | 41 | 42 | def pipeline(num_nodes, instances): 43 | num_local_gpus = 8 44 | chunks = num_local_gpus 45 | total_chunks_per_loop = chunks 46 | remote_bw = 1 47 | size = num_local_gpus * num_nodes 48 | topology = fully_connected(size) 49 | collective = Pipeline(size, total_chunks_per_loop, True) 50 | 51 | def rank(node, local_rank): 52 | return node * num_local_gpus + local_rank 53 | 54 | with MSCCLProgram("alltonext-forward", topology, collective, instances): 55 | 56 | for n in range(num_nodes): 57 | for g in range(num_local_gpus): 58 | r = rank(n, g) 59 | 60 | # Do nothing for last gpu - end of pipeline 61 | if r == size - 1: 62 | continue 63 | 64 | # Cross node copy - cooperative 65 | if g == num_local_gpus -1: 66 | for ch in range(chunks): 67 | c = chunk(r, Buffer.input, ch) 68 | if ch == 0: # 2 steps: Scatter - copy to (node, 0), IB copy to (node+1, 0) 69 | c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2) 70 | 71 | elif ch == num_local_gpus-1: 72 | # 2 steps: IB copy to (node+1, g) then gather onto (node+1, 0) 73 | c = c.copy(rank(n+1, ch), f's{n}->{n+1}', 0, ch=ch%2) 74 | else: 75 | # 3 steps: Scatter - copy to (node, g), IB copy to (node+1, g), gather onto (node+1, 0) 76 | c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2) 77 | c = c.copy(rank(n+1, ch), f's{n}->{n+1}', 0, ch=ch%2) 78 | 79 | c.copy(r+1, Buffer.output, c.get_dst_index(), ch=ch%2) 80 | 81 | # Normal copy - directly 82 | else: 83 | c = chunk(r, Buffer.input, 0, chunks) 84 | c.copy(r+1, Buffer.output, 0, ch=g%2) 85 | 86 | Check() 87 | XML() 88 | 89 | if __name__ == '__main__': 90 | parser = argparse.ArgumentParser() 91 | parser.add_argument('num_nodes', type=int, help ='number of nodes') 92 | parser.add_argument('instances', type=int, help ='number of instances') 93 | 94 | args = parser.parse_args() 95 | 96 | pipeline(args.num_nodes, args.instances) -------------------------------------------------------------------------------- /examples/mscclang/hierarchical_allreduce.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | 6 | from msccl.language import * 7 | from msccl.topologies import * 8 | from msccl.language.collectives import AllReduce 9 | 10 | # Blue Connect style AllReduce https://proceedings.mlsys.org/paper/2019/file/9b8619251a19057cff70779273e95aa6-Paper.pdf 11 | # Assumes only two-level switches 12 | 13 | def ring_reduce_scatter(size, rank_offset=0, rank_step=1, local_chunk_size=1, chunk_offset=0, chunk_stride=1, chan=-1): 14 | for ch in range(0, size): 15 | index = ch * chunk_stride * local_chunk_size + chunk_offset 16 | for step in range(0, size-1): 17 | other = chunk(((step+1+ch) % size)*rank_step +rank_offset, Buffer.input, index, local_chunk_size) 18 | c = chunk(((step+2+ch) % size)*rank_step+rank_offset, Buffer.input, index, local_chunk_size) 19 | c.reduce(other, ch=chan) 20 | 21 | def ring_all_gather(size, rank_offset=0, rank_step=1, local_chunk_size=1, chunk_offset=0, chunk_stride=1, chan=-1): 22 | for ch in range(0, size): 23 | index = ch * chunk_stride * local_chunk_size + chunk_offset 24 | for step in range(0, size-1): 25 | c = chunk(((step+ch) % size)*rank_step + rank_offset, Buffer.input, index, local_chunk_size) 26 | c.copy(((step+1+ch) % size)*rank_step + rank_offset, Buffer.input, index, ch=chan) 27 | 28 | def hierarchical_allreduce(num_local_gpus, num_nodes, instances, protocol, schedule): 29 | num_gpus = num_local_gpus * num_nodes 30 | topology = fully_connected(num_gpus) 31 | collective = AllReduce(num_gpus, num_gpus, True) 32 | 33 | with MSCCLProgram("hierarchical_allreduce", topology, collective, instances, protocol=protocol, 34 | interleaved_replication=False): 35 | 36 | local_chunk_size = num_nodes 37 | if schedule == 'auto': 38 | for n in range(num_nodes): 39 | for offset in range(num_nodes): 40 | ring_reduce_scatter(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes) 41 | 42 | # Cross node Reduce-Scatter 43 | for g in range(num_local_gpus): 44 | ring_reduce_scatter(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes) 45 | 46 | # Cross node All-gather 47 | for g in range(num_local_gpus): 48 | ring_all_gather(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes) 49 | 50 | 51 | # All gather within each node 52 | for n in range(num_nodes): 53 | for offset in range(num_nodes): 54 | ring_all_gather(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes) 55 | 56 | else: 57 | # Reduce Scatter within each node 58 | for n in range(num_nodes): 59 | for offset in range(num_nodes): 60 | ring_reduce_scatter(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes, chan=offset) 61 | 62 | # Cross node Reduce-Scatter 63 | for g in range(num_local_gpus): 64 | ring_reduce_scatter(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes, chan=g%2+num_nodes*2) 65 | 66 | # Cross node All-gather 67 | for g in range(num_local_gpus): 68 | ring_all_gather(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes, chan=g%2+num_nodes*2) 69 | 70 | 71 | # All gather within each node 72 | for n in range(num_nodes): 73 | for offset in range(num_nodes): 74 | ring_all_gather(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes, chan=offset+num_nodes) 75 | 76 | XML() 77 | Check() 78 | 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument('num_gpus', type=int, help='number of gpus per node') 81 | parser.add_argument('num_nodes', type=int, help='number of nodes') 82 | parser.add_argument('instances', type=int, help='number of instances') 83 | parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL128', 'LL'], help='Protocol') 84 | parser.add_argument('--schedule', type=str, default='auto', choices=['auto', 'manual'], help='Scheduling') 85 | 86 | args = parser.parse_args() 87 | 88 | hierarchical_allreduce(args.num_gpus, args.num_nodes, args.instances, args.protocol, args.schedule) 89 | 90 | -------------------------------------------------------------------------------- /examples/mscclang/pipeline_a100_allpairs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | def allreduce_allpairs(gpus, instances, protocol): 10 | size = gpus 11 | chunksperloop = gpus * gpus 12 | topology = fully_connected(2*size) 13 | collective = AllReduce(2*size, chunksperloop, True) 14 | with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 15 | interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True): 16 | 17 | # Each rank sends the nth chunk to the nth rank into scratch space 18 | for r1 in range(size): 19 | for r2 in range(size): 20 | if r1 != r2: 21 | index = r2 * size 22 | c = chunk(r1, Buffer.input, index, size=size) 23 | c.copy(r2, 'scratch', sendtb=r2, recvtb=r1) 24 | 25 | # Each rank performs a local reduction on the nth chunk 26 | # Utilize 8 threadblocks for this reduction for better parallelism 27 | for r in range(size): 28 | for index in range(0, size * (size-1)): 29 | c = chunk(r, Buffer.input, r*size + (index % size)) 30 | c.reduce(chunk(r, 'scratch', index), sendtb=(index % size)) 31 | c = chunk(r, Buffer.input, r*size, size=size) 32 | c.copy(r+size, Buffer.input, r*size, ch=r%2) 33 | 34 | # Each rank sends the fully reduced nth chunk to all other gpus 35 | for r1 in range(size): 36 | for r2 in range(size): 37 | if r1 != r2: 38 | index = r1 * size 39 | c = chunk(r1+size, Buffer.input, index, size) 40 | c.copy(r2+size, Buffer.input, index, sendtb=r2, recvtb=r1) 41 | 42 | XML() 43 | #Check() 44 | 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 47 | parser.add_argument('instances', type=int, help='number of instances') 48 | parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol') 49 | 50 | args = parser.parse_args() 51 | 52 | allreduce_allpairs(args.num_gpus, args.instances, args.protocol) -------------------------------------------------------------------------------- /examples/mscclang/pipeline_a100_ring.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | from msccl.language import * 6 | from msccl.topologies import * 7 | from msccl.language.collectives import AllReduce 8 | 9 | # Ring all reduce for A100s 10 | # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs. 11 | # channels=1 is standard ring, all chunks are assigned to the same tb/channel 12 | # channels=8 devotes 1 tb/channel to handling 1 chunk of the data 13 | def allreduce_ring(size, instances, channels, protocol): 14 | topology = fully_connected(2*size) 15 | collective = AllReduce(2*size, size, True) 16 | with MSCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances, 17 | protocol=protocol, threadblock_policy=ThreadblockPolicy.manual): 18 | # Reduce ring 19 | for step in range(0, size-1): 20 | for index in range(0, size): 21 | rank = (index + step) % size 22 | next_rank = (index + step + 1) % size 23 | channel = index%channels 24 | c = chunk(next_rank, Buffer.input, index) 25 | c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel) 26 | # Propagate ring 27 | for index in range(0, size): 28 | rank = (index - 1) % size 29 | c = chunk(rank, Buffer.input, index) 30 | c.copy(rank+size, Buffer.input, index, ch=rank%2) 31 | for step in range(-1, size-2): 32 | for index in range(0, size): 33 | rank = (index + step) % size 34 | c = chunk(rank+size, Buffer.input, index) 35 | next_rank = (index + step + 1) % size 36 | channel = index%channels 37 | c = c.copy(next_rank+size, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel) 38 | 39 | XML() 40 | # Check() 41 | 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 44 | parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]') 45 | parser.add_argument('instances', type=int, help='number of instances') 46 | parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: LL128') 47 | args = parser.parse_args() 48 | 49 | allreduce_ring(args.num_gpus, args.instances, args.channels, args.protocol) 50 | -------------------------------------------------------------------------------- /examples/mscclang/reducegather.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | 6 | from msccl.language import * 7 | from msccl.topologies import * 8 | from msccl.language.collectives import Collective 9 | 10 | class ReduceGather(Collective): 11 | def __init__(self, num_ranks, chunk_factor, inplace, groups): 12 | Collective.__init__(self, num_ranks, chunk_factor, inplace) 13 | self.groups = groups 14 | self.gpus_per_group = num_ranks // groups 15 | assert chunk_factor == 1, "Only supports chunks == number of ranks" 16 | 17 | def init_buffers(self): 18 | assert self.chunk_factor == 1 19 | rank_buffers = [] 20 | chunks_per_node = self.num_ranks 21 | for r in range(self.num_ranks): 22 | input_buffer = [None] * self.gpus_per_group 23 | output_buffer = [None] * chunks_per_node 24 | for c in range(self.groups): 25 | input_buffer[c] = Chunk(r, c, -1, c) 26 | buffers = {Buffer.input : input_buffer, 27 | Buffer.output : output_buffer} 28 | rank_buffers.append(buffers) 29 | return rank_buffers 30 | 31 | 32 | def check(self, prog): 33 | expected_chunks = [] 34 | for r in range(self.num_ranks): 35 | chunk = ReduceChunk([]) 36 | for x in range(self.groups): 37 | y = r // self.groups 38 | next = y * self.groups + x 39 | chunk = chunk.reduce(Chunk(next, r % self.gpus_per_group)) 40 | expected_chunks.append(chunk) 41 | 42 | correct = True 43 | for r in range(self.num_ranks): 44 | output = prog.buffers[r][Buffer.output] 45 | for c in range(self.num_ranks): 46 | chunk = output[c] 47 | if chunk is None or chunk != expected_chunks[c]: 48 | print(f'Rank {r} chunk {c} is incorrect should be {expected_chunks[c]} given {chunk}') 49 | correct = False 50 | return correct 51 | 52 | 53 | def program(num_ranks, groups, instances, protocol): 54 | gpus_per_group = num_ranks // groups 55 | topology = fully_connected(num_ranks) 56 | chunk_factor = 1 57 | inplace = False 58 | collective = ReduceGather(num_ranks, chunk_factor, inplace, groups) 59 | 60 | with MSCCLProgram("reduce-gather", topology, collective, instances, protocol, threadblock_policy=ThreadblockPolicy.manual): 61 | 62 | # Per group reduce scatter 63 | for y in range(groups): 64 | for x in range(gpus_per_group): 65 | output_index = y * groups + x 66 | input_index = x 67 | gpu = y * groups + (x+1) % gpus_per_group 68 | c = chunk(gpu, Buffer.input, input_index) 69 | # Use the input buffer to perform reduction across groups 70 | for x_ in range(1, gpus_per_group): 71 | c = c.reduce(y * groups + (x + 1 + x_) % gpus_per_group, Buffer.input, input_index, sendtb=0, recvtb=0, ch=0) 72 | # Copy reduced chunk into the output buffer 73 | c = c.send(c.rank, Buffer.output, output_index, sendtb=0, recvtb=0, ch=0) 74 | 75 | 76 | # Ring Allgather 77 | for r in range(num_ranks): 78 | c = chunk(r, Buffer.output, r) 79 | next = (r + 1) % num_ranks 80 | while next != r: 81 | c = c.send(next, Buffer.output, r, sendtb=1, recvtb=1, ch=1) 82 | next = (next + 1) % num_ranks 83 | 84 | Check() 85 | XML() 86 | 87 | if __name__ == '__main__': 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument('num_ranks', type=int, help ='number of ranks') 90 | parser.add_argument('groups', type=int, help='number of reduction groups') 91 | parser.add_argument('--instances', type=int, default=1, help='number of instances') 92 | parser.add_argument('--protocol', type=str, default='Simple', 93 | choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol') 94 | args = parser.parse_args() 95 | 96 | assert args.num_ranks % args.groups == 0 97 | 98 | program(args.num_ranks, args.groups, args.instances, args.protocol) 99 | -------------------------------------------------------------------------------- /examples/mscclang/simple/allgather_ring.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.language import * 5 | from msccl.topologies import * 6 | from msccl.language.collectives import AllGather 7 | 8 | def allgather_ring(size): 9 | topology = fully_connected(size) 10 | collective = AllGather(size, 1, False) 11 | with MSCCLProgram("allgather_ring", topology, collective, 1): 12 | # Loop over each chunk's root 13 | for r in range(size): 14 | # Get the chunk at rank r, input[r] 15 | c = chunk(r, Buffer.input, 0) 16 | # Copy chunk to the output buffer 17 | c = c.copy(r, buffer=Buffer.output, index=r, sendtb=0) 18 | 19 | next = (r + 1) % size 20 | while next != r: 21 | # For each rank in the ring, send the chunk to the next rank 22 | # Setting sender's tb and receiver's tb to be 0 so that send/receives on the 23 | # same rank can be merged into a receive-copy-send 24 | c = c.copy(next, buffer=Buffer.output, index=r) 25 | next = (next + 1) % size 26 | XML() 27 | Check() 28 | 29 | def allgather_ring_inplace(size): 30 | topology = fully_connected(size) 31 | collective = AllGather(size, 1, True) 32 | with MSCCLProgram("allgather_ring", topology, collective, 1): 33 | # Loop over each chunk's root 34 | for r in range(size): 35 | # Get the chunk at rank r, input[r] 36 | c = chunk(r, Buffer.input, 0) 37 | 38 | next = (r + 1) % size 39 | while next != r: 40 | # For each rank in the ring, send the chunk to the next rank 41 | # Setting sender's tb and receiver's tb to be 0 so that send/receives on the 42 | # same rank can be merged into a receive-copy-send 43 | c = c.copy(next, buffer=Buffer.output, index=r) 44 | next = (next + 1) % size 45 | XML() 46 | Check() 47 | 48 | allgather_ring(4) 49 | # allgather_ring_inplace(4) -------------------------------------------------------------------------------- /examples/mscclang/simple/allreduce_ring.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | 6 | from msccl.language import * 7 | from msccl.topologies import * 8 | from msccl.collectives import * 9 | from msccl.language.collectives import AllReduce 10 | 11 | 12 | def allreduce_ring(size, instances): 13 | # Logical topology 14 | topology = fully_connected(size) 15 | collective = AllReduce(size, size, inplace=True) 16 | 17 | with MSCCLProgram("allreduce_ring_inplace", topology, collective, instances): 18 | for r in range(size): 19 | index = r 20 | # (rank, buffer, index) 21 | c = chunk(r, Buffer.input, index) 22 | next = (r + 1) % size 23 | # Chunk travels around the ring being reduced 24 | while next != r: 25 | c1 = chunk(next, buffer=Buffer.input, index=r) 26 | # c1 += c 27 | c = c1.reduce(c) 28 | next = (next + 1) % size 29 | 30 | # Send the fully reduced chunk around the ring 31 | while next != (r - 1) % size: 32 | c = c.copy(next, buffer=Buffer.input, index=r) 33 | next = (next + 1) % size 34 | 35 | Check() 36 | XML() 37 | 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('num_gpus', type=int, help ='number of gpus') 40 | parser.add_argument('instances', type=int, help='number of instances') 41 | 42 | args = parser.parse_args() 43 | 44 | allreduce_ring(args.num_gpus, args.instances) 45 | -------------------------------------------------------------------------------- /examples/mscclang/simple/custom_collective.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # Example of a simple custom collective where Rank 0 sends a chunk to Ranks 1 and 2 5 | 6 | from msccl.language import * 7 | from msccl.topologies import * 8 | from msccl.language.collectives import Collective 9 | 10 | # For custom collectives you need to define a new collective class 11 | # this is used by mscclang to initialize buffers with chunks (pre-condition) 12 | # and provide a checker to check that chunks satisfy the post-condition of the collective. 13 | class CollEx(Collective): 14 | # Initial state is chunk0 is on rank0 in the input buffer 15 | def init_buffers(self): 16 | chunks_per_node = self.chunk_factor 17 | rank_buffers = [] 18 | for r in range(self.num_ranks): 19 | input_buffer = [None] * chunks_per_node 20 | output_buffer = [None] * chunks_per_node 21 | if r == 0: 22 | for c in range(chunks_per_node): 23 | # Format for specifying a chunk 24 | # Chunk(starting rank, starting index, ending rank, ending index) 25 | # Because this chunk ends up on multiple ranks ending rank is set to -1 26 | input_buffer[c] = Chunk(r, c, -1, c) 27 | buffers = {Buffer.input : input_buffer, 28 | Buffer.output : output_buffer} 29 | rank_buffers.append(buffers) 30 | return rank_buffers 31 | 32 | 33 | # Final state chunk0 from rank0 is in the output buffer of rank1 and rank2 34 | def check(self, prog): 35 | correct = True 36 | for r in range(1, self.num_ranks): 37 | output = prog.buffers[r][Buffer.output] 38 | for c in range(self.chunk_factor): 39 | chunk = output[c] 40 | # Check that we got chunk 0 from rank 0 41 | if chunk is None or chunk.origin_rank != 0 or chunk.origin_index != 0: 42 | print(f'Rank {r} chunk {c} is incorrect should be ({0}, {0}) given {chunk}') 43 | correct = False 44 | return correct 45 | 46 | 47 | def custom_example1(): 48 | # MSCCLang programs take in a name for hte program, the topology of the network, 49 | # collective being implemented, chunksperloop of the collective, and optionally the NCCL protocol to be used 50 | size = 3 51 | topology = fully_connected(size) 52 | # Collectives take in number of ranks in the network, chunksperloop of the collective, whether it is inplace, 53 | collective = CollEx(size, 1, inplace=False) 54 | with MSCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"): 55 | # Get the chunk at rank 0 index 0 of the input buffer 56 | c = chunk(0, Buffer.input, 0) 57 | # Send chunks to 1 and 2 58 | # Can specify the sender's tb, receiver's tb, and channel for the send operation 59 | # MSCCLang provides a default threadblock assignment if they aren't specified 60 | # MSCCLang will also check the tb/channel combos are valid 61 | c.copy(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0) 62 | c.copy(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1) 63 | 64 | XML() # Generates the XML for this collective 65 | Check() # Checks the routes defined for each chunk are correct. Currently doesn't check XML correct 66 | 67 | def custom_example2(): 68 | 69 | size = 3 70 | topology = fully_connected(size) 71 | 72 | collective = CollEx(size, 1, inplace=False) 73 | with MSCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"): 74 | c = chunk(0, Buffer.input, 0) 75 | # This is the same program as above but instead of rank 0 sending to 1 and 2 76 | # 0 sends to 1 which sends to 2 77 | # send returns the chunk on the receiver's side 78 | c = c.copy(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0) 79 | c.copy(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1) 80 | 81 | XML() 82 | Check() 83 | 84 | custom_example1() 85 | custom_example2() 86 | -------------------------------------------------------------------------------- /examples/requirements_sccl_init.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/parasailteam/msccl-presynth -------------------------------------------------------------------------------- /examples/sccl_init.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | def show(): 7 | if 'MSCCL_CONFIG' in os.environ: 8 | print() 9 | print(f"MSCCL_CONFIG = {os.environ['MSCCL_CONFIG']}") 10 | print(f"Contents of {os.environ['MSCCL_CONFIG']}:") 11 | with open(os.environ['MSCCL_CONFIG']) as f: 12 | print(f.read()) 13 | print() 14 | 15 | 16 | print('=== Trigger a builtin synthesis plan ===') 17 | 18 | import msccl 19 | msccl.init('ndv4', 9, (msccl.Collective.alltoall, '1GB')) 20 | 21 | show() 22 | 23 | 24 | print('=== Register additional plans from a library ===') 25 | 26 | import msccl_presynth 27 | msccl.init('ndv2', 3, 28 | (msccl.Collective.alltoall, '1GB'), 29 | (msccl.Collective.allgather, (128, '1KB'))) 30 | 31 | show() 32 | 33 | 34 | print('=== Register custom plans ===') 35 | 36 | from msccl.autosynth.registry import register_synthesis_plan 37 | 38 | @register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1MB', None)) 39 | def alltoall_9000(machines): 40 | return """ 41 | ... 42 | """ 43 | 44 | msccl.init('ndv9000', 1, (msccl.Collective.alltoall, '2MB')) 45 | 46 | show() 47 | 48 | 49 | print('=== Overlapping size ranges ===') 50 | 51 | register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, (0, '1KB'), protocol='LL')(alltoall_9000) 52 | register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1KB', '1MB'), protocol='LL128')(alltoall_9000) 53 | 54 | msccl.init('ndv9000', 1, (msccl.Collective.alltoall, ('2KB', None))) 55 | 56 | show() 57 | 58 | 59 | # TODO: Update the following programs to use the new syntax 60 | # print('=== MSCCLang program ===') 61 | 62 | # from msccl.autosynth.registry import register_msccl_program 63 | # from msccl.topologies import line 64 | # from msccl.language import * 65 | 66 | # @register_msccl_program(line(2), 'allgather', 'two_gpus', machines= lambda m: m == 1) 67 | # def trivial_allgather(prog, nodes): 68 | # chunk(Buffer.input, 0, 0).send(0, Buffer.output, 0).send(1) 69 | # chunk(Buffer.input, 1, 0).send(1, Buffer.output, 1).send(0) 70 | 71 | # msccl.init('two_gpus', 1, (msccl.Collective.allgather, (0, None))) 72 | 73 | # show() 74 | 75 | 76 | # print('=== MSCCLang program example ====') 77 | 78 | # from msccl.topologies import fully_connected 79 | # from msccl.programs.allreduce_a100_ring import allreduce_ring 80 | 81 | # @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True, 82 | # instances=4, protocol='LL128', threadblock_policy=ThreadblockPolicy.manual, machines=lambda x: x == 1) 83 | # def ndv4_ring_allreduce(prog, nodes): 84 | # allreduce_ring(size=8, channels=8) 85 | 86 | # msccl.init('ndv4', 1, (msccl.Collective.allreduce, (0, None))) 87 | 88 | # show() -------------------------------------------------------------------------------- /examples/send.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # This script defines and saves a custom collective to send from rank 2 to rank 7 5 | 6 | from msccl.collectives import build_collective 7 | from msccl.serialization import save_msccl_object 8 | 9 | precondition = lambda r, c: r == 2 10 | postcondition = lambda r, c: r == 7 11 | coll = build_collective('Send', 8, 1, precondition, postcondition) 12 | save_msccl_object(coll, 'send.json') 13 | -------------------------------------------------------------------------------- /examples/unpermute_dgx1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # This script shows how to use MSCCL to find a way to permute the nodes of a DGX1 to match the default order. 5 | 6 | from msccl.topologies import * 7 | from msccl.isomorphisms import find_isomorphisms 8 | 9 | def solve_dgx1_permutation(): 10 | local = nvlink_only() 11 | isomorphisms = find_isomorphisms(dgx1(), local, limit=4) 12 | if len(isomorphisms) == 0: 13 | raise RuntimeError('No isomorphism to DGX1 found') 14 | return isomorphisms 15 | print(solve_dgx1_permutation()) 16 | -------------------------------------------------------------------------------- /msccl/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.autosynth import init, tabulate_plans, print_plans 5 | from msccl.autosynth import ndv2_perm 6 | from msccl.autosynth import Collective 7 | -------------------------------------------------------------------------------- /msccl/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # PYTHON_ARGCOMPLETE_OK 3 | 4 | # Copyright (c) Microsoft Corporation. 5 | # Licensed under the MIT License. 6 | 7 | import msccl.collectives as collectives 8 | import msccl.topologies as topologies 9 | import msccl.strategies as strategies 10 | from msccl.cli import * 11 | 12 | import argparse 13 | import argcomplete 14 | import sys 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser('msccl') 18 | 19 | cmd_parsers = parser.add_subparsers(title='command', dest='command') 20 | cmd_parsers.required = True 21 | 22 | handlers = [] 23 | handlers.append(make_solvers(cmd_parsers)) 24 | handlers.append(make_composers(cmd_parsers)) 25 | handlers.append(make_distributors(cmd_parsers)) 26 | handlers.append(make_analyses(cmd_parsers)) 27 | handlers.append(make_handle_ncclize(cmd_parsers)) 28 | handlers.append(make_plans(cmd_parsers)) 29 | 30 | argcomplete.autocomplete(parser) 31 | args = parser.parse_args() 32 | 33 | for handler in handlers: 34 | if handler(args, args.command): 35 | break 36 | 37 | if __name__ == '__main__': 38 | main() 39 | -------------------------------------------------------------------------------- /msccl/algorithm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from dataclasses import dataclass 5 | from collections import defaultdict 6 | 7 | @dataclass 8 | class Step(object): 9 | rounds: int 10 | sends: list 11 | 12 | class Algorithm(object): 13 | def __init__(self, name, collective, topology, instance, steps, input_map = {}, output_map = {}): 14 | self.name = name 15 | self.topology = topology 16 | self.collective = collective 17 | self.instance = instance 18 | self.steps = steps 19 | self.input_map = input_map 20 | self.output_map = output_map 21 | 22 | self._update_link_utilizations() 23 | self._check_bandwidth_constraints() 24 | 25 | for step in self.steps: 26 | step.sends.sort() 27 | 28 | @classmethod 29 | def make_implementation(cls, collective, topology, instance, steps): 30 | chunked = collective.chunk_up(instance.chunks) 31 | 32 | # Figure out input and output addresses 33 | input_map = {} 34 | output_map = {} 35 | for rank in chunked.ranks(): 36 | input_addrs = set() 37 | output_addrs = set() 38 | for chunk in chunked.chunks(): 39 | # An address is an input address if any of its chunks is in the precondition 40 | if chunked.precondition(rank, chunk): 41 | input_addrs.add(chunked.address(chunk)) 42 | # An address is an output address if any of its chunks is in the postcondition 43 | if chunked.postcondition(rank, chunk): 44 | output_addrs.add(chunked.address(chunk)) 45 | if len(input_addrs) > 0: 46 | input_map[rank] = input_addrs 47 | if len(output_addrs) > 0: 48 | output_map[rank] = output_addrs 49 | 50 | # Concatenate collective and topology names plus instance arguments to create a name 51 | name = f'{collective.name}-{topology.name}-{instance}' 52 | 53 | algo = cls(name, collective, topology, instance, steps, input_map, output_map) 54 | algo.check_implements(chunked) 55 | if instance.extra_rounds > 0: 56 | used_extra_rounds = algo.extra_rounds() 57 | if used_extra_rounds > instance.extra_rounds: 58 | raise ValueError(f'steps use {used_extra_rounds} extra rounds but only {instance.extra_rounds} were allowed') 59 | return algo 60 | 61 | def ranks(self): 62 | return range(self.topology.num_nodes()) 63 | 64 | def num_steps(self): 65 | return len(self.steps) 66 | 67 | def extra_rounds(self): 68 | rounds = 0 69 | for step in self.steps: 70 | rounds += step.rounds 71 | return rounds - self.num_steps() 72 | 73 | def is_pipelined(self): 74 | return self.instance.pipeline != None 75 | 76 | def check_implements(self, collective): 77 | if self.topology.num_nodes() != collective.num_nodes: 78 | raise RuntimeError('topology and collective have different number of nodes') 79 | # Find which chunks will be sent from an address 80 | chunks_at_address = defaultdict(list) 81 | for chunk in collective.chunks(): 82 | chunks_at_address[collective.address(chunk)].append(chunk) 83 | # State records if a rank holds a chunk 84 | def idx(rank, chunk): 85 | return rank * collective.num_chunks + chunk 86 | state = [False] * (collective.num_nodes * collective.num_chunks) 87 | # Initialize state from precondition 88 | for rank in collective.ranks(): 89 | for chunk in collective.chunks(): 90 | state[idx(rank, chunk)] = collective.precondition(rank, chunk) 91 | # Propagate state through sends of every step 92 | for step in self.steps: 93 | next_state = state.copy() 94 | for addr, src, dst in step.sends: 95 | for chunk in chunks_at_address[addr]: 96 | next_state[idx(dst, chunk)] |= state[idx(src, chunk)] 97 | state = next_state 98 | # Check that the postcondition holds 99 | for rank in collective.ranks(): 100 | for chunk in collective.chunks(): 101 | if collective.postcondition(rank, chunk) and not state[idx(rank, chunk)]: 102 | raise RuntimeError(f'rank {rank} does not get chunk {chunk} as required by the postcondition') 103 | 104 | def _update_link_utilizations(self): 105 | self._link_utilizations = [] 106 | ranks = range(self.topology.num_nodes()) 107 | for step in self.steps: 108 | step_utilizations = [[0 for _ in ranks] for _ in ranks] 109 | for addr, src, dst in step.sends: 110 | step_utilizations[dst][src] += 1 # Same order as topology 111 | self._link_utilizations.append(step_utilizations) 112 | 113 | def _check_bandwidth_constraints(self): 114 | for srcs, dsts, bw, name in self.topology.bandwidth_constraints(): 115 | for step_num, step in enumerate(self.steps): 116 | util = 0 117 | for dst in dsts: 118 | for src in srcs: 119 | if self.is_pipelined(): 120 | for overlapping_step in range(step_num, len(self.steps), self.instance.pipeline): 121 | util += self._link_utilizations[overlapping_step][dst][src] 122 | else: 123 | util += self._link_utilizations[step_num][dst][src] 124 | assert util <= bw * step.rounds, \ 125 | f'Step {step_num} uses {util} bandwidth but constraint {name} only allows for {bw * step.rounds} bandwidth (when rounds={step.rounds}).' 126 | 127 | def __str__(self): 128 | s = '' 129 | for i, step in enumerate(self.steps): 130 | if i != 0: 131 | s += '\n' 132 | if step.rounds > 1: 133 | s += f'(step {i+1}, rounds={step.rounds}) ' 134 | else: 135 | s += f'(step {i+1}) ' 136 | s += ', '.join([f'{chunk}:{src}→{dst}' for chunk, src, dst in step.sends]) 137 | return s 138 | -------------------------------------------------------------------------------- /msccl/autosynth/msccl_ndv2_launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -c "import msccl; msccl.ndv2_perm()" 3 | order=/var/lock/msccl_autosynth_inspector_topo.lock 4 | if [ -f "$order" ]; then 5 | export CUDA_VISIBLE_DEVICES=$(= 2) 14 | def synthesize_ndv2_relay_alltoall(machines): 15 | gather_coll = gather(8, 0) 16 | scatter_coll = scatter(8, 1) 17 | gather_algo = solve_least_steps(dgx1(), gather_coll) 18 | scatter_algo = solve_least_steps(dgx1(), scatter_coll) 19 | algo = synthesize_gather_scatter_distributed_alltoall( 20 | machines, gather_algo, scatter_algo) 21 | return ncclize(algo, instances=8) -------------------------------------------------------------------------------- /msccl/autosynth/ndv4_plans.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.autosynth.registry import register_synthesis_plan, register_msccl_program 5 | from msccl.programs.allreduce_a100_ring import allreduce_ring 6 | from msccl.programs.allreduce_allpairs import allreduce_allpairs 7 | from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical 8 | from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step 9 | from msccl.topologies import fully_connected 10 | from msccl.language.ir import ThreadblockPolicy 11 | 12 | def register_ndv4_plans(): 13 | 14 | @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=64, inplace=True, 15 | instances=2, protocol='LL', sizes=('512B', '82944B'), threadblock_policy=ThreadblockPolicy.manual, interleaved_replication=False, dependence_nop=True, machines= lambda x: x == 1) 16 | def ndv4_allpairs_allreduce_config1(prog, nodes): 17 | allreduce_allpairs(8) 18 | 19 | @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=64, inplace=True, 20 | instances=4, protocol='LL', sizes=('82944B', '458752B'), threadblock_policy=ThreadblockPolicy.manual, interleaved_replication=False, dependence_nop=True, machines= lambda x: x == 1) 21 | def ndv4_allpairs_allreduce_config2(prog, nodes): 22 | allreduce_allpairs(8) 23 | 24 | @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True, 25 | instances=8, protocol='LL', sizes=('458752B', '2129920B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1) 26 | def ndv4_ring_allreduce_config1(prog, nodes): 27 | allreduce_ring(size=8, channels=4) 28 | 29 | @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True, 30 | instances=8, protocol='LL128', sizes=('2129920B', '22806528B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1) 31 | def ndv4_ring_allreduce_config2(prog, nodes): 32 | allreduce_ring(size=8, channels=4) 33 | 34 | @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32 or x == 64) 35 | def ndv4_alltoall_hierarchical_config1(prog, nodes): 36 | alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8) 37 | 38 | @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 8 or x == 16 or x == 32) 39 | def ndv4_alltoall_hierarchical_config2(prog, nodes): 40 | alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8) 41 | 42 | @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 64) 43 | def ndv4_alltoall_three_step(prog, nodes): 44 | alltoall_three_step(num_nodes=nodes, gpus_per_node=8) 45 | 46 | @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('1KB', None), machines=lambda x: x == 2 or x == 4) 47 | def ndv4_alltoall_hierarchical_config2(prog, nodes): 48 | alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8) 49 | 50 | 51 | -------------------------------------------------------------------------------- /msccl/autosynth/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from collections import defaultdict 5 | import math 6 | import tempfile 7 | import os 8 | import atexit 9 | import humanfriendly 10 | 11 | from msccl.language import MSCCLProgram, ir_to_xml 12 | from msccl.language.ir import ThreadblockPolicy 13 | import msccl.language.collectives as lang_collectives 14 | from msccl.topologies import distributed_fully_connected 15 | 16 | # The plans are keyed by (collective, machine_type) and each entry is a tuple 17 | # (name, function, machines, size_range, protocol, priority). 18 | synthesis_plans = defaultdict(list) 19 | 20 | 21 | def _register_ef_provider(desc, fun, collective, machine_type, machines, sizes, protocol, priority): 22 | if sizes == None: 23 | sizes = (0, math.inf) 24 | else: 25 | lower, upper = sizes 26 | if isinstance(lower, str): 27 | lower = humanfriendly.parse_size(lower) 28 | if isinstance(upper, str): 29 | upper = humanfriendly.parse_size(upper) 30 | if upper == None: 31 | upper = math.inf 32 | sizes = (lower, upper) 33 | # Register entries under all keys that might trigger this plan 34 | entry = (desc, fun, machines, sizes, protocol, priority) 35 | if isinstance(machine_type, list): 36 | for mtype in machine_type: 37 | synthesis_plans[(collective, mtype)].append(entry) 38 | else: 39 | synthesis_plans[(collective, machine_type)].append(entry) 40 | 41 | 42 | def register_ef_file(path, collective, machine_type, num_machines, sizes=None, protocol='Simple', priority=0): 43 | def provide_ef_path(machines): 44 | return path 45 | _register_ef_provider(f'load {path}', provide_ef_path, collective, 46 | machine_type, lambda x: x == num_machines, sizes, protocol, priority) 47 | 48 | 49 | def register_synthesis_plan(collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', priority=0): 50 | def decorator(fun): 51 | def wrapped(machines): 52 | ef = fun(machines) 53 | fd, path = tempfile.mkstemp() 54 | with os.fdopen(fd, 'w') as f: 55 | f.write(ef) 56 | atexit.register(os.remove, path) 57 | return path 58 | _register_ef_provider(f'call {fun.__name__}', wrapped, collective, 59 | machine_type, machines, sizes, protocol, priority) 60 | # Return the original function to not break other usage 61 | return fun 62 | return decorator 63 | 64 | 65 | def register_msccl_program(local_topology, collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', 66 | chunk_factor=1, priority=0, collective_obj=None, instances=1, inplace=False, threadblock_policy=ThreadblockPolicy.auto, 67 | interleaved_replication=True, dependence_nop=False): 68 | def decorator(fun): 69 | name = fun.__name__ 70 | def wrapped(machines): 71 | topology = distributed_fully_connected(local_topology, machines, 1) 72 | co = collective_obj 73 | if co == None: 74 | if collective == 'allreduce': 75 | co = lang_collectives.AllReduce(topology.num_nodes(), chunk_factor, inplace) 76 | elif collective == 'allgather': 77 | co = lang_collectives.AllGather(topology.num_nodes(), chunk_factor, inplace) 78 | elif collective == 'alltoall': 79 | co = lang_collectives.AllToAll(topology.num_nodes(), chunk_factor, inplace) 80 | elif collective == 'reduce_scatter': 81 | co = lang_collectives.ReduceScatter(topology.num_nodes(), chunk_factor, inplace) 82 | else: 83 | raise RuntimeError(f'No collective_obj in msccl.language.collectives known for "{collective}"') 84 | prog = MSCCLProgram(name, topology, co, instances, protocol, threadblock_policy=threadblock_policy, 85 | interleaved_replication=interleaved_replication, dependence_nop=dependence_nop) 86 | with prog: 87 | fun(prog, machines) 88 | prog.check() 89 | ef = prog.generate_xml() 90 | fd, path = tempfile.mkstemp() 91 | with os.fdopen(fd, 'w') as f: 92 | f.write(ef) 93 | atexit.register(os.remove, path) 94 | return path 95 | _register_ef_provider(f'run {name}', wrapped, collective, 96 | machine_type, machines, sizes, protocol, priority) 97 | # Return the original function to not break other usage 98 | return fun 99 | return decorator -------------------------------------------------------------------------------- /msccl/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .solve import * 5 | from .compose import * 6 | from .distribute import * 7 | from .analyze import * 8 | from .ncclize import * 9 | from .plans import * 10 | -------------------------------------------------------------------------------- /msccl/cli/analyze.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .known_topologies import KnownTopologies 5 | from .known_collectives import KnownCollectives 6 | from .common import * 7 | from msccl.rounds_bound import lower_bound_rounds 8 | from msccl.isomorphisms import find_isomorphisms 9 | 10 | def make_analyses(cmd_parsers): 11 | handler_funcs = [] 12 | handler_funcs.append(make_handle_bound_rounds) 13 | handler_funcs.append(make_handle_find_isomorphisms) 14 | 15 | return make_cmd_category(cmd_parsers, 'analyze', 'analysis', handler_funcs) 16 | 17 | def make_handle_bound_rounds(cmd_parsers): 18 | cmd = cmd_parsers.add_parser('rounds') 19 | topologies = KnownTopologies(cmd) 20 | collectives = KnownCollectives(cmd) 21 | 22 | def handle(args, command): 23 | if command != 'rounds': 24 | return False 25 | 26 | topology = topologies.create(args) 27 | collective = collectives.create(args, topology.num_nodes()) 28 | lower_bound_rounds(topology, collective, logging=True) 29 | return True 30 | 31 | return handle 32 | 33 | def make_handle_find_isomorphisms(cmd_parsers): 34 | cmd = cmd_parsers.add_parser('isomorphisms') 35 | topologies1 = KnownTopologies(cmd, tag='1') 36 | topologies2 = KnownTopologies(cmd, tag='2') 37 | 38 | def handle(args, command): 39 | if command != 'isomorphisms': 40 | return False 41 | 42 | topology1 = topologies1.create(args) 43 | topology2 = topologies2.create(args) 44 | isomorphisms = find_isomorphisms(topology1, topology2, logging=True) 45 | return True 46 | 47 | return handle 48 | -------------------------------------------------------------------------------- /msccl/cli/compose.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.composers import * 5 | from .common import * 6 | 7 | def make_composers(cmd_parsers): 8 | handler_funcs = [] 9 | handler_funcs.append(make_handle_allreduce) 10 | 11 | return make_cmd_category(cmd_parsers, 'compose', 'composer', handler_funcs) 12 | 13 | def make_handle_allreduce(cmd_parsers): 14 | name = 'allreduce' 15 | cmd = cmd_parsers.add_parser(name) 16 | read_reducescatter_algorithm = add_input_algorithm(cmd, name="reducescatter-algorithm") 17 | read_allgather_algorithm = add_input_algorithm(cmd, name="allgather-algorithm") 18 | validate_output_args, output_handler = add_output_algorithm(cmd) 19 | 20 | def handle(args, command): 21 | if command != name: 22 | return False 23 | 24 | reducescatter_algorithm = read_reducescatter_algorithm(args) 25 | allgather_algorithm = read_allgather_algorithm(args) 26 | validate_output_args(args) 27 | algo = compose_allreduce(reducescatter_algorithm, allgather_algorithm, logging=True) 28 | output_handler(args, algo) 29 | return True 30 | 31 | return handle -------------------------------------------------------------------------------- /msccl/cli/distribute.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.distributors import * 5 | from .known_distributed_topologies import KnownDistributedTopologies 6 | from .known_topologies import KnownTopologies 7 | from .common import * 8 | 9 | def make_distributors(cmd_parsers): 10 | handler_funcs = [] 11 | handler_funcs.append(make_handle_greedy_alltoall) 12 | handler_funcs.append(make_handle_gather_scatter_alltoall) 13 | handler_funcs.append(make_handle_create_subproblem_distributed_alltoall) 14 | handler_funcs.append(make_handle_distribute_alltoall_stitch_subproblem) 15 | 16 | return make_cmd_category(cmd_parsers, 'distribute', 'distributor', handler_funcs) 17 | 18 | def make_handle_greedy_alltoall(cmd_parsers): 19 | name = 'alltoall-greedy' 20 | cmd = cmd_parsers.add_parser(name) 21 | read_algorithm = add_input_algorithm(cmd) 22 | distributed_topologies = KnownDistributedTopologies(cmd) 23 | validate_output_args, output_handler = add_output_algorithm(cmd) 24 | 25 | def handle(args, command): 26 | if command != name: 27 | return False 28 | 29 | input_algorithm = read_algorithm(args) 30 | validate_output_args(args) 31 | topology = distributed_topologies.create(args, input_algorithm.topology) 32 | algo = synthesize_greedy_distributed_alltoall(topology, input_algorithm, logging=True) 33 | output_handler(args, algo) 34 | return True 35 | 36 | return handle 37 | 38 | def make_handle_gather_scatter_alltoall(cmd_parsers): 39 | name = 'alltoall-gather-scatter' 40 | cmd = cmd_parsers.add_parser(name) 41 | read_gather_algorithm = add_input_algorithm(cmd, name='gather') 42 | read_scatter_algorithm = add_input_algorithm(cmd, name='scatter') 43 | cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology to be made') 44 | cmd.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='remote bandwidth', metavar='N') 45 | validate_output_args, output_handler = add_output_algorithm(cmd) 46 | 47 | def handle(args, command): 48 | if command != name: 49 | return False 50 | 51 | gather_algorithm = read_gather_algorithm(args) 52 | scatter_algorithm = read_scatter_algorithm(args) 53 | validate_output_args(args) 54 | algo = synthesize_gather_scatter_distributed_alltoall(args.copies, gather_algorithm, scatter_algorithm, args.remote_bandwidth, logging=True) 55 | output_handler(args, algo) 56 | return True 57 | 58 | return handle 59 | 60 | def make_handle_create_subproblem_distributed_alltoall(cmd_parsers): 61 | name = 'alltoall-create-subproblem' 62 | cmd = cmd_parsers.add_parser(name) 63 | topologies = KnownTopologies(cmd) 64 | cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology to be made') 65 | cmd.add_argument('--relay-nodes', type=int, nargs='+', default=[0], help='relay nodes') 66 | cmd.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='remote bandwidth', metavar='N') 67 | cmd.add_argument('--share-bandwidth', action='store_true', help='share local bandwidth between relay nodes') 68 | validate_output_args, output_handler = add_output_msccl_objects(cmd) 69 | 70 | def handle(args, command): 71 | if command != name: 72 | return False 73 | 74 | local_topology = topologies.create(args) 75 | validate_output_args(args) 76 | 77 | collective, topology = make_alltoall_subproblem_collective_and_topology(local_topology, args.copies, args.relay_nodes, args.remote_bandwidth, args.share_bandwidth) 78 | 79 | output_handler(args, collective, collective.name) 80 | output_handler(args, topology, topology.name) 81 | return True 82 | 83 | return handle 84 | 85 | def make_handle_distribute_alltoall_stitch_subproblem(cmd_parsers): 86 | name = 'alltoall-stitch-subproblem' 87 | cmd = cmd_parsers.add_parser(name) 88 | read_subproblem_algorithm = add_input_algorithm(cmd) 89 | cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology made for the subproblem') 90 | validate_output_args, output_handler = add_output_algorithm(cmd) 91 | 92 | def handle(args, command): 93 | if command != name: 94 | return False 95 | 96 | subproblem_algorithm = read_subproblem_algorithm(args) 97 | validate_output_args(args) 98 | algo = synthesize_alltoall_subproblem(subproblem_algorithm, args.copies, logging=True) 99 | output_handler(args, algo) 100 | return True 101 | 102 | return handle -------------------------------------------------------------------------------- /msccl/cli/known_collectives.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import msccl.collectives as collectives 5 | from msccl.serialization import * 6 | from pathlib import Path 7 | import sys 8 | 9 | class KnownCollectives: 10 | def __init__(self, parser): 11 | self.parser = parser 12 | self.constructors = { 13 | 'Broadcast': self._rooted_coll(collectives.broadcast), 14 | 'Reduce': self._rooted_coll(collectives.reduce), 15 | 'Scatter': self._rooted_coll(collectives.scatter), 16 | 'Gather': self._rooted_coll(collectives.gather), 17 | 'Allgather': self._coll(collectives.allgather), 18 | 'Allreduce': self._coll(collectives.allreduce), 19 | 'Alltoall': self._coll(collectives.alltoall), 20 | 'ReduceScatter': self._coll(collectives.reduce_scatter), 21 | 'Scan': self._coll(collectives.scan), 22 | 'MultirootBroadcast': self._multiroot_coll(collectives.multiroot_broadcast), 23 | 'MultirootScatter': self._multiroot_coll(collectives.multiroot_scatter), 24 | 'MultirootGather': self._multiroot_coll(collectives.multiroot_gather), 25 | 'custom': self._custom_coll(), 26 | } 27 | self.parser.add_argument('collective', type=str, choices=self.constructors.keys(), help='collective') 28 | self.parser.add_argument('--collective-file', type=Path, default=None, help='a serialized collective', metavar='FILE') 29 | self.parser.add_argument('--root', type=int, default=0, help='used by rooted collectives', metavar='N') 30 | self.parser.add_argument('--roots', type=int, nargs='+', default=[0], help='used by multi-rooted collectives', metavar='N') 31 | 32 | def create(self, args, num_nodes): 33 | return self.constructors[args.collective](num_nodes, args) 34 | 35 | def _custom_coll(self): 36 | def make(size, args): 37 | input_file = args.collective_file 38 | if input_file is None: 39 | self.parser.error('--collective-file is required for custom collectives') 40 | exit(1) 41 | 42 | if not input_file.exists(): 43 | print(f'error: input file not found: {input_file}', file=sys.stderr) 44 | exit(1) 45 | 46 | return load_msccl_object(input_file) 47 | return make 48 | 49 | def _rooted_coll(self, fun): 50 | def make(size, args): 51 | root = args.root 52 | return fun(size, root) 53 | return make 54 | 55 | def _coll(self, fun): 56 | def make(size, args): 57 | return fun(size) 58 | return make 59 | 60 | def _multiroot_coll(self, fun): 61 | def make(size, args): 62 | roots = args.roots 63 | return fun(size, roots) 64 | return make 65 | -------------------------------------------------------------------------------- /msccl/cli/known_distributed_topologies.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import msccl.topologies as topologies 5 | import pathlib 6 | 7 | class KnownDistributedTopologies: 8 | def __init__(self, parser): 9 | self.parser = parser 10 | self.constructors = { 11 | 'DistributedFullyConnected': topologies.distributed_fully_connected, 12 | 'DistributedHubAndSpoke': topologies.distributed_hub_and_spoke, 13 | } 14 | self.parser.add_argument('topology', type=str, choices=self.constructors.keys(), help='the distributed topology') 15 | self.parser.add_argument('-n', '--nodes', type=int, help='total nodes in the distributed topology, must be divisible by local topology') 16 | self.parser.add_argument('--copies', type=int, help='copies of the local topology to be made') 17 | self.parser.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='bandwidth of links in the distributed topology', metavar='N') 18 | 19 | def create(self, args, local_topology): 20 | if args.nodes != None and args.copies != None: 21 | self.parser.error('please use only one of -n/--nodes, --copies') 22 | if args.copies != None: 23 | copies = args.copies 24 | elif args.nodes != None: 25 | if args.nodes % local_topology.num_nodes() != 0: 26 | self.parser.error(f'total number of nodes must be divisible by the local number of nodes {local_topology.num_nodes()}, but {args.nodes} was given') 27 | copies = args.nodes // local_topology.num_nodes() 28 | else: 29 | self.parser.error('one of the following arguments is required: --nodes, --copies') 30 | return self.constructors[args.topology](local_topology, copies, args.remote_bandwidth) 31 | -------------------------------------------------------------------------------- /msccl/cli/known_topologies.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import msccl.topologies as topologies 5 | from msccl.serialization import * 6 | from .known_transformers import KnownTransformers 7 | from pathlib import Path 8 | import sys 9 | 10 | class KnownTopologies: 11 | def __init__(self, parser, tag=''): 12 | self.parser = parser 13 | self.tag = tag 14 | self.constructors = { 15 | 'FullyConnected': self._sized_topo(topologies.fully_connected), 16 | 'HubAndSpoke': self._sized_topo(topologies.hub_and_spoke), 17 | 'Ring': self._sized_topo(topologies.ring), 18 | 'Line': self._sized_topo(topologies.line), 19 | 'Star': self._sized_topo(topologies.star), 20 | 'AMD4': self._fixed_topo(topologies.amd4), 21 | 'AMD8': self._fixed_topo(topologies.amd8), 22 | 'DGX1': self._fixed_topo(topologies.dgx1), 23 | 'DGX2': self._fixed_topo(lambda: topologies.hub_and_spoke(16)), 24 | 'NVLinkOnly': self._fixed_topo(topologies.nvlink_only), 25 | 'custom': self._custom_topo(), 26 | } 27 | self.parser.add_argument(f'topology{tag}', type=str, choices=self.constructors.keys(), help=f'topology {tag}') 28 | self.parser.add_argument(f'--topology-file{tag}', type=Path, default=None, help=f'a serialized topology', metavar=f'FILE') 29 | self.parser.add_argument(f'-n{tag}', f'--nodes{tag}', type=int, help='required for non-fixed topologies', metavar='N') 30 | self.known_transformers = KnownTransformers(parser, tag=tag) 31 | 32 | def _topology(self, args): 33 | return vars(args)[f'topology{self.tag}'] 34 | 35 | def _nodes(self, args): 36 | return vars(args)[f'nodes{self.tag}'] 37 | 38 | def create(self, args): 39 | topology = self.constructors[self._topology(args)](args) 40 | topology = self.known_transformers.transform(args, topology) 41 | return topology 42 | 43 | def _custom_topo(self): 44 | def make(args): 45 | input_file = vars(args)[f'topology_file{self.tag}'] 46 | if input_file is None: 47 | self.parser.error(f'--topology-file{self.tag} is required for custom topologies') 48 | exit(1) 49 | 50 | if not input_file.exists(): 51 | print(f'error: input file not found: {input_file}', file=sys.stderr) 52 | exit(1) 53 | 54 | return load_msccl_object(input_file) 55 | return make 56 | 57 | def _fixed_topo(self, Cls): 58 | def make(args): 59 | topo = Cls() 60 | if self._nodes(args) != None and self._nodes(args) != topo.num_nodes(): 61 | self.parser.error(f'fixed-size topology {self._topology(args)} has {topo.num_nodes()} nodes, but command line specified {self._nodes(args)} nodes') 62 | return topo 63 | return make 64 | 65 | def _sized_topo(self, Cls): 66 | def make(args): 67 | if self._nodes(args) == None: 68 | self.parser.error(f'topology {self._topology(args)} requires -n/--nodes') 69 | return Cls(self._nodes(args)) 70 | return make 71 | -------------------------------------------------------------------------------- /msccl/cli/known_transformers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import msccl.topologies as topologies 5 | 6 | class KnownTransformers: 7 | def __init__(self, parser, tag=''): 8 | self.parser = parser 9 | self.tag = tag 10 | self.transformers = { 11 | 'reverse': topologies.reverse_topology, 12 | 'binarize': topologies.binarize_topology, 13 | } 14 | self.parser.add_argument(f'-t{tag}', f'--transform{tag}', action='append', default=[], choices=self.transformers.keys(), help='apply a topology transformer. may be used multiple times') 15 | 16 | def transform(self, args, topology): 17 | for key in vars(args)[f'transform{self.tag}']: 18 | topology = self.transformers[key](topology) 19 | return topology 20 | -------------------------------------------------------------------------------- /msccl/cli/ncclize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.ncclize import * 5 | from .common import * 6 | 7 | def make_handle_ncclize(cmd_parsers): 8 | cmd = cmd_parsers.add_parser('ncclize') 9 | read_algorithm = add_input_algorithm(cmd, multiple=True) 10 | validate_output_args, output_handler = add_output_file(cmd) 11 | remap_scratch_grp = cmd.add_mutually_exclusive_group() 12 | remap_scratch_grp.add_argument('--remap-scratch', action='store_true', default=None, help='remap scratch buffer indices into free input/output indices') 13 | remap_scratch_grp.add_argument('--no-remap-scratch', action='store_false', dest='remap_scratch', help='don\'t remap scratch buffer indices into free input/output indices') 14 | cmd.add_argument('--no-merge-contiguous', action='store_true', help='don\'t merge sends/receives from/to contiguous memory') 15 | cmd.add_argument('--no-pretty-print', action='store_true', help='don\'t pretty print the generated XML') 16 | cmd.add_argument('--greedy-scratch-sorting', action='store_true', help='sort scratch buffer indices greedily to increase contiguous operations') 17 | cmd.add_argument('--no-scratch', action='store_true', help='use extra space at the end of output buffer instead of the scratch buffer') 18 | cmd.add_argument('--channel-policy', type=ChannelPolicy, choices=list(ChannelPolicy), default=ChannelPolicy.MatchTopology, help='channel allocation policy') 19 | cmd.add_argument('--instances', type=int, default=1, help='number of interleaved instances of the algorithm to make') 20 | 21 | def handle(args, command): 22 | if command != 'ncclize': 23 | return False 24 | 25 | input_algorithms = read_algorithm(args) 26 | validate_output_args(args) 27 | 28 | for algo in input_algorithms: 29 | ncclized = ncclize(algo, 30 | remap_scratch=args.remap_scratch, 31 | channel_policy=args.channel_policy, 32 | pretty_print=not args.no_pretty_print, 33 | use_scratch=not args.no_scratch, 34 | merge_contiguous=not args.no_merge_contiguous, 35 | greedy_scratch_sorting=args.greedy_scratch_sorting, 36 | instances=args.instances, 37 | logging=True) 38 | 39 | handled = output_handler(args, lambda: ncclized, name_msccl_object(algo.name, ending='msccl.xml')) 40 | 41 | return True 42 | 43 | return handle 44 | -------------------------------------------------------------------------------- /msccl/cli/plans.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .common import * 5 | from msccl.autosynth import * 6 | 7 | def make_plans(cmd_parsers): 8 | handler_funcs = [] 9 | handler_funcs.append(make_handle_list) 10 | 11 | return make_cmd_category(cmd_parsers, 'plans', 'subcommand', handler_funcs) 12 | 13 | def make_handle_list(cmd_parsers): 14 | cmd = cmd_parsers.add_parser('list') 15 | 16 | def handle(args, command): 17 | if command != 'list': 18 | return False 19 | 20 | print_plans() 21 | return True 22 | 23 | return handle 24 | -------------------------------------------------------------------------------- /msccl/cli/solve.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import msccl.strategies as strategies 5 | from .known_topologies import KnownTopologies 6 | from .known_collectives import KnownCollectives 7 | from .common import * 8 | 9 | def make_solvers(cmd_parsers): 10 | handler_funcs = [] 11 | handler_funcs.append(make_handle_solve_instance) 12 | handler_funcs.append(make_handle_solve_least_steps) 13 | handler_funcs.append(make_handle_solve_pareto_optimal) 14 | 15 | return make_cmd_category(cmd_parsers, 'solve', 'solver', handler_funcs) 16 | 17 | def _make_handle_strategy(cmd_parsers, name, invoke, take_steps = True): 18 | cmd = cmd_parsers.add_parser(name) 19 | instance_handler = add_instance(cmd, take_steps=take_steps) 20 | topologies = KnownTopologies(cmd) 21 | collectives = KnownCollectives(cmd) 22 | validate_output_args, output_handler = add_output_algorithm(cmd) 23 | 24 | def handle(args, command): 25 | if command != name: 26 | return False 27 | 28 | validate_output_args(args) 29 | topology = topologies.create(args) 30 | collective = collectives.create(args, topology.num_nodes()) 31 | instance = instance_handler(args) 32 | algo = invoke(args, topology, collective, instance) 33 | output_handler(args, algo) 34 | return True 35 | 36 | return cmd, handle 37 | 38 | def make_handle_solve_instance(cmd_parsers): 39 | def invoke(args, topology, collective, instance): 40 | return strategies.solve_instance(topology, collective, instance, logging=True) 41 | 42 | cmd, handle = _make_handle_strategy(cmd_parsers, 'instance', invoke) 43 | return handle 44 | 45 | def make_handle_solve_least_steps(cmd_parsers): 46 | def invoke(args, topology, collective, instance): 47 | return strategies.solve_least_steps(topology, collective, args.initial_steps, instance, logging=True) 48 | 49 | cmd, handle = _make_handle_strategy(cmd_parsers, 'least-steps', invoke, take_steps=False) 50 | cmd.add_argument('--initial-steps', type=int, default=1, metavar='N') 51 | return handle 52 | 53 | def make_handle_solve_pareto_optimal(cmd_parsers): 54 | name = 'pareto-optimal' 55 | cmd = cmd_parsers.add_parser(name) 56 | topologies = KnownTopologies(cmd) 57 | collectives = KnownCollectives(cmd) 58 | validate_output_args, output_handler = add_output_msccl_objects(cmd) 59 | cmd.add_argument('--min-chunks', type=int, default=1, metavar='N') 60 | cmd.add_argument('--max-chunks', type=int, default=None, metavar='N') 61 | cmd.add_argument('--assume-rpc-bound', default=None, help='assume bandwidth optimality requires at least this many rounds per chunk', metavar='N/N') 62 | cmd.add_argument('--no-monotonic-feasibility', action='store_true', help='turn off an unproven assumption about monotonic feasibility of instances') 63 | cmd.add_argument('--save-eagerly', action='store_true', help='save algorithms as soon as they are found, without pruning non-Pareto optimal algorithms at the end') 64 | instance_handler = add_instance(cmd, take_steps=False, take_rounds=False) 65 | 66 | def handle(args, command): 67 | if command != name: 68 | return False 69 | 70 | validate_output_args(args) 71 | topology = topologies.create(args) 72 | instance = instance_handler(args) 73 | collective = collectives.create(args, topology.num_nodes()) 74 | assume_rpc_bound = None 75 | if args.assume_rpc_bound: 76 | try: 77 | assume_rpc_bound = parse_fraction(args.assume_rpc_bound) 78 | except ValueError: 79 | cmd.error('could not parse --assume-rpc-bound as a fraction') 80 | algorithms = [] 81 | for algorithm in strategies.solve_all_latency_bandwidth_tradeoffs(topology, collective, args.min_chunks, args.max_chunks, assume_rpc_bound, not args.no_monotonic_feasibility, base_instance=instance, logging=True): 82 | algorithms.append(algorithm) 83 | if args.save_eagerly: 84 | output_handler(args, algorithm, algorithm.name) 85 | if not args.save_eagerly: 86 | efficient_algorithms = strategies.prune_pareto_optimal(algorithms) 87 | print(f'Found {len(efficient_algorithms)} Pareto optimal algorithms. Pruned {len(algorithms) - len(efficient_algorithms)} non-optimal algorithms.') 88 | for algorithm in efficient_algorithms: 89 | output_handler(args, algorithm, algorithm.name) 90 | return True 91 | 92 | return handle 93 | -------------------------------------------------------------------------------- /msccl/collectives.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from abc import ABC, abstractmethod 5 | from dataclasses import dataclass 6 | 7 | @dataclass 8 | class Chunk: 9 | precondition: set 10 | postcondition: set 11 | address: int 12 | 13 | class Collective: 14 | def __init__(self, name, num_nodes, chunks, triggers = {}, runtime_name= 'custom'): 15 | self.name = name 16 | self.num_nodes = num_nodes 17 | self.num_chunks = len(chunks) 18 | self._chunks = chunks 19 | self._triggers = triggers 20 | self.runtime_name = runtime_name 21 | 22 | self.is_combining = False 23 | addresses_seen = set() 24 | for chunk in self._chunks: 25 | if chunk.address in addresses_seen: 26 | self.is_combining = True 27 | addresses_seen.add(chunk.address) 28 | self.num_addresses = len(addresses_seen) 29 | 30 | def ranks(self): 31 | return range(self.num_nodes) 32 | 33 | def chunks(self): 34 | return range(len(self._chunks)) 35 | 36 | def precondition(self, rank, chunk): 37 | return rank in self._chunks[chunk].precondition 38 | 39 | def postcondition(self, rank, chunk): 40 | return rank in self._chunks[chunk].postcondition 41 | 42 | def address(self, chunk): 43 | return self._chunks[chunk].address 44 | 45 | def trigger(self, rank, chunk): 46 | if (rank, chunk) in self._triggers: 47 | return self._triggers[(rank, chunk)] 48 | else: 49 | return None 50 | 51 | def has_triggers(self): 52 | return len(self._triggers) > 0 53 | 54 | def chunk_up(self, div): 55 | if div < 1: 56 | raise ValueError('Divisor must be greater or equal to one (and one is a no-op).') 57 | if div == 1: 58 | return self 59 | 60 | def remap(addr, i): 61 | return addr * div + i 62 | 63 | new_chunks = [] 64 | for chunk in self._chunks: 65 | for i in range(div): 66 | new_chunks.append(Chunk(chunk.precondition, chunk.postcondition, remap(chunk.address, i))) 67 | 68 | name = f'{self.name},chunks={div}' 69 | return Collective(name, self.num_nodes, new_chunks) 70 | 71 | def build_collective(name, num_nodes, num_chunks, precondition, postcondition, address = lambda c: c, trigger = lambda r, c: None, runtime_name = 'custom'): 72 | chunks = [] 73 | for chunk in range(num_chunks): 74 | chunk_precondition = set(rank for rank in range(num_nodes) if precondition(rank, chunk)) 75 | chunk_postcondition = set(rank for rank in range(num_nodes) if postcondition(rank, chunk)) 76 | chunk_address = address(chunk) 77 | chunks.append(Chunk(chunk_precondition, chunk_postcondition, chunk_address)) 78 | triggers = {(rank, chunk): trigger(rank, chunk) for rank in range(num_nodes) for chunk in range(num_chunks) if trigger(rank, chunk) != None} 79 | return Collective(name, num_nodes, chunks, triggers, runtime_name) 80 | 81 | # Common pre- and postconditions 82 | def _scattered(num_nodes, chunks = 1): 83 | def cond(rank, chunk): 84 | return rank == (chunk // chunks) % num_nodes 85 | return cond 86 | 87 | def _transpose(num_nodes): 88 | def cond(rank, chunk): 89 | return rank == chunk // num_nodes 90 | return cond 91 | 92 | def _all(rank, chunk): 93 | return True 94 | 95 | def _root(root): 96 | def cond(rank, chunk): 97 | return rank == root 98 | return cond 99 | 100 | # Non-combining collectives 101 | 102 | def broadcast(num_nodes, root): 103 | return build_collective(f'Broadcast(n={num_nodes},root={root})', num_nodes, 1, _root(root), _all) 104 | 105 | def scatter(num_nodes, root): 106 | return build_collective(f'Scatter(n={num_nodes},root={root})', num_nodes, num_nodes, _root(root), _scattered(num_nodes)) 107 | 108 | def gather(num_nodes, root): 109 | return build_collective(f'Gather(n={num_nodes},root={root})', num_nodes, num_nodes, _scattered(num_nodes), _root(root)) 110 | 111 | def allgather(num_nodes): 112 | return build_collective(f'Allgather(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all, runtime_name='allgather') 113 | 114 | def alltoall(num_nodes): 115 | return build_collective(f'Alltoall(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes), runtime_name='alltoall') 116 | 117 | # Combining collectives 118 | 119 | # Represents a single buffer to reduce 120 | def _single_scattered(num_nodes): 121 | def address(chunk): 122 | return chunk // num_nodes 123 | return address 124 | 125 | def reduce(num_nodes, root): 126 | return build_collective(f'Reduce(n={num_nodes},root={root})', num_nodes, num_nodes, _scattered(num_nodes), _root(root), _single_scattered(num_nodes)) 127 | 128 | def allreduce(num_nodes): 129 | return build_collective(f'Allreduce(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all, _single_scattered(num_nodes), runtime_name='allreduce') 130 | 131 | def reduce_scatter(num_nodes): 132 | return build_collective(f'ReduceScatter(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes), _single_scattered(num_nodes), runtime_name='reduce_scatter') 133 | 134 | def scan(num_nodes): 135 | def postcondition(rank, chunk): 136 | origin = chunk % num_nodes 137 | return rank >= origin 138 | return build_collective(f'Scan(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), postcondition, _single_scattered(num_nodes)) 139 | 140 | # Multi-root generalizations of MPI rooted collectives 141 | # TODO: Add one for reduce. That needs a new addressing function. 142 | 143 | def _roots(roots): 144 | def cond(rank, chunk): 145 | return rank == roots[chunk % len(roots)] 146 | return cond 147 | 148 | def multiroot_broadcast(num_nodes, roots): 149 | return build_collective(f'MultirootBroadcast(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, len(roots), _roots(roots), _all) 150 | 151 | def multiroot_scatter(num_nodes, roots): 152 | return build_collective(f'MultirootScatter(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, num_nodes * len(roots), _roots(roots), _scattered(num_nodes, len(roots))) 153 | 154 | def multiroot_gather(num_nodes, roots): 155 | return build_collective(f'MultirootGather(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, num_nodes * len(roots), _scattered(num_nodes, len(roots)), _roots(roots)) 156 | -------------------------------------------------------------------------------- /msccl/composers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.collectives import allreduce 5 | from msccl.algorithm import * 6 | from msccl.instance import * 7 | 8 | def compose_allreduce(reducescatter_algo, allgather_algo, logging=False): 9 | if reducescatter_algo.is_pipelined() or allgather_algo.is_pipelined(): 10 | raise ValueError('Pipelining is not supported.') 11 | 12 | if reducescatter_algo.instance.chunks != allgather_algo.instance.chunks: 13 | raise ValueError(f'ReduceScatter and Allgather must have the same chunks (got {reducescatter_algo.instance.chunks} and {allgather_algo.instance.chunks})') 14 | 15 | if reducescatter_algo.topology.name != allgather_algo.topology.name: 16 | # TODO: improve this check to check actual structure, not just name 17 | raise ValueError(f'ReduceScatter and Allgather must have the same topology (got {reducescatter_algo.topology.name} and {allgather_algo.topology.name})') 18 | topo = reducescatter_algo.topology 19 | 20 | coll = allreduce(topo.num_nodes()) 21 | 22 | steps = reducescatter_algo.steps + allgather_algo.steps 23 | instance = Instance(len(steps), 24 | extra_rounds=reducescatter_algo.instance.extra_rounds+allgather_algo.instance.extra_rounds, 25 | chunks=reducescatter_algo.instance.chunks) 26 | return Algorithm.make_implementation(coll, topo, instance, steps) -------------------------------------------------------------------------------- /msccl/distributors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .greedy_alltoall import * 5 | from .gather_scatter_alltoall import * 6 | from .alltoall_subproblem import * 7 | -------------------------------------------------------------------------------- /msccl/instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from dataclasses import dataclass 5 | 6 | @dataclass(frozen=True) 7 | class Instance: 8 | steps: int 9 | extra_rounds: int = 0 10 | chunks: int = 1 11 | pipeline: int = None 12 | extra_memory: int = None 13 | allow_exchange: bool = False 14 | 15 | def rounds(self): 16 | return self.steps + self.extra_rounds 17 | 18 | def set(self, steps = None, extra_rounds = None, chunks = None, pipeline = None, extra_memory = None, allow_exchange = None): 19 | return Instance( 20 | steps if steps != None else self.steps, 21 | extra_rounds if extra_rounds != None else self.extra_rounds, 22 | chunks if chunks != None else self.chunks, 23 | pipeline if pipeline != None else self.pipeline, 24 | extra_memory if extra_memory != None else self.extra_memory, 25 | allow_exchange if allow_exchange != None else self.allow_exchange) 26 | 27 | def __str__(self): 28 | s = f'steps={self.steps}' 29 | if self.extra_rounds > 0: 30 | s += f',rounds={self.steps + self.extra_rounds}' 31 | if self.chunks > 1: 32 | s += f',chunks={self.chunks}' 33 | if self.pipeline != None: 34 | s += f',pipeline={self.pipeline}' 35 | if self.extra_memory != None: 36 | s += f',extra_memory={self.extra_memory}' 37 | if self.allow_exchange: 38 | s += f',allow_exchange' 39 | return s 40 | -------------------------------------------------------------------------------- /msccl/isomorphisms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from z3 import * 5 | from dataclasses import dataclass 6 | 7 | @dataclass 8 | class Permutation: 9 | nodes: list 10 | 11 | def __str__(self): 12 | return f'Permutation(nodes={self.nodes})' 13 | 14 | def _pn(node): 15 | return Int(f'perm_node_{node}') 16 | 17 | def _select_node_permutation(s, topology): 18 | # Select a permutation of nodes 19 | for node in topology.nodes(): 20 | s.add(_pn(node) >= 0) 21 | s.add(_pn(node) < topology.num_nodes()) 22 | for prev in range(node): 23 | s.add(_pn(node) != _pn(prev)) 24 | 25 | def _links_constraint(topology, target_topology): 26 | nodes = range(topology.num_nodes()) 27 | 28 | def links_isomorphic(perm_src, perm_dst, link): 29 | # Return a condition on whether the permuted ranks are isomorphic from src to dst wrt. the given link 30 | for src in nodes: 31 | for dst in nodes: 32 | if target_topology.link(src, dst) != link: 33 | yield Not(And(perm_src == src, perm_dst == dst)) 34 | # Require all pairs of nodes to be isomorphic to their permuted counterparts 35 | conditions = [] 36 | for src in nodes: 37 | for dst in nodes: 38 | link = topology.link(src, dst) 39 | conditions.extend(links_isomorphic(_pn(src), _pn(dst), link)) 40 | return And(conditions) 41 | 42 | def _decode_permutation(model, topology): 43 | node_permutation = [model.eval(_pn(node)).as_long() for node in topology.nodes()] 44 | return Permutation(node_permutation) 45 | 46 | def find_isomorphisms(topology, target_topology, limit=None, logging=False): 47 | ''' 48 | Finds all isomorphisms from one topology to a target topology. Returns a list of permutations. 49 | ''' 50 | if len(topology.switches) > 0: 51 | print('MSCCL Warning: Topologies with switches are not supported. import msccl will be ignored.') 52 | return [] 53 | 54 | if limit != None and limit <= 0: 55 | raise ValueError('MSCCL error: limit was set improperly.') 56 | 57 | if topology.num_nodes() != target_topology.num_nodes(): 58 | raise ValueError('MSCCL error: target topology does not match with the given topology.') 59 | 60 | if logging: 61 | print(f'Encoding {topology.name} - {target_topology.name} isomorphisms to Z3') 62 | 63 | s = Solver() 64 | 65 | _select_node_permutation(s, topology) 66 | s.add(_links_constraint(topology, target_topology)) 67 | 68 | if logging: 69 | print(f'Solving isomorphisms incrementally...') 70 | 71 | isomorphisms = [] 72 | while s.check() == sat: 73 | isomorphism = _decode_permutation(s.model(), topology) 74 | isomorphisms.append(isomorphism) 75 | 76 | if logging: 77 | print(isomorphism) 78 | 79 | if limit != None and len(isomorphisms) >= limit: 80 | break 81 | 82 | # Block this permutation 83 | assignment = [_pn(node) == perm for node, perm in enumerate(isomorphism.nodes)] 84 | s.add(Not(And(assignment))) 85 | 86 | if logging: 87 | print(f'{len(isomorphisms)} isomorphisms found.') 88 | return isomorphisms 89 | -------------------------------------------------------------------------------- /msccl/language/buffer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # Scratch buffer slice with manual indexing 5 | class BufferSlice: 6 | def __init__(self, buf, name): 7 | self.name = name 8 | self.buf = buf 9 | self.offset = -1 # Offset into the global scratch buffer 10 | self.chunks = [] 11 | 12 | # Returns the global index into the scratch buffer 13 | def get_global_index(self, index): 14 | assert (self.offset > -1), 'set_offset needs to be called first' 15 | return self.offset + index 16 | 17 | def get_buffer(self): 18 | return self.buf 19 | 20 | def instance_size(self): 21 | return len(self.chunks) 22 | 23 | def set_offset(self, offset): 24 | self.offset = offset 25 | 26 | def __getitem__(self, index): 27 | return self.chunks[index] 28 | 29 | def __setitem__(self, index, value): 30 | current_size = len(self.chunks) 31 | while index > current_size: 32 | self.chunks.append(None) 33 | current_size = len(self.chunks) 34 | if index == current_size: 35 | self.chunks.append(value) 36 | else: 37 | self.chunks[index] = value 38 | 39 | def __len__(self): 40 | return len(self.chunks) -------------------------------------------------------------------------------- /msccl/language/chunk.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from dataclasses import dataclass 6 | from msccl.language.ir import * 7 | 8 | @dataclass 9 | class Chunk: 10 | origin_rank: int # Rank the chunk initially started at 11 | origin_index: int # Index the chunk initially started at 12 | dst_rank: int = -1 13 | dst_index: int = -1 14 | 15 | def reduce(self, dst, chunk): 16 | if type(chunk) is ReduceChunk: 17 | return chunk.reduce(dst, self) 18 | elif type(chunk) is Chunk: 19 | chunks = [self, chunk] 20 | return ReduceChunk(dst, chunks) 21 | else: 22 | assert True, "Trying to reduce with chunk of None" 23 | return None 24 | 25 | def __hash__(self): 26 | return hash((self.origin_rank, self.origin_index)) 27 | 28 | def __eq__(self, other): 29 | return type(other) is Chunk and self.origin_rank == other.origin_rank and self.origin_index == other.origin_index 30 | 31 | def __lt__(self, other): 32 | return self.origin_rank < other.origin_rank or \ 33 | (self.origin_rank == other.origin_rank and self.origin_index < other.origin_index) 34 | 35 | @dataclass 36 | class ReduceChunk: 37 | creation_rank: int # Rank the Reduce Chunk is created. Necessary since the same ReduceChunk can be created on multiple ranks independently 38 | chunks: list # List of chunks reduced 39 | 40 | def reduce(self, dst, chunk): 41 | if type(chunk) is ReduceChunk: 42 | chunks = self.chunks + chunk.chunks 43 | elif type(chunk) is Chunk: 44 | chunks =self.chunks + [chunk] 45 | else: 46 | assert True, "Trying to reduce with chunk of None" 47 | return ReduceChunk(self.creation_rank, chunks) 48 | 49 | def sort(self): 50 | self.chunks.sort() 51 | 52 | def __hash__(self): 53 | self.sort() 54 | return hash((self.creation_rank,) + tuple(self.chunks)) 55 | 56 | # Two reduce chunks are equal if they contain the same list of 57 | # chunks being reduced 58 | def __eq__(self, other): 59 | self.sort() 60 | other.sort() 61 | return self.chunks == other.chunks 62 | -------------------------------------------------------------------------------- /msccl/language/passes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import sys 5 | from msccl.language.ir import * 6 | 7 | # Check that there are no cyclic dependencies within a Rank 8 | def check_dependency_cycles(tbs): 9 | for rank, rank_tbs in enumerate(tbs): 10 | for tbid, tb in rank_tbs.items(): 11 | for op in tb.ops: 12 | deps = op.depends 13 | chain = [op] 14 | # DFS to check for cycles 15 | while len(deps) > 0: 16 | dep = deps[0] 17 | if dep in chain: 18 | print(f"Cyclic dependency in rank {rank} threadblock {tbid} at {op}") 19 | for op in chain: 20 | print(" ", op) 21 | sys.exit(1) 22 | next_depends = dep.depends 23 | if len(next_depends) > 0: 24 | chain.append(dep) 25 | else: 26 | chain = [op] 27 | deps = next_depends + deps[1:] 28 | 29 | 30 | # Check there are no ordering violations between threadblocks across ranks 31 | def check_threadblock_ordering(rank_dag): 32 | for rank in range(rank_dag.num_ranks): 33 | for tb in rank_dag.tbs[rank].values(): 34 | prev_steps = {} # tbid -> step of last recv from tbid 35 | # Check that sends and their corresponding receives between two threadblocks 36 | # happen in the same order. 37 | for op_step, op in enumerate(tb.ops): 38 | if op.is_send(): 39 | match = op.recv_match 40 | if match.is_recv(): 41 | assert op.dst.rank == match.rank, f"Bug in MSCCLang: Sends don't match receives" 42 | 43 | other_tbid = match.tb 44 | if other_tbid in prev_steps: 45 | if match.step <= prev_steps[other_tbid].step: 46 | print("Offending Steps", match.step, prev_steps[other_tbid].step) 47 | print("Sending tb") 48 | for op in tb.ops: 49 | print(f'{op.step}: Recv step: {op.recv_match.step if op.is_send() else -1} {op} priority:{(op.chunk_step, op.priority, op.dst.index)}') 50 | print("Receiving tb") 51 | for op in rank_dag.tbs[match.rank][other_tbid].ops: 52 | print(f'{op.step}: {op} priority:{(op.chunk_step, op.priority, op.dst.index)}') 53 | assert match.step > prev_steps[other_tbid].step, f"Rank {op.rank} sends op1 then op2 but {match.rank} receives op2 then op1" 54 | 55 | prev_steps[other_tbid] = match 56 | -------------------------------------------------------------------------------- /msccl/language/routines.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.language import * 5 | from msccl.topologies import * 6 | from msccl.language.collectives import * 7 | 8 | def allgather_ring_inplace(gpus, gpu_offset=0, index_offset=0, ch=0): 9 | for rank in range(gpu_offset, gpu_offset+gpus): 10 | index = index_offset + rank - gpu_offset 11 | c = chunk(rank, Buffer.input, 0) 12 | for r_next in range(1, gpus): 13 | next_rank = (rank + r_next) % gpus + gpu_offset 14 | c = c.copy(next_rank, Buffer.output, index, ch=0) 15 | 16 | def allreduce_ring_inplace(gpus, gpu_offset=0, index_offset=0, ch=0): 17 | for rank in range(gpu_offset, gpu_offset+gpus): 18 | index = index_offset + rank - gpu_offset 19 | c = chunk(rank, Buffer.input, index) 20 | # Reduce ring 21 | for r_next in range(1, gpus): 22 | next_rank = (rank + r_next) % gpus + gpu_offset 23 | c = chunk(next_rank, Buffer.input, index).reduce(c, ch=ch) 24 | # Propagate ring 25 | for r_next in range(0, gpus-1): 26 | next_rank = (rank + r_next) % gpus + gpu_offset 27 | c = c.copy(next_rank, Buffer.input, index, ch=ch) 28 | -------------------------------------------------------------------------------- /msccl/language/visualize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import igraph as ig 5 | from msccl.language.ir import * 6 | from msccl.language.rank_dag import * 7 | 8 | def visualize_chunk_dag(chunk_paths): # pragma: no cover 9 | frontier = [] 10 | nnodes = 0 11 | vertex_label = [] 12 | vertex_colors = [] 13 | edges = [] 14 | visited = set() 15 | 16 | def add_node(op, nnodes, vertex_label, vertex_colors): 17 | if op.num == -1: 18 | op.num = nnodes 19 | nnodes += 1 20 | if op.inst == ChunkInstruction.start: 21 | vertex_label.append(f'Start at {op.dst.rank}, {op.dst.index}.') 22 | vertex_colors.append('yellow') 23 | elif op.inst == ChunkInstruction.send: 24 | vertex_label.append(f'Send to Rank {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}') 25 | vertex_colors.append('blue') 26 | elif op.inst == ChunkInstruction.reduce: 27 | vertex_label.append(f'Reduce with {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}') 28 | vertex_colors.append('green') 29 | return nnodes 30 | 31 | for chunk, op in chunk_paths.items(): 32 | if len(op.prev) == 0: 33 | frontier.append(op) 34 | 35 | while len(frontier) > 0: 36 | op = frontier[0] 37 | if op in visited: 38 | frontier = frontier[1:] 39 | else: 40 | nnodes = add_node(op, nnodes, vertex_label, vertex_colors) 41 | for next_op in op.next: 42 | nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors) 43 | edges.append([op.num, next_op.num]) 44 | frontier = frontier[1:] + op.next 45 | visited.add(op) 46 | 47 | g = ig.Graph(nnodes, edges, directed=True) 48 | layout = g.layout(layout=ig.Graph.layout_grid) 49 | ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='auto') 50 | 51 | def visualize_rank_dag(operations): # pragma: no cover 52 | frontier = [] 53 | nnodes = 0 54 | vertex_label = [] 55 | vertex_colors = [] 56 | edges = [] 57 | visited = set() 58 | colors = ['red', 'green', 'blue', 'yellow', 'teal', 'pink', 'purple', 'orange'] 59 | 60 | def add_node(op, nnodes, vertex_label, vertex_colors): 61 | if op.num == -1: 62 | op.num = nnodes 63 | nnodes += 1 64 | # Add new node to graph 65 | if op.inst == Instruction.start: 66 | vertex_label.append(f'Chunk {op.src.index} Rank {op.src.rank}') 67 | elif op.inst == Instruction.send: 68 | vertex_label.append(f'S to Rank {op.dst.rank}') 69 | elif op.inst == Instruction.recv: 70 | vertex_label.append(f'R from {op.src.rank}') 71 | elif op.inst == Instruction.recv_reduce_copy: 72 | vertex_label.append(f'RRC from {op.src.rank}') 73 | else: 74 | vertex_label.append(f'{op.inst}') 75 | 76 | # Add colors 77 | if op.inst == Instruction.start: 78 | vertex_colors.append('gray') 79 | else: 80 | vertex_colors.append(colors[op.tb % len(colors)]) 81 | return nnodes 82 | 83 | for slot, op in operations.items(): 84 | if len(op.prev) == 0: 85 | frontier.append(op) 86 | 87 | while len(frontier) > 0: 88 | op = frontier[0] 89 | 90 | if op in visited: 91 | frontier = frontier[1:] 92 | else: 93 | nnodes = add_node(op, nnodes, vertex_label, vertex_colors) 94 | 95 | for next_op in op.next: 96 | nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors) 97 | edges.append([op.num, next_op.num]) 98 | frontier = frontier[1:] + list(op.next) 99 | visited.add(op) 100 | 101 | g = ig.Graph(nnodes, edges, directed=True) 102 | layout = g.layout(layout=ig.Graph.layout_grid) 103 | ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='rt') -------------------------------------------------------------------------------- /msccl/ncd_reduction.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.collectives import * 5 | from msccl.topologies import reverse_topology 6 | from msccl.algorithm import Algorithm, Step 7 | from collections import defaultdict 8 | 9 | class ReductionNotApplicableError(ValueError): 10 | pass 11 | 12 | def non_combining_dual(primal): 13 | if not primal.is_combining: 14 | raise ReductionNotApplicableError('The collective is already non-combining.') 15 | 16 | if primal.has_triggers(): 17 | raise ReductionNotApplicableError('The collective has triggers.') 18 | 19 | dual_precondition = defaultdict(set) 20 | dual_postcondition = defaultdict(set) 21 | 22 | addresses = set() 23 | for chunk in primal.chunks(): 24 | addr = primal.address(chunk) 25 | addresses.add(addr) 26 | for rank in primal.ranks(): 27 | if primal.postcondition(rank, chunk): 28 | dual_precondition[addr].add(rank) 29 | if primal.precondition(rank, chunk): 30 | dual_postcondition[addr].add(rank) 31 | for addr in dual_precondition: 32 | if len(dual_precondition[addr]) > 1: 33 | raise ReductionNotApplicableError('The non-combining reduction is only applicable to collectives with a unique root per address.') 34 | 35 | return build_collective(f'Dual{primal.name}', primal.num_nodes, len(addresses), 36 | lambda r, c: r in dual_precondition[c], 37 | lambda r, c: r in dual_postcondition[c]) 38 | 39 | def recover_primal_algorithm(dual_algorithm, primal, original_topology, instance): 40 | primal_steps = [] 41 | for step in reversed(dual_algorithm.steps): 42 | primal_sends = [(chunk, dst, src) for chunk, src, dst in step.sends] 43 | primal_steps.append(Step(step.rounds, primal_sends)) 44 | return Algorithm.make_implementation(primal, original_topology, instance, primal_steps) 45 | 46 | def wrap_try_ncd_reduction(solver_cls): 47 | class NonCombiningReductionWrapper(solver_cls): 48 | def __init__(self, topology, collective): 49 | self.primal = collective 50 | try: 51 | # Create the dual collective 52 | self.dual = non_combining_dual(collective) 53 | collective = self.dual 54 | 55 | # Solve the dual in the reverse topology 56 | self.original_topology = topology 57 | topology = reverse_topology(topology) 58 | except ReductionNotApplicableError: 59 | self.dual = None 60 | super().__init__(topology, collective) 61 | 62 | def solve(self, instance): 63 | algo = super().solve(instance) 64 | if self.dual != None and algo != None: 65 | return recover_primal_algorithm(algo, self.primal, self.original_topology, instance) 66 | else: 67 | return algo 68 | 69 | return NonCombiningReductionWrapper 70 | -------------------------------------------------------------------------------- /msccl/programs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /msccl/programs/allreduce_a100_ring.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.language import * 5 | 6 | # Ring all reduce for A100s 7 | # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs. 8 | # channels=1 is standard ring, all chunks are assigned to the same tb/channel 9 | # channels=8 devotes 1 tb/channel to handling 1 chunk of the data 10 | def allreduce_ring(size, channels): 11 | # Reduce ring 12 | for step in range(0, size-1): 13 | for index in range(0, size): 14 | rank = (index + step) % size 15 | next_rank = (index + step + 1) % size 16 | channel = index%channels 17 | c = chunk(next_rank, Buffer.input, index) 18 | c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel) 19 | # Propagate ring 20 | for step in range(-1, size-2): 21 | for index in range(0, size): 22 | rank = (index + step) % size 23 | c = chunk(rank, Buffer.input, index) 24 | next_rank = (index + step + 1) % size 25 | channel = index%channels 26 | c = c.copy(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel) -------------------------------------------------------------------------------- /msccl/programs/allreduce_allpairs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.language import * 5 | 6 | def allreduce_allpairs(size): 7 | # Each rank sends the nth chunk to the nth rank into scratch space 8 | for r1 in range(size): 9 | for r2 in range(size): 10 | if r1 != r2: 11 | index = r2 * size 12 | c = chunk(r1, Buffer.input, index, size=size) 13 | c.copy(r2, 'scratch', sendtb=r2, recvtb=r1) 14 | 15 | # Each rank performs a local reduction on the nth chunk 16 | # Utilize 8 threadblocks for this reduction for better parallelism 17 | for r in range(size): 18 | for index in range(0, size * (size-1)): 19 | c = chunk(r, Buffer.input, r*size + (index % size)) 20 | c.reduce(chunk(r, 'scratch', index), sendtb=(index % size)) 21 | 22 | # Each rank sends the fully reduced nth chunk to all other gpus 23 | for r1 in range(size): 24 | for r2 in range(size): 25 | if r1 != r2: 26 | index = r1 * size 27 | c = chunk(r1, Buffer.input, index, size) 28 | c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1) 29 | 30 | -------------------------------------------------------------------------------- /msccl/programs/alltoall_a100_8kp1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.language import * 5 | 6 | def alltoall_three_step(num_nodes, gpus_per_node, instances=1, ib_connections=1): 7 | num_ranks = num_nodes * gpus_per_node 8 | 9 | # (node, local gpu) to rank 10 | # (n, g) => r 11 | def RankFromNodeGpuPair(n, g): 12 | return n*gpus_per_node + g 13 | 14 | # For cross node traffic from node n1 to node n2, returns the ranks g 15 | # gpus on n1 and n2 that handle that traffic. 16 | def CrossNodeGpus(n1, n2): 17 | def LocalRank(n1, n2): 18 | return (n2 if n1 > n2 else n2-1) % gpus_per_node 19 | r1 = RankFromNodeGpuPair(n1, LocalRank(n1, n2)) 20 | r2 = RankFromNodeGpuPair(n2, LocalRank(n2, n1)) 21 | return (r1, r2) 22 | 23 | # Groups chunk reference into one large chunk reference (used for IB) 24 | # Save them under a key in the dictionary ib_chunks 25 | def AddChunk(ib_chunks, key, c): 26 | if key in ib_chunks: 27 | ib_chunks[key] = ib_chunks[key].group(c) 28 | else: 29 | ib_chunks[key] = c 30 | 31 | ib_chunks = {} # Keeps track of chunks going over IB buffer buffer name -> chunk 32 | for n1 in range(num_nodes): 33 | for g1 in range(gpus_per_node): 34 | for ch in range(instances): 35 | for n2 in range(num_nodes): 36 | r1 = RankFromNodeGpuPair(n1, g1) 37 | if (n1 != n2): 38 | # Send over all chunks destined for that node to the peer gpu that handles chunks to that node 39 | c = chunk(r1, Buffer.input, n2 * gpus_per_node * instances + ch * gpus_per_node, gpus_per_node) 40 | # Gather chunks destined for cross node ranks in scratch to route through IB 41 | gather_rank, _ = CrossNodeGpus(n1, n2) 42 | buffer_key = (n1, n2) 43 | # Send chunk to the gather_rank. Send returns a chunk reference to the 44 | # receiver's chunk 45 | c = c.copy(gather_rank, buffer=buffer_key, ch=ch*2) 46 | # Group the chunks using a particular IB pair into one large chunk reference 47 | AddChunk(ib_chunks, buffer_key, c) 48 | else: 49 | # Within a node - direct copy/copy the chunks over nvlink to the output buffer. 50 | # Use a different channel to ensure that we don't get in the way of copys/receives above 51 | # which are on the critical path. 52 | for g2 in range(gpus_per_node): 53 | r2 = RankFromNodeGpuPair(n2, g2) 54 | c = chunk(r1, Buffer.input, r2 * instances + ch) 55 | c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2) 56 | 57 | 58 | 59 | # IB Send and local scatters 60 | for buffer_key, ib_chunk in ib_chunks.items(): 61 | (n1, n2) = buffer_key 62 | _, scatter_rank = CrossNodeGpus(n1, n2) 63 | # IB copy divided across multiple parallel channels 64 | chunks = ib_chunk.split(ib_connections) 65 | for ch, c in enumerate(chunks): 66 | # Note: If we are only going to use 1 IB connection for each IB copy 67 | # alternate between channels 0 and 1 to utilize both IB links. 68 | if ib_connections == 1: 69 | ib_channel = c.rank % 2 70 | else: 71 | ib_channel = ch 72 | c = c.copy(scatter_rank, buffer=buffer_key, ch=ib_channel) 73 | # Local scatter 74 | cs = c.split(gpus_per_node * gpus_per_node) 75 | for i, c in enumerate(cs): 76 | # Access the chunk's destination rank and index to route it to its final place 77 | final_rank = c.get_dst_rank() 78 | index = c.get_dst_index() 79 | c.copy(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1) -------------------------------------------------------------------------------- /msccl/programs/alltoall_a100_yifan.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.language import * 5 | 6 | def alltoall_hierarchical(num_nodes, gpus_per_node): 7 | num_ranks = num_nodes * gpus_per_node 8 | 9 | for n1 in range(num_nodes): 10 | for r in range(1,num_nodes): 11 | n2 = (n1 + r) % num_nodes 12 | 13 | # Gather all local chunks for the node neighbor 14 | for g1 in range(gpus_per_node): 15 | rank1 = n1 * gpus_per_node + g1 16 | 17 | for g2 in range(gpus_per_node): 18 | rank2 = n1 * gpus_per_node + g2 19 | # chunk to copy: g2 on n2 20 | index = n2 * gpus_per_node + g2 21 | c = chunk(rank1, Buffer.input, index) 22 | c = c.copy(rank2, f'copy_{n2}') 23 | 24 | for r in range(1,num_nodes): 25 | n2 = (n1 + r) % num_nodes 26 | # IB copy 27 | for g1 in range(gpus_per_node): 28 | rank = n1 * gpus_per_node + g1 29 | ib_peer = n2 * gpus_per_node + g1 30 | c = chunk(rank, f'copy_{n2}', 0, 8) 31 | c = c.copy(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2) 32 | 33 | 34 | # Handle local chunks within a node 35 | for rank in range(num_ranks): 36 | for g in range(gpus_per_node): 37 | index = (rank // gpus_per_node) * gpus_per_node + g 38 | c = chunk(rank, Buffer.input, index) 39 | c.copy(c.get_dst_rank(), Buffer.output, c.get_dst_index()) 40 | -------------------------------------------------------------------------------- /msccl/rounds_bound.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.ncd_reduction import non_combining_dual 5 | from msccl.topologies import reverse_topology 6 | from z3 import * 7 | from fractions import Fraction 8 | 9 | def _flow(chunk, src, dst): 10 | return Real(f'flow_{chunk}_from_{src}_to_{dst}') 11 | 12 | def lower_bound_rounds(topology, collective, logging=False): 13 | ''' 14 | Solve a lower bound rounds required by any algorithm. Uses a multi-commodity feasibility inspired encoding to Z3. 15 | ''' 16 | 17 | opt = Optimize() 18 | 19 | # Remember names before possible non-combining dual reduction 20 | collective_name = collective.name 21 | topology_name = topology.name 22 | 23 | # Use non-combining dual if necessary 24 | if collective.is_combining: 25 | collective = non_combining_dual(collective) 26 | topology = reverse_topology(topology) 27 | 28 | chunks = collective.chunks() 29 | ranks = collective.ranks() 30 | 31 | for chunk in chunks: 32 | for rank in ranks: 33 | # All flows are between 0 and 1 34 | for dst in topology.destinations(rank): 35 | opt.add(_flow(chunk,rank,dst) >= 0) 36 | opt.add(_flow(chunk,rank,dst) <= 1) 37 | total_in = sum(_flow(chunk,src,rank) for src in topology.sources(rank)) 38 | if not collective.precondition(rank, chunk): 39 | # Ranks not in the precondition need to justify outflows 40 | for dst in topology.destinations(rank): 41 | opt.add(_flow(chunk,rank,dst) <= total_in) 42 | # Ranks in the postcondition, but not in the precondition need the whole chunk 43 | if collective.postcondition(rank, chunk): 44 | opt.add(total_in == 1) 45 | 46 | # Represents how many rounds all the steps of the algorithm would use 47 | rounds = Real(f'rounds') 48 | 49 | for srcs, dsts, bw, _ in topology.bandwidth_constraints(): 50 | # Sum of all flows relevant to this constraint 51 | sum_flow = sum(_flow(chunk,src,dst) for src in srcs for dst in dsts for chunk in chunks) 52 | # Total flow must be less than the limit, taking rounds into consideration 53 | opt.add(sum_flow <= bw * rounds) 54 | 55 | # Minimize the number of rounds 56 | min_rounds = opt.minimize(rounds) 57 | result = opt.check() 58 | if result == sat: 59 | bound_ref = opt.lower(min_rounds) 60 | if isinstance(bound_ref, IntNumRef): 61 | rounds_lb = Fraction(bound_ref.as_long(), 1) 62 | elif isinstance(bound_ref, RatNumRef): 63 | rounds_lb = bound_ref.as_fraction() 64 | else: 65 | raise RuntimeError(f'Unhandled Z3 numeral type: {type(bound_ref)}') 66 | if logging: 67 | print(f'{collective_name} algorithms need at least {rounds_lb} rounds in {topology_name} topology.') 68 | return rounds_lb 69 | else: 70 | if logging: 71 | if result == unsat: 72 | print(f'Unsat. {collective_name} is not implementable in {topology_name} topology.') 73 | else: 74 | assert result == unknown, 'Unhandled Z3 result' 75 | print('Unknown. Z3 was not able to solve the lower bound.') 76 | return None 77 | -------------------------------------------------------------------------------- /msccl/serialization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.algorithm import Algorithm, Step 5 | from msccl.topologies import Topology 6 | from msccl.instance import Instance 7 | from msccl.collectives import Collective, Chunk 8 | 9 | import json 10 | import warnings 11 | 12 | def _msccl_object_hook(o): 13 | if not 'msccl_type' in o: 14 | return o 15 | if o['msccl_type'] == 'algorithm': 16 | input_map = { int(k): set(v) for k, v in o['input_map'].items() } 17 | output_map = { int(k): set(v) for k, v in o['output_map'].items() } 18 | return Algorithm(o['name'], o['collective'], o['topology'], o['instance'], o['steps'], input_map, output_map) 19 | if o['msccl_type'] == 'step': 20 | sends = [(addr, src, dst) for addr, src, dst in o['sends']] 21 | return Step(o['rounds'], sends) 22 | if o['msccl_type'] == 'collective': 23 | triggers = { (int(r), int(c)): v for r, rmap in o['triggers'].items() for c, v in rmap.items() } 24 | return Collective(o['name'], o['nodes'], o['chunks'], triggers, o['runtime_name']) 25 | if o['msccl_type'] == 'chunk': 26 | pre = set(o['pre']) 27 | post = set(o['post']) 28 | return Chunk(pre, post, o['addr']) 29 | if o['msccl_type'] == 'topology': 30 | return Topology(o['name'], o['links'], o['switches']) 31 | if o['msccl_type'] == 'instance': 32 | return Instance(o['steps'], o['extra_rounds'], o['chunks'], o['pipeline'], o['extra_memory'], o['allow_exchange']) 33 | warnings.warn('Unhandled msccl_type in JSON') 34 | 35 | def MSCCLDecoder(): 36 | return json.JSONDecoder(object_hook=_msccl_object_hook) 37 | 38 | class MSCCLEncoder(json.JSONEncoder): 39 | def __init__(self): 40 | super().__init__() 41 | 42 | def default(self, o): 43 | if isinstance(o, Algorithm): 44 | input_map = { k: list(v) for k, v in o.input_map.items() } 45 | output_map = { k: list(v) for k, v in o.output_map.items() } 46 | return { 47 | 'msccl_type': 'algorithm', 48 | 'name': o.name, 49 | 'instance': o.instance, 50 | 'input_map': input_map, 51 | 'output_map': output_map, 52 | 'steps': o.steps, 53 | 'collective': o.collective, 54 | 'topology': o.topology, 55 | } 56 | if isinstance(o, Step): 57 | return { 58 | 'msccl_type': 'step', 59 | 'rounds': o.rounds, 60 | 'sends': o.sends, 61 | } 62 | if isinstance(o, Collective): 63 | triggers = {} 64 | for (r, c), v in o._triggers.items(): 65 | if not r in triggers: 66 | triggers[r] = {} 67 | triggers[r][c] = v 68 | return { 69 | 'msccl_type': 'collective', 70 | 'name': o.name, 71 | 'nodes': o.num_nodes, 72 | 'chunks': o._chunks, 73 | 'triggers': triggers, 74 | 'runtime_name': o.runtime_name, 75 | } 76 | if isinstance(o, Chunk): 77 | return { 78 | 'msccl_type': 'chunk', 79 | 'pre': list(o.precondition), 80 | 'post': list(o.postcondition), 81 | 'addr': o.address, 82 | } 83 | if isinstance(o, Topology): 84 | return { 85 | 'msccl_type': 'topology', 86 | 'name': o.name, 87 | 'switches': o.switches, 88 | 'links': o.links, 89 | } 90 | if isinstance(o, Instance): 91 | return { 92 | 'msccl_type': 'instance', 93 | 'steps': o.steps, 94 | 'extra_rounds': o.extra_rounds, 95 | 'chunks': o.chunks, 96 | 'pipeline': o.pipeline, 97 | 'extra_memory': o.extra_memory, 98 | 'allow_exchange': o.allow_exchange, 99 | } 100 | return json.JSONEncoder.default(self, o) 101 | 102 | def save_msccl_object(obj, filename): 103 | with open(filename, 'w') as f: 104 | f.write(MSCCLEncoder().encode(obj)) 105 | 106 | def load_msccl_object(filename): 107 | with open(filename) as f: 108 | return MSCCLDecoder().decode(f.read()) 109 | -------------------------------------------------------------------------------- /msccl/steps_bound.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import math 5 | 6 | def _distances(topology): 7 | # Floyd–Warshall algorithm for all-pairs shortest paths 8 | nodes = range(topology.num_nodes()) 9 | dist = [[math.inf for _ in nodes] for _ in nodes] 10 | for dst in nodes: 11 | for src in topology.sources(dst): 12 | dist[src][dst] = 1 13 | for node in nodes: 14 | dist[node][node] = 0 15 | for k in nodes: 16 | for i in nodes: 17 | for j in nodes: 18 | if dist[i][j] > dist[i][k] + dist[k][j]: 19 | dist[i][j] = dist[i][k] + dist[k][j] 20 | return dist 21 | 22 | def lower_bound_steps(topology, collective): 23 | ''' Finds a lower bound for the steps required as the maximum distance for a chunk from any of its sources. ''' 24 | 25 | dist = _distances(topology) 26 | 27 | # Find the maximum of the least steps required for each chunk 28 | least_steps = 0 29 | for chunk in collective.chunks(): 30 | for dst in collective.ranks(): 31 | if collective.postcondition(dst, chunk): 32 | # Find the shortest distance from some rank in the precondition 33 | least_distance = math.inf 34 | for src in collective.ranks(): 35 | if collective.precondition(src, chunk): 36 | least_distance = min(least_distance, dist[src][dst]) 37 | # Update the least steps required if the distance from any rank in the precondition is larger 38 | least_steps = max(least_steps, least_distance) 39 | 40 | if least_steps == math.inf: 41 | # Return None if the collective is unimplementable with any number of steps 42 | return None 43 | else: 44 | return least_steps 45 | -------------------------------------------------------------------------------- /msccl/strategies.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.instance import Instance 5 | from msccl.path_encoding import PathEncoding 6 | from msccl.rounds_bound import lower_bound_rounds 7 | from msccl.steps_bound import lower_bound_steps 8 | 9 | import time 10 | import math 11 | from fractions import Fraction 12 | import itertools 13 | from collections import defaultdict 14 | 15 | def _solve_and_log(encoding, instance, logging): 16 | if logging: 17 | print(f'Solving instance {instance}... ', end='', flush=True) 18 | 19 | start_time = time.time() 20 | result = encoding.solve(instance) 21 | duration = time.time() - start_time 22 | 23 | if logging: 24 | if result != None: 25 | print(f'synthesized! ({duration:.1f}s)') 26 | else: 27 | print(f'unsatisfiable. ({duration:.1f}s)') 28 | 29 | return result 30 | 31 | def solve_instance(topology, collective, instance, logging = False): 32 | encoding = PathEncoding(topology, collective) 33 | return _solve_and_log(encoding, instance, logging) 34 | 35 | def solve_least_steps(topology, collective, initial_steps = 1, base_instance = Instance(None), logging = False): 36 | if initial_steps < 1: 37 | raise ValueError('initial_steps must be strictly positive') 38 | 39 | encoding = PathEncoding(topology, collective) 40 | 41 | # Lower bound the number of steps required 42 | steps_lb = lower_bound_steps(topology, collective) 43 | if steps_lb == None: 44 | if logging: 45 | raise ValueError('The collective is unimplementable in this topology.') 46 | if logging: 47 | print(f'Algorithms need at least {steps_lb} steps.') 48 | 49 | num_steps = max(initial_steps, steps_lb) 50 | if num_steps > steps_lb: 51 | result = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging) 52 | if result != None: 53 | if logging: 54 | print('Synthesized on initial guess. Checking for fewer steps.') 55 | while num_steps > steps_lb: 56 | num_steps -= 1 57 | maybe_better = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging) 58 | if maybe_better != None: 59 | result = maybe_better 60 | else: 61 | break 62 | return result 63 | else: 64 | num_steps += 1 65 | 66 | while True: 67 | result = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging) 68 | if result != None: 69 | return result 70 | else: 71 | num_steps += 1 72 | 73 | def solve_all_latency_bandwidth_tradeoffs(topology, collective, min_chunks = 1, max_chunks = None, assume_rounds_per_chunk_lb = None, assume_monotonic_feasibility = False, base_instance = Instance(None), logging = False): 74 | if min_chunks < 1: 75 | raise ValueError('min_chunks must be strictly positive.') 76 | if max_chunks != None and max_chunks < min_chunks: 77 | raise ValueError('max_chunks must be greater or equal to min_chunks.') 78 | if assume_rounds_per_chunk_lb != None and assume_rounds_per_chunk_lb < 0: 79 | raise ValueError('assume_rounds_per_chunk_lb must be positive.') 80 | 81 | # Lower bound the number of steps required 82 | steps_lb = lower_bound_steps(topology, collective) 83 | if logging: 84 | print(f'Algorithms need at least {steps_lb} steps.') 85 | 86 | # Lower bound the number of rounds per unit of chunkiness required 87 | if assume_rounds_per_chunk_lb != None: 88 | rounds_per_chunk_lb = assume_rounds_per_chunk_lb 89 | if logging: 90 | print(f'Assuming algorithms need at least {rounds_per_chunk_lb} rounds per chunk.') 91 | else: 92 | rounds_per_chunk_lb = lower_bound_rounds(topology, collective) 93 | if logging: 94 | print(f'Algorithms need at least {rounds_per_chunk_lb} rounds per chunk.') 95 | 96 | # Remember for which rounds per chunk fraction a given number of steps will be unsat 97 | step_rpc_lb = defaultdict(lambda: Fraction(0)) 98 | 99 | chunks_iter = range(min_chunks, max_chunks+1) if max_chunks != None else itertools.count(min_chunks) 100 | 101 | algorithms = [] 102 | for chunks in chunks_iter: 103 | encoding = PathEncoding(topology, collective) 104 | rounds_lb = math.ceil(rounds_per_chunk_lb * chunks) 105 | 106 | rounds = rounds_lb - 1 107 | found = False 108 | while not found: 109 | rounds += 1 110 | rpc = Fraction(rounds, chunks) 111 | # Skip this fraction if a lower number of chunks will have already considered it 112 | if math.gcd(chunks, rounds) != 1: 113 | continue 114 | for steps in range(steps_lb, rounds+1): 115 | # Skip this number of steps if a previous instance with stricter rounds per chunk already failed 116 | if assume_monotonic_feasibility and rpc < step_rpc_lb[steps]: 117 | continue 118 | instance = base_instance.set(steps=steps, extra_rounds=rounds - steps, chunks=chunks) 119 | result = _solve_and_log(encoding, instance, logging=logging) 120 | if result != None: 121 | assert rpc >= step_rpc_lb[steps], 'Monotonic feasibility assumption would have been violated.' 122 | found = True 123 | yield result 124 | break 125 | else: 126 | # Update the rounds per chunk for which this number of steps is not sufficient 127 | step_rpc_lb[steps] = max(step_rpc_lb[steps], rpc) 128 | if logging and assume_monotonic_feasibility: 129 | print(f'Assuming {steps} step algorithms need at least {rpc} rounds per chunk.') 130 | # Check if a bandwidth optimal algorithm has been found 131 | if found and rpc <= rounds_per_chunk_lb: 132 | assert rpc == rounds_per_chunk_lb, 'Rounds per chunk lower bound did not hold.' 133 | if logging: 134 | print(f'Bandwidth optimal algorithm found!') 135 | break 136 | else: 137 | if logging: 138 | print(f'Reached the limit for chunks.') 139 | 140 | def _steps(algo): 141 | return len(algo.steps) 142 | 143 | def _rpc(algo): 144 | return Fraction(_steps(algo) + algo.extra_rounds(), algo.instance.chunks) 145 | 146 | def prune_pareto_optimal(algorithms): 147 | efficient_algorithms = [] 148 | for i, algo in enumerate(algorithms): 149 | is_efficient = True 150 | for j, other in enumerate(algorithms): 151 | either_worse = _steps(algo) > _steps(other) or _rpc(algo) > _rpc(other) 152 | neither_better = _steps(algo) >= _steps(other) and _rpc(algo) >= _rpc(other) 153 | if either_worse and neither_better: 154 | is_efficient = False 155 | break 156 | if is_efficient: 157 | efficient_algorithms.append(algo) 158 | 159 | return efficient_algorithms 160 | -------------------------------------------------------------------------------- /msccl/topologies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .generic import * 5 | from .transformers import * 6 | from .amd import * 7 | from .nvidia import * 8 | from .distributed import * 9 | -------------------------------------------------------------------------------- /msccl/topologies/amd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .topology import Topology 5 | 6 | def amd4(): 7 | links = [ 8 | [0, 1, 1, 0], 9 | [1, 0, 0, 1], 10 | [1, 0, 0, 1], 11 | [0, 1, 1, 0] 12 | ] 13 | return Topology('AMD4', links) 14 | 15 | def amd8(): 16 | links = [ 17 | [0, 5, 6, 6, 5, 6, 5, 5], 18 | [5, 0, 5, 5, 6, 5, 6, 6], 19 | [6, 5, 0, 6, 5, 6, 5, 5], 20 | [6, 5, 6, 0, 5, 6, 5, 5], 21 | [5, 6, 5, 5, 0, 5, 6, 6], 22 | [6, 5, 6, 6, 5, 0, 5, 5], 23 | [5, 6, 5, 5, 6, 5, 0, 6], 24 | [5, 6, 5, 5, 6, 5, 6, 0] 25 | ] 26 | return Topology('AMD8', links) 27 | -------------------------------------------------------------------------------- /msccl/topologies/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .topology import Topology 5 | 6 | def _copy_links(remote_bw, num_local, num_dist, local_links): 7 | return [[remote_bw if src // num_local != dst // num_local else local_links[dst % num_local][src % num_local] 8 | for src in range(num_dist)] for dst in range(num_dist)] 9 | 10 | def _copy_switches(num_local, num_copies, local_switches): 11 | switches = [] 12 | for srcs, dsts, bw, name in local_switches: 13 | for i in range(num_copies): 14 | dist_srcs = [src + i * num_local for src in srcs] 15 | dist_dsts = [dst + i * num_local for dst in dsts] 16 | switches.append((dist_srcs, dist_dsts, bw, f'copy_{i}_{name}_local')) 17 | return switches 18 | 19 | def distributed_fully_connected(local_topology, num_copies, remote_bw): 20 | num_local = local_topology.num_nodes() 21 | num_dist = num_local * num_copies 22 | 23 | links = _copy_links(remote_bw, num_local, num_dist, local_topology.links) 24 | switches = _copy_switches(num_local, num_copies, local_topology.switches) 25 | 26 | return Topology(f'DistributedFullyConnected(local={local_topology.name},copies={num_copies},bw={remote_bw})', links, switches) 27 | 28 | def distributed_hub_and_spoke(local_topology, num_copies, remote_bw): 29 | num_local = local_topology.num_nodes() 30 | num_dist = num_local * num_copies 31 | 32 | links = _copy_links(remote_bw, num_local, num_dist, local_topology.links) 33 | switches = _copy_switches(num_local, num_copies, local_topology.switches) 34 | 35 | for i in range(num_copies): 36 | local_ranks = [j + i * num_local for j in range(num_local)] 37 | remote_ranks = [k for k in range(num_dist) if k // num_local != i] 38 | switches.append((local_ranks, remote_ranks, remote_bw, f'copy_{i}_out_remote')) 39 | switches.append((remote_ranks, local_ranks, remote_bw, f'copy_{i}_in_remote')) 40 | 41 | return Topology(f'DistributedHubAndSpoke(local={local_topology.name},copies={num_copies},bw={remote_bw})', links, switches) 42 | -------------------------------------------------------------------------------- /msccl/topologies/generic.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .topology import Topology 5 | 6 | def hub_and_spoke(num_nodes): 7 | links = [[0 if x==y else 1 for y in range(num_nodes)] for x in range(num_nodes)] 8 | switches = [] 9 | for node in range(num_nodes): 10 | others = [other for other in range(num_nodes) if other != node] 11 | switches.append(([node],others,1,f'node_{node}_out')) 12 | switches.append((others,[node],1,f'node_{node}_in')) 13 | return Topology(f'HubAndSpoke(n={num_nodes})', links, switches) 14 | 15 | def fully_connected(num_nodes): 16 | links = [] 17 | for i in range(num_nodes): 18 | row = [1] * num_nodes 19 | row[i] = 0 20 | links.append(row) 21 | return Topology(f'FullyConnected(n={num_nodes})', links) 22 | 23 | def ring(num_nodes): 24 | links = [] 25 | for i in range(num_nodes): 26 | row = [0] * num_nodes 27 | row[(i+1) % num_nodes] = 1 28 | row[(i-1) % num_nodes] = 1 29 | links.append(row) 30 | return Topology(f'Ring(n={num_nodes})', links) 31 | 32 | def line(num_nodes): 33 | links = [] 34 | for i in range(num_nodes): 35 | row = [0] * num_nodes 36 | if i - 1 >= 0: 37 | row[i-1] = 1 38 | if i + 1 < num_nodes: 39 | row[i+1] = 1 40 | links.append(row) 41 | return Topology(f'Line(n={num_nodes})', links) 42 | 43 | def star(num_nodes, non_blocking=True): 44 | links = [[0 if i == 0 else 1 for i in range(num_nodes)]] 45 | for i in range(1, num_nodes): 46 | links.append([1 if j == 0 else 0 for j in range(num_nodes)]) 47 | switches = [] 48 | if not non_blocking: 49 | points = [i for i in range(num_nodes) if i != 0] 50 | switches.append(([0],points,1,f'to_points')) 51 | switches.append((points,[0],1,f'from_points')) 52 | return Topology(f'Star(n={num_nodes})', links, switches) 53 | -------------------------------------------------------------------------------- /msccl/topologies/nvidia.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .topology import Topology 5 | 6 | from fractions import Fraction 7 | import subprocess 8 | 9 | def dgx1(): 10 | # (0 1 2 3) (4 5 6 7) are two sockets 11 | # 0 1 3 2 is the high bandwidth chain in socket 1 12 | # 4 5 7 6 is the high bandwidth chain in socket 2 13 | # 0 4 and 2 6 are high bandwidth intersocket links 14 | 15 | links = [ 16 | #0 1 2 3 4 5 6 7 17 | [0, 2, 1, 1, 2, 0, 0, 0], 18 | [2, 0, 1, 2, 0, 1, 0, 0], 19 | [1, 1, 0, 2, 0, 0, 2, 0], 20 | [1, 2, 2, 0, 0, 0, 0, 1], 21 | [2, 0, 0, 0, 0, 2, 1, 1], 22 | [0, 1, 0, 0, 2, 0, 1, 2], 23 | [0, 0, 2, 0, 1, 1, 0, 2], 24 | [0, 0, 0, 1, 1, 2, 2, 0] 25 | ] 26 | 27 | # self.symmetries = [ 28 | # [0, 1, 2, 3, 4, 5, 6, 7], #0 goes to itself 29 | # [0, 1, 2, 3, 4, 5, 6, 7], #1 goes to itself 30 | # [2, 3, 0, 1, 6, 7, 4, 5], #2 goes to 0, 3 goes to 1, ... top - bottom symmetry 31 | # [2, 3, 0, 1, 6, 7, 4, 5], #3 goes to 1, 2 goes to 0, ... top - bottom symmetry 32 | # [4, 5, 6, 7, 0, 1, 2, 3], #4 goes to 0, 5 goes to 1, ... left - right symmetry 33 | # [4, 5, 6, 7, 0, 1, 2, 3], #5 goes to 1, 4 goes to 0, ... left - right symmetry 34 | # [6, 7, 4, 5, 2, 3, 0, 1], #6 goes to 0, 7 goes to 1, ... top-bottom + left-right 35 | # [6, 7, 4, 5, 2, 3, 0, 1] #7 goes to 1, 6 goes to 0, ... top-bottom + left-right 36 | # ] 37 | 38 | # self.beta_bound = Fraction(7,6) 39 | # self.diameter = 2 40 | 41 | return Topology('DGX1', links) 42 | 43 | def dgx_a100(): 44 | links = [[12]*8 for i in range(8)] 45 | for i in range(8): 46 | links[i][i] = 0 47 | 48 | return Topology('DGX_A100', links) 49 | 50 | def nvlink_only(nvidia_smi_topo=None): 51 | if nvidia_smi_topo == None: 52 | nvidia_smi_topo = _get_nvidia_smi_topo() 53 | links = _parse_nvidia_smi_topo(nvidia_smi_topo) 54 | return Topology('NVLinkOnly', links) 55 | 56 | def _get_nvidia_smi_topo(): 57 | output = subprocess.check_output("nvidia-smi topo -m".split()) 58 | return output.decode("utf-8") 59 | 60 | def _parse_nvidia_smi_topo(output): 61 | lines = output.splitlines() 62 | before_legend = [] 63 | for l in lines[1:]: 64 | if l and l.startswith("GPU"): 65 | # Only look at the rows for GPU 66 | before_legend.append(l) 67 | else: 68 | break 69 | devices = [x.split("\t")[0] for x in before_legend] 70 | gpus = [i for i in range(len(before_legend)) 71 | if before_legend[i].startswith("GPU")] 72 | matrix = [x.split("\t")[1:] for x in before_legend] 73 | nvlink_matrix = [[_nvlink_num(x[g]) for g in gpus] for x in matrix] 74 | return nvlink_matrix 75 | 76 | def _nvlink_num(x): 77 | x = x.strip() 78 | if x.startswith("NV"): 79 | return int(x[2:]) 80 | else: 81 | return 0 82 | -------------------------------------------------------------------------------- /msccl/topologies/topology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | class Topology(object): 5 | def __init__(self, name, links, switches=[]): 6 | self.name = name 7 | self.links = links 8 | self.switches = switches 9 | for srcs, dsts, bw, switch_name in switches: 10 | if bw == 0: 11 | raise ValueError(f'Switch {switch_name} has zero bandwidth, but switch bandwidths must be strictly positive. Please encode connectedness in links.') 12 | if bw < 0: 13 | raise ValueError(f'Switch {switch_name} has a negative bandwidth of {bw}. Bandwidth must be strictly positive.') 14 | 15 | def sources(self, dst): 16 | for src, bw in enumerate(self.links[dst]): 17 | if bw > 0: 18 | yield src 19 | 20 | def destinations(self, src): 21 | for dst, links in enumerate(self.links): 22 | bw = links[src] 23 | if bw > 0: 24 | yield dst 25 | 26 | def link(self, src, dst): 27 | return self.links[dst][src] 28 | 29 | def num_nodes(self): 30 | return len(self.links) 31 | 32 | def nodes(self): 33 | return range(self.num_nodes()) 34 | 35 | def bandwidth_constraints(self): 36 | for dst, dst_links in enumerate(self.links): 37 | for src, bw in enumerate(dst_links): 38 | if bw > 0: 39 | yield ([src], [dst], bw, f'{src}→{dst}') 40 | for srcs, dsts, bw, switch_name in self.switches: 41 | yield (srcs, dsts, bw, switch_name) 42 | -------------------------------------------------------------------------------- /msccl/topologies/transformers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .topology import Topology 5 | 6 | def reverse_topology(topology): 7 | ''' 8 | Reverses the direction of all links and switches in the topology. 9 | ''' 10 | num_nodes = topology.num_nodes() 11 | # Transpose the links 12 | links = [[topology.links[src][dst] for src in range(num_nodes)] for dst in range(num_nodes)] 13 | # Reverse the switches 14 | switches = [(dsts, srcs, bw, f'{name}_reversed') for srcs, dsts, bw, name in topology.switches] 15 | return Topology(f'Reverse{topology.name}', links, switches) 16 | 17 | def binarize_topology(topology): 18 | ''' 19 | Makes all link bandwidths 1 and removes all switches. Essentially, the bandwidth modeling part of the topology 20 | is stripped out and only connectivity information is kept. 21 | ''' 22 | num_nodes = topology.num_nodes() 23 | links = [[1 if topology.links[src][dst] > 0 else 0 for src in range(num_nodes)] for dst in range(num_nodes)] 24 | return Topology(f'Binarized{topology.name}', links, []) 25 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --cov=msccl --cov-report term-missing:skip-covered --cov-fail-under 90 -n auto 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dataclasses; python_version < "3.7" 2 | z3-solver 3 | argcomplete 4 | lxml 5 | humanfriendly 6 | tabulate 7 | pytest 8 | pytest-cov 9 | pytest-xdist 10 | -e . 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from setuptools import setup, find_packages 5 | 6 | setup( 7 | name='msccl', 8 | version='2.3.0', 9 | packages=find_packages(), 10 | entry_points={ 11 | 'console_scripts': [ 12 | 'msccl = msccl.__main__:main', 13 | ], 14 | }, 15 | scripts = [ 16 | 'msccl/autosynth/msccl_ndv2_launcher.sh' 17 | ], 18 | install_requires=[ 19 | 'dataclasses; python_version < "3.7"', 20 | 'z3-solver', 21 | 'argcomplete', 22 | 'lxml', 23 | 'humanfriendly', 24 | 'tabulate', 25 | 'igraph' 26 | ], 27 | python_requires='>=3.6', 28 | ) 29 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.collectives import * 5 | 6 | def null_collective(num_nodes): 7 | return build_collective(f'Null(n={num_nodes})', num_nodes, 1, 8 | lambda r, c: True, lambda r, c: False) 9 | 10 | def impossible_collective(num_nodes): 11 | return build_collective(f'Impossible(n={num_nodes})', num_nodes, 1, 12 | lambda r, c: False, lambda r, c: True) 13 | -------------------------------------------------------------------------------- /tests/test_algorithm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | from .common import * 6 | from msccl.algorithm import Algorithm, Step 7 | from msccl.topologies import fully_connected 8 | from msccl.instance import Instance 9 | 10 | def test_invalid_empty(): 11 | with pytest.raises(RuntimeError): 12 | num_nodes = 2 13 | topo = fully_connected(num_nodes) 14 | algo = Algorithm.make_implementation(impossible_collective(num_nodes), topo, Instance(1), [Step(1,[])]) 15 | 16 | def test_valid_empty(): 17 | num_nodes = 2 18 | topo = fully_connected(num_nodes) 19 | algo = Algorithm.make_implementation(null_collective(num_nodes), topo, Instance(1), [Step(1,[])]) 20 | assert algo != None 21 | -------------------------------------------------------------------------------- /tests/test_analyses.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | from msccl.topologies import Topology 6 | from msccl.collectives import build_collective 7 | from msccl.rounds_bound import * 8 | 9 | def test_rounds_bound_unimplementable(): 10 | topo = Topology('Unconnected', [[0,0],[0,0]]) 11 | coll = build_collective('Send', 2, 1, lambda r, c: r == 0, lambda r, c: r == 1) 12 | assert lower_bound_rounds(topo, coll) == None 13 | -------------------------------------------------------------------------------- /tests/test_autosynth.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import msccl 6 | import os 7 | from msccl.autosynth.registry import register_synthesis_plan 8 | 9 | 10 | def test_msccl_init(capsys): 11 | msccl.init('not_a_machine_type', 4, ('alltoall', 0)) 12 | out, err = capsys.readouterr() 13 | assert 'No plan found' in out 14 | assert not 'MSCCL_CONFIG' in os.environ 15 | assert 'NCCL_ALGO' not in os.environ 16 | 17 | msccl.init('ndv2', 2, ('alltoall', '1MB')) 18 | out, err = capsys.readouterr() 19 | assert 'synthesize_ndv2_relay_alltoall' in out 20 | assert 'MSCCL_CONFIG' in os.environ 21 | assert 'NCCL_IB_AR_THRESHOLD' not in os.environ 22 | assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'MSCCL,RING,TREE' 23 | 24 | os.environ['NCCL_ALGO'] = 'RING,FAKE_MSCCL' 25 | msccl.init('ndv4', 8, (msccl.Collective.alltoall, '2MB')) 26 | out, err = capsys.readouterr() 27 | assert 'ndv4_alltoall' in out 28 | assert 'NCCL_IB_AR_THRESHOLD' in os.environ 29 | assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'MSCCL,RING,FAKE_MSCCL' 30 | 31 | os.environ['NCCL_ALGO'] = 'HELLO,MSCCL,WORLD' 32 | msccl.init('ndv4', 16, (msccl.Collective.alltoall, '35MB')) 33 | out, err = capsys.readouterr() 34 | assert 'ndv4_alltoall' in out 35 | assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'HELLO,MSCCL,WORLD' 36 | 37 | 38 | def test_register_plan(): 39 | @register_synthesis_plan('allgather', 'fancy_machine', sizes=(0, '4MB')) 40 | def dummy_plan(m, s): 41 | pass 42 | 43 | @register_synthesis_plan('allgather', ['m1', 'm2'], sizes=[(0, '4MB'), ('1GiB', None)]) 44 | def dummy_plan(m, s): 45 | pass 46 | -------------------------------------------------------------------------------- /tests/test_distributors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .common import * 5 | from msccl.topologies import fully_connected, ring, distributed_fully_connected 6 | from msccl.collectives import alltoall 7 | from msccl.instance import Instance 8 | from msccl.path_encoding import PathEncoding 9 | from msccl.distributors import * 10 | 11 | 12 | def test_greedy_alltoall(): 13 | num_nodes = 2 14 | num_copies = 2 15 | local_topo = fully_connected(num_nodes) 16 | encoding = PathEncoding(local_topo, alltoall(num_nodes)) 17 | local_algo = encoding.solve(Instance(1)) 18 | dist_topo = distributed_fully_connected(local_topo, num_copies, remote_bw=1) 19 | dist_algo = synthesize_greedy_distributed_alltoall(dist_topo, local_algo) 20 | dist_algo.check_implements(alltoall(num_nodes * num_copies)) 21 | 22 | def test_alltoall_subproblem(): 23 | num_nodes = 2 24 | num_copies = 2 25 | local_topo = ring(num_nodes) 26 | sub_coll, sub_topo = make_alltoall_subproblem_collective_and_topology(local_topo, num_copies, [0]) 27 | encoding = PathEncoding(sub_topo, sub_coll) 28 | sub_algo = encoding.solve(Instance(3, extra_rounds=1)) 29 | dist_algo = synthesize_alltoall_subproblem(sub_algo, num_copies) 30 | dist_algo.check_implements(alltoall(num_nodes * num_copies)) 31 | -------------------------------------------------------------------------------- /tests/test_path_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.path_encoding import PathEncoding 5 | from msccl.topologies import fully_connected, line, dgx1 6 | from msccl.collectives import * 7 | from msccl.instance import Instance 8 | 9 | def test_fc_noncombining(): 10 | num_nodes = 2 11 | enc = PathEncoding(fully_connected(num_nodes), allgather(num_nodes)) 12 | assert enc.solve(Instance(1, chunks=2)) == None 13 | assert enc.solve(Instance(2, chunks=2)) != None 14 | 15 | def test_fc_combining_reducible(): 16 | num_nodes = 2 17 | enc = PathEncoding(fully_connected(num_nodes), reduce_scatter(num_nodes)) 18 | assert enc.solve(Instance(1, chunks=2)) == None 19 | assert enc.solve(Instance(2, chunks=2)) != None 20 | 21 | def test_fc_combining_nonreducible(): 22 | num_nodes = 2 23 | enc = PathEncoding(fully_connected(num_nodes), allreduce(num_nodes)) 24 | assert enc.solve(Instance(1, chunks=2)) == None 25 | assert enc.solve(Instance(2, chunks=2)) != None 26 | 27 | def test_dgx1_noncombining(): 28 | topo = dgx1() 29 | enc = PathEncoding(topo, allgather(topo.num_nodes())) 30 | assert enc.solve(Instance(1)) == None 31 | assert enc.solve(Instance(2)) != None 32 | 33 | def test_dgx1_combining_reducible(): 34 | topo = dgx1() 35 | enc = PathEncoding(topo, reduce_scatter(topo.num_nodes())) 36 | assert enc.solve(Instance(1)) == None 37 | assert enc.solve(Instance(2)) != None 38 | 39 | def test_dgx1_combining_nonreducible(): 40 | topo = dgx1() 41 | enc = PathEncoding(topo, allreduce(topo.num_nodes())) 42 | assert enc.solve(Instance(1)) == None 43 | assert enc.solve(Instance(2)) != None 44 | 45 | def test_memory_constraint(): 46 | topo = line(3) 47 | enc = PathEncoding(topo, alltoall(topo.num_nodes())) 48 | assert enc.solve(Instance(2, extra_memory=0)) == None 49 | assert enc.solve(Instance(2, extra_memory=1)) != None 50 | -------------------------------------------------------------------------------- /tests/test_programs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import msccl 5 | from msccl.topologies import fully_connected 6 | from msccl.language.collectives import * 7 | import os 8 | import pytest 9 | 10 | def test_registered_alltoall_yifan(): 11 | from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical 12 | 13 | num_nodes = 4 14 | gpus_per_node = 8 15 | num_ranks = num_nodes * gpus_per_node 16 | topology = fully_connected(num_ranks) 17 | collective = AllToAll(num_ranks, 1, inplace=False) 18 | with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1): 19 | alltoall_hierarchical(num_nodes, gpus_per_node) 20 | assert Check() 21 | 22 | def test_registered_alltoall_8kp1(): 23 | from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step 24 | 25 | num_nodes = 9 26 | gpus_per_node = 8 27 | num_ranks = num_nodes * gpus_per_node 28 | topology = fully_connected(num_ranks) 29 | collective = AllToAll(num_ranks, 1, inplace=False) 30 | with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1): 31 | alltoall_three_step(num_nodes, gpus_per_node) 32 | assert Check() 33 | XML() 34 | 35 | def test_registered_allreduce_ring(): 36 | from msccl.programs.allreduce_a100_ring import allreduce_ring 37 | 38 | num_ranks = 8 39 | instances = 4 40 | topology = fully_connected(num_ranks) 41 | collective = AllReduce(num_ranks, num_ranks, inplace=True) 42 | with MSCCLProgram(f"allreduce_ring", topology, collective, instances, 43 | protocol="LL128", threadblock_policy=ThreadblockPolicy.manual): 44 | allreduce_ring(num_ranks, num_ranks) 45 | assert Check() 46 | XML() 47 | 48 | def test_registered_allreduce_allpairs(): 49 | from msccl.programs.allreduce_allpairs import allreduce_allpairs 50 | 51 | num_ranks = 8 52 | instances = 2 53 | topology = fully_connected(num_ranks) 54 | collective = AllReduce(num_ranks, num_ranks*num_ranks, inplace=True) 55 | with MSCCLProgram(f"allreduce_allpairs", topology, collective, instances, 56 | protocol="LL", threadblock_policy=ThreadblockPolicy.manual): 57 | allreduce_allpairs(num_ranks) 58 | assert Check() 59 | XML() 60 | 61 | def test_registered_ndv4_allreduce(capsys): 62 | msccl.init('ndv4', 1, (msccl.Collective.allreduce, (512, 1024))) 63 | out, err = capsys.readouterr() 64 | assert 'ndv4_allpairs_allreduce_config1 with LL protocol' in out 65 | 66 | msccl.init('ndv4', 1, (msccl.Collective.allreduce, (82944, 458752))) 67 | out, err = capsys.readouterr() 68 | assert 'ndv4_allpairs_allreduce_config2 with LL protocol' in out 69 | 70 | msccl.init('ndv4', 1, (msccl.Collective.allreduce, (458752, 2129920))) 71 | out, err = capsys.readouterr() 72 | assert 'ndv4_ring_allreduce_config1 with LL protocol' in out 73 | 74 | msccl.init('ndv4', 1, (msccl.Collective.allreduce, (2129920, 22806528))) 75 | out, err = capsys.readouterr() 76 | assert 'ndv4_ring_allreduce_config2 with LL128 protocol' in out 77 | 78 | 79 | def test_registered_ndv4_alltoall(capsys): 80 | msccl.init('ndv4', 8, (msccl.Collective.alltoall, ('1MB', '32MB'))) 81 | out, err = capsys.readouterr() 82 | assert 'ndv4_alltoall_hierarchical_config1 with LL128 protocol' in out 83 | 84 | msccl.init('ndv4', 8, (msccl.Collective.alltoall, ('32MB', '64MB'))) 85 | out, err = capsys.readouterr() 86 | assert 'ndv4_alltoall_hierarchical_config2 with Simple protocol' in out 87 | 88 | # msccl.init('ndv4', 64, (msccl.Collective.alltoall, ('32MB', '64MB'))) 89 | # out, err = capsys.readouterr() 90 | # assert 'ndv4_alltoall_three_step with Simple protocol' in out 91 | -------------------------------------------------------------------------------- /tests/test_serialization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .common import * 5 | from msccl.serialization import MSCCLEncoder, MSCCLDecoder 6 | from msccl.algorithm import Algorithm, Step 7 | from msccl.topologies import fully_connected 8 | from msccl.instance import Instance 9 | 10 | def test_algorithm_roundtrip(): 11 | name = 'test_algorithm' 12 | num_nodes = 2 13 | collective = null_collective(num_nodes) 14 | topo = fully_connected(num_nodes) 15 | steps = [Step(1,[(0,0,1)]),Step(2,[(1,1,0),(1,0,1)]),Step(1,[(0,1,0)])] 16 | instance = Instance(3, pipeline=2) 17 | algo1 = Algorithm(name, collective, topo, instance, steps) 18 | json = MSCCLEncoder().encode(algo1) 19 | assert json != None 20 | 21 | algo2 = MSCCLDecoder().decode(json) 22 | assert algo2.name == name 23 | assert algo2.instance == instance 24 | assert algo2.steps == steps 25 | -------------------------------------------------------------------------------- /tests/test_topologies.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from msccl.topologies import * 5 | 6 | def test_local_topologies(): 7 | assert hub_and_spoke(4) != None 8 | assert fully_connected(6) != None 9 | assert ring(3) != None 10 | assert line(5) != None 11 | assert star(6) != None 12 | assert dgx1() != None 13 | assert amd4() != None 14 | assert amd8() != None 15 | 16 | def test_distributed_topologies(): 17 | assert distributed_fully_connected(ring(4), 2, 1) != None 18 | assert distributed_hub_and_spoke(star(6), 4, 2) != None 19 | 20 | def test_transformers(): 21 | assert binarize_topology(dgx1()) != None 22 | assert reverse_topology(dgx1()) != None 23 | 24 | def test_nvlink_only(): 25 | dgx1_topo = ''' GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_2 mlx5_1 mlx5_3 CPU Affinity 26 | GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS PIX SYS PHB SYS 0-19,40-59 27 | GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS PIX SYS PHB SYS 0-19,40-59 28 | GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PHB SYS PIX SYS 0-19,40-59 29 | GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PHB SYS PIX SYS 0-19,40-59 30 | GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS PIX SYS PHB 20-39,60-79 31 | GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS PIX SYS PHB 20-39,60-79 32 | GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS PHB SYS PIX 20-39,60-79 33 | GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS PHB SYS PIX 20-39,60-79 34 | mlx5_0 PIX PIX PHB PHB SYS SYS SYS SYS X SYS PHB SYS 35 | mlx5_2 SYS SYS SYS SYS PIX PIX PHB PHB SYS X SYS PHB 36 | mlx5_1 PHB PHB PIX PIX SYS SYS SYS SYS PHB SYS X SYS 37 | mlx5_3 SYS SYS SYS SYS PHB PHB PIX PIX SYS PHB SYS X 38 | 39 | Legend: 40 | 41 | X = Self 42 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 43 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 44 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 45 | PXB = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge) 46 | PIX = Connection traversing a single PCIe switch 47 | NV# = Connection traversing a bonded set of # NVLinks''' 48 | topo = nvlink_only(dgx1_topo) 49 | assert topo != None 50 | assert topo.num_nodes() == 8 51 | --------------------------------------------------------------------------------