├── Dockerfile ├── LICENSE ├── README.md ├── nccl-tests ├── .gitignore ├── LICENSE.txt ├── Makefile ├── README.md ├── build │ ├── all_gather_perf │ ├── all_reduce_perf │ ├── alltoall_perf │ ├── broadcast_perf │ ├── gather_perf │ ├── hypercube_perf │ ├── reduce_perf │ ├── reduce_scatter_perf │ ├── scatter_perf │ ├── sendrecv_perf │ ├── timer.o │ └── verifiable │ │ └── verifiable.o ├── doc │ └── PERFORMANCE.md ├── src │ ├── Makefile │ ├── all_gather.cu │ ├── all_reduce.cu │ ├── alltoall.cu │ ├── broadcast.cu │ ├── common.cu │ ├── common.h │ ├── gather.cu │ ├── hypercube.cu │ ├── nccl1_compat.h │ ├── reduce.cu │ ├── reduce_scatter.cu │ ├── scatter.cu │ ├── sendrecv.cu │ ├── timer.cc │ └── timer.h └── verifiable │ ├── Makefile │ ├── inexact_regress.cu │ ├── verifiable.cu │ ├── verifiable.h │ └── verifiable.mk ├── sources.list ├── sshd_config └── start.sh /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:23.11-py3 2 | 3 | WORKDIR / 4 | 5 | # change the download source of apt, comment it out if you are abroad 6 | COPY sources.list /etc/apt/sources.list 7 | RUN apt-get update && \ 8 | apt-get install -y openssh-server vim curl inetutils-ping net-tools telnet lsof 9 | 10 | COPY start.sh /start.sh 11 | COPY sshd_config /etc/ssh/sshd_config 12 | COPY nccl-tests /nccl-tests 13 | 14 | CMD ["/bin/bash", "start.sh"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Build-NCCL-Tests-With-PyTorch 2 | 3 | ![license](https://img.shields.io/hexpm/l/plug.svg) 4 | [![docker](https://img.shields.io/docker/pulls/mayooot/nccl-tests-with-pytorch.svg)](https://hub.docker.com/r/mayooot/nccl-tests-with-pytorch) 5 | 6 | # Overview 7 | 8 | Build [NCCL-Tests](https://github.com/NVIDIA/nccl-tests) and configure SSHD in PyTorch container to help you test NCCL 9 | faster! 10 | 11 | PyTorch Version: 23.11 12 | 13 | # Quick Start 14 | 15 | ~~~shell 16 | docker pull mayooot/nccl-tests-with-pytorch:v0.0.1 17 | ~~~ 18 | 19 | # Build From Source 20 | 21 | ~~~shell 22 | git clone https://github.com/mayooot/build-nccl-tests-with-pytorch 23 | cd build-nccl-tests-with-pytorch 24 | 25 | docker build -t nccl-tests-with-pytorch:latest . 26 | ~~~ 27 | 28 | # Usage 29 | 30 | The default values for `PORT` and `PASS` are 12345, you can replace them with `-e`. 31 | 32 | In addition, you need to mount the host's `id_rsa` and `id_rsa.pub` to the container. 33 | 34 | ~~~shell 35 | docker run --name foo \ 36 | -d -it \ 37 | --network=host \ 38 | -e PORT=1998 -e PASS=P@88w0rd \ 39 | -v /tmp/id_rsa:/root/.ssh/id_rsa \ 40 | -v /tmp/id_rsa.pub:/root/.ssh/id_rsa.pub \ 41 | --gpus all --shm-size=1g \ 42 | --cap-add=IPC_LOCK --device=/dev/infiniband \ 43 | mayooot/nccl-tests-with-pytorch:v0.0.1 44 | ~~~ 45 | 46 | The code and executable for NCCL-Tests is located in `/nccl-tests`, so let me show you how to use it, 47 | using `all_reduce_perf` as an example. 48 | 49 | Before using `all_reduce_perf`, you need to configure SSH intercommunication. 50 | 51 | ~~~shell 52 | ssh-copy-id -p 1998 root@all_cluster_ip 53 | ~~~ 54 | 55 | Please replace `--host cluster_ip1,cluster_ip2,...` to the real cluster's IP address. 56 | 57 | ~~~shell 58 | docker exec -it foo bash 59 | 60 | cd /nccl-tests 61 | 62 | mpirun --allow-run-as-root \ 63 | -mca plm_rsh_args "-p 1998" \ 64 | -x NCCL_DEBUG=INFO \ 65 | -x NCCL_IB_HCA=mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17 \ 66 | --host cluster_ip1,cluster_ip2,... \ 67 | ./build/all_reduce_perf \ 68 | -b 1G -e 4G -f 2 -g 8 69 | ~~~ 70 | 71 | # Contribute 72 | 73 | Feel free to open issues and pull requests. Any feedback is highly appreciated! -------------------------------------------------------------------------------- /nccl-tests/.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # See LICENCE.txt for license information 4 | /build 5 | -------------------------------------------------------------------------------- /nccl-tests/LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of NVIDIA CORPORATION, nor the names of their 13 | contributors may be used to endorse or promote products derived 14 | from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | -------------------------------------------------------------------------------- /nccl-tests/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENCE.txt for license information 5 | # 6 | 7 | BUILDDIR ?= build 8 | override BUILDDIR := $(abspath $(BUILDDIR)) 9 | 10 | .PHONY: all clean 11 | 12 | default: src.build 13 | 14 | TARGETS=src 15 | 16 | all: ${TARGETS:%=%.build} 17 | clean: ${TARGETS:%=%.clean} 18 | 19 | %.build: 20 | ${MAKE} -C $* build BUILDDIR=${BUILDDIR} 21 | 22 | %.clean: 23 | ${MAKE} -C $* clean BUILDDIR=${BUILDDIR} 24 | -------------------------------------------------------------------------------- /nccl-tests/README.md: -------------------------------------------------------------------------------- 1 | # NCCL Tests 2 | 3 | These tests check both the performance and the correctness of [NCCL](http://github.com/nvidia/nccl) operations. 4 | 5 | ## Build 6 | 7 | To build the tests, just type `make`. 8 | 9 | If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME. 10 | 11 | ```shell 12 | $ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl 13 | ``` 14 | 15 | NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed. 16 | 17 | ```shell 18 | $ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl 19 | ``` 20 | 21 | ## Usage 22 | 23 | NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). 24 | 25 | ### Quick examples 26 | 27 | Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : 28 | ```shell 29 | $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 30 | ``` 31 | 32 | Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs: 33 | ```shell 34 | $ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 35 | ``` 36 | 37 | ### Performance 38 | 39 | See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column. 40 | 41 | ### Arguments 42 | 43 | All tests support the same set of arguments : 44 | 45 | * Number of GPUs 46 | * `-t,--nthreads ` number of threads per process. Default : 1. 47 | * `-g,--ngpus ` number of gpus per thread. Default : 1. 48 | * Sizes to scan 49 | * `-b,--minbytes ` minimum size to start with. Default : 32M. 50 | * `-e,--maxbytes ` maximum size to end at. Default : 32M. 51 | * Increments can be either fixed or a multiplication factor. Only one of those should be used 52 | * `-i,--stepbytes ` fixed increment between sizes. Default : 1M. 53 | * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. 54 | * NCCL operations arguments 55 | * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum. 56 | * `-d,--datatype ` Specify which datatype to use. Default : Float. 57 | * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0. 58 | * Performance 59 | * `-n,--iters ` number of iterations. Default : 20. 60 | * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. 61 | * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. 62 | * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. 63 | * Test operation 64 | * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. 65 | * `-c,--check ` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. 66 | * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. 67 | * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. 68 | 69 | ## Copyright 70 | 71 | NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. 72 | 73 | -------------------------------------------------------------------------------- /nccl-tests/build/all_gather_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/all_gather_perf -------------------------------------------------------------------------------- /nccl-tests/build/all_reduce_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/all_reduce_perf -------------------------------------------------------------------------------- /nccl-tests/build/alltoall_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/alltoall_perf -------------------------------------------------------------------------------- /nccl-tests/build/broadcast_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/broadcast_perf -------------------------------------------------------------------------------- /nccl-tests/build/gather_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/gather_perf -------------------------------------------------------------------------------- /nccl-tests/build/hypercube_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/hypercube_perf -------------------------------------------------------------------------------- /nccl-tests/build/reduce_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/reduce_perf -------------------------------------------------------------------------------- /nccl-tests/build/reduce_scatter_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/reduce_scatter_perf -------------------------------------------------------------------------------- /nccl-tests/build/scatter_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/scatter_perf -------------------------------------------------------------------------------- /nccl-tests/build/sendrecv_perf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/sendrecv_perf -------------------------------------------------------------------------------- /nccl-tests/build/timer.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/timer.o -------------------------------------------------------------------------------- /nccl-tests/build/verifiable/verifiable.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/verifiable/verifiable.o -------------------------------------------------------------------------------- /nccl-tests/doc/PERFORMANCE.md: -------------------------------------------------------------------------------- 1 | # Performance reported by NCCL tests 2 | 3 | NCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used. 4 | 5 | # Time 6 | 7 | Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations. 8 | 9 | On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but 10 | also the bandwidth multiplied by the size. 11 | 12 | Therefore, on large sizes, it makes more sense to look at the bandwidth. 13 | 14 | # Bandwidth 15 | 16 | ## Algorithm bandwidth 17 | 18 | Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth. 19 | 20 | `algbw = S/t` 21 | 22 | ## Bus bandwidth 23 | 24 | While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks. 25 | Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase). 26 | 27 | To provide a number which reflects how optimally the hardware is used, NCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output). 28 | This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. 29 | Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used. 30 | 31 | The formula depends on the collective operation. 32 | 33 | ### AllReduce 34 | 35 | An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation : 36 | 37 | `o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}` 38 | 39 | **Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).** 40 | 41 | A ring would do that operation in an order which follows the ring : 42 | 43 | `i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}` 44 | 45 | A tree would do it hierarchically : 46 | 47 | `(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))` 48 | 49 | In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output), 50 | we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation. 51 | 52 | Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best : 53 | 54 | `t = (S*2*(n-1)) / (n*B)` 55 | 56 | Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them. 57 | Reordering the equation, we find that 58 | 59 | `t = (S/B) * (2*(n-1)/n)` 60 | 61 | Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute : 62 | 63 | `B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)` 64 | 65 | ### ReduceScatter 66 | 67 | The ReduceScatter operation requires only to perform the addition part of the allReduce operation : 68 | 69 | `o_K = i_0 + i_1 + i_2 + ... + i_{n-1}` 70 | 71 | With K being the rank which is getting the final result(K=offset/recvsize). 72 | 73 | The perfect reduceScatter time with a rank bandwidth of B would therefore be : 74 | 75 | `t = S*(n-1) / (B*n)` 76 | 77 | And the Bus Bandwidth is therefore computed as : 78 | 79 | `B = S/t * (n-1)/n = algbw * (n-1)/n` 80 | 81 | Note that here, S is the size in bytes of the total array, which for NCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank. 82 | 83 | ### AllGather 84 | 85 | The AllGather operation requires only to perform the assignment part of the allReduce operation : 86 | 87 | `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K` 88 | 89 | With K being the rank where the data originates from (K=offset*sendsize). 90 | 91 | The perfect allGather time with a rank bandwidth of B would therefore be : 92 | 93 | `t = S*(n-1) / (B*n)` 94 | 95 | And the Bus Bandwidth is therefore computed as : 96 | 97 | `B = S/t * (n-1)/n = algbw * (n-1)/n` 98 | 99 | Note that here, S is the size in bytes of the total array, which for NCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank. 100 | 101 | ### Broadcast 102 | 103 | The broadcast operation representation is similar to allGather : 104 | 105 | `o_0 = o_1 = o_2 = ... = o_{n-1} = i_R` 106 | 107 | R being the root of the operation. 108 | 109 | However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations. 110 | Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out : 111 | 112 | `t = S/B` 113 | 114 | And : 115 | 116 | `B = S/t` 117 | 118 | ### Reduce 119 | 120 | The reduce operation performs : 121 | 122 | `o_R = i_0 + i_1 + i_2 + ... + i_{n-1}` 123 | 124 | R being the root of the operation. 125 | 126 | Similarly to broadcast, all data need to be sent to the root, hence : 127 | 128 | `t = S/B` 129 | 130 | And : 131 | 132 | `B = S/t` 133 | 134 | ### Summary 135 | 136 | To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth : 137 | 138 | * AllReduce : 2*(_n_-1)/_n_ 139 | * ReduceScatter : (_n_-1)/_n_ 140 | * AllGather : (_n_-1)/_n_ 141 | * Broadcast : 1 142 | * Reduce : 1 143 | 144 | The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network. 145 | -------------------------------------------------------------------------------- /nccl-tests/src/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | CUDA_HOME ?= /usr/local/cuda 8 | PREFIX ?= /usr/local 9 | VERBOSE ?= 0 10 | DEBUG ?= 0 11 | 12 | CUDA_LIB ?= $(CUDA_HOME)/lib64 13 | CUDA_INC ?= $(CUDA_HOME)/include 14 | NVCC ?= $(CUDA_HOME)/bin/nvcc 15 | CUDARTLIB ?= cudart 16 | 17 | CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) 18 | CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) 19 | 20 | # Better define NVCC_GENCODE in your environment to the minimal set 21 | # of archs to reduce compile time. 22 | ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) 23 | NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ 24 | -gencode=arch=compute_61,code=sm_61 \ 25 | -gencode=arch=compute_70,code=sm_70 \ 26 | -gencode=arch=compute_80,code=sm_80 \ 27 | -gencode=arch=compute_80,code=compute_80 28 | else 29 | NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ 30 | -gencode=arch=compute_50,code=sm_50 \ 31 | -gencode=arch=compute_60,code=sm_60 \ 32 | -gencode=arch=compute_61,code=sm_61 \ 33 | -gencode=arch=compute_70,code=sm_70 \ 34 | -gencode=arch=compute_70,code=compute_70 35 | endif 36 | 37 | NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 38 | CXXFLAGS := -std=c++11 39 | 40 | LDFLAGS := -L${CUDA_LIB} -lcudart -lrt 41 | NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt 42 | 43 | ifeq ($(DEBUG), 0) 44 | NVCUFLAGS += -O3 -g 45 | CXXFLAGS += -O3 -g 46 | else 47 | NVCUFLAGS += -O0 -G -g 48 | CXXFLAGS += -O0 -g -ggdb3 49 | endif 50 | 51 | ifneq ($(VERBOSE), 0) 52 | NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter 53 | else 54 | .SILENT: 55 | endif 56 | 57 | .PHONY: build clean 58 | 59 | BUILDDIR ?= ../build 60 | ifneq ($(NCCL_HOME), "") 61 | NVCUFLAGS += -I$(NCCL_HOME)/include/ 62 | NVLDFLAGS += -L$(NCCL_HOME)/lib 63 | endif 64 | 65 | ifeq ($(MPI), 1) 66 | NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include 67 | NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi 68 | endif 69 | ifeq ($(MPI_IBM),1) 70 | NVCUFLAGS += -DMPI_SUPPORT 71 | NVLDFLAGS += -lmpi_ibm 72 | endif 73 | LIBRARIES += nccl 74 | NVLDFLAGS += $(LIBRARIES:%=-l%) 75 | 76 | DST_DIR := $(BUILDDIR) 77 | SRC_FILES := $(wildcard *.cu) 78 | OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) 79 | BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube 80 | BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) 81 | 82 | build: ${BIN_FILES} 83 | 84 | clean: 85 | rm -rf ${DST_DIR} 86 | 87 | TEST_VERIFIABLE_SRCDIR := ../verifiable 88 | TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable 89 | include ../verifiable/verifiable.mk 90 | 91 | ${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS) 92 | @printf "Compiling %-35s > %s\n" $< $@ 93 | @mkdir -p ${DST_DIR} 94 | $(NVCC) -o $@ $(NVCUFLAGS) -c $< 95 | 96 | ${DST_DIR}/timer.o: timer.cc timer.h 97 | @printf "Compiling %-35s > %s\n" $< $@ 98 | @mkdir -p ${DST_DIR} 99 | $(CXX) $(CXXFLAGS) -o $@ -c timer.cc 100 | 101 | ${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) 102 | @printf "Linking %-35s > %s\n" $< $@ 103 | @mkdir -p ${DST_DIR} 104 | $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} 105 | 106 | -------------------------------------------------------------------------------- /nccl-tests/src/all_gather.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "cuda_runtime.h" 8 | #include "common.h" 9 | 10 | #define ALIGN 4 11 | 12 | void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { 13 | size_t base = (count/(ALIGN*nranks))*ALIGN; 14 | *sendcount = base; 15 | *recvcount = base*nranks; 16 | *sendInplaceOffset = base; 17 | *recvInplaceOffset = 0; 18 | *paramcount = base; 19 | } 20 | 21 | testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { 22 | size_t sendcount = args->sendBytes / wordSize(type); 23 | size_t recvcount = args->expectedBytes / wordSize(type); 24 | int nranks = args->nProcs*args->nThreads*args->nGpus; 25 | 26 | for (int i=0; inGpus; i++) { 27 | CUDACHECK(cudaSetDevice(args->gpus[i])); 28 | int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); 29 | CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); 30 | void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; 31 | TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); 32 | for (int j=0; jexpected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); 34 | } 35 | CUDACHECK(cudaDeviceSynchronize()); 36 | } 37 | return testSuccess; 38 | } 39 | 40 | void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { 41 | double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; 42 | 43 | *algBw = baseBw; 44 | double factor = ((double)(nranks - 1))/((double)nranks); 45 | *busBw = baseBw * factor; 46 | } 47 | 48 | testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { 49 | NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); 50 | return testSuccess; 51 | } 52 | 53 | struct testColl allGatherTest = { 54 | "AllGather", 55 | AllGatherGetCollByteCount, 56 | AllGatherInitData, 57 | AllGatherGetBw, 58 | AllGatherRunColl 59 | }; 60 | 61 | void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { 62 | size_t paramcount, sendInplaceOffset, recvInplaceOffset; 63 | AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); 64 | } 65 | 66 | testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { 67 | args->collTest = &allGatherTest; 68 | ncclDataType_t *run_types; 69 | const char **run_typenames; 70 | int type_count; 71 | 72 | if ((int)type != -1) { 73 | type_count = 1; 74 | run_types = &type; 75 | run_typenames = &typeName; 76 | } else { 77 | type_count = test_typenum; 78 | run_types = test_types; 79 | run_typenames = test_typenames; 80 | } 81 | 82 | for (int i=0; isendBytes / wordSize(type); 20 | size_t recvcount = args->expectedBytes / wordSize(type); 21 | int nranks = args->nProcs*args->nThreads*args->nGpus; 22 | 23 | for (int i=0; inGpus; i++) { 24 | CUDACHECK(cudaSetDevice(args->gpus[i])); 25 | int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); 26 | CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); 27 | void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; 28 | TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); 29 | TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); 30 | CUDACHECK(cudaDeviceSynchronize()); 31 | } 32 | return testSuccess; 33 | } 34 | 35 | void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { 36 | double baseBw = (double)(count * typesize) / 1.0E9 / sec; 37 | 38 | *algBw = baseBw; 39 | double factor = ((double)(2*(nranks - 1)))/((double)nranks); 40 | *busBw = baseBw * factor; 41 | } 42 | 43 | testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { 44 | NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); 45 | return testSuccess; 46 | } 47 | 48 | struct testColl allReduceTest = { 49 | "AllReduce", 50 | AllReduceGetCollByteCount, 51 | AllReduceInitData, 52 | AllReduceGetBw, 53 | AllReduceRunColl 54 | }; 55 | 56 | void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { 57 | size_t paramcount, sendInplaceOffset, recvInplaceOffset; 58 | AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); 59 | } 60 | 61 | testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { 62 | args->collTest = &allReduceTest; 63 | ncclDataType_t *run_types; 64 | ncclRedOp_t *run_ops; 65 | const char **run_typenames, **run_opnames; 66 | int type_count, op_count; 67 | 68 | if ((int)type != -1) { 69 | type_count = 1; 70 | run_types = &type; 71 | run_typenames = &typeName; 72 | } else { 73 | type_count = test_typenum; 74 | run_types = test_types; 75 | run_typenames = test_typenames; 76 | } 77 | 78 | if ((int)op != -1) { 79 | op_count = 1; 80 | run_ops = &op; 81 | run_opnames = &opName; 82 | } else { 83 | op_count = test_opnum; 84 | run_ops = test_ops; 85 | run_opnames = test_opnames; 86 | } 87 | 88 | for (int i=0; isendBytes / wordSize(type); 20 | size_t recvcount = args->expectedBytes / wordSize(type); 21 | int nranks = args->nProcs*args->nThreads*args->nGpus; 22 | 23 | for (int i=0; inGpus; i++) { 24 | CUDACHECK(cudaSetDevice(args->gpus[i])); 25 | int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); 26 | CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); 27 | void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; 28 | TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); 29 | for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); 32 | } 33 | CUDACHECK(cudaDeviceSynchronize()); 34 | } 35 | // We don't support in-place alltoall 36 | args->reportErrors = in_place ? 0 : 1; 37 | return testSuccess; 38 | } 39 | 40 | void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { 41 | double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; 42 | 43 | *algBw = baseBw; 44 | double factor = ((double)(nranks-1))/((double)(nranks)); 45 | *busBw = baseBw * factor; 46 | } 47 | 48 | testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { 49 | int nRanks; 50 | NCCLCHECK(ncclCommCount(comm, &nRanks)); 51 | size_t rankOffset = count * wordSize(type); 52 | 53 | #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 54 | printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); 55 | return testNcclError; 56 | #else 57 | NCCLCHECK(ncclGroupStart()); 58 | for (int r=0; rcollTest = &alltoAllTest; 82 | ncclDataType_t *run_types; 83 | const char **run_typenames; 84 | int type_count; 85 | 86 | if ((int)type != -1) { 87 | type_count = 1; 88 | run_types = &type; 89 | run_typenames = &typeName; 90 | } else { 91 | type_count = test_typenum; 92 | run_types = test_types; 93 | run_typenames = test_typenames; 94 | } 95 | 96 | for (int i=0; isendBytes / wordSize(type); 20 | size_t recvcount = args->expectedBytes / wordSize(type); 21 | 22 | for (int i=0; inGpus; i++) { 23 | CUDACHECK(cudaSetDevice(args->gpus[i])); 24 | int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); 25 | CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); 26 | void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; 27 | if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); 28 | TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0)); 29 | CUDACHECK(cudaDeviceSynchronize()); 30 | } 31 | return testSuccess; 32 | } 33 | 34 | void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { 35 | double baseBw = (double)(count * typesize) / 1.0E9 / sec; 36 | 37 | *algBw = baseBw; 38 | double factor = 1; 39 | *busBw = baseBw * factor; 40 | } 41 | 42 | testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { 43 | int rank; 44 | NCCLCHECK(ncclCommUserRank(comm, &rank)); 45 | #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 46 | NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream)); 47 | #else 48 | if (rank == root) { 49 | NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream)); 50 | } else { 51 | NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream)); 52 | } 53 | #endif 54 | return testSuccess; 55 | } 56 | 57 | struct testColl broadcastTest = { 58 | "Broadcast", 59 | BroadcastGetCollByteCount, 60 | BroadcastInitData, 61 | BroadcastGetBw, 62 | BroadcastRunColl 63 | }; 64 | 65 | void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { 66 | size_t paramcount, sendInplaceOffset, recvInplaceOffset; 67 | BroadcastGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); 68 | } 69 | 70 | testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { 71 | args->collTest = &broadcastTest; 72 | ncclDataType_t *run_types; 73 | const char **run_typenames; 74 | int type_count; 75 | int begin_root, end_root; 76 | 77 | if ((int)type != -1) { 78 | type_count = 1; 79 | run_types = &type; 80 | run_typenames = &typeName; 81 | } else { 82 | type_count = test_typenum; 83 | run_types = test_types; 84 | run_typenames = test_typenames; 85 | } 86 | 87 | if (root != -1) { 88 | begin_root = end_root = root; 89 | } else { 90 | begin_root = 0; 91 | end_root = args->nProcs*args->nThreads*args->nGpus-1; 92 | } 93 | 94 | for (int i=0; i 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "cuda.h" 14 | 15 | #include "../verifiable/verifiable.h" 16 | 17 | int test_ncclVersion = 0; // init'd with ncclGetVersion() 18 | 19 | #if NCCL_MAJOR >= 2 20 | ncclDataType_t test_types[ncclNumTypes] = { 21 | ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble 22 | #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) 23 | , ncclBfloat16 24 | #endif 25 | }; 26 | const char *test_typenames[ncclNumTypes] = { 27 | "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" 28 | #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) 29 | , "bfloat16" 30 | #endif 31 | }; 32 | int test_typenum = -1; 33 | 34 | const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; 35 | ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin 36 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) 37 | , ncclAvg 38 | #endif 39 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) 40 | , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand 41 | #endif 42 | }; 43 | int test_opnum = -1; 44 | #else 45 | ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; 46 | const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; 47 | int test_typenum = 7; 48 | const char *test_opnames[] = {"sum", "prod", "max", "min"}; 49 | ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; 50 | int test_opnum = 4; 51 | #endif 52 | 53 | // For libnccl's < 2.13 54 | extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) { 55 | return ""; 56 | } 57 | 58 | int is_main_proc = 0; 59 | thread_local int is_main_thread = 0; 60 | 61 | // Command line parameter defaults 62 | static int nThreads = 1; 63 | static int nGpus = 1; 64 | static size_t minBytes = 32*1024*1024; 65 | static size_t maxBytes = 32*1024*1024; 66 | static size_t stepBytes = 1*1024*1024; 67 | static size_t stepFactor = 1; 68 | static int datacheck = 1; 69 | static int warmup_iters = 5; 70 | static int iters = 20; 71 | static int agg_iters = 1; 72 | static int ncclop = ncclSum; 73 | static int nccltype = ncclFloat; 74 | static int ncclroot = 0; 75 | static int parallel_init = 0; 76 | static int blocking_coll = 0; 77 | static int streamnull = 0; 78 | static int timeout = 0; 79 | static int cudaGraphLaunches = 0; 80 | static int report_cputime = 0; 81 | // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) 82 | static int average = 1; 83 | 84 | #define NUM_BLOCKS 32 85 | 86 | static double parsesize(const char *value) { 87 | long long int units; 88 | double size; 89 | char size_lit; 90 | 91 | int count = sscanf(value, "%lf %1s", &size, &size_lit); 92 | 93 | switch (count) { 94 | case 2: 95 | switch (size_lit) { 96 | case 'G': 97 | case 'g': 98 | units = 1024*1024*1024; 99 | break; 100 | case 'M': 101 | case 'm': 102 | units = 1024*1024; 103 | break; 104 | case 'K': 105 | case 'k': 106 | units = 1024; 107 | break; 108 | default: 109 | return -1.0; 110 | }; 111 | break; 112 | case 1: 113 | units = 1; 114 | break; 115 | default: 116 | return -1.0; 117 | } 118 | 119 | return size * units; 120 | } 121 | 122 | testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) { 123 | ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault); 124 | CUDACHECK(cudaDeviceSynchronize()); 125 | return testSuccess; 126 | } 127 | 128 | testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) { 129 | ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault); 130 | return testSuccess; 131 | } 132 | 133 | testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) { 134 | ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault); 135 | return testSuccess; 136 | } 137 | 138 | void Barrier(struct threadArgs *args) { 139 | thread_local int epoch = 0; 140 | static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}; 141 | static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER}; 142 | static int counter[2] = {0, 0}; 143 | 144 | pthread_mutex_lock(&lock[epoch]); 145 | if(++counter[epoch] == args->nThreads) 146 | pthread_cond_broadcast(&cond[epoch]); 147 | 148 | if(args->thread+1 == args->nThreads) { 149 | while(counter[epoch] != args->nThreads) 150 | pthread_cond_wait(&cond[epoch], &lock[epoch]); 151 | #ifdef MPI_SUPPORT 152 | MPI_Barrier(MPI_COMM_WORLD); 153 | #endif 154 | counter[epoch] = 0; 155 | pthread_cond_broadcast(&cond[epoch]); 156 | } 157 | else { 158 | while(counter[epoch] != 0) 159 | pthread_cond_wait(&cond[epoch], &lock[epoch]); 160 | } 161 | pthread_mutex_unlock(&lock[epoch]); 162 | epoch ^= 1; 163 | } 164 | 165 | // Inter-thread/process barrier+allreduce. The quality of the return value 166 | // for average=0 (which means broadcast from rank=0) is dubious. The returned 167 | // value will actually be the result of process-local broadcast from the local thread=0. 168 | template 169 | void Allreduce(struct threadArgs* args, T* value, int average) { 170 | thread_local int epoch = 0; 171 | static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}; 172 | static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER}; 173 | static T accumulator[2]; 174 | static int counter[2] = {0, 0}; 175 | 176 | pthread_mutex_lock(&lock[epoch]); 177 | if(counter[epoch] == 0) { 178 | if(average != 0 || args->thread == 0) accumulator[epoch] = *value; 179 | } else { 180 | switch(average) { 181 | case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break; 182 | case /*avg*/1: accumulator[epoch] += *value; break; 183 | case /*min*/2: accumulator[epoch] = std::min(accumulator[epoch], *value); break; 184 | case /*max*/3: accumulator[epoch] = std::max(accumulator[epoch], *value); break; 185 | case /*sum*/4: accumulator[epoch] += *value; break; 186 | } 187 | } 188 | 189 | if(++counter[epoch] == args->nThreads) 190 | pthread_cond_broadcast(&cond[epoch]); 191 | 192 | if(args->thread+1 == args->nThreads) { 193 | while(counter[epoch] != args->nThreads) 194 | pthread_cond_wait(&cond[epoch], &lock[epoch]); 195 | 196 | #ifdef MPI_SUPPORT 197 | if(average != 0) { 198 | static_assert(std::is_same::value || std::is_same::value, "Allreduce only for T in {long long, double}"); 199 | MPI_Datatype ty = std::is_same::value ? MPI_LONG_LONG : 200 | std::is_same::value ? MPI_DOUBLE : 201 | MPI_Datatype(); 202 | MPI_Op op = average == 1 ? MPI_SUM : 203 | average == 2 ? MPI_MIN : 204 | average == 3 ? MPI_MAX : 205 | average == 4 ? MPI_SUM : MPI_Op(); 206 | MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD); 207 | } 208 | #endif 209 | 210 | if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads; 211 | counter[epoch] = 0; 212 | pthread_cond_broadcast(&cond[epoch]); 213 | } 214 | else { 215 | while(counter[epoch] != 0) 216 | pthread_cond_wait(&cond[epoch], &lock[epoch]); 217 | } 218 | pthread_mutex_unlock(&lock[epoch]); 219 | 220 | *value = accumulator[epoch]; 221 | epoch ^= 1; 222 | } 223 | 224 | testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) { 225 | int nranks = args->nProcs*args->nGpus*args->nThreads; 226 | size_t count = args->expectedBytes/wordSize(type); 227 | 228 | int64_t *wrongPerGpu = nullptr; 229 | CUDACHECK(cudaHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped)); 230 | 231 | for (int i=0; inGpus; i++) { 232 | int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); 233 | CUDACHECK(cudaSetDevice(args->gpus[i])); 234 | void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; 235 | 236 | TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); 237 | 238 | #if 1 && DEBUG_PRINT 239 | if (args->reportErrors && wrongPerGpu[i] != 0) { 240 | printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]); 241 | char *expectedHost = (char*)malloc(args->expectedBytes); 242 | char *dataHost = (char*)malloc(args->expectedBytes); 243 | int eltsz = wordSize(type); 244 | cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost); 245 | cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); 246 | 247 | for(int j=0; jexpectedBytes/eltsz; j++) { 248 | unsigned long long want, got; 249 | want = 0; 250 | memcpy(&want, expectedHost + j*eltsz, eltsz); 251 | got = 0; 252 | memcpy(&got, dataHost + j*eltsz, eltsz); 253 | if(want != got) { 254 | printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got); 255 | } 256 | } 257 | free(expectedHost); 258 | free(dataHost); 259 | } 260 | #endif 261 | } 262 | 263 | *wrongElts = 0; 264 | for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i]; 265 | cudaFreeHost(wrongPerGpu); 266 | 267 | if (args->reportErrors && *wrongElts) args->errors[0]++; 268 | return testSuccess; 269 | } 270 | 271 | testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { 272 | cudaError_t cudaErr; 273 | int remaining = ngpus; 274 | int* done = (int*)malloc(sizeof(int)*ngpus); 275 | memset(done, 0, sizeof(int)*ngpus); 276 | timer tim; 277 | 278 | while (remaining) { 279 | int idle = 1; 280 | for (int i=0; i= NCCL_VERSION(2,4,0) 294 | if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { 295 | ncclResult_t ncclAsyncErr; 296 | NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); 297 | if (ncclAsyncErr != ncclSuccess) { 298 | // An asynchronous error happened. Stop the operation and destroy 299 | // the communicator 300 | for (int i=0; i timeout && timeout > 0) { 308 | for (int i=0; inbytes / wordSize(type); 331 | 332 | // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange 333 | size_t totalnbytes = max(args->sendBytes, args->expectedBytes); 334 | size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; 335 | size_t shift = totalnbytes * (iter % steps); 336 | 337 | if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); 338 | for (int i = 0; i < args->nGpus; i++) { 339 | #ifndef NCCL_MAJOR 340 | CUDACHECK(cudaSetDevice(args->gpus[i])); 341 | #endif 342 | int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); 343 | char* recvBuff = ((char*)args->recvbuffs[i]) + shift; 344 | char* sendBuff = ((char*)args->sendbuffs[i]) + shift; 345 | ncclRedOp_t op; 346 | 347 | if(opIndex < ncclNumOps) { 348 | op = opIndex; 349 | } 350 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) 351 | else { 352 | union { 353 | int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; 354 | half f16; float f32; double f64; 355 | #if defined(__CUDA_BF16_TYPES_EXIST__) 356 | __nv_bfloat16 bf16; 357 | #endif 358 | }; 359 | switch(type) { 360 | case ncclInt8: i8 = ncclVerifiablePremulScalar(rank); break; 361 | case ncclUint8: u8 = ncclVerifiablePremulScalar(rank); break; 362 | case ncclInt32: i32 = ncclVerifiablePremulScalar(rank); break; 363 | case ncclUint32: u32 = ncclVerifiablePremulScalar(rank); break; 364 | case ncclInt64: i64 = ncclVerifiablePremulScalar(rank); break; 365 | case ncclUint64: u64 = ncclVerifiablePremulScalar(rank); break; 366 | case ncclFloat16: f16 = ncclVerifiablePremulScalar(rank); break; 367 | case ncclFloat32: f32 = ncclVerifiablePremulScalar(rank); break; 368 | case ncclFloat64: f64 = ncclVerifiablePremulScalar(rank); break; 369 | #if defined(__CUDA_BF16_TYPES_EXIST__) 370 | case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break; 371 | #endif 372 | } 373 | NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); 374 | } 375 | #endif 376 | 377 | TESTCHECK(args->collTest->runColl( 378 | (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), 379 | (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), 380 | count, type, op, root, args->comms[i], args->streams[i])); 381 | 382 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) 383 | if(opIndex >= ncclNumOps) { 384 | NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); 385 | } 386 | #endif 387 | } 388 | if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); 389 | 390 | if (blocking_coll) { 391 | // Complete op before returning 392 | TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); 393 | } 394 | if (blocking_coll) Barrier(args); 395 | return testSuccess; 396 | } 397 | 398 | testResult_t completeColl(struct threadArgs* args) { 399 | if (blocking_coll) return testSuccess; 400 | 401 | TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); 402 | return testSuccess; 403 | } 404 | 405 | testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { 406 | size_t count = args->nbytes / wordSize(type); 407 | if (datacheck) { 408 | // Initialize sendbuffs, recvbuffs and expected 409 | TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); 410 | } 411 | 412 | // Sync 413 | TESTCHECK(startColl(args, type, op, root, in_place, 0)); 414 | TESTCHECK(completeColl(args)); 415 | 416 | Barrier(args); 417 | 418 | #if CUDART_VERSION >= 11030 419 | cudaGraph_t graphs[args->nGpus]; 420 | cudaGraphExec_t graphExec[args->nGpus]; 421 | if (cudaGraphLaunches >= 1) { 422 | // Begin cuda graph capture 423 | for (int i=0; inGpus; i++) { 424 | // Thread local mdoe is needed for: 425 | // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads 426 | // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture. 427 | // Since pre-connect calls cudaMalloc, we cannot use global capture mode 428 | CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); 429 | } 430 | } 431 | #endif 432 | 433 | // Performance Benchmark 434 | timer tim; 435 | for (int iter = 0; iter < iters; iter++) { 436 | if (agg_iters>1) NCCLCHECK(ncclGroupStart()); 437 | for (int aiter = 0; aiter < agg_iters; aiter++) { 438 | TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); 439 | } 440 | if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); 441 | } 442 | 443 | #if CUDART_VERSION >= 11030 444 | if (cudaGraphLaunches >= 1) { 445 | // End cuda graph capture 446 | for (int i=0; inGpus; i++) { 447 | CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); 448 | } 449 | // Instantiate cuda graph 450 | for (int i=0; inGpus; i++) { 451 | CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); 452 | } 453 | // Resync CPU, restart timing, launch cuda graph 454 | Barrier(args); 455 | tim.reset(); 456 | for (int l=0; lnGpus; i++) { 458 | CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); 459 | } 460 | } 461 | } 462 | #endif 463 | 464 | double cputimeSec = tim.elapsed()/(iters*agg_iters); 465 | TESTCHECK(completeColl(args)); 466 | 467 | double deltaSec = tim.elapsed(); 468 | deltaSec = deltaSec/(iters*agg_iters); 469 | if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; 470 | Allreduce(args, &deltaSec, average); 471 | 472 | #if CUDART_VERSION >= 11030 473 | if (cudaGraphLaunches >= 1) { 474 | //destroy cuda graph 475 | for (int i=0; inGpus; i++) { 476 | CUDACHECK(cudaGraphExecDestroy(graphExec[i])); 477 | CUDACHECK(cudaGraphDestroy(graphs[i])); 478 | } 479 | } 480 | #endif 481 | 482 | double algBw, busBw; 483 | args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); 484 | 485 | Barrier(args); 486 | 487 | int64_t wrongElts = 0; 488 | static __thread int rep = 0; 489 | rep++; 490 | for (int c = 0; c < datacheck; c++) { 491 | // Initialize sendbuffs, recvbuffs and expected 492 | TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); 493 | 494 | #if CUDART_VERSION >= 11030 495 | if (cudaGraphLaunches >= 1) { 496 | // Begin cuda graph capture for data check 497 | for (int i=0; inGpus; i++) { 498 | CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); 499 | } 500 | } 501 | #endif 502 | 503 | //test validation in single itertion, should ideally be included into the multi-iteration run 504 | TESTCHECK(startColl(args, type, op, root, in_place, 0)); 505 | 506 | #if CUDART_VERSION >= 11030 507 | if (cudaGraphLaunches >= 1) { 508 | // End cuda graph capture 509 | for (int i=0; inGpus; i++) { 510 | CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); 511 | } 512 | // Instantiate cuda graph 513 | for (int i=0; inGpus; i++) { 514 | CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); 515 | } 516 | // Launch cuda graph 517 | for (int i=0; inGpus; i++) { 518 | CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); 519 | } 520 | } 521 | #endif 522 | 523 | TESTCHECK(completeColl(args)); 524 | 525 | #if CUDART_VERSION >= 11030 526 | if (cudaGraphLaunches >= 1) { 527 | //destroy cuda graph 528 | for (int i=0; inGpus; i++) { 529 | CUDACHECK(cudaGraphExecDestroy(graphExec[i])); 530 | CUDACHECK(cudaGraphDestroy(graphs[i])); 531 | } 532 | } 533 | #endif 534 | 535 | TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts)); 536 | 537 | //aggregate delta from all threads and procs 538 | long long wrongElts1 = wrongElts; 539 | //if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts); 540 | Allreduce(args, &wrongElts1, /*sum*/4); 541 | wrongElts = wrongElts1; 542 | if (wrongElts) break; 543 | } 544 | 545 | double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6; 546 | char timeStr[100]; 547 | if (timeUsec >= 10000.0) { 548 | sprintf(timeStr, "%7.0f", timeUsec); 549 | } else if (timeUsec >= 100.0) { 550 | sprintf(timeStr, "%7.1f", timeUsec); 551 | } else { 552 | sprintf(timeStr, "%7.2f", timeUsec); 553 | } 554 | if (args->reportErrors) { 555 | PRINT(" %7s %6.2f %6.2f %5g", timeStr, algBw, busBw, (double)wrongElts); 556 | } else { 557 | PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); 558 | } 559 | 560 | args->bw[0] += busBw; 561 | args->bw_count[0]++; 562 | return testSuccess; 563 | } 564 | 565 | void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { 566 | int nranks = args->nProcs*args->nGpus*args->nThreads; 567 | size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; 568 | 569 | count = size / wordSize(type); 570 | args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); 571 | 572 | args->nbytes = paramCount * wordSize(type); 573 | args->sendBytes = sendCount * wordSize(type); 574 | args->expectedBytes = recvCount * wordSize(type); 575 | args->sendInplaceOffset = sendInplaceOffset * wordSize(type); 576 | args->recvInplaceOffset = recvInplaceOffset * wordSize(type); 577 | } 578 | 579 | testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { 580 | // Sync to avoid first-call timeout 581 | Barrier(args); 582 | 583 | // Warm-up for large size 584 | setupArgs(args->maxbytes, type, args); 585 | for (int iter = 0; iter < warmup_iters; iter++) { 586 | TESTCHECK(startColl(args, type, op, root, 0, iter)); 587 | } 588 | TESTCHECK(completeColl(args)); 589 | 590 | // Warm-up for small size 591 | setupArgs(args->minbytes, type, args); 592 | for (int iter = 0; iter < warmup_iters; iter++) { 593 | TESTCHECK(startColl(args, type, op, root, 0, iter)); 594 | } 595 | TESTCHECK(completeColl(args)); 596 | 597 | // Benchmark 598 | for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { 599 | setupArgs(size, type, args); 600 | char rootName[100]; 601 | sprintf(rootName, "%6i", root); 602 | PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); 603 | TESTCHECK(BenchTime(args, type, op, root, 0)); 604 | TESTCHECK(BenchTime(args, type, op, root, 1)); 605 | PRINT("\n"); 606 | } 607 | return testSuccess; 608 | } 609 | 610 | testResult_t threadRunTests(struct threadArgs* args) { 611 | // Set device to the first of our GPUs. If we don't do that, some operations 612 | // will be done on the current GPU (by default : 0) and if the GPUs are in 613 | // exclusive mode those operations will fail. 614 | CUDACHECK(cudaSetDevice(args->gpus[0])); 615 | TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); 616 | return testSuccess; 617 | } 618 | 619 | testResult_t threadInit(struct threadArgs* args) { 620 | char hostname[1024]; 621 | getHostName(hostname, 1024); 622 | int nranks = args->nProcs*args->nThreads*args->nGpus; 623 | 624 | //set main thread again 625 | is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0; 626 | 627 | NCCLCHECK(ncclGroupStart()); 628 | for (int i=0; inGpus; i++) { 629 | int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; 630 | CUDACHECK(cudaSetDevice(args->gpus[i])); 631 | NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); 632 | } 633 | NCCLCHECK(ncclGroupEnd()); 634 | 635 | TESTCHECK(threadRunTests(args)); 636 | 637 | for (int i=0; inGpus; i++) { 638 | NCCLCHECK(ncclCommDestroy(args->comms[i])); 639 | } 640 | return testSuccess; 641 | } 642 | 643 | void* threadLauncher(void* thread_) { 644 | struct testThread* thread = (struct testThread*)thread_; 645 | thread->ret = thread->func(&thread->args); 646 | return NULL; 647 | } 648 | testResult_t threadLaunch(struct testThread* thread) { 649 | pthread_create(&thread->thread, NULL, threadLauncher, thread); 650 | return testSuccess; 651 | } 652 | 653 | testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) { 654 | CUDACHECK(cudaMalloc(sendbuff, nbytes)); 655 | CUDACHECK(cudaMalloc(recvbuff, nbytes)); 656 | if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); 657 | return testSuccess; 658 | } 659 | 660 | testResult_t run(); // Main function 661 | 662 | int main(int argc, char* argv[]) { 663 | // Make sure everyline is flushed so that we see the progress of the test 664 | setlinebuf(stdout); 665 | 666 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) 667 | ncclGetVersion(&test_ncclVersion); 668 | #else 669 | test_ncclVersion = NCCL_VERSION_CODE; 670 | #endif 671 | //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); 672 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) 673 | test_opnum = 4; 674 | test_typenum = 9; 675 | if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { 676 | test_opnum++; // ncclAvg 677 | #if defined(__CUDA_BF16_TYPES_EXIST__) 678 | test_typenum++; // bfloat16 679 | #endif 680 | } 681 | if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { 682 | test_opnum++; // PreMulSum 683 | } 684 | #endif 685 | 686 | // Parse args 687 | double parsed; 688 | int longindex; 689 | static struct option longopts[] = { 690 | {"nthreads", required_argument, 0, 't'}, 691 | {"ngpus", required_argument, 0, 'g'}, 692 | {"minbytes", required_argument, 0, 'b'}, 693 | {"maxbytes", required_argument, 0, 'e'}, 694 | {"stepbytes", required_argument, 0, 'i'}, 695 | {"stepfactor", required_argument, 0, 'f'}, 696 | {"iters", required_argument, 0, 'n'}, 697 | {"agg_iters", required_argument, 0, 'm'}, 698 | {"warmup_iters", required_argument, 0, 'w'}, 699 | {"parallel_init", required_argument, 0, 'p'}, 700 | {"check", required_argument, 0, 'c'}, 701 | {"op", required_argument, 0, 'o'}, 702 | {"datatype", required_argument, 0, 'd'}, 703 | {"root", required_argument, 0, 'r'}, 704 | {"blocking", required_argument, 0, 'z'}, 705 | {"stream_null", required_argument, 0, 'y'}, 706 | {"timeout", required_argument, 0, 'T'}, 707 | {"cudagraph", required_argument, 0, 'G'}, 708 | {"report_cputime", required_argument, 0, 'C'}, 709 | {"average", required_argument, 0, 'a'}, 710 | {"help", no_argument, 0, 'h'}, 711 | {} 712 | }; 713 | 714 | while(1) { 715 | int c; 716 | c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); 717 | 718 | if (c == -1) 719 | break; 720 | 721 | switch(c) { 722 | case 't': 723 | nThreads = strtol(optarg, NULL, 0); 724 | break; 725 | case 'g': 726 | nGpus = strtol(optarg, NULL, 0); 727 | break; 728 | case 'b': 729 | parsed = parsesize(optarg); 730 | if (parsed < 0) { 731 | fprintf(stderr, "invalid size specified for 'minbytes'\n"); 732 | return -1; 733 | } 734 | minBytes = (size_t)parsed; 735 | break; 736 | case 'e': 737 | parsed = parsesize(optarg); 738 | if (parsed < 0) { 739 | fprintf(stderr, "invalid size specified for 'maxbytes'\n"); 740 | return -1; 741 | } 742 | maxBytes = (size_t)parsed; 743 | break; 744 | case 'i': 745 | stepBytes = strtol(optarg, NULL, 0); 746 | break; 747 | case 'f': 748 | stepFactor = strtol(optarg, NULL, 0); 749 | break; 750 | case 'n': 751 | iters = (int)strtol(optarg, NULL, 0); 752 | break; 753 | case 'm': 754 | #if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) 755 | agg_iters = (int)strtol(optarg, NULL, 0); 756 | #else 757 | fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); 758 | #endif 759 | break; 760 | case 'w': 761 | warmup_iters = (int)strtol(optarg, NULL, 0); 762 | break; 763 | case 'c': 764 | datacheck = (int)strtol(optarg, NULL, 0); 765 | break; 766 | case 'p': 767 | parallel_init = (int)strtol(optarg, NULL, 0); 768 | break; 769 | case 'o': 770 | ncclop = ncclstringtoop(optarg); 771 | break; 772 | case 'd': 773 | nccltype = ncclstringtotype(optarg); 774 | break; 775 | case 'r': 776 | ncclroot = strtol(optarg, NULL, 0); 777 | break; 778 | case 'z': 779 | blocking_coll = strtol(optarg, NULL, 0); 780 | break; 781 | case 'y': 782 | streamnull = strtol(optarg, NULL, 0); 783 | break; 784 | case 'T': 785 | timeout = strtol(optarg, NULL, 0); 786 | break; 787 | case 'G': 788 | #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 789 | cudaGraphLaunches = strtol(optarg, NULL, 0); 790 | #else 791 | printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); 792 | #endif 793 | break; 794 | case 'C': 795 | report_cputime = strtol(optarg, NULL, 0); 796 | break; 797 | case 'a': 798 | average = (int)strtol(optarg, NULL, 0); 799 | break; 800 | case 'h': 801 | default: 802 | if (c != 'h') printf("invalid option '%c'\n", c); 803 | printf("USAGE: %s \n\t" 804 | "[-t,--nthreads ] \n\t" 805 | "[-g,--ngpus ] \n\t" 806 | "[-b,--minbytes ] \n\t" 807 | "[-e,--maxbytes ] \n\t" 808 | "[-i,--stepbytes ] \n\t" 809 | "[-f,--stepfactor ] \n\t" 810 | "[-n,--iters ] \n\t" 811 | "[-m,--agg_iters ] \n\t" 812 | "[-w,--warmup_iters ] \n\t" 813 | "[-p,--parallel_init <0/1>] \n\t" 814 | "[-c,--check ] \n\t" 815 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) 816 | "[-o,--op ] \n\t" 817 | #elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) 818 | "[-o,--op ] \n\t" 819 | #else 820 | "[-o,--op ] \n\t" 821 | #endif 822 | "[-d,--datatype ] \n\t" 823 | "[-r,--root ] \n\t" 824 | "[-z,--blocking <0/1>] \n\t" 825 | "[-y,--stream_null <0/1>] \n\t" 826 | "[-T,--timeout