├── Dockerfile
├── LICENSE
├── README.md
├── nccl-tests
    ├── .gitignore
    ├── LICENSE.txt
    ├── Makefile
    ├── README.md
    ├── build
    │   ├── all_gather_perf
    │   ├── all_reduce_perf
    │   ├── alltoall_perf
    │   ├── broadcast_perf
    │   ├── gather_perf
    │   ├── hypercube_perf
    │   ├── reduce_perf
    │   ├── reduce_scatter_perf
    │   ├── scatter_perf
    │   ├── sendrecv_perf
    │   ├── timer.o
    │   └── verifiable
    │   │   └── verifiable.o
    ├── doc
    │   └── PERFORMANCE.md
    ├── src
    │   ├── Makefile
    │   ├── all_gather.cu
    │   ├── all_reduce.cu
    │   ├── alltoall.cu
    │   ├── broadcast.cu
    │   ├── common.cu
    │   ├── common.h
    │   ├── gather.cu
    │   ├── hypercube.cu
    │   ├── nccl1_compat.h
    │   ├── reduce.cu
    │   ├── reduce_scatter.cu
    │   ├── scatter.cu
    │   ├── sendrecv.cu
    │   ├── timer.cc
    │   └── timer.h
    └── verifiable
    │   ├── Makefile
    │   ├── inexact_regress.cu
    │   ├── verifiable.cu
    │   ├── verifiable.h
    │   └── verifiable.mk
├── sources.list
├── sshd_config
└── start.sh


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:23.11-py3
 2 | 
 3 | WORKDIR /
 4 | 
 5 | # change the download source of apt, comment it out if you are abroad
 6 | COPY sources.list /etc/apt/sources.list
 7 | RUN apt-get update && \
 8 |     apt-get install -y openssh-server vim curl inetutils-ping net-tools telnet lsof
 9 | 
10 | COPY start.sh /start.sh
11 | COPY sshd_config /etc/ssh/sshd_config
12 | COPY nccl-tests /nccl-tests
13 | 
14 | CMD ["/bin/bash", "start.sh"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Build-NCCL-Tests-With-PyTorch
 2 | 
 3 | ![license](https://img.shields.io/hexpm/l/plug.svg)
 4 | [![docker](https://img.shields.io/docker/pulls/mayooot/nccl-tests-with-pytorch.svg)](https://hub.docker.com/r/mayooot/nccl-tests-with-pytorch)
 5 | 
 6 | # Overview
 7 | 
 8 | Build [NCCL-Tests](https://github.com/NVIDIA/nccl-tests) and configure SSHD in PyTorch container to help you test NCCL
 9 | faster!
10 | 
11 | PyTorch Version: 23.11
12 | 
13 | # Quick Start
14 | 
15 | ~~~shell
16 | docker pull mayooot/nccl-tests-with-pytorch:v0.0.1
17 | ~~~
18 | 
19 | # Build From Source
20 | 
21 | ~~~shell
22 | git clone https://github.com/mayooot/build-nccl-tests-with-pytorch
23 | cd build-nccl-tests-with-pytorch
24 | 
25 | docker build -t nccl-tests-with-pytorch:latest .
26 | ~~~
27 | 
28 | # Usage
29 | 
30 | The default values for `PORT` and `PASS` are 12345, you can replace them with `-e`.
31 | 
32 | In addition, you need to mount the host's `id_rsa` and `id_rsa.pub` to the container.
33 | 
34 | ~~~shell
35 | docker run --name foo \
36 |   -d -it \
37 |   --network=host \
38 |   -e PORT=1998 -e PASS=P@88w0rd \
39 |   -v /tmp/id_rsa:/root/.ssh/id_rsa \
40 |   -v /tmp/id_rsa.pub:/root/.ssh/id_rsa.pub \
41 |   --gpus all --shm-size=1g \
42 |   --cap-add=IPC_LOCK --device=/dev/infiniband \
43 |   mayooot/nccl-tests-with-pytorch:v0.0.1 
44 | ~~~
45 | 
46 | The code and executable for NCCL-Tests is located in `/nccl-tests`, so let me show you how to use it,
47 | using `all_reduce_perf` as an example.
48 | 
49 | Before using `all_reduce_perf`, you need to configure SSH intercommunication.
50 | 
51 | ~~~shell
52 | ssh-copy-id -p 1998 root@all_cluster_ip
53 | ~~~
54 | 
55 | Please replace `--host cluster_ip1,cluster_ip2,...` to the real cluster's IP address.
56 | 
57 | ~~~shell
58 | docker exec -it foo bash
59 | 
60 | cd /nccl-tests
61 | 
62 | mpirun --allow-run-as-root \
63 |   -mca plm_rsh_args "-p 1998" \
64 |   -x NCCL_DEBUG=INFO \
65 |   -x NCCL_IB_HCA=mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17 \
66 |   --host cluster_ip1,cluster_ip2,... \
67 |   ./build/all_reduce_perf \
68 |   -b 1G -e 4G -f 2 -g 8
69 | ~~~
70 | 
71 | # Contribute
72 | 
73 | Feel free to open issues and pull requests. Any feedback is highly appreciated!


--------------------------------------------------------------------------------
/nccl-tests/.gitignore:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # See LICENCE.txt for license information
4 | /build
5 | 


--------------------------------------------------------------------------------
/nccl-tests/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
 3 | 
 4 |  Redistribution and use in source and binary forms, with or without
 5 |  modification, are permitted provided that the following conditions
 6 |  are met:
 7 |   * Redistributions of source code must retain the above copyright
 8 |     notice, this list of conditions and the following disclaimer.
 9 |   * Redistributions in binary form must reproduce the above copyright
10 |     notice, this list of conditions and the following disclaimer in the
11 |     documentation and/or other materials provided with the distribution.
12 |   * Neither the name of NVIDIA CORPORATION, nor the names of their
13 |     contributors may be used to endorse or promote products derived
14 |     from this software without specific prior written permission.
15 | 
16 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | 


--------------------------------------------------------------------------------
/nccl-tests/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENCE.txt for license information
 5 | #
 6 | 
 7 | BUILDDIR ?= build
 8 | override BUILDDIR := $(abspath $(BUILDDIR))
 9 | 
10 | .PHONY: all clean
11 | 
12 | default: src.build
13 | 
14 | TARGETS=src
15 | 
16 | all:   ${TARGETS:%=%.build}
17 | clean: ${TARGETS:%=%.clean}
18 | 
19 | %.build:
20 | 	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
21 | 
22 | %.clean:
23 | 	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
24 | 


--------------------------------------------------------------------------------
/nccl-tests/README.md:
--------------------------------------------------------------------------------
 1 | # NCCL Tests
 2 | 
 3 | These tests check both the performance and the correctness of [NCCL](http://github.com/nvidia/nccl) operations.
 4 | 
 5 | ## Build
 6 | 
 7 | To build the tests, just type `make`.
 8 | 
 9 | If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME.
10 | 
11 | ```shell
12 | $ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
13 | ```
14 | 
15 | NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
16 | 
17 | ```shell
18 | $ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
19 | ```
20 | 
21 | ## Usage
22 | 
23 | NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
24 | 
25 | ### Quick examples
26 | 
27 | Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
28 | ```shell
29 | $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
30 | ```
31 | 
32 | Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
33 | ```shell
34 | $ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
35 | ```
36 | 
37 | ### Performance
38 | 
39 | See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
40 | 
41 | ### Arguments
42 | 
43 | All tests support the same set of arguments :
44 | 
45 | * Number of GPUs
46 |   * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
47 |   * `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
48 | * Sizes to scan
49 |   * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
50 |   * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
51 |   * Increments can be either fixed or a multiplication factor. Only one of those should be used
52 |     * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
53 |     * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
54 | * NCCL operations arguments
55 |   * `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
56 |   * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
57 |   * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
58 | * Performance
59 |   * `-n,--iters <iteration count>` number of iterations. Default : 20.
60 |   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
61 |   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
62 |   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
63 | * Test operation
64 |   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
65 |   * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
66 |   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
67 |   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
68 | 
69 | ## Copyright
70 | 
71 | NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
72 | 
73 | 


--------------------------------------------------------------------------------
/nccl-tests/build/all_gather_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/all_gather_perf


--------------------------------------------------------------------------------
/nccl-tests/build/all_reduce_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/all_reduce_perf


--------------------------------------------------------------------------------
/nccl-tests/build/alltoall_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/alltoall_perf


--------------------------------------------------------------------------------
/nccl-tests/build/broadcast_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/broadcast_perf


--------------------------------------------------------------------------------
/nccl-tests/build/gather_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/gather_perf


--------------------------------------------------------------------------------
/nccl-tests/build/hypercube_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/hypercube_perf


--------------------------------------------------------------------------------
/nccl-tests/build/reduce_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/reduce_perf


--------------------------------------------------------------------------------
/nccl-tests/build/reduce_scatter_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/reduce_scatter_perf


--------------------------------------------------------------------------------
/nccl-tests/build/scatter_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/scatter_perf


--------------------------------------------------------------------------------
/nccl-tests/build/sendrecv_perf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/sendrecv_perf


--------------------------------------------------------------------------------
/nccl-tests/build/timer.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/timer.o


--------------------------------------------------------------------------------
/nccl-tests/build/verifiable/verifiable.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mayooot/build-nccl-tests-with-pytorch/79d1e0995488937530e73e08572f44fd06a5fac5/nccl-tests/build/verifiable/verifiable.o


--------------------------------------------------------------------------------
/nccl-tests/doc/PERFORMANCE.md:
--------------------------------------------------------------------------------
  1 | # Performance reported by NCCL tests
  2 | 
  3 | NCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
  4 | 
  5 | # Time
  6 | 
  7 | Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations.
  8 | 
  9 | On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but
 10 | also the bandwidth multiplied by the size.
 11 | 
 12 | Therefore, on large sizes, it makes more sense to look at the bandwidth.
 13 | 
 14 | # Bandwidth
 15 | 
 16 | ## Algorithm bandwidth
 17 | 
 18 | Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth.
 19 | 
 20 | `algbw = S/t`
 21 | 
 22 | ## Bus bandwidth
 23 | 
 24 | While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
 25 | Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
 26 | 
 27 | To provide a number which reflects how optimally the hardware is used, NCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
 28 | This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
 29 | Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
 30 | 
 31 | The formula depends on the collective operation.
 32 | 
 33 | ### AllReduce
 34 | 
 35 | An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation :
 36 | 
 37 | `o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}`
 38 | 
 39 | **Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
 40 | 
 41 | A ring would do that operation in an order which follows the ring :
 42 | 
 43 | `i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
 44 | 
 45 | A tree would do it hierarchically :
 46 | 
 47 | `(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
 48 | 
 49 | In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output),
 50 | we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
 51 | 
 52 | Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
 53 | 
 54 |  `t = (S*2*(n-1)) / (n*B)`
 55 | 
 56 | Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
 57 | Reordering the equation, we find that
 58 | 
 59 |  `t = (S/B) * (2*(n-1)/n)`
 60 | 
 61 | Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute :
 62 | 
 63 |  `B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)`
 64 | 
 65 | ### ReduceScatter
 66 | 
 67 | The ReduceScatter operation requires only to perform the addition part of the allReduce operation :
 68 | 
 69 |  `o_K = i_0 + i_1 + i_2 + ... + i_{n-1}`
 70 | 
 71 | With K being the rank which is getting the final result(K=offset/recvsize).
 72 | 
 73 | The perfect reduceScatter time with a rank bandwidth of B would therefore be :
 74 | 
 75 |  `t = S*(n-1) / (B*n)`
 76 | 
 77 | And the Bus Bandwidth is therefore computed as :
 78 | 
 79 |  `B = S/t * (n-1)/n = algbw * (n-1)/n`
 80 | 
 81 | Note that here, S is the size in bytes of the total array, which for NCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
 82 | 
 83 | ### AllGather
 84 | 
 85 | The AllGather operation requires only to perform the assignment part of the allReduce operation :
 86 | 
 87 |  `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
 88 | 
 89 | With K being the rank where the data originates from (K=offset*sendsize).
 90 | 
 91 | The perfect allGather time with a rank bandwidth of B would therefore be :
 92 | 
 93 |  `t = S*(n-1) / (B*n)`
 94 | 
 95 | And the Bus Bandwidth is therefore computed as :
 96 | 
 97 |  `B = S/t * (n-1)/n = algbw * (n-1)/n`
 98 | 
 99 | Note that here, S is the size in bytes of the total array, which for NCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
100 | 
101 | ### Broadcast
102 | 
103 | The broadcast operation representation is similar to allGather :
104 | 
105 |  `o_0 = o_1 = o_2 = ... = o_{n-1} = i_R`
106 | 
107 | R being the root of the operation.
108 | 
109 | However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations.
110 | Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out :
111 | 
112 |  `t = S/B`
113 | 
114 | And :
115 | 
116 |  `B = S/t`
117 | 
118 | ### Reduce
119 | 
120 | The reduce operation performs :
121 | 
122 |  `o_R = i_0 + i_1 + i_2 + ... + i_{n-1}`
123 | 
124 | R being the root of the operation.
125 | 
126 | Similarly to broadcast, all data need to be sent to the root, hence :
127 | 
128 |  `t = S/B`
129 | 
130 | And :
131 | 
132 |  `B = S/t`
133 | 
134 | ### Summary
135 | 
136 | To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth :
137 | 
138 | * AllReduce : 2*(_n_-1)/_n_
139 | * ReduceScatter : (_n_-1)/_n_
140 | * AllGather : (_n_-1)/_n_
141 | * Broadcast : 1
142 | * Reduce : 1
143 | 
144 | The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
145 | 


--------------------------------------------------------------------------------
/nccl-tests/src/Makefile:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # See LICENSE.txt for license information
  5 | #
  6 | 
  7 | CUDA_HOME ?= /usr/local/cuda
  8 | PREFIX ?= /usr/local
  9 | VERBOSE ?= 0
 10 | DEBUG ?= 0
 11 | 
 12 | CUDA_LIB ?= $(CUDA_HOME)/lib64
 13 | CUDA_INC ?= $(CUDA_HOME)/include
 14 | NVCC ?= $(CUDA_HOME)/bin/nvcc
 15 | CUDARTLIB ?= cudart
 16 | 
 17 | CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 18 | CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 19 | 
 20 | # Better define NVCC_GENCODE in your environment to the minimal set
 21 | # of archs to reduce compile time.
 22 | ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
 23 | NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
 24 |                 -gencode=arch=compute_61,code=sm_61 \
 25 |                 -gencode=arch=compute_70,code=sm_70 \
 26 |                 -gencode=arch=compute_80,code=sm_80 \
 27 |                 -gencode=arch=compute_80,code=compute_80
 28 | else
 29 | NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
 30 |                 -gencode=arch=compute_50,code=sm_50 \
 31 |                 -gencode=arch=compute_60,code=sm_60 \
 32 |                 -gencode=arch=compute_61,code=sm_61 \
 33 |                 -gencode=arch=compute_70,code=sm_70 \
 34 |                 -gencode=arch=compute_70,code=compute_70
 35 | endif
 36 | 
 37 | NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 38 | CXXFLAGS   := -std=c++11
 39 | 
 40 | LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 41 | NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
 42 | 
 43 | ifeq ($(DEBUG), 0)
 44 | NVCUFLAGS += -O3 -g
 45 | CXXFLAGS  += -O3 -g
 46 | else
 47 | NVCUFLAGS += -O0 -G -g
 48 | CXXFLAGS  += -O0 -g -ggdb3
 49 | endif
 50 | 
 51 | ifneq ($(VERBOSE), 0)
 52 | NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 53 | else
 54 | .SILENT:
 55 | endif
 56 | 
 57 | .PHONY: build clean
 58 | 
 59 | BUILDDIR ?= ../build
 60 | ifneq ($(NCCL_HOME), "")
 61 | NVCUFLAGS += -I$(NCCL_HOME)/include/
 62 | NVLDFLAGS += -L$(NCCL_HOME)/lib
 63 | endif
 64 | 
 65 | ifeq ($(MPI), 1)
 66 | NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
 67 | NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
 68 | endif
 69 | ifeq ($(MPI_IBM),1)
 70 | NVCUFLAGS += -DMPI_SUPPORT
 71 | NVLDFLAGS += -lmpi_ibm
 72 | endif
 73 | LIBRARIES += nccl
 74 | NVLDFLAGS += $(LIBRARIES:%=-l%)
 75 | 
 76 | DST_DIR := $(BUILDDIR)
 77 | SRC_FILES := $(wildcard *.cu)
 78 | OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
 79 | BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
 80 | BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 81 | 
 82 | build: ${BIN_FILES}
 83 | 
 84 | clean:
 85 | 	rm -rf ${DST_DIR}
 86 | 
 87 | TEST_VERIFIABLE_SRCDIR := ../verifiable
 88 | TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
 89 | include ../verifiable/verifiable.mk
 90 | 
 91 | ${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
 92 | 	@printf "Compiling  %-35s > %s\n" $< $@
 93 | 	@mkdir -p ${DST_DIR}
 94 | 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
 95 | 
 96 | ${DST_DIR}/timer.o: timer.cc timer.h
 97 | 	@printf "Compiling  %-35s > %s\n" $< $@
 98 | 	@mkdir -p ${DST_DIR}
 99 | 	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
100 | 
101 | ${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
102 | 	@printf "Linking  %-35s > %s\n" $< $@
103 | 	@mkdir -p ${DST_DIR}
104 | 	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
105 | 
106 | 


--------------------------------------------------------------------------------
/nccl-tests/src/all_gather.cu:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #include "cuda_runtime.h"
 8 | #include "common.h"
 9 | 
10 | #define ALIGN 4
11 | 
12 | void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
13 |   size_t base = (count/(ALIGN*nranks))*ALIGN;
14 |   *sendcount = base;
15 |   *recvcount = base*nranks;
16 |   *sendInplaceOffset = base;
17 |   *recvInplaceOffset = 0;
18 |   *paramcount = base;
19 | }
20 | 
21 | testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
22 |   size_t sendcount = args->sendBytes / wordSize(type);
23 |   size_t recvcount = args->expectedBytes / wordSize(type);
24 |   int nranks = args->nProcs*args->nThreads*args->nGpus;
25 | 
26 |   for (int i=0; i<args->nGpus; i++) {
27 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
28 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
29 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
30 |     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
31 |     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
32 |     for (int j=0; j<nranks; j++) {
33 |       TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
34 |     }
35 |     CUDACHECK(cudaDeviceSynchronize());
36 |   }
37 |   return testSuccess;
38 | }
39 | 
40 | void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
41 |   double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
42 | 
43 |   *algBw = baseBw;
44 |   double factor = ((double)(nranks - 1))/((double)nranks);
45 |   *busBw = baseBw * factor;
46 | }
47 | 
48 | testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
49 |   NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
50 |   return testSuccess;
51 | }
52 | 
53 | struct testColl allGatherTest = {
54 |   "AllGather",
55 |   AllGatherGetCollByteCount,
56 |   AllGatherInitData,
57 |   AllGatherGetBw,
58 |   AllGatherRunColl
59 | };
60 | 
61 | void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
62 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
63 |   AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
64 | }
65 | 
66 | testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
67 |   args->collTest = &allGatherTest;
68 |   ncclDataType_t *run_types;
69 |   const char **run_typenames;
70 |   int type_count;
71 | 
72 |   if ((int)type != -1) {
73 |     type_count = 1;
74 |     run_types = &type;
75 |     run_typenames = &typeName;
76 |   } else {
77 |     type_count = test_typenum;
78 |     run_types = test_types;
79 |     run_typenames = test_typenames;
80 |   }
81 | 
82 |   for (int i=0; i<type_count; i++) {
83 |     TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
84 |   }
85 |   return testSuccess;
86 | }
87 | 
88 | struct testEngine allGatherEngine = {
89 |   AllGatherGetBuffSize,
90 |   AllGatherRunTest
91 | };
92 | 
93 | #pragma weak ncclTestEngine=allGatherEngine
94 | 


--------------------------------------------------------------------------------
/nccl-tests/src/all_reduce.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 11 |   *sendcount = count;
 12 |   *recvcount = count;
 13 |   *sendInplaceOffset = 0;
 14 |   *recvInplaceOffset = 0;
 15 |   *paramcount = *sendcount;
 16 | }
 17 | 
 18 | testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 19 |   size_t sendcount = args->sendBytes / wordSize(type);
 20 |   size_t recvcount = args->expectedBytes / wordSize(type);
 21 |   int nranks = args->nProcs*args->nThreads*args->nGpus;
 22 | 
 23 |   for (int i=0; i<args->nGpus; i++) {
 24 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 25 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 26 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 27 |     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
 28 |     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
 29 |     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
 30 |     CUDACHECK(cudaDeviceSynchronize());
 31 |   }
 32 |   return testSuccess;
 33 | }
 34 | 
 35 | void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 36 |   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 37 | 
 38 |   *algBw = baseBw;
 39 |   double factor = ((double)(2*(nranks - 1)))/((double)nranks);
 40 |   *busBw = baseBw * factor;
 41 | }
 42 | 
 43 | testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 44 |   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
 45 |   return testSuccess;
 46 | }
 47 | 
 48 | struct testColl allReduceTest = {
 49 |   "AllReduce",
 50 |   AllReduceGetCollByteCount,
 51 |   AllReduceInitData,
 52 |   AllReduceGetBw,
 53 |   AllReduceRunColl
 54 | };
 55 | 
 56 | void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 57 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 58 |   AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 59 | }
 60 | 
 61 | testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 62 |   args->collTest = &allReduceTest;
 63 |   ncclDataType_t *run_types;
 64 |   ncclRedOp_t *run_ops;
 65 |   const char **run_typenames, **run_opnames;
 66 |   int type_count, op_count;
 67 | 
 68 |   if ((int)type != -1) {
 69 |     type_count = 1;
 70 |     run_types = &type;
 71 |     run_typenames = &typeName;
 72 |   } else {
 73 |     type_count = test_typenum;
 74 |     run_types = test_types;
 75 |     run_typenames = test_typenames;
 76 |   }
 77 | 
 78 |   if ((int)op != -1) {
 79 |     op_count = 1;
 80 |     run_ops = &op;
 81 |     run_opnames = &opName;
 82 |   } else {
 83 |     op_count = test_opnum;
 84 |     run_ops = test_ops;
 85 |     run_opnames = test_opnames;
 86 |   }
 87 | 
 88 |   for (int i=0; i<type_count; i++) {
 89 |     for (int j=0; j<op_count; j++) {
 90 |       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
 91 |     }
 92 |   }
 93 |   return testSuccess;
 94 | }
 95 | 
 96 | struct testEngine allReduceEngine = {
 97 |   AllReduceGetBuffSize,
 98 |   AllReduceRunTest
 99 | };
100 | 
101 | #pragma weak ncclTestEngine=allReduceEngine
102 | 


--------------------------------------------------------------------------------
/nccl-tests/src/alltoall.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 11 |   *sendcount = (count/nranks)*nranks;
 12 |   *recvcount = (count/nranks)*nranks;
 13 |   *sendInplaceOffset = 0;
 14 |   *recvInplaceOffset = 0;
 15 |   *paramcount = count/nranks;
 16 | }
 17 | 
 18 | testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 19 |   size_t sendcount = args->sendBytes / wordSize(type);
 20 |   size_t recvcount = args->expectedBytes / wordSize(type);
 21 |   int nranks = args->nProcs*args->nThreads*args->nGpus;
 22 | 
 23 |   for (int i=0; i<args->nGpus; i++) {
 24 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 25 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 26 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 27 |     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
 28 |     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
 29 |     for (int j=0; j<nranks; j++) {
 30 |       size_t partcount = sendcount/nranks;
 31 |       TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
 32 |     }
 33 |     CUDACHECK(cudaDeviceSynchronize());
 34 |   }
 35 |   // We don't support in-place alltoall
 36 |   args->reportErrors = in_place ? 0 : 1;
 37 |   return testSuccess;
 38 | }
 39 | 
 40 | void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 41 |   double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
 42 | 
 43 |   *algBw = baseBw;
 44 |   double factor = ((double)(nranks-1))/((double)(nranks));
 45 |   *busBw = baseBw * factor;
 46 | }
 47 | 
 48 | testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 49 |   int nRanks;
 50 |   NCCLCHECK(ncclCommCount(comm, &nRanks));
 51 |   size_t rankOffset = count * wordSize(type);
 52 | 
 53 | #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
 54 |   printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
 55 |   return testNcclError;
 56 | #else
 57 |   NCCLCHECK(ncclGroupStart());
 58 |   for (int r=0; r<nRanks; r++) {
 59 |     NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
 60 |     NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
 61 |   }
 62 |   NCCLCHECK(ncclGroupEnd());
 63 |   return testSuccess;
 64 | #endif
 65 | }
 66 | 
 67 | struct testColl alltoAllTest = {
 68 |   "AlltoAll",
 69 |   AlltoAllGetCollByteCount,
 70 |   AlltoAllInitData,
 71 |   AlltoAllGetBw,
 72 |   AlltoAllRunColl
 73 | };
 74 | 
 75 | void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 76 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 77 |   AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 78 | }
 79 | 
 80 | testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 81 |   args->collTest = &alltoAllTest;
 82 |   ncclDataType_t *run_types;
 83 |   const char **run_typenames;
 84 |   int type_count;
 85 | 
 86 |   if ((int)type != -1) {
 87 |     type_count = 1;
 88 |     run_types = &type;
 89 |     run_typenames = &typeName;
 90 |   } else {
 91 |     type_count = test_typenum;
 92 |     run_types = test_types;
 93 |     run_typenames = test_typenames;
 94 |   }
 95 | 
 96 |   for (int i=0; i<type_count; i++) {
 97 |       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
 98 |   }
 99 |   return testSuccess;
100 | }
101 | 
102 | struct testEngine alltoAllEngine = {
103 |   AlltoAllGetBuffSize,
104 |   AlltoAllRunTest
105 | };
106 | 
107 | #pragma weak ncclTestEngine=alltoAllEngine
108 | 


--------------------------------------------------------------------------------
/nccl-tests/src/broadcast.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 11 |   *sendcount = count;
 12 |   *recvcount = count;
 13 |   *sendInplaceOffset = 0;
 14 |   *recvInplaceOffset = 0;
 15 |   *paramcount = *sendcount;
 16 | }
 17 | 
 18 | testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 19 |   size_t sendcount = args->sendBytes / wordSize(type);
 20 |   size_t recvcount = args->expectedBytes / wordSize(type);
 21 | 
 22 |   for (int i=0; i<args->nGpus; i++) {
 23 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 24 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 25 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 26 |     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
 27 |     if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
 28 |     TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
 29 |     CUDACHECK(cudaDeviceSynchronize());
 30 |   }
 31 |   return testSuccess;
 32 | }
 33 | 
 34 | void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 35 |   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 36 | 
 37 |   *algBw = baseBw;
 38 |   double factor = 1;
 39 |   *busBw = baseBw * factor;
 40 | }
 41 | 
 42 | testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 43 |   int rank;
 44 |   NCCLCHECK(ncclCommUserRank(comm, &rank));
 45 | #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
 46 |   NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream));
 47 | #else
 48 |   if (rank == root) {
 49 |       NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
 50 |   } else {
 51 |       NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
 52 |   }
 53 | #endif
 54 |   return testSuccess;
 55 | }
 56 | 
 57 | struct testColl broadcastTest = {
 58 |   "Broadcast",
 59 |   BroadcastGetCollByteCount,
 60 |   BroadcastInitData,
 61 |   BroadcastGetBw,
 62 |   BroadcastRunColl
 63 | };
 64 | 
 65 | void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 66 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 67 |   BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 68 | }
 69 | 
 70 | testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 71 |   args->collTest = &broadcastTest;
 72 |   ncclDataType_t *run_types;
 73 |   const char **run_typenames;
 74 |   int type_count;
 75 |   int begin_root, end_root;
 76 | 
 77 |   if ((int)type != -1) {
 78 |     type_count = 1;
 79 |     run_types = &type;
 80 |     run_typenames = &typeName;
 81 |   } else {
 82 |     type_count = test_typenum;
 83 |     run_types = test_types;
 84 |     run_typenames = test_typenames;
 85 |   }
 86 | 
 87 |   if (root != -1) {
 88 |     begin_root = end_root = root;
 89 |   } else {
 90 |     begin_root = 0;
 91 |     end_root = args->nProcs*args->nThreads*args->nGpus-1;
 92 |   }
 93 | 
 94 |   for (int i=0; i<type_count; i++) {
 95 |     for (int j=begin_root; j<=end_root; j++) {
 96 |       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
 97 |     }
 98 |   }
 99 |   return testSuccess;
100 | }
101 | 
102 | struct testEngine broadcastEngine = {
103 |   BroadcastGetBuffSize,
104 |   BroadcastRunTest
105 | };
106 | 
107 | #pragma weak ncclTestEngine=broadcastEngine
108 | 


--------------------------------------------------------------------------------
/nccl-tests/src/common.cu:
--------------------------------------------------------------------------------
   1 | /*************************************************************************
   2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
   3 |  *
   4 |  * See LICENSE.txt for license information
   5 |  ************************************************************************/
   6 | 
   7 | #include "common.h"
   8 | #include <pthread.h>
   9 | #include <cstdio>
  10 | #include <type_traits>
  11 | #include <getopt.h>
  12 | #include <libgen.h>
  13 | #include "cuda.h"
  14 | 
  15 | #include "../verifiable/verifiable.h"
  16 | 
  17 | int test_ncclVersion = 0; // init'd with ncclGetVersion()
  18 | 
  19 | #if NCCL_MAJOR >= 2
  20 |   ncclDataType_t test_types[ncclNumTypes] = {
  21 |     ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
  22 |   #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
  23 |     , ncclBfloat16
  24 |   #endif
  25 |   };
  26 |   const char *test_typenames[ncclNumTypes] = {
  27 |     "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
  28 |   #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
  29 |     , "bfloat16"
  30 |   #endif
  31 |   };
  32 |   int test_typenum = -1;
  33 | 
  34 |   const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
  35 |   ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
  36 |   #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
  37 |     , ncclAvg
  38 |   #endif
  39 |   #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
  40 |     , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
  41 |   #endif
  42 |   };
  43 |   int test_opnum = -1;
  44 | #else
  45 |   ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
  46 |   const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
  47 |   int test_typenum = 7;
  48 |   const char *test_opnames[] = {"sum", "prod", "max", "min"};
  49 |   ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
  50 |   int test_opnum = 4;
  51 | #endif
  52 | 
  53 | // For libnccl's < 2.13
  54 | extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) {
  55 |   return "";
  56 | }
  57 | 
  58 | int is_main_proc = 0;
  59 | thread_local int is_main_thread = 0;
  60 | 
  61 | // Command line parameter defaults
  62 | static int nThreads = 1;
  63 | static int nGpus = 1;
  64 | static size_t minBytes = 32*1024*1024;
  65 | static size_t maxBytes = 32*1024*1024;
  66 | static size_t stepBytes = 1*1024*1024;
  67 | static size_t stepFactor = 1;
  68 | static int datacheck = 1;
  69 | static int warmup_iters = 5;
  70 | static int iters = 20;
  71 | static int agg_iters = 1;
  72 | static int ncclop = ncclSum;
  73 | static int nccltype = ncclFloat;
  74 | static int ncclroot = 0;
  75 | static int parallel_init = 0;
  76 | static int blocking_coll = 0;
  77 | static int streamnull = 0;
  78 | static int timeout = 0;
  79 | static int cudaGraphLaunches = 0;
  80 | static int report_cputime = 0;
  81 | // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
  82 | static int average = 1;
  83 | 
  84 | #define NUM_BLOCKS 32
  85 | 
  86 | static double parsesize(const char *value) {
  87 |     long long int units;
  88 |     double size;
  89 |     char size_lit;
  90 | 
  91 |     int count = sscanf(value, "%lf %1s", &size, &size_lit);
  92 | 
  93 |     switch (count) {
  94 |     case 2:
  95 |       switch (size_lit) {
  96 |       case 'G':
  97 |       case 'g':
  98 |         units = 1024*1024*1024;
  99 |         break;
 100 |       case 'M':
 101 |       case 'm':
 102 |         units = 1024*1024;
 103 |         break;
 104 |       case 'K':
 105 |       case 'k':
 106 |         units = 1024;
 107 |         break;
 108 |       default:
 109 |         return -1.0;
 110 |       };
 111 |       break;
 112 |     case 1:
 113 |       units = 1;
 114 |       break;
 115 |     default:
 116 |       return -1.0;
 117 |     }
 118 | 
 119 |     return size * units;
 120 | }
 121 | 
 122 | testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
 123 |   ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
 124 |   CUDACHECK(cudaDeviceSynchronize());
 125 |   return testSuccess;
 126 | }
 127 | 
 128 | testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
 129 |   ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
 130 |   return testSuccess;
 131 | }
 132 | 
 133 | testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
 134 |   ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
 135 |   return testSuccess;
 136 | }
 137 | 
 138 | void Barrier(struct threadArgs *args) {
 139 |   thread_local int epoch = 0;
 140 |   static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
 141 |   static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
 142 |   static int counter[2] = {0, 0};
 143 | 
 144 |   pthread_mutex_lock(&lock[epoch]);
 145 |   if(++counter[epoch] == args->nThreads)
 146 |     pthread_cond_broadcast(&cond[epoch]);
 147 | 
 148 |   if(args->thread+1 == args->nThreads) {
 149 |     while(counter[epoch] != args->nThreads)
 150 |       pthread_cond_wait(&cond[epoch], &lock[epoch]);
 151 |     #ifdef MPI_SUPPORT
 152 |       MPI_Barrier(MPI_COMM_WORLD);
 153 |     #endif
 154 |     counter[epoch] = 0;
 155 |     pthread_cond_broadcast(&cond[epoch]);
 156 |   }
 157 |   else {
 158 |     while(counter[epoch] != 0)
 159 |       pthread_cond_wait(&cond[epoch], &lock[epoch]);
 160 |   }
 161 |   pthread_mutex_unlock(&lock[epoch]);
 162 |   epoch ^= 1;
 163 | }
 164 | 
 165 | // Inter-thread/process barrier+allreduce. The quality of the return value
 166 | // for average=0 (which means broadcast from rank=0) is dubious. The returned
 167 | // value will actually be the result of process-local broadcast from the local thread=0.
 168 | template<typename T>
 169 | void Allreduce(struct threadArgs* args, T* value, int average) {
 170 |   thread_local int epoch = 0;
 171 |   static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
 172 |   static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
 173 |   static T accumulator[2];
 174 |   static int counter[2] = {0, 0};
 175 | 
 176 |   pthread_mutex_lock(&lock[epoch]);
 177 |   if(counter[epoch] == 0) {
 178 |     if(average != 0 || args->thread == 0) accumulator[epoch] = *value;
 179 |   } else {
 180 |     switch(average) {
 181 |     case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break;
 182 |     case /*avg*/1: accumulator[epoch] += *value; break;
 183 |     case /*min*/2: accumulator[epoch] = std::min<T>(accumulator[epoch], *value); break;
 184 |     case /*max*/3: accumulator[epoch] = std::max<T>(accumulator[epoch], *value); break;
 185 |     case /*sum*/4: accumulator[epoch] += *value; break;
 186 |     }
 187 |   }
 188 | 
 189 |   if(++counter[epoch] == args->nThreads)
 190 |     pthread_cond_broadcast(&cond[epoch]);
 191 | 
 192 |   if(args->thread+1 == args->nThreads) {
 193 |     while(counter[epoch] != args->nThreads)
 194 |       pthread_cond_wait(&cond[epoch], &lock[epoch]);
 195 | 
 196 |     #ifdef MPI_SUPPORT
 197 |     if(average != 0) {
 198 |       static_assert(std::is_same<T, long long>::value || std::is_same<T, double>::value, "Allreduce<T> only for T in {long long, double}");
 199 |       MPI_Datatype ty = std::is_same<T, long long>::value ? MPI_LONG_LONG :
 200 |                         std::is_same<T, double>::value ? MPI_DOUBLE :
 201 |                         MPI_Datatype();
 202 |       MPI_Op op = average == 1 ? MPI_SUM :
 203 |                   average == 2 ? MPI_MIN :
 204 |                   average == 3 ? MPI_MAX :
 205 |                   average == 4 ? MPI_SUM : MPI_Op();
 206 |       MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD);
 207 |     }
 208 |     #endif
 209 | 
 210 |     if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads;
 211 |     counter[epoch] = 0;
 212 |     pthread_cond_broadcast(&cond[epoch]);
 213 |   }
 214 |   else {
 215 |     while(counter[epoch] != 0)
 216 |       pthread_cond_wait(&cond[epoch], &lock[epoch]);
 217 |   }
 218 |   pthread_mutex_unlock(&lock[epoch]);
 219 | 
 220 |   *value = accumulator[epoch];
 221 |   epoch ^= 1;
 222 | }
 223 | 
 224 | testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) {
 225 |   int nranks = args->nProcs*args->nGpus*args->nThreads;
 226 |   size_t count = args->expectedBytes/wordSize(type);
 227 | 
 228 |   int64_t *wrongPerGpu = nullptr;
 229 |   CUDACHECK(cudaHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
 230 | 
 231 |   for (int i=0; i<args->nGpus; i++) {
 232 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 233 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 234 |     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
 235 | 
 236 |     TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
 237 | 
 238 | #if 1 && DEBUG_PRINT
 239 |     if (args->reportErrors && wrongPerGpu[i] != 0) {
 240 |       printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
 241 |       char *expectedHost = (char*)malloc(args->expectedBytes);
 242 |       char *dataHost = (char*)malloc(args->expectedBytes);
 243 |       int eltsz = wordSize(type);
 244 |       cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost);
 245 |       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
 246 | 
 247 |       for(int j=0; j<args->expectedBytes/eltsz; j++) {
 248 |         unsigned long long want, got;
 249 |         want = 0;
 250 |         memcpy(&want, expectedHost + j*eltsz, eltsz);
 251 |         got = 0;
 252 |         memcpy(&got, dataHost + j*eltsz, eltsz);
 253 |         if(want != got) {
 254 |           printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got);
 255 |         }
 256 |       }
 257 |       free(expectedHost);
 258 |       free(dataHost);
 259 |     }
 260 | #endif
 261 |   }
 262 | 
 263 |   *wrongElts = 0;
 264 |   for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
 265 |   cudaFreeHost(wrongPerGpu);
 266 | 
 267 |   if (args->reportErrors && *wrongElts) args->errors[0]++;
 268 |   return testSuccess;
 269 | }
 270 | 
 271 | testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
 272 |   cudaError_t cudaErr;
 273 |   int remaining = ngpus;
 274 |   int* done = (int*)malloc(sizeof(int)*ngpus);
 275 |   memset(done, 0, sizeof(int)*ngpus);
 276 |   timer tim;
 277 | 
 278 |   while (remaining) {
 279 |    int idle = 1;
 280 |    for (int i=0; i<ngpus; i++) {
 281 |      if (done[i]) continue;
 282 | 
 283 |      cudaErr = cudaStreamQuery(streams[i]);
 284 |      if (cudaErr == cudaSuccess) {
 285 |        done[i] = 1;
 286 |        remaining--;
 287 |        idle = 0;
 288 |        continue;
 289 |      }
 290 | 
 291 |      if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
 292 | 
 293 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
 294 |      if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
 295 |        ncclResult_t ncclAsyncErr;
 296 |        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
 297 |        if (ncclAsyncErr != ncclSuccess) {
 298 |          // An asynchronous error happened. Stop the operation and destroy
 299 |          // the communicator
 300 |          for (int i=0; i<ngpus; i++)
 301 |            NCCLCHECK(ncclCommAbort(comms[i]));
 302 |          // Abort the perf test
 303 |          NCCLCHECK(ncclAsyncErr);
 304 |        }
 305 |      }
 306 |      double delta = tim.elapsed();
 307 |      if (delta > timeout && timeout > 0) {
 308 |        for (int i=0; i<ngpus; i++)
 309 |          NCCLCHECK(ncclCommAbort(comms[i]));
 310 |        char hostname[1024];
 311 |        getHostName(hostname, 1024);
 312 |        printf("%s: Test timeout (%ds) %s:%d\n",
 313 |            hostname,
 314 |            timeout,
 315 |            __FILE__,__LINE__);
 316 |        free(done);
 317 |        return testTimeout;
 318 |      }
 319 | #endif
 320 |    }
 321 | 
 322 |    // We might want to let other threads (including NCCL threads) use the CPU.
 323 |    if (idle) sched_yield();
 324 |   }
 325 |   free(done);
 326 |   return testSuccess;
 327 | }
 328 | 
 329 | testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
 330 |   size_t count = args->nbytes / wordSize(type);
 331 | 
 332 |   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
 333 |   size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
 334 |   size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
 335 |   size_t shift = totalnbytes * (iter % steps);
 336 | 
 337 |   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
 338 |   for (int i = 0; i < args->nGpus; i++) {
 339 | #ifndef NCCL_MAJOR
 340 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 341 | #endif
 342 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 343 |     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
 344 |     char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
 345 |     ncclRedOp_t op;
 346 | 
 347 |     if(opIndex < ncclNumOps) {
 348 |       op = opIndex;
 349 |     }
 350 |     #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
 351 |     else {
 352 |       union {
 353 |         int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
 354 |         half f16; float f32; double f64;
 355 |         #if defined(__CUDA_BF16_TYPES_EXIST__)
 356 |         __nv_bfloat16 bf16;
 357 |         #endif
 358 |       };
 359 |       switch(type) {
 360 |       case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
 361 |       case ncclUint8: u8 = ncclVerifiablePremulScalar<uint8_t>(rank); break;
 362 |       case ncclInt32: i32 = ncclVerifiablePremulScalar<int32_t>(rank); break;
 363 |       case ncclUint32: u32 = ncclVerifiablePremulScalar<uint32_t>(rank); break;
 364 |       case ncclInt64: i64 = ncclVerifiablePremulScalar<int64_t>(rank); break;
 365 |       case ncclUint64: u64 = ncclVerifiablePremulScalar<uint64_t>(rank); break;
 366 |       case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
 367 |       case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
 368 |       case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
 369 |       #if defined(__CUDA_BF16_TYPES_EXIST__)
 370 |       case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
 371 |       #endif
 372 |       }
 373 |       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
 374 |     }
 375 |     #endif
 376 | 
 377 |     TESTCHECK(args->collTest->runColl(
 378 |           (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
 379 |           (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
 380 |         count, type, op, root, args->comms[i], args->streams[i]));
 381 | 
 382 |     #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
 383 |     if(opIndex >= ncclNumOps) {
 384 |       NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
 385 |     }
 386 |     #endif
 387 |   }
 388 |   if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
 389 | 
 390 |   if (blocking_coll) {
 391 |     // Complete op before returning
 392 |     TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
 393 |   }
 394 |   if (blocking_coll) Barrier(args);
 395 |   return testSuccess;
 396 | }
 397 | 
 398 | testResult_t completeColl(struct threadArgs* args) {
 399 |   if (blocking_coll) return testSuccess;
 400 | 
 401 |   TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
 402 |   return testSuccess;
 403 | }
 404 | 
 405 | testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
 406 |   size_t count = args->nbytes / wordSize(type);
 407 |   if (datacheck) {
 408 |     // Initialize sendbuffs, recvbuffs and expected
 409 |     TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
 410 |   }
 411 | 
 412 |   // Sync
 413 |   TESTCHECK(startColl(args, type, op, root, in_place, 0));
 414 |   TESTCHECK(completeColl(args));
 415 | 
 416 |   Barrier(args);
 417 | 
 418 | #if CUDART_VERSION >= 11030
 419 |   cudaGraph_t graphs[args->nGpus];
 420 |   cudaGraphExec_t graphExec[args->nGpus];
 421 |   if (cudaGraphLaunches >= 1) {
 422 |     // Begin cuda graph capture
 423 |     for (int i=0; i<args->nGpus; i++) {
 424 |       // Thread local mdoe is needed for:
 425 |       // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
 426 |       // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
 427 |       //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
 428 |       CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
 429 |     }
 430 |   }
 431 | #endif
 432 | 
 433 |   // Performance Benchmark
 434 |   timer tim;
 435 |   for (int iter = 0; iter < iters; iter++) {
 436 |     if (agg_iters>1) NCCLCHECK(ncclGroupStart());
 437 |     for (int aiter = 0; aiter < agg_iters; aiter++) {
 438 |       TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
 439 |     }
 440 |     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
 441 |   }
 442 | 
 443 | #if CUDART_VERSION >= 11030
 444 |   if (cudaGraphLaunches >= 1) {
 445 |     // End cuda graph capture
 446 |     for (int i=0; i<args->nGpus; i++) {
 447 |       CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
 448 |     }
 449 |     // Instantiate cuda graph
 450 |     for (int i=0; i<args->nGpus; i++) {
 451 |       CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
 452 |     }
 453 |     // Resync CPU, restart timing, launch cuda graph
 454 |     Barrier(args);
 455 |     tim.reset();
 456 |     for (int l=0; l<cudaGraphLaunches; l++) {
 457 |       for (int i=0; i<args->nGpus; i++) {
 458 |         CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
 459 |       }
 460 |     }
 461 |   }
 462 | #endif
 463 | 
 464 |   double cputimeSec = tim.elapsed()/(iters*agg_iters);
 465 |   TESTCHECK(completeColl(args));
 466 | 
 467 |   double deltaSec = tim.elapsed();
 468 |   deltaSec = deltaSec/(iters*agg_iters);
 469 |   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
 470 |   Allreduce(args, &deltaSec, average);
 471 | 
 472 | #if CUDART_VERSION >= 11030
 473 |   if (cudaGraphLaunches >= 1) {
 474 |     //destroy cuda graph
 475 |     for (int i=0; i<args->nGpus; i++) {
 476 |       CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
 477 |       CUDACHECK(cudaGraphDestroy(graphs[i]));
 478 |     }
 479 |   }
 480 | #endif
 481 | 
 482 |   double algBw, busBw;
 483 |   args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
 484 | 
 485 |   Barrier(args);
 486 | 
 487 |   int64_t wrongElts = 0;
 488 |   static __thread int rep = 0;
 489 |   rep++;
 490 |   for (int c = 0; c < datacheck; c++) {
 491 |       // Initialize sendbuffs, recvbuffs and expected
 492 |       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 493 | 
 494 | #if CUDART_VERSION >= 11030
 495 |       if (cudaGraphLaunches >= 1) {
 496 |         // Begin cuda graph capture for data check
 497 |         for (int i=0; i<args->nGpus; i++) {
 498 |           CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
 499 |         }
 500 |       }
 501 | #endif
 502 | 
 503 |       //test validation in single itertion, should ideally be included into the multi-iteration run
 504 |       TESTCHECK(startColl(args, type, op, root, in_place, 0));
 505 | 
 506 | #if CUDART_VERSION >= 11030
 507 |       if (cudaGraphLaunches >= 1) {
 508 |         // End cuda graph capture
 509 |         for (int i=0; i<args->nGpus; i++) {
 510 |           CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
 511 |         }
 512 |         // Instantiate cuda graph
 513 |         for (int i=0; i<args->nGpus; i++) {
 514 |           CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
 515 |         }
 516 |         // Launch cuda graph
 517 |         for (int i=0; i<args->nGpus; i++) {
 518 |           CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
 519 |         }
 520 |       }
 521 | #endif
 522 | 
 523 |       TESTCHECK(completeColl(args));
 524 | 
 525 | #if CUDART_VERSION >= 11030
 526 |       if (cudaGraphLaunches >= 1) {
 527 |         //destroy cuda graph
 528 |         for (int i=0; i<args->nGpus; i++) {
 529 |           CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
 530 |           CUDACHECK(cudaGraphDestroy(graphs[i]));
 531 |         }
 532 |       }
 533 | #endif
 534 | 
 535 |       TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts));
 536 | 
 537 |       //aggregate delta from all threads and procs
 538 |       long long wrongElts1 = wrongElts;
 539 |       //if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts);
 540 |       Allreduce(args, &wrongElts1, /*sum*/4);
 541 |       wrongElts = wrongElts1;
 542 |       if (wrongElts) break;
 543 |   }
 544 | 
 545 |   double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
 546 |   char timeStr[100];
 547 |   if (timeUsec >= 10000.0) {
 548 |     sprintf(timeStr, "%7.0f", timeUsec);
 549 |   } else if (timeUsec >= 100.0) {
 550 |     sprintf(timeStr, "%7.1f", timeUsec);
 551 |   } else {
 552 |     sprintf(timeStr, "%7.2f", timeUsec);
 553 |   }
 554 |   if (args->reportErrors) {
 555 |     PRINT("  %7s  %6.2f  %6.2f  %5g", timeStr, algBw, busBw, (double)wrongElts);
 556 |   } else {
 557 |     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
 558 |   }
 559 | 
 560 |   args->bw[0] += busBw;
 561 |   args->bw_count[0]++;
 562 |   return testSuccess;
 563 | }
 564 | 
 565 | void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
 566 |   int nranks = args->nProcs*args->nGpus*args->nThreads;
 567 |   size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
 568 | 
 569 |   count = size / wordSize(type);
 570 |   args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
 571 | 
 572 |   args->nbytes = paramCount * wordSize(type);
 573 |   args->sendBytes = sendCount * wordSize(type);
 574 |   args->expectedBytes = recvCount * wordSize(type);
 575 |   args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
 576 |   args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
 577 | }
 578 | 
 579 | testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
 580 |   // Sync to avoid first-call timeout
 581 |   Barrier(args);
 582 | 
 583 |   // Warm-up for large size
 584 |   setupArgs(args->maxbytes, type, args);
 585 |   for (int iter = 0; iter < warmup_iters; iter++) {
 586 |     TESTCHECK(startColl(args, type, op, root, 0, iter));
 587 |   }
 588 |   TESTCHECK(completeColl(args));
 589 | 
 590 |   // Warm-up for small size
 591 |   setupArgs(args->minbytes, type, args);
 592 |   for (int iter = 0; iter < warmup_iters; iter++) {
 593 |     TESTCHECK(startColl(args, type, op, root, 0, iter));
 594 |   }
 595 |   TESTCHECK(completeColl(args));
 596 | 
 597 |   // Benchmark
 598 |   for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
 599 |       setupArgs(size, type, args);
 600 |       char rootName[100];
 601 |       sprintf(rootName, "%6i", root);
 602 |       PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
 603 |       TESTCHECK(BenchTime(args, type, op, root, 0));
 604 |       TESTCHECK(BenchTime(args, type, op, root, 1));
 605 |       PRINT("\n");
 606 |   }
 607 |   return testSuccess;
 608 | }
 609 | 
 610 | testResult_t threadRunTests(struct threadArgs* args) {
 611 |   // Set device to the first of our GPUs. If we don't do that, some operations
 612 |   // will be done on the current GPU (by default : 0) and if the GPUs are in
 613 |   // exclusive mode those operations will fail.
 614 |   CUDACHECK(cudaSetDevice(args->gpus[0]));
 615 |   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
 616 |   return testSuccess;
 617 | }
 618 | 
 619 | testResult_t threadInit(struct threadArgs* args) {
 620 |   char hostname[1024];
 621 |   getHostName(hostname, 1024);
 622 |   int nranks =  args->nProcs*args->nThreads*args->nGpus;
 623 | 
 624 |   //set main thread again
 625 |   is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0;
 626 | 
 627 |   NCCLCHECK(ncclGroupStart());
 628 |   for (int i=0; i<args->nGpus; i++) {
 629 |     int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
 630 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 631 |     NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
 632 |   }
 633 |   NCCLCHECK(ncclGroupEnd());
 634 | 
 635 |   TESTCHECK(threadRunTests(args));
 636 | 
 637 |   for (int i=0; i<args->nGpus; i++) {
 638 |     NCCLCHECK(ncclCommDestroy(args->comms[i]));
 639 |   }
 640 |   return testSuccess;
 641 | }
 642 | 
 643 | void* threadLauncher(void* thread_) {
 644 |   struct testThread* thread = (struct testThread*)thread_;
 645 |   thread->ret = thread->func(&thread->args);
 646 |   return NULL;
 647 | }
 648 | testResult_t threadLaunch(struct testThread* thread) {
 649 |   pthread_create(&thread->thread, NULL, threadLauncher, thread);
 650 |   return testSuccess;
 651 | }
 652 | 
 653 | testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
 654 |     CUDACHECK(cudaMalloc(sendbuff, nbytes));
 655 |     CUDACHECK(cudaMalloc(recvbuff, nbytes));
 656 |     if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
 657 |     return testSuccess;
 658 | }
 659 | 
 660 | testResult_t run(); // Main function
 661 | 
 662 | int main(int argc, char* argv[]) {
 663 |   // Make sure everyline is flushed so that we see the progress of the test
 664 |   setlinebuf(stdout);
 665 | 
 666 |   #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
 667 |     ncclGetVersion(&test_ncclVersion);
 668 |   #else
 669 |     test_ncclVersion = NCCL_VERSION_CODE;
 670 |   #endif
 671 |   //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
 672 |   #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
 673 |     test_opnum = 4;
 674 |     test_typenum = 9;
 675 |     if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
 676 |       test_opnum++; // ncclAvg
 677 |       #if defined(__CUDA_BF16_TYPES_EXIST__)
 678 |         test_typenum++; // bfloat16
 679 |       #endif
 680 |     }
 681 |     if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
 682 |       test_opnum++; // PreMulSum
 683 |     }
 684 |   #endif
 685 | 
 686 |   // Parse args
 687 |   double parsed;
 688 |   int longindex;
 689 |   static struct option longopts[] = {
 690 |     {"nthreads", required_argument, 0, 't'},
 691 |     {"ngpus", required_argument, 0, 'g'},
 692 |     {"minbytes", required_argument, 0, 'b'},
 693 |     {"maxbytes", required_argument, 0, 'e'},
 694 |     {"stepbytes", required_argument, 0, 'i'},
 695 |     {"stepfactor", required_argument, 0, 'f'},
 696 |     {"iters", required_argument, 0, 'n'},
 697 |     {"agg_iters", required_argument, 0, 'm'},
 698 |     {"warmup_iters", required_argument, 0, 'w'},
 699 |     {"parallel_init", required_argument, 0, 'p'},
 700 |     {"check", required_argument, 0, 'c'},
 701 |     {"op", required_argument, 0, 'o'},
 702 |     {"datatype", required_argument, 0, 'd'},
 703 |     {"root", required_argument, 0, 'r'},
 704 |     {"blocking", required_argument, 0, 'z'},
 705 |     {"stream_null", required_argument, 0, 'y'},
 706 |     {"timeout", required_argument, 0, 'T'},
 707 |     {"cudagraph", required_argument, 0, 'G'},
 708 |     {"report_cputime", required_argument, 0, 'C'},
 709 |     {"average", required_argument, 0, 'a'},
 710 |     {"help", no_argument, 0, 'h'},
 711 |     {}
 712 |   };
 713 | 
 714 |   while(1) {
 715 |     int c;
 716 |     c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
 717 | 
 718 |     if (c == -1)
 719 |       break;
 720 | 
 721 |     switch(c) {
 722 |       case 't':
 723 |         nThreads = strtol(optarg, NULL, 0);
 724 |         break;
 725 |       case 'g':
 726 |         nGpus = strtol(optarg, NULL, 0);
 727 |         break;
 728 |       case 'b':
 729 |         parsed = parsesize(optarg);
 730 |         if (parsed < 0) {
 731 |           fprintf(stderr, "invalid size specified for 'minbytes'\n");
 732 |           return -1;
 733 |         }
 734 |         minBytes = (size_t)parsed;
 735 |         break;
 736 |       case 'e':
 737 |         parsed = parsesize(optarg);
 738 |         if (parsed < 0) {
 739 |           fprintf(stderr, "invalid size specified for 'maxbytes'\n");
 740 |           return -1;
 741 |         }
 742 |         maxBytes = (size_t)parsed;
 743 |         break;
 744 |       case 'i':
 745 |         stepBytes = strtol(optarg, NULL, 0);
 746 |         break;
 747 |       case 'f':
 748 |         stepFactor = strtol(optarg, NULL, 0);
 749 |         break;
 750 |       case 'n':
 751 |         iters = (int)strtol(optarg, NULL, 0);
 752 |         break;
 753 |       case 'm':
 754 | #if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
 755 |         agg_iters = (int)strtol(optarg, NULL, 0);
 756 | #else
 757 |         fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
 758 | #endif
 759 |         break;
 760 |       case 'w':
 761 |         warmup_iters = (int)strtol(optarg, NULL, 0);
 762 |         break;
 763 |       case 'c':
 764 |         datacheck = (int)strtol(optarg, NULL, 0);
 765 |         break;
 766 |       case 'p':
 767 |         parallel_init = (int)strtol(optarg, NULL, 0);
 768 |         break;
 769 |       case 'o':
 770 |         ncclop = ncclstringtoop(optarg);
 771 |         break;
 772 |       case 'd':
 773 |         nccltype = ncclstringtotype(optarg);
 774 |         break;
 775 |       case 'r':
 776 |         ncclroot = strtol(optarg, NULL, 0);
 777 |         break;
 778 |       case 'z':
 779 |         blocking_coll = strtol(optarg, NULL, 0);
 780 |         break;
 781 |       case 'y':
 782 |         streamnull = strtol(optarg, NULL, 0);
 783 |         break;
 784 |       case 'T':
 785 |         timeout = strtol(optarg, NULL, 0);
 786 |         break;
 787 |       case 'G':
 788 | #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
 789 |         cudaGraphLaunches = strtol(optarg, NULL, 0);
 790 | #else
 791 |         printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
 792 | #endif
 793 |         break;
 794 |       case 'C':
 795 |         report_cputime = strtol(optarg, NULL, 0);
 796 |         break;
 797 |       case 'a':
 798 |         average = (int)strtol(optarg, NULL, 0);
 799 |         break;
 800 |       case 'h':
 801 |       default:
 802 |         if (c != 'h') printf("invalid option '%c'\n", c);
 803 |         printf("USAGE: %s \n\t"
 804 |             "[-t,--nthreads <num threads>] \n\t"
 805 |             "[-g,--ngpus <gpus per thread>] \n\t"
 806 |             "[-b,--minbytes <min size in bytes>] \n\t"
 807 |             "[-e,--maxbytes <max size in bytes>] \n\t"
 808 |             "[-i,--stepbytes <increment size>] \n\t"
 809 |             "[-f,--stepfactor <increment factor>] \n\t"
 810 |             "[-n,--iters <iteration count>] \n\t"
 811 |             "[-m,--agg_iters <aggregated iteration count>] \n\t"
 812 |             "[-w,--warmup_iters <warmup iteration count>] \n\t"
 813 |             "[-p,--parallel_init <0/1>] \n\t"
 814 |             "[-c,--check <check iteration count>] \n\t"
 815 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
 816 |             "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
 817 | #elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
 818 |             "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
 819 | #else
 820 |             "[-o,--op <sum/prod/min/max/all>] \n\t"
 821 | #endif
 822 |             "[-d,--datatype <nccltype/all>] \n\t"
 823 |             "[-r,--root <root>] \n\t"
 824 |             "[-z,--blocking <0/1>] \n\t"
 825 |             "[-y,--stream_null <0/1>] \n\t"
 826 |             "[-T,--timeout <time in seconds>] \n\t"
 827 |             "[-G,--cudagraph <num graph launches>] \n\t"
 828 |             "[-C,--report_cputime <0/1>] \n\t"
 829 |             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
 830 |             "[-h,--help]\n",
 831 |           basename(argv[0]));
 832 |         return 0;
 833 |     }
 834 |   }
 835 |   if (minBytes > maxBytes) {
 836 |     fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
 837 |            (unsigned long long)minBytes,
 838 |            (unsigned long long)maxBytes);
 839 |     return -1;
 840 |   }
 841 | #ifdef MPI_SUPPORT
 842 |   MPI_Init(&argc, &argv);
 843 | #endif
 844 |   TESTCHECK(run());
 845 |   return 0;
 846 | }
 847 | 
 848 | testResult_t run() {
 849 |   int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
 850 |   int localRank = 0;
 851 |   char hostname[1024];
 852 |   getHostName(hostname, 1024);
 853 | 
 854 | #ifdef MPI_SUPPORT
 855 |   MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
 856 |   MPI_Comm_rank(MPI_COMM_WORLD, &proc);
 857 |   uint64_t hostHashs[totalProcs];
 858 |   hostHashs[proc] = getHostHash(hostname);
 859 |   MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
 860 |   for (int p=0; p<totalProcs; p++) {
 861 |     if (p == proc) break;
 862 |     if (hostHashs[p] == hostHashs[proc]) localRank++;
 863 |   }
 864 | 
 865 |   char* str = getenv("NCCL_TESTS_SPLIT_MASK");
 866 |   uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
 867 |   MPI_Comm mpi_comm;
 868 |   color = proc & mask;
 869 |   MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
 870 |   MPI_Comm_size(mpi_comm, &ncclProcs);
 871 |   MPI_Comm_rank(mpi_comm, &ncclProc);
 872 | #endif
 873 |   is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
 874 | 
 875 |   PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
 876 |         nThreads, nGpus, minBytes, maxBytes,
 877 |         (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
 878 |         warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
 879 |   if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
 880 |   if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
 881 |   PRINT("#\n");
 882 | 
 883 |   PRINT("# Using devices\n");
 884 | #define MAX_LINE 2048
 885 |   char line[MAX_LINE];
 886 |   int len = 0;
 887 |   size_t maxMem = ~0;
 888 |   char* envstr = getenv("NCCL_TESTS_DEVICE");
 889 |   int gpu0 = envstr ? atoi(envstr) : -1;
 890 |   for (int i=0; i<nThreads*nGpus; i++) {
 891 |     int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
 892 |     int rank = proc*nThreads*nGpus+i;
 893 |     cudaDeviceProp prop;
 894 |     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
 895 |     len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
 896 |                     rank, color, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
 897 |     maxMem = std::min(maxMem, prop.totalGlobalMem);
 898 |   }
 899 | 
 900 | #if MPI_SUPPORT
 901 |   char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
 902 |   // Gather all output in rank order to root (0)
 903 |   MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
 904 |   if (proc == 0) {
 905 |     for (int p = 0; p < totalProcs; p++)
 906 |       PRINT("%s", lines+MAX_LINE*p);
 907 |     free(lines);
 908 |   }
 909 |   MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
 910 | #else
 911 |   PRINT("%s", line);
 912 | #endif
 913 | 
 914 |   // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
 915 |   size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
 916 |   if (maxBytes > memMaxBytes) {
 917 |     maxBytes = memMaxBytes;
 918 |     if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
 919 |   }
 920 | 
 921 |   ncclUniqueId ncclId;
 922 |   if (ncclProc == 0) {
 923 |     NCCLCHECK(ncclGetUniqueId(&ncclId));
 924 |   }
 925 | #ifdef MPI_SUPPORT
 926 |   MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
 927 |   MPI_Barrier(MPI_COMM_WORLD); // Ensure Bcast is complete for HCOLL
 928 | #endif
 929 |   int gpus[nGpus*nThreads];
 930 |   cudaStream_t streams[nGpus*nThreads];
 931 |   void* sendbuffs[nGpus*nThreads];
 932 |   void* recvbuffs[nGpus*nThreads];
 933 |   void* expected[nGpus*nThreads];
 934 |   size_t sendBytes, recvBytes;
 935 | 
 936 |   ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads);
 937 | 
 938 |   envstr = getenv("NCCL_TESTS_DEVICE");
 939 |   gpu0 = envstr ? atoi(envstr) : -1;
 940 |   for (int i=0; i<nGpus*nThreads; i++) {
 941 |     gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
 942 |     CUDACHECK(cudaSetDevice(gpus[i]));
 943 |     TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
 944 |     if (streamnull)
 945 |       streams[i] = NULL;
 946 |     else
 947 |       CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
 948 |   }
 949 | 
 950 |   //if parallel init is not selected, use main thread to initialize NCCL
 951 |   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
 952 |   if (!parallel_init) {
 953 |      if (ncclProcs == 1) {
 954 |        NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
 955 |      } else {
 956 |        NCCLCHECK(ncclGroupStart());
 957 |        for (int i=0; i<nGpus*nThreads; i++) {
 958 |          CUDACHECK(cudaSetDevice(gpus[i]));
 959 |          NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
 960 |        }
 961 |        NCCLCHECK(ncclGroupEnd());
 962 |      }
 963 |   }
 964 | 
 965 |   int errors[nThreads];
 966 |   double bw[nThreads];
 967 |   double* delta;
 968 |   CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
 969 |   int bw_count[nThreads];
 970 |   for (int t=0; t<nThreads; t++) {
 971 |     bw[t] = 0.0;
 972 |     errors[t] = bw_count[t] = 0;
 973 |   }
 974 | 
 975 |   fflush(stdout);
 976 | 
 977 |   const char* timeStr = report_cputime ? "cputime" : "time";
 978 |   PRINT("#\n");
 979 |   PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
 980 |   PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
 981 |       timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
 982 |   PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
 983 |       "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 984 | 
 985 |   struct testThread threads[nThreads];
 986 |   memset(threads, 0, sizeof(struct testThread)*nThreads);
 987 | 
 988 |   for (int t=nThreads-1; t>=0; t--) {
 989 |     threads[t].args.minbytes=minBytes;
 990 |     threads[t].args.maxbytes=maxBytes;
 991 |     threads[t].args.stepbytes=stepBytes;
 992 |     threads[t].args.stepfactor=stepFactor;
 993 |     threads[t].args.localRank = localRank;
 994 | 
 995 |     threads[t].args.totalProcs=totalProcs;
 996 |     threads[t].args.nProcs=ncclProcs;
 997 |     threads[t].args.proc=ncclProc;
 998 |     threads[t].args.nThreads=nThreads;
 999 |     threads[t].args.thread=t;
1000 |     threads[t].args.nGpus=nGpus;
1001 |     threads[t].args.gpus=gpus+t*nGpus;
1002 |     threads[t].args.sendbuffs = sendbuffs+t*nGpus;
1003 |     threads[t].args.recvbuffs = recvbuffs+t*nGpus;
1004 |     threads[t].args.expected = expected+t*nGpus;
1005 |     threads[t].args.ncclId = ncclId;
1006 |     threads[t].args.comms=comms+t*nGpus;
1007 |     threads[t].args.streams=streams+t*nGpus;
1008 | 
1009 |     threads[t].args.errors=errors+t;
1010 |     threads[t].args.bw=bw+t;
1011 |     threads[t].args.bw_count=bw_count+t;
1012 | 
1013 |     threads[t].args.reportErrors = datacheck;
1014 | 
1015 |     threads[t].func = parallel_init ? threadInit : threadRunTests;
1016 |     if (t)
1017 |       TESTCHECK(threadLaunch(threads+t));
1018 |     else
1019 |       TESTCHECK(threads[t].func(&threads[t].args));
1020 |   }
1021 | 
1022 |   // Wait for other threads and accumulate stats and errors
1023 |   for (int t=nThreads-1; t>=0; t--) {
1024 |     if (t) pthread_join(threads[t].thread, NULL);
1025 |     TESTCHECK(threads[t].ret);
1026 |     if (t) {
1027 |       errors[0] += errors[t];
1028 |       bw[0] += bw[t];
1029 |       bw_count[0] += bw_count[t];
1030 |     }
1031 |   }
1032 | 
1033 | #ifdef MPI_SUPPORT
1034 |   MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
1035 | #endif
1036 | 
1037 |   if (!parallel_init) {
1038 |     for(int i=0; i<nGpus*nThreads; ++i)
1039 |       NCCLCHECK(ncclCommDestroy(comms[i]));
1040 |     free(comms);
1041 |   }
1042 | 
1043 |   // Free off CUDA allocated memory
1044 |   for (int i=0; i<nGpus*nThreads; i++) {
1045 |     if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
1046 |     if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
1047 |     if (datacheck) CUDACHECK(cudaFree(expected[i]));
1048 |   }
1049 |   CUDACHECK(cudaFreeHost(delta));
1050 | 
1051 |   envstr = getenv("NCCL_TESTS_MIN_BW");
1052 |   double check_avg_bw = envstr ? atof(envstr) : -1;
1053 |   bw[0] /= bw_count[0];
1054 | 
1055 |   PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
1056 |   PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
1057 |   PRINT("#\n");
1058 | #ifdef MPI_SUPPORT
1059 |   MPI_Comm_free(&mpi_comm);
1060 |   MPI_Finalize();
1061 | #endif
1062 | 
1063 |   PRINT("%s\n", ncclGetLastError(NULL));
1064 | 
1065 |   // 'cuda-memcheck --leak-check full' requires this
1066 |   cudaDeviceReset();
1067 | 
1068 |   if (errors[0] || bw[0] < check_avg_bw*(0.9))
1069 |     exit(EXIT_FAILURE);
1070 |   else
1071 |     exit(EXIT_SUCCESS);
1072 | }
1073 | 


--------------------------------------------------------------------------------
/nccl-tests/src/common.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | #ifndef __COMMON_H__
  7 | #define __COMMON_H__
  8 | 
  9 | #include "nccl.h"
 10 | #include <stdio.h>
 11 | #include <cstdint>
 12 | #include <algorithm>
 13 | #ifdef MPI_SUPPORT
 14 | #include "mpi.h"
 15 | #endif
 16 | #include <pthread.h>
 17 | #include "nccl1_compat.h"
 18 | #include "timer.h"
 19 | 
 20 | // For nccl.h < 2.13 since we define a weak fallback
 21 | extern "C" char const* ncclGetLastError(ncclComm_t comm);
 22 | 
 23 | #define CUDACHECK(cmd) do {                         \
 24 |   cudaError_t err = cmd;                            \
 25 |   if( err != cudaSuccess ) {                        \
 26 |     char hostname[1024];                            \
 27 |     getHostName(hostname, 1024);                    \
 28 |     printf("%s: Test CUDA failure %s:%d '%s'\n",    \
 29 |          hostname,                                  \
 30 |         __FILE__,__LINE__,cudaGetErrorString(err)); \
 31 |     return testCudaError;                           \
 32 |   }                                                 \
 33 | } while(0)
 34 | 
 35 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
 36 | #define NCCLCHECK(cmd) do {                         \
 37 |   ncclResult_t res = cmd;                           \
 38 |   if (res != ncclSuccess) {                         \
 39 |     char hostname[1024];                            \
 40 |     getHostName(hostname, 1024);                    \
 41 |     printf("%s: Test NCCL failure %s:%d "           \
 42 |            "'%s / %s'\n",                           \
 43 |            hostname,__FILE__,__LINE__,              \
 44 |            ncclGetErrorString(res),                 \
 45 |            ncclGetLastError(NULL));                 \
 46 |     return testNcclError;                           \
 47 |   }                                                 \
 48 | } while(0)
 49 | #else
 50 | #define NCCLCHECK(cmd) do {                         \
 51 |   ncclResult_t res = cmd;                           \
 52 |   if (res != ncclSuccess) {                         \
 53 |     char hostname[1024];                            \
 54 |     getHostName(hostname, 1024);                    \
 55 |     printf("%s: Test NCCL failure %s:%d '%s'\n",    \
 56 |          hostname,                                  \
 57 |         __FILE__,__LINE__,ncclGetErrorString(res)); \
 58 |     return testNcclError;                           \
 59 |   }                                                 \
 60 | } while(0)
 61 | #endif
 62 | 
 63 | typedef enum {
 64 |   testSuccess = 0,
 65 |   testInternalError = 1,
 66 |   testCudaError = 2,
 67 |   testNcclError = 3,
 68 |   testTimeout = 4,
 69 |   testNumResults = 5
 70 | } testResult_t;
 71 | 
 72 | // Relay errors up and trace
 73 | #define TESTCHECK(cmd) do {                         \
 74 |   testResult_t r = cmd;                             \
 75 |   if (r!= testSuccess) {                            \
 76 |     char hostname[1024];                            \
 77 |     getHostName(hostname, 1024);                    \
 78 |     printf(" .. %s pid %d: Test failure %s:%d\n",   \
 79 |          hostname, getpid(),                        \
 80 |         __FILE__,__LINE__);                         \
 81 |     return r;                                       \
 82 |   }                                                 \
 83 | } while(0)
 84 | 
 85 | struct testColl {
 86 |   const char name[20];
 87 |   void (*getCollByteCount)(
 88 |       size_t *sendcount, size_t *recvcount, size_t *paramcount,
 89 |       size_t *sendInplaceOffset, size_t *recvInplaceOffset,
 90 |       size_t count, int nranks);
 91 |   testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
 92 |       ncclRedOp_t op, int root, int rep, int in_place);
 93 |   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
 94 |   testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
 95 |       ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 96 | };
 97 | extern struct testColl allReduceTest;
 98 | extern struct testColl allGatherTest;
 99 | extern struct testColl reduceScatterTest;
100 | extern struct testColl broadcastTest;
101 | extern struct testColl reduceTest;
102 | extern struct testColl alltoAllTest;
103 | 
104 | struct testEngine {
105 |   void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
106 |   testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
107 |       const char* typeName, ncclRedOp_t op, const char* opName);
108 | };
109 | 
110 | extern struct testEngine ncclTestEngine;
111 | 
112 | struct threadArgs {
113 |   size_t nbytes;
114 |   size_t minbytes;
115 |   size_t maxbytes;
116 |   size_t stepbytes;
117 |   size_t stepfactor;
118 | 
119 |   int totalProcs;
120 |   int nProcs;
121 |   int proc;
122 |   int nThreads;
123 |   int thread;
124 |   int nGpus;
125 |   int* gpus;
126 |   int localRank;
127 |   void** sendbuffs;
128 |   size_t sendBytes;
129 |   size_t sendInplaceOffset;
130 |   void** recvbuffs;
131 |   size_t recvInplaceOffset;
132 |   ncclUniqueId ncclId;
133 |   ncclComm_t* comms;
134 |   cudaStream_t* streams;
135 | 
136 |   void** expected;
137 |   size_t expectedBytes;
138 |   int* errors;
139 |   double* bw;
140 |   int* bw_count;
141 | 
142 |   int reportErrors;
143 | 
144 |   struct testColl* collTest;
145 | };
146 | 
147 | typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
148 | struct testThread {
149 |   pthread_t thread;
150 |   threadFunc_t func;
151 |   struct threadArgs args;
152 |   testResult_t ret;
153 | };
154 | 
155 | // Provided by common.cu
156 | extern void Barrier(struct threadArgs* args);
157 | extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
158 | extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
159 | extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
160 | extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
161 | 
162 | #include <unistd.h>
163 | 
164 | static void getHostName(char* hostname, int maxlen) {
165 |   gethostname(hostname, maxlen);
166 |   for (int i=0; i< maxlen; i++) {
167 |     if (hostname[i] == '.') {
168 |       hostname[i] = '\0';
169 |       return;
170 |     }
171 |   }
172 | }
173 | 
174 | #include <stdint.h>
175 | 
176 | static uint64_t getHash(const char* string, size_t n) {
177 |   // Based on DJB2a, result = result * 33 ^ char
178 |   uint64_t result = 5381;
179 |   for (size_t c = 0; c < n; c++) {
180 |     result = ((result << 5) + result) ^ string[c];
181 |   }
182 |   return result;
183 | }
184 | 
185 | /* Generate a hash of the unique identifying string for this host
186 |  * that will be unique for both bare-metal and container instances
187 |  * Equivalent of a hash of;
188 |  *
189 |  * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
190 |  *
191 |  */
192 | #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
193 | static uint64_t getHostHash(const char* hostname) {
194 |   char hostHash[1024];
195 | 
196 |   // Fall back is the hostname if something fails
197 |   (void) strncpy(hostHash, hostname, sizeof(hostHash));
198 |   int offset = strlen(hostHash);
199 | 
200 |   FILE *file = fopen(HOSTID_FILE, "r");
201 |   if (file != NULL) {
202 |     char *p;
203 |     if (fscanf(file, "%ms", &p) == 1) {
204 |         strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
205 |         free(p);
206 |     }
207 |   }
208 |   fclose(file);
209 | 
210 |   // Make sure the string is terminated
211 |   hostHash[sizeof(hostHash)-1]='\0';
212 | 
213 |   return getHash(hostHash, strlen(hostHash));
214 | }
215 | 
216 | static size_t wordSize(ncclDataType_t type) {
217 |   switch(type) {
218 |     case ncclChar:
219 | #if NCCL_MAJOR >= 2
220 |     //case ncclInt8:
221 |     case ncclUint8:
222 | #endif
223 |       return 1;
224 |     case ncclHalf:
225 | #if defined(__CUDA_BF16_TYPES_EXIST__)
226 |     case ncclBfloat16:
227 | #endif
228 |     //case ncclFloat16:
229 |       return 2;
230 |     case ncclInt:
231 |     case ncclFloat:
232 | #if NCCL_MAJOR >= 2
233 |     //case ncclInt32:
234 |     case ncclUint32:
235 |     //case ncclFloat32:
236 | #endif
237 |       return 4;
238 |     case ncclInt64:
239 |     case ncclUint64:
240 |     case ncclDouble:
241 |     //case ncclFloat64:
242 |       return 8;
243 |     default: return 0;
244 |   }
245 | }
246 | 
247 | extern int test_ncclVersion; // init'd with ncclGetVersion()
248 | constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
249 | extern int test_opnum;
250 | extern int test_typenum;
251 | extern ncclDataType_t test_types[ncclNumTypes];
252 | extern const char *test_typenames[ncclNumTypes];
253 | extern ncclRedOp_t test_ops[];
254 | extern const char *test_opnames[];
255 | 
256 | static int ncclstringtotype(char *str) {
257 |     for (int t=0; t<ncclNumTypes; t++) {
258 |       if (strcmp(str, test_typenames[t]) == 0) {
259 |         return t;
260 |       }
261 |     }
262 |     if (strcmp(str, "all") == 0) {
263 |       return -1;
264 |     }
265 |     printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
266 |     return ncclFloat;
267 | }
268 | 
269 | static int ncclstringtoop (char *str) {
270 |     for (int o=0; o<test_opnum; o++) {
271 |       if (strcmp(str, test_opnames[o]) == 0) {
272 |         return o;
273 |       }
274 |     }
275 |     if (strcmp(str, "all") == 0) {
276 |       return -1;
277 |     }
278 |     printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
279 |     return ncclSum;
280 | }
281 | 
282 | extern int is_main_proc;
283 | extern thread_local int is_main_thread;
284 | #define PRINT if (is_main_thread) printf
285 | 
286 | #endif
287 | 


--------------------------------------------------------------------------------
/nccl-tests/src/gather.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 11 |   *sendcount = count/nranks;
 12 |   *recvcount = (count/nranks)*nranks;
 13 |   *sendInplaceOffset = count/nranks;
 14 |   *recvInplaceOffset = 0;
 15 |   *paramcount = count/nranks;
 16 | }
 17 | 
 18 | testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 19 |   size_t sendcount = args->sendBytes / wordSize(type);
 20 |   size_t recvcount = args->expectedBytes / wordSize(type);
 21 |   int nranks = args->nProcs*args->nThreads*args->nGpus;
 22 | 
 23 |   for (int i=0; i<args->nGpus; i++) {
 24 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 25 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 26 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 27 |     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
 28 |     TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
 29 |     CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
 30 |     if (rank == root) {
 31 |       TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
 32 |     }
 33 |     CUDACHECK(cudaDeviceSynchronize());
 34 |   }
 35 |   return testSuccess;
 36 | }
 37 | 
 38 | void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 39 |   double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
 40 | 
 41 |   *algBw = baseBw;
 42 |   double factor = ((double)(nranks-1))/((double)(nranks));
 43 |   *busBw = baseBw * factor;
 44 | }
 45 | 
 46 | testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 47 |   int nRanks;
 48 |   NCCLCHECK(ncclCommCount(comm, &nRanks));
 49 |   int rank;
 50 |   NCCLCHECK(ncclCommUserRank(comm, &rank));
 51 |   size_t rankOffset = count * wordSize(type);
 52 |   if (count == 0) return testSuccess;
 53 | 
 54 |   NCCLCHECK(ncclGroupStart());
 55 |   NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
 56 |   if (rank == root) {
 57 |     for (int r=0; r<nRanks; r++) {
 58 |       NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
 59 |     }
 60 |   }
 61 |   NCCLCHECK(ncclGroupEnd());
 62 | 
 63 |   return testSuccess;
 64 | }
 65 | 
 66 | struct testColl gatherTest = {
 67 |   "Gather",
 68 |   GatherGetCollByteCount,
 69 |   GatherInitData,
 70 |   GatherGetBw,
 71 |   GatherRunColl
 72 | };
 73 | 
 74 | void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 75 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 76 |   GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 77 | }
 78 | 
 79 | testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 80 |   args->collTest = &gatherTest;
 81 |   ncclDataType_t *run_types;
 82 |   const char **run_typenames;
 83 |   int type_count;
 84 |   int begin_root, end_root;
 85 | 
 86 |   if ((int)type != -1) {
 87 |     type_count = 1;
 88 |     run_types = &type;
 89 |     run_typenames = &typeName;
 90 |   } else {
 91 |     type_count = test_typenum;
 92 |     run_types = test_types;
 93 |     run_typenames = test_typenames;
 94 |   }
 95 | 
 96 |   if (root != -1) {
 97 |     begin_root = end_root = root;
 98 |   } else {
 99 |     begin_root = 0;
100 |     end_root = args->nProcs*args->nThreads*args->nGpus-1;
101 |   }
102 | 
103 |   for (int i=0; i<type_count; i++) {
104 |     for (int j=begin_root; j<=end_root; j++) {
105 |       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
106 |     }
107 |   }
108 |   return testSuccess;
109 | }
110 | 
111 | struct testEngine gatherEngine = {
112 |   GatherGetBuffSize,
113 |   GatherRunTest
114 | };
115 | 
116 | #pragma weak ncclTestEngine=gatherEngine
117 | 


--------------------------------------------------------------------------------
/nccl-tests/src/hypercube.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | #define ALIGN 4
 11 | 
 12 | void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 13 |   size_t base = (count/(ALIGN*nranks))*ALIGN;
 14 |   *sendcount = base;
 15 |   *recvcount = base*nranks;
 16 |   *sendInplaceOffset = base;
 17 |   *recvInplaceOffset = 0;
 18 |   *paramcount = base;
 19 | }
 20 | 
 21 | testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 22 |   size_t sendcount = args->sendBytes / wordSize(type);
 23 |   size_t recvcount = args->expectedBytes / wordSize(type);
 24 |   int nranks = args->nProcs*args->nThreads*args->nGpus;
 25 | 
 26 |   for (int i=0; i<args->nGpus; i++) {
 27 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 28 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 29 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 30 |     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
 31 |     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
 32 |     for (int j=0; j<nranks; j++) {
 33 |       TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
 34 |     }
 35 |     CUDACHECK(cudaDeviceSynchronize());
 36 |   }
 37 |   return testSuccess;
 38 | }
 39 | 
 40 | void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 41 |   double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
 42 | 
 43 |   *algBw = baseBw;
 44 |   double factor = 1;
 45 |   *busBw = baseBw * factor;
 46 | }
 47 | 
 48 | testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 49 |   char* sbuff = (char*)sendbuff;
 50 |   char* rbuff = (char*)recvbuff;
 51 |   int nRanks;
 52 |   NCCLCHECK(ncclCommCount(comm, &nRanks));
 53 |   int rank;
 54 |   NCCLCHECK(ncclCommUserRank(comm, &rank));
 55 |   size_t rankSize = count * wordSize(type);
 56 | 
 57 |   if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
 58 | 
 59 |   // Hypercube AllGather
 60 |   for (int mask=1; mask<nRanks; mask<<=1) {
 61 |     NCCLCHECK(ncclGroupStart());
 62 |     int s = rank & ~(mask-1);
 63 |     int r = s ^ mask;
 64 |     NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
 65 |     NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
 66 |     NCCLCHECK(ncclGroupEnd());
 67 |   }
 68 |   return testSuccess;
 69 | }
 70 | 
 71 | struct testColl hyperCubeTest = {
 72 |   "HyperCube",
 73 |   HyperCubeGetCollByteCount,
 74 |   HyperCubeInitData,
 75 |   HyperCubeGetBw,
 76 |   HyperCubeRunColl
 77 | };
 78 | 
 79 | void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 80 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 81 |   HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 82 | }
 83 | 
 84 | testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 85 |   args->collTest = &hyperCubeTest;
 86 |   ncclDataType_t *run_types;
 87 |   const char **run_typenames;
 88 |   int type_count;
 89 | 
 90 |   if ((int)type != -1) {
 91 |     type_count = 1;
 92 |     run_types = &type;
 93 |     run_typenames = &typeName;
 94 |   } else {
 95 |     type_count = test_typenum;
 96 |     run_types = test_types;
 97 |     run_typenames = test_typenames;
 98 |   }
 99 | 
100 |   // Check if this is a power of 2
101 |   int nRanks = args->nProcs*args->nThreads*args->nGpus;
102 |   if (nRanks && !(nRanks & (nRanks - 1))) {
103 |     for (int i=0; i<type_count; i++) {
104 |       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
105 |     }
106 |   } else {
107 |     printf("nRanks %d is not a power of 2, skipping\n", nRanks);
108 |   }
109 | 
110 |   return testSuccess;
111 | }
112 | 
113 | struct testEngine hyperCubeEngine = {
114 |   HyperCubeGetBuffSize,
115 |   HyperCubeRunTest
116 | };
117 | 
118 | #pragma weak ncclTestEngine=hyperCubeEngine
119 | 


--------------------------------------------------------------------------------
/nccl-tests/src/nccl1_compat.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL1_COMPAT_H
 8 | #define NCCL1_COMPAT_H
 9 | 
10 | #ifndef NCCL_MAJOR // NCCL 1.x
11 | #define NCCL_MAJOR 1
12 | #define NCCL_MINOR 0
13 | 
14 | #define ncclNumOps nccl_NUM_OPS
15 | #define ncclNumTypes nccl_NUM_TYPES
16 | 
17 | static ncclResult_t ncclGroupStart() { return ncclSuccess; }
18 | static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
19 | 
20 | #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
21 | 
22 | static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
23 |     ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
24 |   CHECKCOUNT(count);
25 |   return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
26 | }
27 | static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
28 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
29 |   CHECKCOUNT(count);
30 |   return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
31 | }
32 | static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
33 |     ncclComm_t comm, cudaStream_t stream) {
34 |   CHECKCOUNT(count);
35 |   return ncclBcast(buff, (int)count, datatype, root, comm, stream);
36 | }
37 | static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
38 |     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
39 |     cudaStream_t stream) {
40 |   CHECKCOUNT(recvcount);
41 |   return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
42 | }
43 | static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
44 |     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
45 |   CHECKCOUNT(sendcount);
46 |   return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
47 | }
48 | #endif
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/nccl-tests/src/reduce.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 11 |   *sendcount = count;
 12 |   *recvcount = count;
 13 |   *sendInplaceOffset = 0;
 14 |   *recvInplaceOffset = 0;
 15 |   *paramcount = *sendcount;
 16 | }
 17 | 
 18 | testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 19 |   size_t sendcount = args->sendBytes / wordSize(type);
 20 |   size_t recvcount = args->expectedBytes / wordSize(type);
 21 |   int nranks = args->nProcs*args->nThreads*args->nGpus;
 22 | 
 23 |   for (int i=0; i<args->nGpus; i++) {
 24 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 25 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 26 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 27 |     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
 28 |     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
 29 |     CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
 30 |     if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
 31 |     CUDACHECK(cudaDeviceSynchronize());
 32 |   }
 33 |   return testSuccess;
 34 | }
 35 | 
 36 | void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 37 |   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 38 |   *algBw = baseBw;
 39 |   *busBw = baseBw;
 40 | }
 41 | 
 42 | testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 43 |   NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
 44 |   return testSuccess;
 45 | }
 46 | 
 47 | struct testColl reduceTest = {
 48 |   "Reduce",
 49 |   ReduceGetCollByteCount,
 50 |   ReduceInitData,
 51 |   ReduceGetBw,
 52 |   ReduceRunColl
 53 | };
 54 | 
 55 | void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 56 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 57 |   ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 58 | }
 59 | 
 60 | testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 61 |   args->collTest = &reduceTest;
 62 |   ncclDataType_t *run_types;
 63 |   ncclRedOp_t *run_ops;
 64 |   const char **run_typenames, **run_opnames;
 65 |   int type_count, op_count;
 66 |   int begin_root, end_root;
 67 | 
 68 |   if ((int)type != -1) {
 69 |     type_count = 1;
 70 |     run_types = &type;
 71 |     run_typenames = &typeName;
 72 |   } else {
 73 |     type_count = test_typenum;
 74 |     run_types = test_types;
 75 |     run_typenames = test_typenames;
 76 |   }
 77 | 
 78 |   if ((int)op != -1) {
 79 |     op_count = 1;
 80 |     run_ops = &op;
 81 |     run_opnames = &opName;
 82 |   } else {
 83 |     op_count = test_opnum;
 84 |     run_ops = test_ops;
 85 |     run_opnames = test_opnames;
 86 |   }
 87 | 
 88 |   if (root != -1) {
 89 |     begin_root = end_root = root;
 90 |   } else {
 91 |     begin_root = 0;
 92 |     end_root = args->nProcs*args->nThreads*args->nGpus-1;
 93 |   }
 94 | 
 95 |   for (int i=0; i<type_count; i++) {
 96 |     for (int j=0; j<op_count; j++) {
 97 |       for (int k=begin_root; k<=end_root; k++) {
 98 |         TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
 99 |       }
100 |     }
101 |   }
102 |   return testSuccess;
103 | }
104 | 
105 | struct testEngine reduceEngine = {
106 |   ReduceGetBuffSize,
107 |   ReduceRunTest
108 | };
109 | 
110 | #pragma weak ncclTestEngine=reduceEngine
111 | 


--------------------------------------------------------------------------------
/nccl-tests/src/reduce_scatter.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | #define ALIGN 4
 11 | 
 12 | void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 13 |   size_t base = (count/(ALIGN*nranks))*ALIGN;
 14 |   *sendcount = base*nranks;
 15 |   *recvcount = base;
 16 |   *sendInplaceOffset = 0;
 17 |   *recvInplaceOffset = base;
 18 |   *paramcount = base;
 19 | }
 20 | 
 21 | testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 22 |   size_t sendcount = args->sendBytes / wordSize(type);
 23 |   size_t recvcount = args->expectedBytes / wordSize(type);
 24 |   int nranks = args->nProcs*args->nThreads*args->nGpus;
 25 | 
 26 |   for (int i=0; i<args->nGpus; i++) {
 27 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 28 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 29 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 30 |     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
 31 |     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
 32 |     CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
 33 |     TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
 34 |     CUDACHECK(cudaDeviceSynchronize());
 35 |   }
 36 |   return testSuccess;
 37 | }
 38 | 
 39 | void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 40 |   double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
 41 | 
 42 |   *algBw = baseBw;
 43 |   double factor = ((double)(nranks - 1))/((double)nranks);
 44 |   *busBw = baseBw * factor;
 45 | }
 46 | 
 47 | testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 48 |   NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
 49 |   return testSuccess;
 50 | }
 51 | 
 52 | struct testColl reduceScatterTest = {
 53 |   "ReduceScatter",
 54 |   ReduceScatterGetCollByteCount,
 55 |   ReduceScatterInitData,
 56 |   ReduceScatterGetBw,
 57 |   ReduceScatterRunColl
 58 | };
 59 | 
 60 | void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 61 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 62 |   ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 63 | }
 64 | 
 65 | testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 66 |   args->collTest = &reduceScatterTest;
 67 |   ncclDataType_t *run_types;
 68 |   ncclRedOp_t *run_ops;
 69 |   const char **run_typenames, **run_opnames;
 70 |   int type_count, op_count;
 71 | 
 72 |   if ((int)type != -1) {
 73 |     type_count = 1;
 74 |     run_types = &type;
 75 |     run_typenames = &typeName;
 76 |   } else {
 77 |     type_count = test_typenum;
 78 |     run_types = test_types;
 79 |     run_typenames = test_typenames;
 80 |   }
 81 | 
 82 |   if ((int)op != -1) {
 83 |     run_ops = &op;
 84 |     run_opnames = &opName;
 85 |     op_count = 1;
 86 |   } else {
 87 |     op_count = test_opnum;
 88 |     run_ops = test_ops;
 89 |     run_opnames = test_opnames;
 90 |   }
 91 | 
 92 |   for (int i=0; i<type_count; i++) {
 93 |     for (int j=0; j<op_count; j++) {
 94 |       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
 95 |     }
 96 |   }
 97 |   return testSuccess;
 98 | }
 99 | 
100 | struct testEngine reduceScatterEngine = {
101 |   ReduceScatterGetBuffSize,
102 |   ReduceScatterRunTest
103 | };
104 | 
105 | #pragma weak ncclTestEngine=reduceScatterEngine
106 | 


--------------------------------------------------------------------------------
/nccl-tests/src/scatter.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 11 |   *sendcount = (count/nranks)*nranks;
 12 |   *recvcount = count/nranks;
 13 |   *sendInplaceOffset = 0;
 14 |   *recvInplaceOffset = count/nranks;
 15 |   *paramcount = count/nranks;
 16 | }
 17 | 
 18 | testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 19 |   size_t sendcount = args->sendBytes / wordSize(type);
 20 |   size_t recvcount = args->expectedBytes / wordSize(type);
 21 | 
 22 |   for (int i=0; i<args->nGpus; i++) {
 23 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 24 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 25 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 26 |     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
 27 |     if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
 28 |     TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
 29 |     CUDACHECK(cudaDeviceSynchronize());
 30 |   }
 31 |   return testSuccess;
 32 | }
 33 | 
 34 | void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 35 |   double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
 36 | 
 37 |   *algBw = baseBw;
 38 |   double factor = ((double)(nranks-1))/((double)(nranks));
 39 |   *busBw = baseBw * factor;
 40 | }
 41 | 
 42 | testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 43 |   int nRanks;
 44 |   NCCLCHECK(ncclCommCount(comm, &nRanks));
 45 |   int rank;
 46 |   NCCLCHECK(ncclCommUserRank(comm, &rank));
 47 |   size_t rankOffset = count * wordSize(type);
 48 |   if (count == 0) return testSuccess;
 49 | 
 50 |   NCCLCHECK(ncclGroupStart());
 51 |   if (rank == root) {
 52 |     for (int r=0; r<nRanks; r++) {
 53 |       NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
 54 |     }
 55 |   }
 56 |   NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
 57 |   NCCLCHECK(ncclGroupEnd());
 58 | 
 59 |   return testSuccess;
 60 | }
 61 | 
 62 | struct testColl scatterTest = {
 63 |   "Scatter",
 64 |   ScatterGetCollByteCount,
 65 |   ScatterInitData,
 66 |   ScatterGetBw,
 67 |   ScatterRunColl
 68 | };
 69 | 
 70 | void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 71 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 72 |   ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 73 | }
 74 | 
 75 | testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 76 |   args->collTest = &scatterTest;
 77 |   ncclDataType_t *run_types;
 78 |   const char **run_typenames;
 79 |   int type_count;
 80 |   int begin_root, end_root;
 81 | 
 82 |   if ((int)type != -1) {
 83 |     type_count = 1;
 84 |     run_types = &type;
 85 |     run_typenames = &typeName;
 86 |   } else {
 87 |     type_count = test_typenum;
 88 |     run_types = test_types;
 89 |     run_typenames = test_typenames;
 90 |   }
 91 | 
 92 |   if (root != -1) {
 93 |     begin_root = end_root = root;
 94 |   } else {
 95 |     begin_root = 0;
 96 |     end_root = args->nProcs*args->nThreads*args->nGpus-1;
 97 |   }
 98 | 
 99 |   for (int i=0; i<type_count; i++) {
100 |     for (int j=begin_root; j<=end_root; j++) {
101 |       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
102 |     }
103 |   }
104 |   return testSuccess;
105 | }
106 | 
107 | struct testEngine scatterEngine = {
108 |   ScatterGetBuffSize,
109 |   ScatterRunTest
110 | };
111 | 
112 | #pragma weak ncclTestEngine=scatterEngine
113 | 


--------------------------------------------------------------------------------
/nccl-tests/src/sendrecv.cu:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * See LICENSE.txt for license information
  5 |  ************************************************************************/
  6 | 
  7 | #include "cuda_runtime.h"
  8 | #include "common.h"
  9 | 
 10 | void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
 11 |   *sendcount = count;
 12 |   *recvcount = count;
 13 |   *sendInplaceOffset = 0;
 14 |   *recvInplaceOffset = 0;
 15 |   *paramcount = *sendcount;
 16 | }
 17 | 
 18 | testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
 19 |   size_t sendcount = args->sendBytes / wordSize(type);
 20 |   size_t recvcount = args->expectedBytes / wordSize(type);
 21 |   int nranks = args->nProcs*args->nThreads*args->nGpus;
 22 | 
 23 |   for (int i=0; i<args->nGpus; i++) {
 24 |     CUDACHECK(cudaSetDevice(args->gpus[i]));
 25 |     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
 26 |     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
 27 |     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
 28 |     TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
 29 |     int peer = (rank-1+nranks)%nranks;
 30 |     TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
 31 |     CUDACHECK(cudaDeviceSynchronize());
 32 |   }
 33 |   // We don't support in-place sendrecv
 34 |   args->reportErrors = in_place ? 0 : 1;
 35 |   return testSuccess;
 36 | }
 37 | 
 38 | void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
 39 |   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 40 | 
 41 |   *algBw = baseBw;
 42 |   double factor = 1;
 43 |   *busBw = baseBw * factor;
 44 | }
 45 | 
 46 | testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 47 |   int nRanks;
 48 |   NCCLCHECK(ncclCommCount(comm, &nRanks));
 49 |   int rank;
 50 |   NCCLCHECK(ncclCommUserRank(comm, &rank));
 51 |   int recvPeer = (rank-1+nRanks) % nRanks;
 52 |   int sendPeer = (rank+1) % nRanks;
 53 | 
 54 |   NCCLCHECK(ncclGroupStart());
 55 |   NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
 56 |   NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
 57 |   NCCLCHECK(ncclGroupEnd());
 58 |   return testSuccess;
 59 | }
 60 | 
 61 | struct testColl sendRecvTest = {
 62 |   "SendRecv",
 63 |   SendRecvGetCollByteCount,
 64 |   SendRecvInitData,
 65 |   SendRecvGetBw,
 66 |   SendRecvRunColl
 67 | };
 68 | 
 69 | void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
 70 |   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
 71 |   SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
 72 | }
 73 | 
 74 | testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
 75 |   args->collTest = &sendRecvTest;
 76 |   ncclDataType_t *run_types;
 77 |   ncclRedOp_t *run_ops;
 78 |   const char **run_typenames, **run_opnames;
 79 |   int type_count, op_count;
 80 | 
 81 |   if ((int)type != -1) {
 82 |     type_count = 1;
 83 |     run_types = &type;
 84 |     run_typenames = &typeName;
 85 |   } else {
 86 |     type_count = test_typenum;
 87 |     run_types = test_types;
 88 |     run_typenames = test_typenames;
 89 |   }
 90 | 
 91 |   if ((int)op != -1) {
 92 |     op_count = 1;
 93 |     run_ops = &op;
 94 |     run_opnames = &opName;
 95 |   } else {
 96 |     op_count = test_opnum;
 97 |     run_ops = test_ops;
 98 |     run_opnames = test_opnames;
 99 |   }
100 | 
101 |   for (int i=0; i<type_count; i++) {
102 |     for (int j=0; j<op_count; j++) {
103 |       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
104 |     }
105 |   }
106 |   return testSuccess;
107 | }
108 | 
109 | struct testEngine sendRecvEngine = {
110 |   SendRecvGetBuffSize,
111 |   SendRecvRunTest
112 | };
113 | 
114 | #pragma weak ncclTestEngine=sendRecvEngine
115 | 


--------------------------------------------------------------------------------
/nccl-tests/src/timer.cc:
--------------------------------------------------------------------------------
 1 | #include "timer.h"
 2 | 
 3 | // Make sure to compile this translation unit with the host compiler and not
 4 | // nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
 5 | #include <chrono>
 6 | 
 7 | namespace {
 8 |   std::uint64_t now() {
 9 |     using clock = std::chrono::steady_clock;
10 |     return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
11 |   }
12 | }
13 | 
14 | timer::timer() {
15 |   t0 = now();
16 | }
17 | 
18 | double timer::elapsed() const {
19 |   std::uint64_t t1 = now();
20 |   return 1.e-9*(t1 - t0);
21 | }
22 | 
23 | double timer::reset() {
24 |   std::uint64_t t1 = now();
25 |   double ans = 1.e-9*(t1 - t0);
26 |   t0 = t1;
27 |   return ans;
28 | }
29 | 


--------------------------------------------------------------------------------
/nccl-tests/src/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef _408319ecdd5b47b28bf8f511c4fdf816
 2 | #define _408319ecdd5b47b28bf8f511c4fdf816
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | // Can't include <chrono> because of bug with gcc 10.3.0
 7 | class timer {
 8 |   std::uint64_t t0;
 9 | public:
10 |   timer();
11 |   double elapsed() const;
12 |   double reset();
13 | };
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/nccl-tests/verifiable/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../makefiles/common.mk
 2 | 
 3 | .PHONY: all clean
 4 | 
 5 | BUILDDIR := $(abspath ../../build)
 6 | NCCLDIR := $(BUILDDIR)
 7 | NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
 8 | DST_DIR := $(BUILDDIR)/test/verifiable
 9 | 
10 | all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
11 | 
12 | clean:
13 | 	rm -rf $(DST_DIR)
14 | 
15 | TEST_VERIFIABLE_SRCDIR := .
16 | TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
17 | include verifiable.mk
18 | 
19 | self_test: $(DST_DIR)/self_test
20 | 
21 | $(DST_DIR)/self_test: verifiable.cu verifiable.h
22 | 	@printf "Linking  %s\n" $@
23 | 	@mkdir -p $(DST_DIR)
24 | 	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
25 | 


--------------------------------------------------------------------------------
/nccl-tests/verifiable/inexact_regress.cu:
--------------------------------------------------------------------------------
  1 | /* Generate parameters for our error bound model of floating point average
  2 |  * (sum of scaled values) by sampling sums of random sequences for each
  3 |  * floating point type.
  4 |  *
  5 |  * The model has parameters "coef" and "power", where for two floats a & b,
  6 |  * they are close enough if and only if:
  7 |  *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
  8 |  *
  9 |  * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
 10 |  *
 11 |  * Compile with:
 12 |  *   nvcc -gencode=arch=compute_80,code=sm_80
 13 |  */
 14 | 
 15 | #include <algorithm>
 16 | #include <cmath>
 17 | #include <cstdio>
 18 | #include <cstdint>
 19 | #include <cuda_bf16.h>
 20 | #include <cuda_fp16.h>
 21 | 
 22 | using std::uint64_t;
 23 | using std::uint32_t;
 24 | using bfloat16 = __nv_bfloat16;
 25 | 
 26 | template<typename T>
 27 | struct float_traits;
 28 | 
 29 | template<>
 30 | struct float_traits<float> {
 31 |   static constexpr int mantissa_bits = 23;
 32 |   static constexpr int exponent_bits = 8;
 33 |   using uint_t = uint32_t;
 34 |   __device__ static float make(double x) { return (float)x; }
 35 |   __device__ static float make(uint64_t x) { return (float)x; }
 36 |   __device__ static double todouble(float x) { return x; }
 37 |   __device__ static float add(float a, float b) { return a+b; }
 38 |   __device__ static float mul(float a, float b) { return a*b; }
 39 | };
 40 | template<>
 41 | struct float_traits<double> {
 42 |   static constexpr int mantissa_bits = 52;
 43 |   static constexpr int exponent_bits = 11;
 44 |   using uint_t = uint64_t;
 45 |   __device__ static double make(double x) { return x; }
 46 |   __device__ static double make(uint64_t x) { return (double)x; }
 47 |   __device__ static double todouble(double x) { return x; }
 48 |   __device__ static double add(double a, double b) { return a+b; }
 49 |   __device__ static double mul(double a, double b) { return a*b; }
 50 | };
 51 | template<>
 52 | struct float_traits<half> {
 53 |   static constexpr int mantissa_bits = 10;
 54 |   static constexpr int exponent_bits = 5;
 55 |   using uint_t = uint16_t;
 56 |   __device__ static half make(double x) { return __double2half(x); }
 57 |   __device__ static half make(uint64_t x) { return __int2half_rn(x); }
 58 |   __device__ static double todouble(half x) { return __half2float(x); }
 59 |   __device__ static half add(half a, half b) { return __hadd(a, b); }
 60 |   __device__ static half mul(half a, half b) { return __hmul(a, b); }
 61 | };
 62 | template<>
 63 | struct float_traits<bfloat16> {
 64 |   static constexpr int mantissa_bits = 7;
 65 |   static constexpr int exponent_bits = 8;
 66 |   using uint_t = uint16_t;
 67 |   __device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
 68 |   __device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
 69 |   __device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
 70 |   __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
 71 |   __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
 72 | };
 73 | 
 74 | template<typename F>
 75 | __device__ int compare(F a, F b) {
 76 |   union { typename float_traits<F>::uint_t ua; F fa; };
 77 |   union { typename float_traits<F>::uint_t ub; F fb; };
 78 |   ua=0; ub=0;
 79 |   fa=a; fb=b;
 80 |   //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
 81 |   return ua < ub ? ub-ua : ua-ub;
 82 | }
 83 | 
 84 | struct xoshiro256ss {
 85 | 	uint64_t s[4];
 86 |   __device__ xoshiro256ss(int seed) {
 87 |     constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
 88 |     for(int i=0; i < 4; i++)
 89 |       s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
 90 |   }
 91 |   __device__ uint64_t operator()() {
 92 |     auto rol64 = [](uint64_t x, int k) {
 93 |       return (x << k) | (x >> (64 - k));
 94 |     };
 95 |     uint64_t const result = rol64(s[1] * 5, 7) * 9;
 96 |     uint64_t const t = s[1] << 17;
 97 |     s[2] ^= s[0];
 98 |     s[3] ^= s[1];
 99 |     s[1] ^= s[2];
100 |     s[0] ^= s[3];
101 |     s[2] ^= t;
102 |     s[3] = rol64(s[3], 45);
103 |     return result;
104 |   }
105 | };
106 | 
107 | template<typename F>
108 | __global__ void kernel() {
109 |   using traits = float_traits<F>;
110 |   constexpr int samps = 4<<10;
111 |   __shared__ F accf[samps];
112 |   __shared__ double accd[samps];
113 | 
114 |   xoshiro256ss rng(threadIdx.x);
115 |   float expo_avg = 1;
116 |   for(int pass=0; pass < 2; pass++) {
117 |     F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
118 |     int err_max = 0;
119 |     float coef = 0;
120 |     double expo_sum = 0;
121 |     int expo_n = 0;
122 |     int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
123 |     for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
124 |     //for(int round=0; round < 2; round++) {
125 |       for(int i=threadIdx.x; i < samps; i += blockDim.x) {
126 |         accf[i] = 0;
127 |         accd[i] = 0;
128 |       }
129 |       __syncthreads();
130 |       for(int r=0; r < max_ranks; r++) {
131 |         int err = 0;
132 |         for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
133 |           constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
134 |           double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
135 |           F f = traits::make(d);
136 |           accf[i] = traits::add(accf[i], traits::mul(scalar, f));
137 |           accd[i] += traits::todouble(f);
138 |           //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
139 |           int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
140 |           err = err > e ? err : e;
141 |         }
142 |         err = __reduce_max_sync(-1u, err);
143 |         err_max = err_max > err ? err_max : err;
144 |         if (r >= 2) {
145 |           // err = 1 + coef*pow(r,expo)
146 |           float c = float(err-1)/powf(float(r), expo_avg);
147 |           coef = coef > c ? coef : c;
148 |         }
149 |         if (r >= 2) {
150 |           double expo = log2f(1+err_max)/log2f(r);
151 |           expo_sum += expo;
152 |           expo_n++;
153 |           //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
154 |         }
155 |       }
156 |     }
157 |     if(pass==0)
158 |       expo_avg = expo_sum/expo_n;
159 |     else if(threadIdx.x == 0)
160 |       std::printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
161 |   }
162 | }
163 | 
164 | int main() {
165 |   std::printf("type=float:\n");
166 |   kernel<float><<<1,32>>>();
167 |   cudaDeviceSynchronize();
168 | 
169 |   std::printf("\ntype=half:\n");
170 |   kernel<half><<<1,32>>>();
171 |   cudaDeviceSynchronize();
172 | 
173 |   std::printf("\ntype=bfloat16:\n");
174 |   kernel<bfloat16><<<1,32>>>();
175 |   cudaDeviceSynchronize();
176 |   return 0;
177 | }
178 | 


--------------------------------------------------------------------------------
/nccl-tests/verifiable/verifiable.cu:
--------------------------------------------------------------------------------
   1 | #pragma nv_diag_suppress declared_but_not_referenced
   2 | 
   3 | #include "verifiable.h"
   4 | #include <nccl.h>
   5 | 
   6 | #include <cuda_runtime.h>
   7 | #include <cuda_fp16.h>
   8 | #if CUDART_VERSION >= 11000
   9 | #include <cuda_bf16.h>
  10 | #endif
  11 | 
  12 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
  13 |   #define HAVE_ncclBfloat16 1
  14 | #else
  15 |   #define HAVE_ncclBfloat16 0
  16 | #endif
  17 | 
  18 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
  19 |   #define HAVE_ncclAvg 1
  20 | #else
  21 |   #define HAVE_ncclAvg 0
  22 | #endif
  23 | 
  24 | #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
  25 |   #define HAVE_ncclPreMulSum 1
  26 | #else
  27 |   #define HAVE_ncclPreMulSum 0
  28 | #endif
  29 | 
  30 | #include <algorithm>
  31 | #include <cassert>
  32 | #include <cstdio>
  33 | #include <cstdint>
  34 | #include <cmath>
  35 | #include <unistd.h>
  36 | 
  37 | using std::size_t;
  38 | using std::int8_t;
  39 | using std::int16_t;
  40 | using std::int32_t;
  41 | using std::int64_t;
  42 | using std::uint8_t;
  43 | using std::uint16_t;
  44 | using std::uint32_t;
  45 | using std::uint64_t;
  46 | 
  47 | ////////////////////////////////////////////////////////////////////////////////
  48 | 
  49 | namespace {
  50 | template<typename T>
  51 | __device__ unsigned long long bitsOf(T x) {
  52 |   union { unsigned long long ull; T val; } u;
  53 |   u.ull = 0;
  54 |   u.val = x;
  55 |   return u.ull;
  56 | }
  57 | 
  58 | __host__ __device__ uint64_t mixBits(uint64_t x) {
  59 |   union { uint32_t u32[2]; uint64_t u64; };
  60 |   u64 = x;
  61 |   u32[1] += 1;
  62 |   u32[0] ^= u32[1];
  63 |   u64 *= 0x9e3779b97f4a7c13u;
  64 |   u32[0] ^= u32[1]<<16 ^ u32[1]>>16;
  65 |   return u64;
  66 | }
  67 | 
  68 | __host__ __device__ uint64_t hashOf(uint64_t a, uint64_t b=0) {
  69 |   a += uint64_t(1)<<32;
  70 |   a += b;
  71 |   a ^= a>>32;
  72 |   a *= 0x9e3779b97f4a7c13u;
  73 |   a += b>>16 ^ b<<48;
  74 |   a ^= a>>32;
  75 |   a *= 0xc4ceb9fe1a85ec53u;
  76 |   return a;
  77 | }
  78 | }
  79 | 
  80 | ////////////////////////////////////////////////////////////////////////////////
  81 | 
  82 | namespace {
  83 | template<typename T>
  84 | struct IsIntegral: std::is_integral<T> {};
  85 | template<>
  86 | struct IsIntegral<half>: std::false_type {};
  87 | #ifdef __CUDA_BF16_TYPES_EXIST__
  88 | template<>
  89 | struct IsIntegral<__nv_bfloat16>: std::false_type {};
  90 | #endif
  91 | }
  92 | 
  93 | ////////////////////////////////////////////////////////////////////////////////
  94 | 
  95 | // Hide a value from arithmetic optimizations. Hopefully compiler cannot detect
  96 | // that this is equivalent to the identity function.
  97 | template<typename T>
  98 | __host__ __device__ T inhibit(T x) {
  99 |   union { uint64_t u64; T val; };
 100 |   u64 = 0;
 101 |   val = x;
 102 |   u64 *= 0x0000000100000001u;
 103 |   u64 *= 0xffffffff00000001u;
 104 |   return val;
 105 | }
 106 | 
 107 | ////////////////////////////////////////////////////////////////////////////////
 108 | 
 109 | namespace {
 110 |   template<typename Y, typename X>
 111 |   __host__ __device__ Y castTo(X x) {
 112 |     return Y(x);
 113 |   }
 114 |   template<typename Y>
 115 |   __host__ __device__ Y castTo(float x) {
 116 |     return Y(x);
 117 |   }
 118 |   template<>
 119 |   __host__ __device__ half castTo<half>(float x) {
 120 |     return __float2half(x);
 121 |   }
 122 |   #ifdef __CUDA_BF16_TYPES_EXIST__
 123 |   template<>
 124 |   __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
 125 |     return __float2bfloat16(x);
 126 |   }
 127 |   #endif
 128 | }
 129 | 
 130 | ////////////////////////////////////////////////////////////////////////////////
 131 | // The reduction functions
 132 | 
 133 | namespace {
 134 | struct ReduceNil {
 135 |   template<typename T>
 136 |   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
 137 |   template<typename T>
 138 |   __host__ __device__ T operator()(T a, T /*b*/) const { return a; }
 139 |   template<typename T>
 140 |   __host__ __device__ T postOp(T x) const { return x; }
 141 | };
 142 | struct ReduceSum {
 143 |   template<typename T>
 144 |   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
 145 |   template<typename T, typename=decltype(T()+T())>
 146 |   __host__ __device__ T operator()(T a, T b) const { return a + b; }
 147 |   __host__ __device__ half operator()(half a, half b) const {
 148 |     #if __CUDA_ARCH__ >= 530
 149 |       return __hadd(a, b);
 150 |     #else
 151 |       return __float2half(__half2float(a) + __half2float(b));
 152 |     #endif
 153 |   }
 154 |   #ifdef __CUDA_BF16_TYPES_EXIST__
 155 |   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
 156 |     #if __CUDA_ARCH__ >= 800
 157 |       return __hadd(a, b);
 158 |     #else
 159 |       return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
 160 |     #endif
 161 |   }
 162 |   #endif
 163 |   template<typename T>
 164 |   __host__ __device__ T postOp(T x) const { return x; }
 165 | };
 166 | struct ReduceProd {
 167 |   template<typename T>
 168 |   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
 169 |   template<typename T, typename=decltype(T()*T())>
 170 |   __host__ __device__ T operator()(T a, T b) const { return a * b; }
 171 |   __host__ __device__ half operator()(half a, half b) const {
 172 |     #if __CUDA_ARCH__ >= 530
 173 |       return __hmul(a, b);
 174 |     #else
 175 |       return __float2half(__half2float(a) * __half2float(b));
 176 |     #endif
 177 |   }
 178 |   #ifdef __CUDA_BF16_TYPES_EXIST__
 179 |   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
 180 |     #if __CUDA_ARCH__ >= 800
 181 |       return __hmul(a, b);
 182 |     #else
 183 |       return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
 184 |     #endif
 185 |   }
 186 |   #endif
 187 |   template<typename T>
 188 |   __host__ __device__ T postOp(T x) const { return x; }
 189 | };
 190 | struct ReduceMin {
 191 |   template<typename T>
 192 |   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
 193 |   template<typename T, typename=decltype(T()<T())>
 194 |   __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
 195 |   __host__ __device__ half operator()(half a, half b) const {
 196 |     #if __CUDA_ARCH__ >= 800
 197 |       return __hmin(a, b);
 198 |     #elif __CUDA_ARCH__ >= 530
 199 |       return __hlt(a, b) ? a : b;
 200 |     #else
 201 |       return __half2float(a) < __half2float(b) ? a : b;
 202 |     #endif
 203 |   }
 204 |   #ifdef __CUDA_BF16_TYPES_EXIST__
 205 |   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
 206 |     #if __CUDA_ARCH__ >= 800
 207 |       return __hmin(a, b);
 208 |     //#elif __CUDA_ARCH__ >= 530
 209 |     //  return __hlt(a, b) ? a : b;
 210 |     #else
 211 |       return __bfloat162float(a) < __bfloat162float(b) ? a : b;
 212 |     #endif
 213 |   }
 214 |   #endif
 215 |   template<typename T>
 216 |   __host__ __device__ T postOp(T x) const { return x; }
 217 | };
 218 | struct ReduceMax {
 219 |   template<typename T>
 220 |   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
 221 |   template<typename T, typename=decltype(T()>T())>
 222 |   __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
 223 |   __host__ __device__ half operator()(half a, half b) const {
 224 |     #if __CUDA_ARCH__ >= 800
 225 |       return __hmax(a, b);
 226 |     #elif __CUDA_ARCH__ >= 530
 227 |       return __hgt(a, b) ? a : b;
 228 |     #else
 229 |       return __half2float(a) > __half2float(b) ? a : b;
 230 |     #endif
 231 |   }
 232 |   #ifdef __CUDA_BF16_TYPES_EXIST__
 233 |   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
 234 |     #if __CUDA_ARCH__ >= 800
 235 |       return __hmax(a, b);
 236 |     //#elif __CUDA_ARCH__ >= 530
 237 |     //  return __hgt(a, b) ? a : b;
 238 |     #else
 239 |       return __bfloat162float(a) > __bfloat162float(b) ? a : b;
 240 |     #endif
 241 |   }
 242 |   #endif
 243 |   template<typename T>
 244 |   __host__ __device__ T postOp(T x) const { return x; }
 245 | };
 246 | struct ReducePreMulSum {
 247 |   template<typename T>
 248 |   __host__ __device__ T preOp(T x, int rank_me) const {
 249 |     return ReduceProd()(x, ncclVerifiablePremulScalar<T>(rank_me));
 250 |   }
 251 |   template<typename T>
 252 |   __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
 253 |   template<typename T>
 254 |   __host__ __device__ T postOp(T x) const { return x; }
 255 | };
 256 | 
 257 | template<typename T, bool integral = IsIntegral<T>::value>
 258 | struct ReduceAvg_Base;
 259 | 
 260 | template<typename T>
 261 | struct ReduceAvg_Base<T, /*integral=*/true> {
 262 |   int rank_n;
 263 |   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
 264 |   __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
 265 |   __host__ __device__ T postOp(T x) const { return x/rank_n; }
 266 | };
 267 | 
 268 | template<typename T>
 269 | struct ReduceAvg_Base<T, /*integral=*/false> {
 270 |   int rank_n;
 271 |   __host__ __device__ T preOp(T x, int /*rank_me*/) const {
 272 |     using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
 273 |     return ReduceProd()(inhibit(castTo<T>(T1(1)/T1(rank_n))), inhibit(x));
 274 |   }
 275 |   __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
 276 |   __host__ __device__ T postOp(T x) const { return x; }
 277 | };
 278 | 
 279 | struct ReduceAvg {
 280 |   int rank_n;
 281 |   template<typename T>
 282 |   __host__ __device__ T preOp(T x, int rank_me) const {
 283 |     return ReduceAvg_Base<T>{rank_n}.preOp(x, rank_me);
 284 |   }
 285 |   template<typename T>
 286 |   __host__ __device__ T operator()(T a, T b) const {
 287 |     return ReduceAvg_Base<T>{rank_n}(a, b);
 288 |   }
 289 |   template<typename T>
 290 |   __host__ __device__ T postOp(T x) const {
 291 |     return ReduceAvg_Base<T>{rank_n}.postOp(x);
 292 |   }
 293 | };
 294 | }
 295 | 
 296 | ////////////////////////////////////////////////////////////////////////////////
 297 | 
 298 | namespace {
 299 | template<typename T>
 300 | struct FloatLayout;
 301 | template<>
 302 | struct FloatLayout<float> {
 303 |   static constexpr int exponent_bits = 8, mantissa_bits = 23;
 304 |   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 305 | };
 306 | template<>
 307 | struct FloatLayout<double> {
 308 |   static constexpr int exponent_bits = 11, mantissa_bits = 52;
 309 |   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 310 | };
 311 | template<>
 312 | struct FloatLayout<half> {
 313 |   static constexpr int exponent_bits = 5, mantissa_bits = 10;
 314 |   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 315 | };
 316 | #ifdef __CUDA_BF16_TYPES_EXIST__
 317 | template<>
 318 | struct FloatLayout<__nv_bfloat16> {
 319 |   static constexpr int exponent_bits = 8, mantissa_bits = 7;
 320 |   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 321 | };
 322 | #endif
 323 | 
 324 | template<typename T>
 325 | __host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
 326 |   union { T ans; uint64_t bits; };
 327 |   bits = sign;
 328 |   bits <<= FloatLayout<T>::exponent_bits;
 329 |   bits |= exp;
 330 |   bits <<= FloatLayout<T>::mantissa_bits;
 331 |   bits |= mant;
 332 |   return ans;
 333 | }
 334 | }
 335 | 
 336 | ////////////////////////////////////////////////////////////////////////////////
 337 | 
 338 | namespace {
 339 | // High bits of multiplcation are useful for generating bounded random values
 340 | // from unbounded random values. For instance, given X a totally random 32-bit
 341 | // integer, `umul32hi(X,n)` will be totally random within [0,n).
 342 | __host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
 343 | #ifdef __CUDA_ARCH__
 344 |   return __umulhi(a, b);
 345 | #else
 346 |   return uint64_t(a)*b >> 32;
 347 | #endif
 348 | }
 349 | __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
 350 | #ifdef __CUDA_ARCH__
 351 |   return __umul64hi(a, b);
 352 | #else
 353 |   return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
 354 | #endif
 355 | }
 356 | 
 357 | __host__ __device__ int clz32(int x) {
 358 | #ifdef __CUDA_ARCH__
 359 |   return __clz(x);
 360 | #else
 361 |   return x==0 ? 32 : __builtin_clz(x);
 362 | #endif
 363 | }
 364 | __host__ __device__ int clz64(long long x) {
 365 | #ifdef __CUDA_ARCH__
 366 |   return __clzll(x);
 367 | #else
 368 |   return x==0 ? 64 : __builtin_clzll(x);
 369 | #endif
 370 | }
 371 | }
 372 | 
 373 | ////////////////////////////////////////////////////////////////////////////////
 374 | 
 375 | namespace {
 376 | // Returns a wildly permuted rank index. Useful when we know we want exactly N
 377 | // random ranks to exhibit some behavior, we can just test if:
 378 | // `shuffleRank(rank_n, rank_me, rng) < N`. Note that rank_n > 0 must be true
 379 | // for well defined results. This mixes the bits of rng.
 380 | __host__ __device__ int shuffleRank(int rank_n, int rank_me, uint64_t &rng) {
 381 |   uint32_t a = uint32_t(rng);
 382 |   uint32_t b = uint32_t(rng>>32);
 383 |   rng = mixBits(rng);
 384 | 
 385 |   uint32_t r = rank_me;
 386 |   // round down rank_n to largest pow2, then subtract 1
 387 |   uint32_t n2 = (~uint32_t(0)>>1) >> clz32(rank_n);
 388 | 
 389 |   // These are 1:1 functions modulo 2^n:
 390 |   //   f(x) = x*a + b : for odd a, any b
 391 |   //   f(x) = (x*x + x)/2
 392 |   // So we apply both to the bottom n2+1 ranks, then rotate the top
 393 |   // (rank_n-n2-1) to the bottom and apply both again.
 394 | 
 395 |   if(r <= n2) {
 396 |     // shuffle bottom n2+1 ranks
 397 |     r = (r*(a|1) + b) & n2;
 398 |     r = (r*r + r)/2 & n2;
 399 |     // rotate top to bottom
 400 |     r += rank_n - (n2+1);
 401 |   }
 402 |   else
 403 |     r -= n2+1; // rotate top to bottom
 404 | 
 405 |   if(r <= n2) {
 406 |     // shuffle bottom n2+1 again
 407 |     r = (r*(b|1) + a) & n2;
 408 |     r = (r*r + r)/2 & n2;
 409 |   }
 410 |   return r;
 411 | }
 412 | }
 413 | 
 414 | namespace {
 415 | // Generate wild integers x and y such that if every rank submits its x into a
 416 | // summation the result will be y with y <= y_max. Ranks should be shuffled
 417 | // before calling.
 418 | template<typename Uint>
 419 | __host__ __device__ void genSumXY(
 420 |     int rank_n, int rank_me, uint64_t &rng, Uint y_max, Uint &x, Uint &y,
 421 |     bool avoid_y=false // if true then returned y will not equal given y
 422 |   ) {
 423 |   static_assert(std::is_unsigned<Uint>::value, "Type must be unsigned integral.");
 424 | 
 425 |   { // Pick y as a random value in [y_max/2, y_max]
 426 |     Uint d, y_min = (y_max+1)/2;
 427 |     if(8*sizeof(Uint) > 32)
 428 |       d = umul64hi(rng, y_max/2 + (avoid_y ? 0 : 1));
 429 |     else
 430 |       d = umul32hi(uint32_t(rng), y_max/2 + (avoid_y ? 0 : 1));
 431 |     Uint y1 = (avoid_y ? y+1 : y_min) + d;
 432 |     y = y1 - (avoid_y && (y1 < y_min || y_max < y1) ? y_max/2 : 0);
 433 |   }
 434 |   rng = mixBits(rng);
 435 | 
 436 |   unsigned r = unsigned(rank_me);
 437 |   unsigned rn = unsigned(rank_n);
 438 |   // Partition our rn ranks into pn distinct subsets each of size rn/pn. If each
 439 |   // rank submits 1+p (where p is 0-based partition index) then the sum be:
 440 |   //   (rn/pn) * pn*(pn+1)/2
 441 |   // So set this equal to our desired sum y and solve for pn.
 442 |   //   (rn/pn) * pn*(pn+1)/2 = y
 443 |   //   rn*(pn+1)/2 = y
 444 |   //   pn = 2*(y/rn)-1
 445 |   Uint pn = rn == 1 ? 1 : 2*(y/rn) - 1;
 446 |   // In the case where rn is huge (compared to y) use only one partition meaning
 447 |   // that all rn ranks will submit 1 (since p=0).
 448 |   pn = pn == 0 ? 1 : pn;
 449 |   // Can't have more partitions than ranks.
 450 |   pn = rn < pn ? rn : pn;
 451 |   // Compute sum of contribution from pn partitions where each submits p+1.
 452 |   Uint p_sum;
 453 |   if(y_max <= ~uint32_t(0)>>1) // compile time known
 454 |     p_sum = Uint(uint32_t(pn)*uint32_t(pn+1)/2);
 455 |   else
 456 |     p_sum = Uint(uint64_t(pn)*uint64_t(pn+1)/2);
 457 |   // Let s be the number of ranks per partition. This is either rn/pn as we
 458 |   // intended, or y/p_sum if that's smaller to prevent overshooting our target y.
 459 |   uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn;
 460 |   x = r/s < pn ? 1 + r/s : 0; //  First s*pn ranks contribute partition index +1.
 461 |   x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy.
 462 | }
 463 | }
 464 | 
 465 | namespace {
 466 | template<typename T>
 467 | __host__ __device__ T genInOutFloatSum(
 468 |     bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 469 |     bool same_sign
 470 |   ) {
 471 |   constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
 472 |   constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
 473 |   using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
 474 |   constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
 475 |   constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
 476 |   uint64_t rng = hashOf(seed, index);
 477 | 
 478 |   int y_sign = rng & 1;
 479 |   int x_sign = y_sign;
 480 |   int xy_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
 481 |   rng = mixBits(rng);
 482 |   rank_me = shuffleRank(rank_n, rank_me, rng);
 483 | 
 484 |   // If we're using mixed signs then partition into evens and odds.
 485 |   int subrank_n = same_sign ? rank_n : (rank_n+1)/2;
 486 |   int subrank_me = same_sign ? rank_me : rank_me/2;
 487 |   uintmant_t x0_mant, y0_mant;
 488 |   genSumXY(subrank_n, subrank_me, rng, max_mant, x0_mant, y0_mant);
 489 | 
 490 |   if (!same_sign && (rank_n+0)/2 != 0) {
 491 |     uintmant_t x1_mant, y1_mant = y0_mant;
 492 |     // Avoid generating y1_mant == y0_mant so we don't have to worry about
 493 |     // signed zero as the result.
 494 |     genSumXY((rank_n+0)/2, rank_me/2, rng, max_mant, x1_mant, y1_mant, /*avoid_y=*/true);
 495 |     y_sign ^= y0_mant < y1_mant ? 1 : 0;
 496 |     y0_mant = (y0_mant < y1_mant ? -1 : 1)*(y0_mant - y1_mant);
 497 |     x_sign ^= rank_me%2;
 498 |     x0_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
 499 |   }
 500 | 
 501 |   uintmant_t ans_mant = input_not_output ? x0_mant : y0_mant;
 502 |   if(ans_mant == 0)
 503 |     return T(0.0f);
 504 |   else {
 505 |     int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
 506 |     int ans_sign = input_not_output ? x_sign : y_sign;
 507 |     int ans_exp = xy_exp - shift;
 508 |     ans_mant <<= shift;
 509 |     return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
 510 |   }
 511 | }
 512 | }
 513 | 
 514 | namespace {
 515 | template<typename T>
 516 | __host__ __device__ T genInOutFloatPreMulSum(
 517 |     bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
 518 |   ) {
 519 |   constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
 520 |   constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
 521 |   using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
 522 |   constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
 523 |   constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
 524 |   uint64_t rng = hashOf(seed, index);
 525 | 
 526 |   int y_sign = rng & 1;
 527 |   int y_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
 528 |   rng = mixBits(rng);
 529 |   int subrank_me0 = shuffleRank((rank_n+1)/2, rank_me/2, rng);
 530 |   int subrank_me1 = shuffleRank((rank_n+0)/2, rank_me/2, rng);
 531 | 
 532 |   // when ncclVerifiablePremulScalar() = 1.0 (rank_me%2 == 0)
 533 |   uintmant_t x0_mant, y0_mant;
 534 |   genSumXY((rank_n+1)/2, subrank_me0, rng, max_mant>>1, x0_mant, y0_mant);
 535 | 
 536 |   // when ncclVerifiablePremulScalar() = 2.0 (rank_me%2 == 1)
 537 |   uintmant_t x1_mant=0, y1_mant=0;
 538 |   if((rank_n+0)/2 != 0)
 539 |     genSumXY((rank_n+0)/2, subrank_me1, rng, max_mant>>2, x1_mant, y1_mant);
 540 | 
 541 |   uintmant_t x_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
 542 |   uintmant_t y_mant = y0_mant + 2*y1_mant;
 543 |   uintmant_t ans_mant = input_not_output ? x_mant : y_mant;
 544 | 
 545 |   if(ans_mant == 0)
 546 |     return T(0.0f);
 547 |   else {
 548 |     int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
 549 |     int ans_sign = y_sign;
 550 |     int ans_exp = y_exp - shift;
 551 |     ans_mant <<= shift;
 552 |     return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
 553 |   }
 554 | }
 555 | }
 556 | 
 557 | namespace {
 558 | template<typename T>
 559 | __host__ __device__ T genInOutFloatProd(
 560 |     bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
 561 |   ) {
 562 |   // Three kinds of contributions (values for x):
 563 |   // 1) x = random value: only one rank does this
 564 |   // 2) x = 2^n: random positive n
 565 |   // 3) x = 1
 566 |   // Since only one rank submits a random value, the result of the product
 567 |   // will have the same mantissa as that value but with an exponent incorporating
 568 |   // the sum of the exponents from case (2)
 569 | 
 570 |   uint64_t rng = hashOf(seed, index);
 571 |   rank_me = shuffleRank(rank_n, rank_me, rng);
 572 |   int y_sign = (rank_n/2)%2;
 573 |   int x_sign = rank_me%2;
 574 | 
 575 |   constexpr unsigned max_exp = -1 + (1<<(FloatLayout<T>::exponent_bits-1));
 576 |   unsigned x_exp=0, y_exp=0;
 577 |   genSumXY(rank_n, rank_me, rng, max_exp, x_exp, y_exp);
 578 |   x_exp += FloatLayout<T>::exponent_bias;
 579 |   y_exp += FloatLayout<T>::exponent_bias;
 580 | 
 581 |   constexpr uint64_t mant_mask = (uint64_t(1)<<FloatLayout<T>::mantissa_bits)-1;
 582 |   uint64_t y_mant = rng & mant_mask;
 583 |   if (y_mant == 0) y_mant = 1;
 584 | 
 585 |   return makeFloat<T>(
 586 |     input_not_output ? x_sign : y_sign,
 587 |     input_not_output ? x_exp : y_exp,
 588 |     !input_not_output || rank_me==0 ? y_mant : 0
 589 |   );
 590 | }
 591 | }
 592 | 
 593 | ////////////////////////////////////////////////////////////////////////////////
 594 | // What follows is lots of overloads for genInput/genOutput to generate data
 595 | 
 596 | namespace {
 597 | // General case for integral data for all ops but ReduceNil/premulsum
 598 | template<typename T, typename ReduceFn,
 599 |          typename = typename std::enable_if<
 600 |              !std::is_same<ReduceFn, ReduceNil>::value
 601 |            >::type>
 602 | __host__ __device__ void genInput(
 603 |     T &ans, ReduceFn, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 604 |     std::true_type /*integral*/
 605 |   ) {
 606 |   (void)rank_n; // silence unused warnings
 607 |   union { uint64_t bits; T tmp; };
 608 |   bits = uint64_t(-1)>>(64 - 8*sizeof(T));
 609 |   bits &= hashOf(index ^ index<<16 ^ rank_me, seed);
 610 |   // make sure we never return 0 in products
 611 |   ans = std::is_same<ReduceFn, ReduceProd>::value && bits == 0 ? T(1) : tmp;
 612 | }
 613 | }
 614 | 
 615 | ////////////////////////////////////////////////////////////////////////////////
 616 | // Dumb/generic case for genOutput just reduces results of genInput
 617 | 
 618 | namespace {
 619 | template<typename T, typename ReduceFn, bool IsIntegral>
 620 | __host__ __device__ void genOutput(
 621 |     T &ans, ReduceFn op, int rank_n, uint64_t seed, intptr_t index,
 622 |     std::integral_constant<bool, IsIntegral>
 623 |   ) {
 624 |   T acc = genInput<T>(op, rank_n, 0, seed, index);
 625 |   acc = op.preOp(acc, 0);
 626 |   for(int r=1; r < rank_n; r++)
 627 |     acc = op(acc, op.preOp(genInput<T>(op, rank_n, r, seed, index), r));
 628 |   ans = op.postOp(acc);
 629 | }
 630 | }
 631 | 
 632 | ////////////////////////////////////////////////////////////////////////////////
 633 | // Nil reduction (byte copy functions). Optimized to assume rank_n=1
 634 | 
 635 | namespace {
 636 | template<typename T, bool IsIntegral>
 637 | __host__ __device__ void genInput(
 638 |     T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 639 |     std::integral_constant<bool, IsIntegral>
 640 |   ) {
 641 |   (void)rank_n, (void)rank_me; // silence unused warnings
 642 |   union { uint64_t bits; T tmp; };
 643 |   bits = mixBits(seed ^ index);
 644 |   bits >>= 64 - 8*sizeof(T);
 645 |   bits &= uint64_t(-1)>>(64 - 8*sizeof(T));
 646 |   ans = tmp;
 647 | }
 648 | 
 649 | template<typename T, typename ReduceFn, bool IsIntegral>
 650 | __host__ __device__ void genOutput(
 651 |     T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
 652 |     std::integral_constant<bool, IsIntegral>
 653 |   ) {
 654 |   ans = genInput<T>(op, rank_n, 0, seed, index);
 655 | }
 656 | }
 657 | 
 658 | ////////////////////////////////////////////////////////////////////////////////
 659 | // Sum of float
 660 | 
 661 | namespace {
 662 | template<typename T>
 663 | __host__ __device__ void genInput(
 664 |     T &ans, ReduceSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 665 |     std::false_type /*integral*/
 666 |   ) {
 667 |   ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/false);
 668 | }
 669 | 
 670 | template<typename T>
 671 | __host__ __device__ void genOutput(
 672 |     T &ans, ReduceSum, int rank_n, uint64_t seed, intptr_t index,
 673 |     std::false_type /*integral*/
 674 |   ) {
 675 |   ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/false);
 676 | }
 677 | }
 678 | 
 679 | ////////////////////////////////////////////////////////////////////////////////
 680 | // Product of float
 681 | 
 682 | namespace {
 683 | template<typename T>
 684 | __host__ __device__ void genInput(
 685 |     T &ans, ReduceProd, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 686 |     std::false_type /*integral*/
 687 |   ) {
 688 |   ans = genInOutFloatProd<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
 689 | }
 690 | 
 691 | template<typename T>
 692 | __host__ __device__ void genOutput(
 693 |     T &ans, ReduceProd, int rank_n, uint64_t seed, intptr_t index,
 694 |     std::false_type /*integral*/
 695 |   ) {
 696 |   ans = genInOutFloatProd<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
 697 | }
 698 | }
 699 | 
 700 | ////////////////////////////////////////////////////////////////////////////////
 701 | // PreMulSum of int/float
 702 | 
 703 | namespace {
 704 | template<typename T>
 705 | __host__ __device__ void genInput(
 706 |     T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 707 |     std::true_type integral
 708 |   ) {
 709 |   genInput(ans, ReduceSum(), rank_n, rank_me, seed, index, integral);
 710 | }
 711 | 
 712 | // No genOutput overload specific to premulsum(int), just use generic case.
 713 | 
 714 | template<typename T>
 715 | __host__ __device__ void genInput(
 716 |     T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 717 |     std::false_type /*integral*/
 718 |   ) {
 719 |   ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
 720 | }
 721 | 
 722 | template<typename T>
 723 | __host__ __device__ void genOutput(
 724 |     T &ans, ReducePreMulSum, int rank_n, uint64_t seed, intptr_t index,
 725 |     std::false_type /*integral*/
 726 |   ) {
 727 |   ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
 728 | }
 729 | }
 730 | 
 731 | /////////////////////////////////////////////////////////////////////////////////
 732 | // Average of float
 733 | 
 734 | namespace {
 735 | template<typename T>
 736 | __host__ __device__ void genInput(
 737 |     T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 738 |     std::false_type /*integral*/
 739 |   ) {
 740 |   ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
 741 | }
 742 | 
 743 | template<typename T>
 744 | __host__ __device__ void genOutput(
 745 |     T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
 746 |     std::false_type /*integral*/
 747 |   ) {
 748 |   ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
 749 |   using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
 750 |   ans = ReduceProd()(ans, T1(1)/T1(rank_n));
 751 | }
 752 | }
 753 | 
 754 | /////////////////////////////////////////////////////////////////////////////////
 755 | // min/max of float
 756 | 
 757 | namespace {
 758 | template<typename T>
 759 | __host__ __device__ void genInput(
 760 |     T &ans, ReduceMin, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 761 |     std::false_type integral
 762 |   ) {
 763 |   genInput<T>(ans, ReduceMax(), rank_n, rank_me, seed, index, integral);
 764 | }
 765 | template<typename T>
 766 | __host__ __device__ void genInput(
 767 |     T &ans, ReduceMax, int rank_n, int rank_me, uint64_t seed, intptr_t index,
 768 |     std::false_type /*integral*/
 769 |   ) {
 770 |   (void)rank_n; // silence unused warnings
 771 |   constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
 772 |   uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
 773 |   int sign = rng & 1;
 774 |   rng ^= rng>>1;
 775 |   int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
 776 |   exp += 1<<(FloatLayout<T>::exponent_bits-2);
 777 |   rng ^= rng >> FloatLayout<T>::exponent_bits;
 778 |   uint64_t mant = rng & mant_mask;
 779 |   ans = makeFloat<T>(sign, exp, mant);
 780 | }
 781 | 
 782 | // No genOutput overload specific to floating point min/max, just use generic case.
 783 | }
 784 | 
 785 | ///////////////////////////////////////////////////////////////////////////////
 786 | // Entry API for genInput/genOutput
 787 | 
 788 | namespace {
 789 | template<typename T, typename ReduceFn>
 790 | __host__ __device__ T genInput(
 791 |     ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t index
 792 |   ) {
 793 |   T ans;
 794 |   genInput(ans, op, rank_n, rank_me, seed, index,
 795 |     std::integral_constant<bool, IsIntegral<T>::value>());
 796 |   return ans;
 797 | }
 798 | 
 799 | template<typename T, typename ReduceFn>
 800 | __host__ __device__ T genOutput(
 801 |     ReduceFn op, int rank_n, uint64_t seed, intptr_t index
 802 |   ) {
 803 |   T ans;
 804 |   genOutput(ans, op, rank_n, seed, index,
 805 |     std::integral_constant<bool, IsIntegral<T>::value>());
 806 |   return ans;
 807 | }
 808 | }
 809 | 
 810 | ////////////////////////////////////////////////////////////////////////////////
 811 | 
 812 | #if !SELF_TEST
 813 | namespace {
 814 | template<typename T, typename ReduceFn>
 815 | __global__ void prepareInput2(
 816 |     T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
 817 |     uint64_t seed, intptr_t elt_ix0
 818 |   ) {
 819 |   intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
 820 |   i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
 821 |   intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
 822 |   i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
 823 |   intptr_t i = i0 + threadIdx.x;
 824 |   while(i < i1) {
 825 |     elts[i] = genInput<T>(op, rank_n, rank_me, seed, elt_ix0+i);
 826 |     #if 0
 827 |     T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
 828 |     printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
 829 |       std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
 830 |     #endif
 831 |     i += blockDim.x;
 832 |   }
 833 | }
 834 | 
 835 | template<typename ReduceOp>
 836 | void prepareInput1(
 837 |     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
 838 |     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 839 |   ) {
 840 |   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
 841 |   #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
 842 |   switch(elt_ty) {
 843 |   case ncclInt8: CASE_TY(int8_t)
 844 |   case ncclUint8: CASE_TY(uint8_t)
 845 |   case ncclInt32: CASE_TY(int32_t)
 846 |   case ncclUint32: CASE_TY(uint32_t)
 847 |   case ncclInt64: CASE_TY(int64_t)
 848 |   case ncclUint64: CASE_TY(uint64_t)
 849 |   case ncclFloat16: CASE_TY(half)
 850 |   #if HAVE_ncclBfloat16
 851 |   case ncclBfloat16: CASE_TY(__nv_bfloat16)
 852 |   #endif
 853 |   case ncclFloat32: CASE_TY(float)
 854 |   case ncclFloat64: CASE_TY(double)
 855 |   default: assert(0);
 856 |   }
 857 |   #undef CASE_TY
 858 | }
 859 | }
 860 | 
 861 | void ncclVerifiablePrepareInput(
 862 |     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
 863 |     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 864 |   ) {
 865 |   #define CASE_OP(op) \
 866 |     if(rank_n == 1) \
 867 |       prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
 868 |     else \
 869 |       prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
 870 |     break;
 871 |   switch(red_op) {
 872 |   case ncclSum: CASE_OP(ReduceSum())
 873 |   case ncclMin: CASE_OP(ReduceMin())
 874 |   case ncclMax: CASE_OP(ReduceMax())
 875 |   case ncclProd: CASE_OP(ReduceProd())
 876 |   #if HAVE_ncclAvg
 877 |   case ncclAvg: CASE_OP(ReduceAvg{rank_n})
 878 |   #endif
 879 |   #if HAVE_ncclPreMulSum
 880 |   default: CASE_OP(ReducePreMulSum())
 881 |   #endif
 882 |   }
 883 |   #undef CASE_OP
 884 | }
 885 | #endif
 886 | 
 887 | ////////////////////////////////////////////////////////////////////////////////
 888 | 
 889 | #if !SELF_TEST
 890 | namespace {
 891 | template<typename T, typename ReduceFn>
 892 | __global__ void prepareExpected2(
 893 |     T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
 894 |     uint64_t seed, intptr_t elt_ix0
 895 |   ) {
 896 |   intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
 897 |   i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
 898 |   intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
 899 |   i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
 900 |   intptr_t i = i0 + threadIdx.x;
 901 |   while(i < i1) {
 902 |     elts[i] = genOutput<T>(op, rank_n, seed, elt_ix0+i);
 903 |     #if 0
 904 |     printf("prepareExpected2 seed=0x%llx ix=%lld x=%g elts=%p\n",
 905 |       (long long)seed, (long long)(elt_ix0+i), (float)elts[i], elts);
 906 |     #endif
 907 |     i += blockDim.x;
 908 |   }
 909 | }
 910 | 
 911 | template<typename ReduceOp>
 912 | void prepareExpected1(
 913 |     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
 914 |     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 915 |   ) {
 916 |   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
 917 |   #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
 918 |   switch(elt_ty) {
 919 |   case ncclInt8: CASE_TY(int8_t)
 920 |   case ncclUint8: CASE_TY(uint8_t)
 921 |   case ncclInt32: CASE_TY(int32_t)
 922 |   case ncclUint32: CASE_TY(uint32_t)
 923 |   case ncclInt64: CASE_TY(int64_t)
 924 |   case ncclUint64: CASE_TY(uint64_t)
 925 |   case ncclFloat16: CASE_TY(half)
 926 |   #if HAVE_ncclBfloat16
 927 |   case ncclBfloat16: CASE_TY(__nv_bfloat16)
 928 |   #endif
 929 |   case ncclFloat32: CASE_TY(float)
 930 |   case ncclFloat64: CASE_TY(double)
 931 |   default: assert(0);
 932 |   }
 933 |   #undef CASE_TY
 934 | }
 935 | }
 936 | 
 937 | void ncclVerifiablePrepareExpected(
 938 |     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
 939 |     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 940 |   ) {
 941 |   #define CASE_OP(op) \
 942 |     if(rank_n == 1) \
 943 |       prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
 944 |     else \
 945 |       prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
 946 |     break;
 947 |   switch(red_op) {
 948 |   case ncclSum: CASE_OP(ReduceSum())
 949 |   case ncclMin: CASE_OP(ReduceMin())
 950 |   case ncclMax: CASE_OP(ReduceMax())
 951 |   case ncclProd: CASE_OP(ReduceProd())
 952 |   #if HAVE_ncclAvg
 953 |   case ncclAvg: CASE_OP(ReduceAvg{rank_n})
 954 |   #endif
 955 |   #if HAVE_ncclPreMulSum
 956 |   default: CASE_OP(ReducePreMulSum())
 957 |   #endif
 958 |   }
 959 |   #undef CASE_OP
 960 | }
 961 | #endif
 962 | 
 963 | ////////////////////////////////////////////////////////////////////////////////
 964 | 
 965 | namespace {
 966 | /* How we compare floating point values when exactness is impossible is interesting.
 967 |  * First, we take note that simply reinterpreting integer bits as floating point
 968 |  * gives us a monotonic mapping which exponentially spaces out floats. Thus
 969 |  * consecutive integers encode consecutive floats. In general, using integer
 970 |  * subraction on the bitpatterns of two floats gives us an integer which is the
 971 |  * logarithm of their relative difference. But, if the floats always have similar
 972 |  * exponents, than the integer difference is actually proportional to the
 973 |  * relative error (this is because we are counting hops in the mantissa bits only,
 974 |  * not the exponent bits). So a cheap way to compare if two floats are relatively
 975 |  * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
 976 |  * calculates such a tolerance for a summation of n floats. This formula
 977 |  * was derived by inspecting the maximum observed integer difference over many
 978 |  * random runs of summation. The parameter values were computed by the
 979 |  * companion program "inexact_regress.cu".
 980 |  */
 981 | __host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
 982 |   float power, coef;
 983 |   switch(elt_ty) {
 984 |   case ncclFloat32:
 985 |   case ncclFloat64:
 986 |     power = .51f;
 987 |     coef = 1.25f;
 988 |     break;
 989 |   case ncclFloat16:
 990 |     power = .91f;
 991 |     coef = .75f;
 992 |     break;
 993 |   #if HAVE_ncclBfloat16
 994 |   case ncclBfloat16:
 995 |     power = .91f;
 996 |     coef = .66f;
 997 |     break;
 998 |   #endif
 999 |   }
1000 |   #if __CUDA_ARCH__
1001 |     return 1 + unsigned(coef*powf(float(rank_n), power));
1002 |   #else
1003 |     return 1 + unsigned(coef*std::pow(float(rank_n), power));
1004 |   #endif
1005 | }
1006 | 
1007 | template<typename T>
1008 | __host__ __device__  uint64_t calcDelta(T a, T b) {
1009 |   union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
1010 |   x.t = a;
1011 |   y.t = b;
1012 |   switch(sizeof(T)) {
1013 |   case 1:  return x.i1 < y.i1 ? y.i1 - x.i1 : x.i1 - y.i1;
1014 |   case 2:  return x.i2 < y.i2 ? y.i2 - x.i2 : x.i2 - y.i2;
1015 |   case 4:  return x.i4 < y.i4 ? y.i4 - x.i4 : x.i4 - y.i4;
1016 |   default: return x.i8 < y.i8 ? y.i8 - x.i8 : x.i8 - y.i8;
1017 |   }
1018 | }
1019 | }
1020 | 
1021 | ////////////////////////////////////////////////////////////////////////////////
1022 | 
1023 | #if !SELF_TEST
1024 | namespace {
1025 | template<typename T>
1026 | __global__ void verifyPrepared(
1027 |     T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
1028 |   ) {
1029 |   intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
1030 |   i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
1031 |   intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
1032 |   i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
1033 |   intptr_t i = i0 + threadIdx.x;
1034 |   int64_t bad = 0;
1035 | 
1036 |   while(i < i1) {
1037 |     T a = results[i], b = expected[i];
1038 |     T delta = a < b ? b - a : a - b;
1039 |     bad += tolerance < delta ? 1 : 0;
1040 |     #if 0
1041 |       if(tolerance < delta) {
1042 |         printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
1043 |       }
1044 |     #endif
1045 |     i += blockDim.x;
1046 |   }
1047 |   asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
1048 | }
1049 | 
1050 | template<typename T, typename Uint, typename ReduceFn>
1051 | __global__ void verifyInline2(
1052 |     T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
1053 |     intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
1054 |   ) {
1055 |   intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
1056 |   i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
1057 |   intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
1058 |   i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
1059 |   intptr_t i = i0 + threadIdx.x;
1060 |   int64_t bad = 0;
1061 | 
1062 |   while(i < i1) {
1063 |     union { T t; Uint u; } a, b;
1064 |     a.t = results[i];
1065 |     b.t = genOutput<T>(op, rank_n, seed, elt_ix0+i);
1066 |     Uint delta = a.u < b.u ? b.u - a.u : a.u - b.u;
1067 |     bad += tolerance < delta ? 1 : 0;
1068 |     #if 0
1069 |       T input = genInput<T>(op, rank_n, 0, seed, elt_ix0+i);
1070 |       if(tolerance < delta) {
1071 |         printf("verifyInline2 fail T=%d ix=%lld got=%g exp=%g input=%g\n",
1072 |           std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
1073 |       } else {
1074 |         printf("verifyInline2 pass T=%d ix=%lld got=%g exp=%g input=%g\n",
1075 |           std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
1076 |       }
1077 |     #endif
1078 |     i += blockDim.x;
1079 |   }
1080 |   asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
1081 | }
1082 | 
1083 | template<typename T, typename Uint>
1084 | void verifyInline1(
1085 |     T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
1086 |     unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
1087 |   ) {
1088 |   #define CASE_OP(op) \
1089 |     if(rank_n == 1) \
1090 |     verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
1091 |       ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
1092 |     else \
1093 |     verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
1094 |       ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
1095 |     break;
1096 |   switch(red_op) {
1097 |   case ncclSum: CASE_OP(ReduceSum())
1098 |   case ncclMin: CASE_OP(ReduceMin())
1099 |   case ncclMax: CASE_OP(ReduceMax())
1100 |   case ncclProd: CASE_OP(ReduceProd())
1101 |   #if HAVE_ncclAvg
1102 |   case ncclAvg: CASE_OP(ReduceAvg{rank_n})
1103 |   #endif
1104 |   #if HAVE_ncclPreMulSum
1105 |   default: CASE_OP(ReducePreMulSum())
1106 |   #endif
1107 |   }
1108 |   #undef CASE_OP
1109 | }
1110 | }
1111 | 
1112 | void ncclVerifiableVerify(
1113 |     void const *results, void const *expected, intptr_t elt_n, int elt_ty,
1114 |     int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
1115 |     int64_t *bad_elt_n, cudaStream_t stream
1116 |   ) {
1117 |   bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
1118 |   #if HAVE_ncclBfloat16
1119 |     floating |= elt_ty == ncclBfloat16;
1120 |   #endif
1121 | 
1122 |   unsigned tolerance = 0;
1123 |   #if HAVE_ncclAvg
1124 |   if (floating && red_op == ncclAvg)
1125 |     tolerance = calcSumFloatTolerance(rank_n, elt_ty);
1126 |   #endif
1127 | 
1128 |   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
1129 | 
1130 |   *bad_elt_n = 0;
1131 |   #define CASE_TY(T, Uint) { \
1132 |       if(expected != nullptr) { \
1133 |         verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
1134 |       } else { \
1135 |         verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
1136 |       } \
1137 |     } break;
1138 |   switch(elt_ty) {
1139 |   case ncclInt8: CASE_TY(int8_t, uint8_t)
1140 |   case ncclUint8: CASE_TY(uint8_t, uint8_t)
1141 |   case ncclInt32: CASE_TY(int32_t, uint32_t)
1142 |   case ncclUint32: CASE_TY(uint32_t, uint32_t)
1143 |   case ncclInt64: CASE_TY(int64_t, uint64_t)
1144 |   case ncclUint64: CASE_TY(uint64_t, uint64_t)
1145 |   case ncclFloat16: CASE_TY(half, uint16_t)
1146 |   #if HAVE_ncclBfloat16
1147 |   case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
1148 |   #endif
1149 |   case ncclFloat32: CASE_TY(float, uint32_t)
1150 |   case ncclFloat64: CASE_TY(double, uint64_t)
1151 |   default: assert(0);
1152 |   }
1153 |   #undef CASE_TY
1154 | }
1155 | #endif
1156 | 
1157 | ////////////////////////////////////////////////////////////////////////////////
1158 | 
1159 | #if SELF_TEST
1160 | #include <iostream>
1161 | 
1162 | template<typename T, typename Op>
1163 | __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
1164 |   //if(!std::is_same<T,half>::value) return;
1165 |   //if(!std::is_same<Op,ReduceProd>::value) return;
1166 |   //if(rank_n!=3) return;
1167 | 
1168 |   unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
1169 |   uint64_t seed = 0xc8e2bed69766d533;
1170 | 
1171 |   for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
1172 |     //if(ix!=387) continue;
1173 |     T y = genOutput<T>(op, rank_n, seed, ix);
1174 |     T sum;
1175 |     for(int r=0; r < rank_n; r++) {
1176 |       T x = genInput<T>(op, rank_n, r, seed, ix);
1177 |       x = op.preOp(x, r);
1178 |       sum = r==0 ? x : op(sum, inhibit(x));
1179 |       //std::printf("x = %llx, sum = %llx\n", bitsOf(x), bitsOf(sum));
1180 |     }
1181 |     sum = op.postOp(sum);
1182 |     if(tolerance < calcDelta(sum, y)) {
1183 |       std::printf(
1184 |         //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
1185 |         "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
1186 |         *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
1187 |       );
1188 |     }
1189 |   }
1190 | }
1191 | 
1192 | template<typename T>
1193 | __device__ void sweep1(int ty, char const *tyname) {
1194 |   for(int i=0; i < 10; i++) {
1195 |     int rank_n = (1<<i) + i;
1196 |     sweep2<T>(ty, tyname, ReduceSum(), "sum", rank_n);
1197 |     sweep2<T>(ty, tyname, ReduceProd(), "prod", rank_n);
1198 |     sweep2<T>(ty, tyname, ReduceMin(), "min", rank_n);
1199 |     sweep2<T>(ty, tyname, ReduceMax(), "max", rank_n);
1200 |     sweep2<T>(ty, tyname, ReducePreMulSum(), "premulsum", rank_n);
1201 |     sweep2<T>(ty, tyname, ReduceAvg{rank_n}, "avg", rank_n);
1202 |   }
1203 | }
1204 | 
1205 | __global__ void sweep() {
1206 |   sweep1<int8_t>(ncclInt8, "int8");
1207 |   sweep1<uint8_t>(ncclUint8, "uint8");
1208 |   sweep1<int32_t>(ncclInt32, "int32");
1209 |   sweep1<uint32_t>(ncclUint32, "uint32");
1210 |   sweep1<int64_t>(ncclInt64, "int64");
1211 |   sweep1<uint64_t>(ncclUint64, "uint64");
1212 |   sweep1<half>(ncclFloat16, "half");
1213 |   #if HAVE_ncclBfloat16
1214 |     sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
1215 |   #endif
1216 |   sweep1<float>(ncclFloat32, "float");
1217 |   sweep1<double>(ncclFloat64, "double");
1218 | }
1219 | 
1220 | int main(int arg_n, char **args) {
1221 |   std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
1222 |   cudaSetDevice(0);
1223 |   sweep<<<1,512>>>();
1224 |   cudaDeviceSynchronize();
1225 |   return 0;
1226 | }
1227 | #endif
1228 | 


--------------------------------------------------------------------------------
/nccl-tests/verifiable/verifiable.h:
--------------------------------------------------------------------------------
 1 | #ifndef _d41d8cd98f00b204e9800998ecf8427e
 2 | #define _d41d8cd98f00b204e9800998ecf8427e
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | /* Routines for launching kernels that verify reduction results. A significant
 9 |  * feature of these routines is they carefully craft floating point input
10 |  * to produce exactly predictable output.
11 |  *
12 |  * int elt_ty: actually just a ncclDataType_t
13 |  *
14 |  * int red_op: mostly just a  ncclRedOp_t. Since PreMulSum ops are dynamically
15 |  * created, these are encoded as the value ncclNumOps and their scalar is
16 |  * assumed to be `ncclVerifiablePremulScalar(rank_me)`
17 |  *
18 |  * uint64_t seed: arbitrary 64-bits to use in seeding the random values
19 |  *
20 |  * intptr_t elt_ix0: index of first element pointed to by elts when generating
21 |  * random values. This makes it possible to generate subsequences independently
22 |  * as well as in aggregate.
23 |  *
24 |  * int rank_n: Number of contributions into the reduction. Non-reduction
25 |  * collectives like broadcast, gather, etc will always set this to one.
26 |  *
27 |  * int rank_me: Index of this contribution
28 |  */
29 | 
30 | // Use this as the local scalar for PreMulSum ops
31 | template<typename T>
32 | __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
33 |   return T(rank_me%2 == 0 ? 1.0f : 2.0f);
34 | }
35 | 
36 | // Enqueue kernel to generate data which is to be reduced.
37 | void ncclVerifiablePrepareInput(
38 |   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
39 |   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
40 | );
41 | 
42 | // Enqueue kernel to generate expected results of reduction.
43 | void ncclVerifiablePrepareExpected(
44 |   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
45 |   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
46 | );
47 | 
48 | // Enqueue kernel to verify reduced data matches expectation. The number of
49 | // failed elements is written to bad_elt_n which must be in cudaHost memory.
50 | // If `expected == nullptr` then the expected results are generated on-the-fly
51 | // which can be costly. Thus if you plan to run the same reduction multiple
52 | // times it is advantageous to precompute the expected values with
53 | // ncclVerifiablePrepareExpected and pass them as `expected` here.
54 | void ncclVerifiableVerify(
55 |   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
56 |   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
57 |   int64_t *bad_elt_n, cudaStream_t stream
58 | );
59 | #endif
60 | 


--------------------------------------------------------------------------------
/nccl-tests/verifiable/verifiable.mk:
--------------------------------------------------------------------------------
 1 | # We requires both of the following paths to be set upon including this makefile
 2 | # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 3 | # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
 4 | 
 5 | TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 6 | TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 7 | 
 8 | $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
 9 | 	@printf "Compiling %s\n" $@
10 | 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
11 | 	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
12 | 


--------------------------------------------------------------------------------
/sources.list:
--------------------------------------------------------------------------------
 1 | # 默认注释了源码镜像以提高 apt update 速度，如有需要可自行取消注释
 2 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy main restricted universe multiverse
 3 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy main restricted universe multiverse
 4 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-updates main restricted universe multiverse
 5 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-updates main restricted universe multiverse
 6 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-backports main restricted universe multiverse
 7 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-backports main restricted universe multiverse
 8 | 
 9 | deb http://security.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse
10 | # deb-src http://security.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse
11 | 
12 | # 预发布软件源，不建议启用
13 | # deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-proposed main restricted universe multiverse
14 | # # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-proposed main restricted universe multiverse


--------------------------------------------------------------------------------
/sshd_config:
--------------------------------------------------------------------------------
  1 | # This is the sshd server system-wide configuration file.  See
  2 | # sshd_config(5) for more information.
  3 | 
  4 | # This sshd was compiled with PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games
  5 | 
  6 | # The strategy used for options in the default sshd_config shipped with
  7 | # OpenSSH is to specify options with their default value where
  8 | # possible, but leave them commented.  Uncommented options override the
  9 | # default value.
 10 | 
 11 | Include /etc/ssh/sshd_config.d/*.conf
 12 | 
 13 | Port 12345
 14 | #AddressFamily any
 15 | #ListenAddress 0.0.0.0
 16 | #ListenAddress ::
 17 | 
 18 | #HostKey /etc/ssh/ssh_host_rsa_key
 19 | #HostKey /etc/ssh/ssh_host_ecdsa_key
 20 | #HostKey /etc/ssh/ssh_host_ed25519_key
 21 | 
 22 | # Ciphers and keying
 23 | #RekeyLimit default none
 24 | 
 25 | # Logging
 26 | #SyslogFacility AUTH
 27 | #LogLevel INFO
 28 | 
 29 | # Authentication:
 30 | 
 31 | #LoginGraceTime 2m
 32 | PermitRootLogin yes
 33 | #StrictModes yes
 34 | #MaxAuthTries 6
 35 | #MaxSessions 10
 36 | 
 37 | PubkeyAuthentication yes
 38 | 
 39 | # Expect .ssh/authorized_keys2 to be disregarded by default in future.
 40 | #AuthorizedKeysFile	.ssh/authorized_keys .ssh/authorized_keys2
 41 | 
 42 | #AuthorizedPrincipalsFile none
 43 | 
 44 | #AuthorizedKeysCommand none
 45 | #AuthorizedKeysCommandUser nobody
 46 | 
 47 | # For this to work you will also need host keys in /etc/ssh/ssh_known_hosts
 48 | #HostbasedAuthentication no
 49 | # Change to yes if you don't trust ~/.ssh/known_hosts for
 50 | # HostbasedAuthentication
 51 | #IgnoreUserKnownHosts no
 52 | # Don't read the user's ~/.rhosts and ~/.shosts files
 53 | #IgnoreRhosts yes
 54 | 
 55 | # To disable tunneled clear text passwords, change to no here!
 56 | #PasswordAuthentication yes
 57 | #PermitEmptyPasswords no
 58 | 
 59 | # Change to yes to enable challenge-response passwords (beware issues with
 60 | # some PAM modules and threads)
 61 | KbdInteractiveAuthentication no
 62 | 
 63 | # Kerberos options
 64 | #KerberosAuthentication no
 65 | #KerberosOrLocalPasswd yes
 66 | #KerberosTicketCleanup yes
 67 | #KerberosGetAFSToken no
 68 | 
 69 | # GSSAPI options
 70 | #GSSAPIAuthentication no
 71 | #GSSAPICleanupCredentials yes
 72 | #GSSAPIStrictAcceptorCheck yes
 73 | #GSSAPIKeyExchange no
 74 | 
 75 | # Set this to 'yes' to enable PAM authentication, account processing,
 76 | # and session processing. If this is enabled, PAM authentication will
 77 | # be allowed through the KbdInteractiveAuthentication and
 78 | # PasswordAuthentication.  Depending on your PAM configuration,
 79 | # PAM authentication via KbdInteractiveAuthentication may bypass
 80 | # the setting of "PermitRootLogin without-password".
 81 | # If you just want the PAM account and session checks to run without
 82 | # PAM authentication, then enable this but set PasswordAuthentication
 83 | # and KbdInteractiveAuthentication to 'no'.
 84 | UsePAM yes
 85 | 
 86 | #AllowAgentForwarding yes
 87 | #AllowTcpForwarding yes
 88 | #GatewayPorts no
 89 | X11Forwarding yes
 90 | #X11DisplayOffset 10
 91 | #X11UseLocalhost yes
 92 | #PermitTTY yes
 93 | PrintMotd no
 94 | #PrintLastLog yes
 95 | #TCPKeepAlive yes
 96 | #PermitUserEnvironment no
 97 | #Compression delayed
 98 | #ClientAliveInterval 0
 99 | #ClientAliveCountMax 3
100 | #UseDNS no
101 | #PidFile /run/sshd.pid
102 | #MaxStartups 10:30:100
103 | #PermitTunnel no
104 | #ChrootDirectory none
105 | #VersionAddendum none
106 | 
107 | # no default banner path
108 | #Banner none
109 | 
110 | # Allow client to pass locale environment variables
111 | AcceptEnv LANG LC_*
112 | 
113 | # override default of no subsystems
114 | Subsystem sftp /usr/lib/openssh/sftp-server
115 | 
116 | # Example of overriding settings on a per-user basis
117 | #Match User anoncvs
118 | #	X11Forwarding no
119 | #	AllowTcpForwarding no
120 | #	PermitTTY no
121 | #	ForceCommand cvs server


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | set -o nounset
 5 | set -o pipefail
 6 | 
 7 | # Check if PORT is set, otherwise set default value
 8 | if [ -z "${PORT:-}" ]; then
 9 |     PORT=12345
10 | fi
11 | 
12 | # Check if PASS is set, otherwise set default value
13 | if [ -z "${PASS:-}" ]; then
14 |     PASS=12345
15 | fi
16 | 
17 | # change the sshd port
18 | sed -i "s/12345/$PORT/" /etc/ssh/sshd_config
19 | 
20 | # change the root password
21 | echo "root:$PASS" | chpasswd
22 | 
23 | # start sshd
24 | service ssh start
25 | 
26 | tail -f /dev/null


--------------------------------------------------------------------------------