├── CHANGELOG.md
├── CMakeLists.txt
├── LICENSE
├── Licenses.txt
├── README.md
├── common.h
├── debian_install.sh
├── detect_cuda_arch.cmake
├── diagrams
    ├── DtoDBidir.png
    ├── DtoHBidir.png
    ├── HtoDBidir.png
    └── measurement.png
├── error_handling.h
├── inline_common.h
├── json
    ├── json-forwards.h
    ├── json.h
    └── jsoncpp.cpp
├── json_output.cpp
├── json_output.h
├── kernels.cu
├── kernels.cuh
├── memcpy.cpp
├── memcpy.h
├── multinode_memcpy.cpp
├── multinode_memcpy.h
├── multinode_testcases.cpp
├── nvbandwidth.cpp
├── output.cpp
├── output.h
├── testcase.cpp
├── testcase.h
├── testcases_ce.cpp
├── testcases_sm.cpp
└── version.h


/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## Changelog
 2 | 
 3 | ### nvbandwidth 0.8
 4 | Bug Fixes:
 5 |  * Device Latency Test Accuracy:
 6 |    * Fixed an issue where the device_to_device_latency test was incorrectly
 7 |      reporting host-device latency instead of device-to-device latency.
 8 |    * Host-device latency reports now correctly reflect C2C or PCIe latency
 9 |    depending on the system, while device-to-device latency reports focus on
10 | 	NVLINK or equivalent inter-device connections.
11 |  * Adjust buffer size threshold use to select which copy kernel is used, for
12 | 	more accurate measurements.
13 |  * Add host name to json output
14 |  * Updated README
15 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(nvbandwidth
 4 |     LANGUAGES CUDA CXX)
 5 | 
 6 | set(CMAKE_CXX_STANDARD 17)
 7 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 8 | set(CMAKE_CUDA_STANDARD 17)
 9 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
10 | 
11 | if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
12 |     # 5.2 architecture not supported since CUDA 13.0
13 |     set(supported_archs "70" "75" "80" "86" "89" "90" "100")
14 | else ()
15 |     set(supported_archs "52" "70" "75" "80" "86" "89" "90" "100")
16 | endif()
17 | 
18 | if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
19 |     message(STATUS "Detecting underlying CUDA Arch to set CMAKE_CUDA_ARCHITECTURES")
20 |     include(detect_cuda_arch.cmake)
21 |     # Set CMAKE_CUDA_ARCHITECURES based on the underlying device
22 |     cuda_detect_architectures(supported_archs CMAKE_CUDA_ARCHITECTURES)
23 | endif()
24 | 
25 | if(NOT CMAKE_BUILD_TYPE)
26 |     set(CMAKE_BUILD_TYPE "Release")
27 | endif()
28 | 
29 | if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
30 |     file(READ "/etc/os-release" OS_RELEASE_CONTENT)
31 |     # Skip static libs on Fedora - https://github.com/NVIDIA/nvbandwidth/issues/4
32 |     if(NOT OS_RELEASE_CONTENT MATCHES "ID=.*fedora")
33 |         set(Boost_USE_STATIC_LIBS ON)
34 |     endif()
35 | else()
36 |     set(Boost_USE_STATIC_LIBS ON)
37 | endif()
38 | find_package(Boost COMPONENTS program_options REQUIRED)
39 | 
40 | set(src
41 |     testcase.cpp
42 |     testcases_ce.cpp
43 |     testcases_sm.cpp
44 |     kernels.cu
45 |     memcpy.cpp
46 |     nvbandwidth.cpp
47 |     multinode_memcpy.cpp
48 |     multinode_testcases.cpp
49 |     output.cpp
50 |     json_output.cpp
51 |     json/jsoncpp.cpp
52 | )
53 | 
54 | execute_process(
55 |     COMMAND git describe --always --tags
56 |     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
57 |     OUTPUT_VARIABLE GIT_VERSION
58 |     OUTPUT_STRIP_TRAILING_WHITESPACE
59 | )
60 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGIT_VERSION=\\\"\"${GIT_VERSION}\"\\\"")
61 | 
62 | if(WIN32)
63 |     set(NVML_LIB_NAME "nvml")
64 | else()
65 |     set(NVML_LIB_NAME "nvidia-ml")
66 | endif()
67 | 
68 | add_executable(nvbandwidth ${src})
69 | target_include_directories(nvbandwidth PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} .)
70 | target_link_libraries(nvbandwidth Boost::program_options ${NVML_LIB_NAME} cuda)
71 | 
72 | if (MULTINODE)
73 |     find_package(MPI REQUIRED)
74 |     include_directories(SYSTEM ${MPI_INCLUDE_PATH})
75 |     target_link_libraries(nvbandwidth MPI::MPI_CXX)
76 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMULTINODE")
77 | endif()
78 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/Licenses.txt:
--------------------------------------------------------------------------------
 1 | JsonCpp:
 2 | Copyright Baptiste Lepilleur - Public domain and MIT licenses
 3 | Attribution statements: Nvidia actively chooses to accept jsoncpp as public domain where acceptable and MIT licensed where public domain is not accepted.
 4 | License text ( https://github.com/open-source-parsers/jsoncpp/blob/master/LICENSE ) 
 5 | 
 6 | /*!
 7 | * The JsonCpp library's source code, including accompanying documentation,
 8 | * tests and demonstration applications, are licensed under the following
 9 | * conditions...
10 | * 
11 | * Baptiste Lepilleur and The JsonCpp Authors explicitly disclaim copyright in all
12 | * jurisdictions which recognize such a disclaimer. In such jurisdictions,
13 | * this software is released into the Public Domain.
14 | * 
15 | * In jurisdictions which do not recognize Public Domain property (e.g. Germany as of
16 | *         2010), this software is Copyright (c) 2007-2010 by Baptiste Lepilleur and
17 | * The JsonCpp Authors, and is released under the terms of the MIT License (see below).
18 | * 
19 | * In jurisdictions which recognize Public Domain property, the user of this
20 | * software may choose to accept it either as 1) Public Domain, 2) under the
21 | * conditions of the MIT License (see below), or 3) under the terms of dual
22 | * Public Domain/MIT License conditions described here, as they choose.
23 | * 
24 | * The MIT License is about as close to Public Domain as a license can get, and is
25 | * described in clear, concise terms at:
26 | * 
27 | *    http://en.wikipedia.org/wiki/MIT_License
28 | * 
29 | *    The full text of the MIT License follows:
30 | * 
31 | *    ========================================================================
32 | *    Copyright (c) 2007-2010 Baptiste Lepilleur and The JsonCpp Authors
33 | * 
34 | *    Permission is hereby granted, free of charge, to any person
35 | *    obtaining a copy of this software and associated documentation
36 | *    files (the "Software"), to deal in the Software without
37 | *    restriction, including without limitation the rights to use, copy,
38 | *    modify, merge, publish, distribute, sublicense, and/or sell copies
39 | *    of the Software, and to permit persons to whom the Software is
40 | *    furnished to do so, subject to the following conditions:
41 | * 
42 | *    The above copyright notice and this permission notice shall be
43 | *    included in all copies or substantial portions of the Software.
44 | * 
45 | *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
46 | *    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
47 | *    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
48 | *    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
49 | *    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
50 | *    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
51 | *    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
52 | *    SOFTWARE.
53 | *    ========================================================================
54 | *    (END LICENSE TEXT)
55 | * 
56 | * The MIT license is compatible with both the GPL and commercial
57 | * software, affording one all of the rights of Public Domain with the
58 | * minor nuisance of being required to keep the above copyright notice
59 | * and license text in the source code. Note also that by accepting the
60 | * Public Domain "license" you can re-license your copy using whatever
61 | * license you like.
62 | */
63 | 
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # nvbandwidth
  2 | A tool for bandwidth measurements on NVIDIA GPUs.
  3 | 
  4 | Measures bandwidth for various memcpy patterns across different links using copy engine or kernel copy methods.
  5 | nvbandwidth reports current measured bandwidth on your system. Additional system-specific tuning may be required to achieve maximal peak bandwidth.
  6 | 
  7 | ## Requirements
  8 | nvbandwidth requires the installation of a CUDA toolkit and some additional Linux software components to be built and run. This section provides the relevant details
  9 | Install a cuda toolkit (version 11.X or above). Multinode version requires 12.3 toolkit and 550 driver or above.
 10 | 
 11 | Install a compiler package which supports c++17. GCC 7.x or above is a possible option.
 12 | 
 13 | Install cmake (version 3.20 or above).
 14 | Cmake version 3.24 or newer is encouraged.
 15 | 
 16 | Install Boost program options library (More details in the next section)
 17 | 
 18 | Ensure that path to nvcc binary (install via toolkit) is available in the $PATH variable on linux systems
 19 | In order to run nvbandwidth, the system should have a CUDA enabled GPU and an NVIDIA display driver that is compatible with the CUDA Toolkit being used to build nvbandwidth.
 20 | For more information, refer to https://docs.nvidia.com/deploy/cuda-compatibility/
 21 | 
 22 | ## Dependencies
 23 | To build and run nvbandwidth please install the Boost program_options library (https://www.boost.org/doc/libs/1_66_0/doc/html/program_options.html).
 24 | 
 25 | Ubuntu/Debian users can run the following to install:
 26 | ```
 27 | apt install libboost-program-options-dev
 28 | ```
 29 | On Ubuntu/Debian, we have provided a utility script (debian_install.sh) which installs some generic software components needed for the build.
 30 | The script also builds the nvbandwidth project.
 31 | ```
 32 | sudo ./debian_install.sh
 33 | ```
 34 | 
 35 | Fedora users can run the following to install:
 36 | ```
 37 | sudo dnf -y install boost-devel
 38 | ```
 39 | 
 40 | ## Build
 41 | To build the `nvbandwidth` executable for single-node:
 42 | ```
 43 | cmake .
 44 | make
 45 | ```
 46 | You may need to set the BOOST_ROOT environment variable on Windows to tell CMake where to find your Boost installation.
 47 | 
 48 | ## Usage:
 49 | ```
 50 | ./nvbandwidth -h
 51 | 
 52 | nvbandwidth CLI:
 53 |   -h [ --help ]                  Produce help message
 54 |   -b [ --bufferSize ] arg (=512) Memcpy buffer size in MiB
 55 |   -l [ --list ]                  List available testcases
 56 |   -t [ --testcase ] arg          Testcase(s) to run (by name or index)
 57 |   -p [ --testcasePrefixes ] arg  Testcase(s) to run (by prefix))
 58 |   -v [ --verbose ]               Verbose output
 59 |   -s [ --skipVerification ]      Skips data verification after copy
 60 |   -d [ --disableAffinity ]       Disable automatic CPU affinity control
 61 |   -i [ --testSamples ] arg (=3)  Iterations of the benchmark
 62 |   -m [ --useMean ]               Use mean instead of median for results
 63 |   -j [ --json ]                  Print output in json format instead of plain
 64 |                                  text.
 65 | ```
 66 | To run all testcases:
 67 | ```
 68 | ./nvbandwidth
 69 | ```
 70 | 
 71 | To run a specific testcase:
 72 | ```
 73 | ./nvbandwidth -t device_to_device_memcpy_read_ce
 74 | ```
 75 | Example output:
 76 | ```
 77 | Running device_to_device_memcpy_write_ce.
 78 | memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)
 79 |           0         1         2         3         4         5         6         7
 80 | 0      0.00    276.07    276.36    276.14    276.29    276.48    276.55    276.33
 81 | 1    276.19      0.00    276.29    276.29    276.57    276.48    276.38    276.24
 82 | 2    276.33    276.29      0.00    276.38    276.50    276.50    276.29    276.31
 83 | 3    276.19    276.62    276.24      0.00    276.29    276.60    276.29    276.55
 84 | 4    276.03    276.55    276.45    276.76      0.00    276.45    276.36    276.62
 85 | 5    276.17    276.57    276.19    276.50    276.31      0.00    276.31    276.15
 86 | 6    274.89    276.41    276.38    276.67    276.41    276.26      0.00    276.33
 87 | 7    276.12    276.45    276.12    276.36    276.00    276.57    276.45      0.00
 88 | ```
 89 | 
 90 | Set number of iterations and the buffer size for copies with --testSamples and --bufferSize
 91 | 
 92 | ## Multinode benchmarks
 93 | 
 94 | In order to build multinode version of nvbandwidth, execute
 95 | 
 96 | ```
 97 | cmake -DMULTINODE=1 .
 98 | make
 99 | ```
100 | 
101 | Multinode nvbandwidth requires MPI. Cmake will find a local installation of MPI to build and link against. Multinode also requires installing, setting up the imex service, and creating the imex channels. Imex service is the NVIDIA Internode Memory Exchange Service. Imex runs on each compute tray to support GPU memory export and import operations across OS domains in an NVLink multi-node deployment. To start the IMEX service, run the following command: 
102 | 
103 | `sudo systemctl start nvidia-imex.service`
104 | Specify the IP addresses of the cluster nodes in /etc/nvidia-imex/nodes_config.cfg file.
105 | 
106 | For example, to run multinode bandwidth on a system with 2 nodes and 4 GPUs per node run the command:
107 | `mpirun --allow-run-as-root --map-by ppr:4:node --bind-to core -np 8 --report-bindings -q -mca btl_tcp_if_include enP5p9s0 --hostfile /etc/nvidia-imex/nodes_config.cfg  ./nvbandwidth -p multinode`
108 | 
109 | ### Local testing
110 | 
111 | You can test it on a single-node machine (Ampere+ GPU required):
112 | 
113 | ```
114 | mpirun -n 4 ./nvbandwidth -p multinode
115 | ```
116 | This command will spawn 4 processes, and run all tests with "multinode" prefix.
117 | 
118 | ### Running it on a cluster
119 | 
120 | To run it on a cluster, submit a job to a workload scheduler that has MPI integration. Run one process per GPU.
121 | 
122 | Running less processes than GPU count is valid, processes will take consecutive GPUs, starting from GPU 0.
123 | 
124 | Running more processes than GPU count is not valid.
125 | 
126 | All ranks in the MPI batch must be part of one multinode clique. Run one instance of nvbandwidth per node/GPU.
127 | 
128 | When running under MPI, only MPI rank 0 will output stdout to the console. Stderr, if needed, will be output by all processes.
129 | 
130 | It is recommended to only run "multinode*" testcases under MPI. While any testcase will succeed, results for non multinode testcases will only come from MPI rank 0.
131 | 
132 | ## Test Details
133 | There are two types of copies implemented, Copy Engine (CE) or Steaming Multiprocessor (SM)
134 | 
135 | CE copies use memcpy APIs. SM copies use kernels.
136 | 
137 | SM copies will truncate the copy size to fit uniformly on the target device to correctly report the bandwidth. The actual byte size for the copy is:
138 | ```
139 | (threadsPerBlock * deviceSMCount) * floor(copySize / (threadsPerBlock * deviceSMCount))
140 | ```
141 | 
142 | threadsPerBlock is set to 512.
143 | 
144 | ### Measurement Details
145 | ![](diagrams/measurement.png)
146 | 
147 | A blocking kernel and CUDA events are used to measure time to perform copies via SM or CE, and bandwidth is calculated from a series of copies.
148 | 
149 | First, we enqueue a spin kernel that spins on a flag in host memory. The spin kernel spins on the device until all events for measurement have been fully enqueued into the measurement streams. This ensures that the overhead of enqueuing operations is excluded from the measurement of actual transfer over the interconnect. Next, we enqueue a start event, certain count of memcpy iterations, and finally a stop event. Finally, we release the flag to start the measurement.
150 | 
151 | This process is repeated 3 times, and the median bandwidth for each trial is reported.
152 | 
153 | Number of repetitions can be overriden using the --testSamples option, and in order to use arithmetic mean instead of median you can specify --useMean option.
154 | 
155 | ### Unidirectional Bandwidth Tests
156 | ```
157 | Running host_to_device_memcpy_ce.
158 | memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
159 |           0         1         2         3         4         5         6         7
160 | 0     26.03     25.94     25.97     26.00     26.19     25.95     26.00     25.97
161 | ```
162 | 
163 | Unidirectional tests measure the bandwidth between each pair in the output matrix individually. Traffic is not sent simultaneously.
164 | 
165 | ### Bidirectional Host <-> Device Bandwidth Tests
166 | ```
167 | Running host_to_device_bidirectional_memcpy_ce.
168 | memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
169 |           0         1         2         3         4         5         6         7
170 | 0     18.56     18.37     19.37     19.59     18.71     18.79     18.46     18.61
171 | ```
172 | 
173 | The setup for bidirectional host to device bandwidth transfer is shown below:
174 | ![](diagrams/HtoDBidir.png)
175 | 
176 | **CE copies**  
177 | Stream 0 (measured stream) performs writes to the device, while the interfering stream in the opposite direction produces reads. This pattern is reversed for measuring bidirectional device to host bandwidth as shown below.
178 | 
179 | 
180 | ![](diagrams/DtoHBidir.png)
181 | 
182 | **SM copies**  
183 | The test launches a kernel copy where alternating thread warps are copying data in alternating directions.
184 | 
185 | ### Bidirectional Device <-> Device Bandwidth Tests
186 | The setup for bidirectional device to device transfers is shown below:
187 | 
188 | ![](diagrams/DtoDBidir.png)
189 | 
190 | **CE copies**  
191 | Stream 0 (measured stream) performs writes to the device, while the interfering stream in the opposite direction produces reads. This pattern is reversed for measuring bidirectional device to host bandwidth as shown below.
192 | 
193 | **SM Copies**  
194 | Similar to the HtoDBidir test above, the test launches a kernel where alternating thread warps copy data in alternating directions.
195 | 
196 | **Bandwidth calculation**  
197 | CE bidirectional bandwidth tests calculate bandwidth on the measured stream:
198 | ```
199 | CE bidir. bandwidth = (size of data on measured stream) / (time on measured stream)
200 | ```
201 | However, SM bidirectional test launches a kernel where odd and even warps are copying data in different directions. Bandwidth is calculated as shown below:
202 | ```
203 | SM bidir. bandwidth = size/(kernel time);
204 | ```
205 | 


--------------------------------------------------------------------------------
/common.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #ifndef COMMON_H_
 19 | #define COMMON_H_
 20 | 
 21 | #include <cmath>
 22 | #include <cstdlib>
 23 | #include <cuda.h>
 24 | #include <nvml.h>
 25 | #include <float.h>
 26 | #include <iomanip>
 27 | #include <iostream>
 28 | #include <ostream>
 29 | #include <string>
 30 | #include <sstream>
 31 | #include <thread>
 32 | #include <vector>
 33 | #include <unordered_set>
 34 | #include <limits.h>
 35 | #include <optional>
 36 | #include <cstring>
 37 | 
 38 | #define STRING_LENGTH 256
 39 | 
 40 | // Default constants
 41 | const unsigned long long defaultLoopCount = 16;
 42 | const unsigned long long smallBufferThreshold = 64;
 43 | const unsigned long long defaultBufferSize = 512;  // 512 MiB
 44 | const unsigned int defaultAverageLoopCount = 3;
 45 | const unsigned int _MiB = 1024 * 1024;
 46 | const unsigned int _2MiB = 2 * _MiB;
 47 | const unsigned int numThreadPerBlock = 512;
 48 | const unsigned int strideLen = 16; /* cacheLine size 128 Bytes, 16 words */
 49 | const unsigned long latencyMemAccessCnt = 1000000; /* 1M total read accesses to gauge latency */
 50 | extern int deviceCount;
 51 | extern unsigned int averageLoopCount;
 52 | extern bool disableAffinity;
 53 | extern bool skipVerification;
 54 | extern bool useMean;
 55 | extern bool jsonOutput;
 56 | // Verbosity
 57 | extern bool verbose;
 58 | extern bool perfFormatter;
 59 | 
 60 | #ifdef MULTINODE
 61 | extern int localDevice;
 62 | extern int localRank;
 63 | extern int worldRank;
 64 | extern int worldSize;
 65 | #endif
 66 | extern char localHostname[STRING_LENGTH];
 67 | 
 68 | class Verbosity {
 69 |  public:
 70 |     bool &controlVariable;
 71 | 
 72 |     Verbosity(bool &controlVariable): controlVariable(controlVariable) {}
 73 | 
 74 |     template<typename T>
 75 |     Verbosity& operator<<(T input) {
 76 |         if (!jsonOutput && controlVariable) std::cout << input;
 77 |         return *this;
 78 |     }
 79 | 
 80 |     using StreamType = decltype(std::cout);
 81 |     Verbosity &operator<<(StreamType &(*func)(StreamType &)) {
 82 |         if (!jsonOutput && controlVariable) {
 83 |             func(std::cout);
 84 |         }
 85 |         return *this;
 86 |     }
 87 | };
 88 | extern Verbosity VERBOSE;
 89 | extern Verbosity OUTPUT;
 90 | 
 91 | #ifdef _MSC_VER
 92 | #define __PRETTY_FUNCTION__ __FUNCTION__
 93 | #endif
 94 | 
 95 | // Rounds n up to the nearest multiple of "multiple".
 96 | // if n is already a multiple of "multiple", n is returned unchanged.
 97 | // works for arbitrary value of "multiple".
 98 | #define ROUND_UP(n, multiple)                                                           \
 99 |     (((n) + ((multiple)-1)) - (((n) + ((multiple)-1)) % (multiple)))
100 | 
101 | #define PROC_MASK_WORD_BITS (8 * sizeof(size_t))
102 | 
103 | #define PROC_MASK_SIZE                                                                  \
104 |     ROUND_UP(std::thread::hardware_concurrency(), PROC_MASK_WORD_BITS) / 8
105 | 
106 | #define PROC_MASK_QUERY_BIT(mask, proc)                                                 \
107 |     (mask[proc / PROC_MASK_WORD_BITS] &                                                 \
108 |         ((size_t)1 << (proc % PROC_MASK_WORD_BITS)))                                    \
109 |             ? 1                                                                         \
110 |             : 0
111 | 
112 | /* Set a bit in an affinity mask */
113 | #define PROC_MASK_SET(mask, proc)                                                       \
114 |     do {                                                                                \
115 |         size_t _proc = (proc);                                                          \
116 |         (mask)[_proc / PROC_MASK_WORD_BITS] |= (size_t)1                                \
117 |                                         << (_proc % PROC_MASK_WORD_BITS);               \
118 |     } while (0)
119 | 
120 | /* Clear a bit in an affinity mask */
121 | #define PROC_MASK_CLEAR(mask, proc)                                                     \
122 |     do {                                                                                \
123 |         size_t _proc = (proc);                                                          \
124 |         (mask)[_proc / PROC_MASK_WORD_BITS] &=                                          \
125 |             ~((size_t)1 << (_proc % PROC_MASK_WORD_BITS));                              \
126 |     } while (0)
127 | 
128 | inline size_t getFirstEnabledCPU() {
129 |     size_t firstEnabledCPU = 0;
130 |     size_t *procMask = (size_t *)calloc(1, PROC_MASK_SIZE);
131 |     for (size_t i = 0; i < PROC_MASK_SIZE * 8; ++i) {
132 |         if (PROC_MASK_QUERY_BIT(procMask, i)) {
133 |             firstEnabledCPU = i;
134 |         break;
135 |         }
136 |     }
137 |     free(procMask);
138 |     return firstEnabledCPU;
139 | }
140 | 
141 | // Calculation and display of performance statistics
142 | // Basic online running statistics calculator, modeled after a less templated
143 | // version of boost::accumulators.
144 | class PerformanceStatistic {
145 |     std::vector<double> values;
146 | 
147 |  public:
148 |     void operator()(const double &sample) { recordSample(sample); }
149 | 
150 |     void recordSample(const double &sample) {
151 |         auto it = std::lower_bound(values.begin(), values.end(), sample);
152 |         values.insert(it, sample);
153 |     }
154 | 
155 |     void reset(void) { values.clear(); }
156 | 
157 |     double sum(void) const {
158 |         double total = 0.0;
159 |         for (double val : values) {
160 |             total += val;
161 |         }
162 |         return total;
163 |     }
164 | 
165 |     size_t count(void) const { return values.size(); }
166 | 
167 |     double mean(void) const {
168 |         return sum() / count();
169 |     }
170 | 
171 |     double variance(void) const {
172 |         double calculated_mean = mean();
173 |         double sum_diff_squared = 0.0;
174 |         for (double val : values) {
175 |             double diff = val - calculated_mean;
176 |             sum_diff_squared += diff * diff;
177 |         }
178 |         return (values.size() > 1 ? sum_diff_squared / (values.size() - 1) : 0.0);
179 |     }
180 | 
181 |     double stddev(void) const {
182 |         return (variance() > 0.0 ? std::sqrt(variance()) : 0.0);
183 |     }
184 | 
185 |     double largest(void) const { return values.size() > 0 ? values[values.size() - 1] : 0.0; }
186 | 
187 |     double smallest(void) const { return values.size() > 0 ? values[0] : 0.0; }
188 | 
189 |     double median(void) const {
190 |         if (values.size() == 0) {
191 |             return 0.0;
192 |         } else if (values.size() % 2 == 0) {
193 |             int idx = values.size() / 2;
194 |             return (values[idx] + values[idx - 1]) / 2.0;
195 |         } else {
196 |             return values[values.size() / 2];
197 |         }
198 |     }
199 | 
200 |     double returnAppropriateMetric(void) const {
201 |         if (useMean) {
202 |             return mean();
203 |         } else {
204 |             return median();
205 |         }
206 |     }
207 | };
208 | 
209 | #ifdef MULTINODE
210 | inline std::string getPaddedProcessId(int id) {
211 |     // max printed number will be worldSize - 1
212 |     int paddingSize = (int) log10(worldSize - 1) + 1;
213 |     std::stringstream s;
214 |     s << std::setfill(' ') << std::setw(paddingSize) << id;
215 |     return s.str();
216 | }
217 | #endif
218 | 
219 | struct LatencyNode {
220 |     struct LatencyNode *next;
221 | };
222 | 
223 | enum UnitType {
224 |     BANDWIDTH,
225 |     LATENCY
226 | };
227 | 
228 | inline std::string getUnitString(UnitType unitType) {
229 |     switch (unitType) {
230 |         case BANDWIDTH:
231 |             return " +GB/s";
232 |         case LATENCY:
233 |             return " -ns";
234 |         default:
235 |             return "";
236 |     }
237 | }
238 | 
239 | // Describe attributes of a single memcpy operation
240 | class MemcpyDescriptor {
241 |  public:
242 |     CUdeviceptr dst;
243 |     CUdeviceptr src;
244 |     CUstream stream;
245 |     size_t copySize;
246 |     unsigned long long loopCount;
247 | 
248 |     MemcpyDescriptor(CUdeviceptr dst, CUdeviceptr src, CUstream stream, size_t copySize, unsigned long long loopCount);
249 | };
250 | 
251 | 
252 | #endif  // COMMON_H_
253 | 


--------------------------------------------------------------------------------
/debian_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Utility script that attempts to install
 3 | # necessary software components needed to
 4 | # build nvbandwidth
 5 | 
 6 | apt install -y build-essential
 7 | apt install -y libboost-program-options-dev
 8 | apt install -y cmake
 9 | output=$(cmake --version | sed -n 1p | sed 's/[^0-9]*//g')
10 | if [ $output -lt 3200 ]; then
11 |     echo "Upgrade cmake version to 3.20 or above to build nvbandwidth"
12 |     exit 1
13 | fi
14 | cmake .
15 | make
16 | 


--------------------------------------------------------------------------------
/detect_cuda_arch.cmake:
--------------------------------------------------------------------------------
 1 | include_guard(GLOBAL)
 2 | 
 3 | # Function uses the CUDA runtime API to query the compute capability of the device, so if a user
 4 | # doesn't pass any architecture options to CMake we only build the current architecture
 5 | 
 6 | # Adapted from https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/detail/detect_architectures.cmake
 7 | 
 8 | function(cuda_detect_architectures possible_archs_var gpu_archs)
 9 | 
10 |   set(__gpu_archs ${${possible_archs_var}})
11 | 
12 |   set(eval_file eval_gpu_archs.cu)
13 |   set(eval_exe eval_gpu_archs)
14 |   set(error_file eval_gpu_archs.stderr.log)
15 | 
16 |   if(NOT DEFINED CMAKE_CUDA_COMPILER)
17 |     message(FATAL_ERROR "No CUDA compiler specified, unable to determine machine's GPUs.")
18 |   endif()
19 | 
20 |   if(NOT EXISTS "${eval_exe}")
21 |     file(WRITE ${eval_file}
22 |          "
23 | #include <cstdio>
24 | #include <set>
25 | #include <string>
26 | using namespace std;
27 | int main(int argc, char** argv) {
28 |   set<string> archs;
29 |   int nDevices;
30 |   if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) {
31 |     for(int dev=0;dev<nDevices;++dev) {
32 |       char buff[32];
33 |       cudaDeviceProp prop;
34 |       if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
35 |       sprintf(buff, \"%d%d\", prop.major, prop.minor);
36 |       archs.insert(buff);
37 |     }
38 |   }
39 |   if(archs.empty()) {
40 |     printf(\"${__gpu_archs}\");
41 |   } else {
42 |     bool first = true;
43 |     for(const auto& arch : archs) {
44 |       printf(first? \"%s\" : \";%s\", arch.c_str());
45 |       first = false;
46 |     }
47 |   }
48 |   printf(\"\\n\");
49 |   return 0;
50 |   }
51 |   ")
52 |     execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -std=c++11 -o "${eval_exe}" "${eval_file}"
53 |                     ERROR_FILE "${error_file}")
54 |   endif()
55 | 
56 |   if(EXISTS "${eval_exe}")
57 |     execute_process(COMMAND "./${eval_exe}" OUTPUT_VARIABLE __gpu_archs
58 |                     OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_FILE "${error_file}")
59 |     message(STATUS "Auto detection of gpu-archs: ${__gpu_archs}")
60 |   else()
61 |     message(STATUS "Failed auto detection of gpu-archs. Falling back to using ${__gpu_archs}.")
62 |   endif()
63 |   # remove the build artifacts
64 |   file(REMOVE "${eval_file}" "${eval_exe}" "${error_file}")
65 |   set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
66 | 
67 | endfunction()
68 | 


--------------------------------------------------------------------------------
/diagrams/DtoDBidir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nvbandwidth/66746a3bef61c8c2e12ab34955310da70b9e38cb/diagrams/DtoDBidir.png


--------------------------------------------------------------------------------
/diagrams/DtoHBidir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nvbandwidth/66746a3bef61c8c2e12ab34955310da70b9e38cb/diagrams/DtoHBidir.png


--------------------------------------------------------------------------------
/diagrams/HtoDBidir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nvbandwidth/66746a3bef61c8c2e12ab34955310da70b9e38cb/diagrams/HtoDBidir.png


--------------------------------------------------------------------------------
/diagrams/measurement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nvbandwidth/66746a3bef61c8c2e12ab34955310da70b9e38cb/diagrams/measurement.png


--------------------------------------------------------------------------------
/error_handling.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * spdx-filecopyrighttext: copyright (c) 2024 nvidia corporation & affiliates. all rights reserved.
 3 |  * spdx-license-identifier: apache-2.0
 4 |  *
 5 |  * licensed under the apache license, version 2.0 (the "license");
 6 |  * you may not use this file except in compliance with the license.
 7 |  * you may obtain a copy of the license at
 8 |  *
 9 |  * http://www.apache.org/licenses/license-2.0
10 |  *
11 |  * unless required by applicable law or agreed to in writing, software
12 |  * distributed under the license is distributed on an "as is" basis,
13 |  * without warranties or conditions of any kind, either express or implied.
14 |  * see the license for the specific language governing permissions and
15 |  * limitations under the license.
16 |  */
17 | 
18 | #ifndef ERROR_HANDLING_H_
19 | #define ERROR_HANDLING_H_
20 | 
21 | void RecordError(const std::stringstream &errmsg);
22 | 
23 | #ifdef MULTINODE
24 | #define HOST_INFO " on " << localHostname << ", rank = " << worldRank
25 | #else
26 | #define HOST_INFO ""
27 | #endif
28 | 
29 | #ifdef MULTINODE
30 | #include <mpi.h>
31 | #define MPI_ABORT MPI_Abort(MPI_COMM_WORLD, 1)
32 | #else
33 | #define MPI_ABORT
34 | #endif
35 | 
36 | // CUDA Error handling
37 | #define CUDA_ASSERT(x) do { \
38 |     cudaError_t cudaErr = (x); \
39 |     if ((cudaErr) != cudaSuccess) { \
40 |         std::stringstream errmsg; \
41 |         errmsg << "[" << cudaGetErrorName(cudaErr) << "] " << cudaGetErrorString(cudaErr) << " in expression " << #x << HOST_INFO << " in " << __PRETTY_FUNCTION__ << "() : " << __FILE__ << ":" <<  __LINE__ << std::endl; \
42 |         RecordError(errmsg); \
43 |         MPI_ABORT; \
44 |         std::exit(1); \
45 |     }  \
46 | } while ( 0 )
47 | 
48 | #define CU_ASSERT(x) do { \
49 |     CUresult cuResult = (x); \
50 |     if ((cuResult) != CUDA_SUCCESS) { \
51 |         const char *errDescStr, *errNameStr; \
52 |         cuGetErrorString(cuResult, &errDescStr); \
53 |         cuGetErrorName(cuResult, &errNameStr); \
54 |         std::stringstream errmsg; \
55 |         errmsg << "[" << errNameStr << "] " << errDescStr << " in expression " << #x << HOST_INFO << " in " << __PRETTY_FUNCTION__ << "() : " << __FILE__ << ":" <<  __LINE__ << std::endl; \
56 |         RecordError(errmsg); \
57 |         MPI_ABORT; \
58 |         std::exit(1); \
59 |     }  \
60 | } while ( 0 )
61 | 
62 | // NVML Error handling
63 | #define NVML_ASSERT(x) do { \
64 |     nvmlReturn_t nvmlResult = (x); \
65 |     if ((nvmlResult) != NVML_SUCCESS) { \
66 |         std::stringstream errmsg; \
67 |         errmsg << "NVML_ERROR: [" << nvmlErrorString(nvmlResult) << "] in expression " << #x << HOST_INFO << " in " << __PRETTY_FUNCTION__ << "() : " << __FILE__ << ":" <<  __LINE__ << std::endl; \
68 |         RecordError(errmsg); \
69 |         MPI_ABORT; \
70 |         std::exit(1); \
71 |     }  \
72 | } while ( 0 )
73 | 
74 | // Generic Error handling
75 | #define ASSERT(x) do { \
76 |     if (!(x)) { \
77 |         std::stringstream errmsg; \
78 |         errmsg << "ASSERT in expression " << #x << HOST_INFO << " in " << __PRETTY_FUNCTION__ << "() : " << __FILE__ << ":" <<  __LINE__  << std::endl; \
79 |         RecordError(errmsg); \
80 |         MPI_ABORT; \
81 |         std::exit(1); \
82 |     }  \
83 | } while ( 0 )
84 | 
85 | #endif  // ERROR_HANDLING_H_
86 | 


--------------------------------------------------------------------------------
/inline_common.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #ifndef INLINE_COMMON_H_
 19 | #define INLINE_COMMON_H_
 20 | 
 21 | #include "common.h"
 22 | #include "error_handling.h"
 23 | 
 24 | template <class T> struct PeerValueMatrix {
 25 |     std::vector<std::optional <T>> m_matrix;
 26 |     int m_rows, m_columns;
 27 |     std::string key;
 28 |     std::vector<std::string> column_labels;
 29 |     std::vector<std::string> row_labels;
 30 |     bool pFormatter;
 31 |     UnitType uType;
 32 | 
 33 |     PeerValueMatrix(int rows, int columns, std::string key = "", bool pFormatter = perfFormatter, UnitType uType = BANDWIDTH): m_matrix(rows * columns), m_rows(rows), m_columns(columns), key(key), pFormatter(perfFormatter), uType(uType) {}
 34 | 
 35 |     std::optional <T> &value(int src, int dst) {
 36 |         ASSERT(src >= 0 && src < m_rows);
 37 |         ASSERT(dst >= 0 && dst < m_columns);
 38 |         return m_matrix[src * m_columns + dst];
 39 |     }
 40 |     const std::optional <T> &value(int src, int dst) const {
 41 |         ASSERT(src >= 0 && src < m_rows);
 42 |         ASSERT(dst >= 0 && dst < m_columns);
 43 |         return m_matrix[src * m_columns + dst];
 44 |     }
 45 | 
 46 |     void setRowLabels(std::vector<std::string> _row_labels) {
 47 |         row_labels = _row_labels;
 48 |     }
 49 | 
 50 |     void setColumnLabels(std::vector<std::string> _column_labels) {
 51 |         column_labels = _column_labels;
 52 |     }
 53 | };
 54 | 
 55 | template <class T>
 56 | std::ostream &operator<<(std::ostream &o, const PeerValueMatrix<T> &matrix) {
 57 |     // This assumes T is numeric
 58 |     T maxVal = std::numeric_limits<T>::min();
 59 |     T minVal = std::numeric_limits<T>::max();
 60 |     T sum = 0;
 61 |     int count = 0;
 62 | 
 63 |     // First square of the table should be blank, calculate and print appropriately many spaces
 64 |     int columnIdWidth = 2;
 65 |     for (auto s : matrix.row_labels) {
 66 |         columnIdWidth = std::max(columnIdWidth, (int) s.size());
 67 |     }
 68 | 
 69 |     for (int i = 0; i < columnIdWidth; i++) {
 70 |         o << " ";
 71 |     }
 72 | 
 73 |     for (int currentDevice = 0; currentDevice < matrix.m_columns; currentDevice++) {
 74 |         if (matrix.column_labels.size() > 0) {
 75 |             o << std::setw(10) << matrix.column_labels[currentDevice];
 76 |         } else {
 77 |             o << std::setw(10) << currentDevice;
 78 |         }
 79 |     }
 80 |     o << std::endl;
 81 | 
 82 |     for (int currentDevice = 0; currentDevice < matrix.m_rows; currentDevice++) {
 83 |         if (matrix.row_labels.size() > 0) {
 84 |             o << std::setw(columnIdWidth) << matrix.row_labels[currentDevice];
 85 |         } else {
 86 |             o << std::setw(2) << currentDevice;
 87 |         }
 88 | 
 89 |         for (int peer = 0; peer < matrix.m_columns; peer++) {
 90 |             std::optional <T> val = matrix.value(currentDevice, peer);
 91 |             if (val) {
 92 |                 o << std::setw(10) << val.value();
 93 |             } else {
 94 |                 o << std::setw(10) << "N/A";
 95 |             }
 96 |             sum += val.value_or(0.0);
 97 |             maxVal = std::max(maxVal, val.value_or(0.0));
 98 |             minVal = std::min(minVal, val.value_or(0.0));
 99 |             if (val.value_or(0.0) > 0) count++;
100 |         }
101 |         o << std::endl;
102 |     }
103 |     o << std::endl;
104 |     if (matrix.pFormatter) {
105 |         o << "&&&& PERF " << matrix.key << " " << sum << getUnitString(matrix.uType) << std::endl;
106 |     } else {
107 |         o << "SUM " << matrix.key << " " << sum << std::endl;
108 |     }
109 | 
110 |     VERBOSE << "MIN " << matrix.key << " " << minVal << '\n';
111 |     VERBOSE << "MAX " << matrix.key << " " << maxVal << '\n';
112 |     VERBOSE << "AVG " << matrix.key << " " << sum / count << '\n';
113 |     return o;
114 | }
115 | 
116 | // NUMA optimal affinity
117 | inline void setOptimalCpuAffinity(int cudaDeviceID) {
118 | #ifdef _WIN32
119 |     // NVML doesn't support setting affinity on Windows
120 |     return;
121 | #endif
122 |     if (disableAffinity) {
123 |         return;
124 |     }
125 | 
126 |     nvmlDevice_t device;
127 |     CUuuid dev_uuid;
128 | 
129 |     std::stringstream s;
130 |     std::unordered_set <unsigned char> dashPos {0, 4, 6, 8, 10};
131 | 
132 |     CU_ASSERT(cuDeviceGetUuid(&dev_uuid, cudaDeviceID));
133 | 
134 |     s << "GPU";
135 |     for (int i = 0; i < 16; i++) {
136 |         if (dashPos.count(i)) {
137 |             s << '-';
138 |         }
139 |         s << std::hex << std::setfill('0') << std::setw(2) << (0xFF & (int)dev_uuid.bytes[i]);
140 |     }
141 | 
142 |     NVML_ASSERT(nvmlDeviceGetHandleByUUID(s.str().c_str(), &device));
143 |     nvmlReturn_t result = nvmlDeviceSetCpuAffinity(device);
144 |     if (result != NVML_ERROR_NOT_SUPPORTED) {
145 |         NVML_ASSERT(result);
146 |     }
147 | }
148 | 
149 | inline bool isMemoryOwnedByCUDA(void *memory) {
150 |     CUmemorytype memorytype;
151 |     CUresult status = cuPointerGetAttribute(&memorytype, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)memory);
152 |     if (status == CUDA_ERROR_INVALID_VALUE) {
153 |         return false;
154 |     } else {
155 |         CU_ASSERT(status);
156 |         return true;
157 |     }
158 | }
159 | 
160 | #endif  // INLINE_COMMON_H_
161 | 


--------------------------------------------------------------------------------
/json/json-forwards.h:
--------------------------------------------------------------------------------
  1 | /// Json-cpp amalgamated forward header (http://jsoncpp.sourceforge.net/).
  2 | /// It is intended to be used with #include "json/json-forwards.h"
  3 | /// This header provides forward declaration for all JsonCpp types.
  4 | 
  5 | // //////////////////////////////////////////////////////////////////////
  6 | // Beginning of content of file: LICENSE
  7 | // //////////////////////////////////////////////////////////////////////
  8 | 
  9 | /*
 10 | The JsonCpp library's source code, including accompanying documentation,
 11 | tests and demonstration applications, are licensed under the following
 12 | conditions...
 13 | 
 14 | Baptiste Lepilleur and The JsonCpp Authors explicitly disclaim copyright in all
 15 | jurisdictions which recognize such a disclaimer. In such jurisdictions,
 16 | this software is released into the Public Domain.
 17 | 
 18 | In jurisdictions which do not recognize Public Domain property (e.g. Germany as of
 19 | 2010), this software is Copyright (c) 2007-2010 by Baptiste Lepilleur and
 20 | The JsonCpp Authors, and is released under the terms of the MIT License (see below).
 21 | 
 22 | In jurisdictions which recognize Public Domain property, the user of this
 23 | software may choose to accept it either as 1) Public Domain, 2) under the
 24 | conditions of the MIT License (see below), or 3) under the terms of dual
 25 | Public Domain/MIT License conditions described here, as they choose.
 26 | 
 27 | The MIT License is about as close to Public Domain as a license can get, and is
 28 | described in clear, concise terms at:
 29 | 
 30 |    http://en.wikipedia.org/wiki/MIT_License
 31 | 
 32 | The full text of the MIT License follows:
 33 | 
 34 | ========================================================================
 35 | Copyright (c) 2007-2010 Baptiste Lepilleur and The JsonCpp Authors
 36 | 
 37 | Permission is hereby granted, free of charge, to any person
 38 | obtaining a copy of this software and associated documentation
 39 | files (the "Software"), to deal in the Software without
 40 | restriction, including without limitation the rights to use, copy,
 41 | modify, merge, publish, distribute, sublicense, and/or sell copies
 42 | of the Software, and to permit persons to whom the Software is
 43 | furnished to do so, subject to the following conditions:
 44 | 
 45 | The above copyright notice and this permission notice shall be
 46 | included in all copies or substantial portions of the Software.
 47 | 
 48 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 49 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 50 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 51 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 52 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 53 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 54 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 55 | SOFTWARE.
 56 | ========================================================================
 57 | (END LICENSE TEXT)
 58 | 
 59 | The MIT license is compatible with both the GPL and commercial
 60 | software, affording one all of the rights of Public Domain with the
 61 | minor nuisance of being required to keep the above copyright notice
 62 | and license text in the source code. Note also that by accepting the
 63 | Public Domain "license" you can re-license your copy using whatever
 64 | license you like.
 65 | 
 66 | */
 67 | 
 68 | // //////////////////////////////////////////////////////////////////////
 69 | // End of content of file: LICENSE
 70 | // //////////////////////////////////////////////////////////////////////
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | #ifndef JSON_FORWARD_AMALGAMATED_H_INCLUDED
 77 | # define JSON_FORWARD_AMALGAMATED_H_INCLUDED
 78 | /// If defined, indicates that the source file is amalgamated
 79 | /// to prevent private header inclusion.
 80 | #define JSON_IS_AMALGAMATION
 81 | 
 82 | // //////////////////////////////////////////////////////////////////////
 83 | // Beginning of content of file: include/json/version.h
 84 | // //////////////////////////////////////////////////////////////////////
 85 | 
 86 | #ifndef JSON_VERSION_H_INCLUDED
 87 | #define JSON_VERSION_H_INCLUDED
 88 | 
 89 | // Note: version must be updated in three places when doing a release. This
 90 | // annoying process ensures that amalgamate, CMake, and meson all report the
 91 | // correct version.
 92 | // 1. /meson.build
 93 | // 2. /include/json/version.h
 94 | // 3. /CMakeLists.txt
 95 | // IMPORTANT: also update the SOVERSION!!
 96 | 
 97 | #define JSONCPP_VERSION_STRING "1.9.5"
 98 | #define JSONCPP_VERSION_MAJOR 1
 99 | #define JSONCPP_VERSION_MINOR 9
100 | #define JSONCPP_VERSION_PATCH 5
101 | #define JSONCPP_VERSION_QUALIFIER
102 | #define JSONCPP_VERSION_HEXA                                                   \
103 |   ((JSONCPP_VERSION_MAJOR << 24) | (JSONCPP_VERSION_MINOR << 16) |             \
104 |    (JSONCPP_VERSION_PATCH << 8))
105 | 
106 | #ifdef JSONCPP_USING_SECURE_MEMORY
107 | #undef JSONCPP_USING_SECURE_MEMORY
108 | #endif
109 | #define JSONCPP_USING_SECURE_MEMORY 0
110 | // If non-zero, the library zeroes any memory that it has allocated before
111 | // it frees its memory.
112 | 
113 | #endif // JSON_VERSION_H_INCLUDED
114 | 
115 | // //////////////////////////////////////////////////////////////////////
116 | // End of content of file: include/json/version.h
117 | // //////////////////////////////////////////////////////////////////////
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | // //////////////////////////////////////////////////////////////////////
125 | // Beginning of content of file: include/json/allocator.h
126 | // //////////////////////////////////////////////////////////////////////
127 | 
128 | // Copyright 2007-2010 Baptiste Lepilleur and The JsonCpp Authors
129 | // Distributed under MIT license, or public domain if desired and
130 | // recognized in your jurisdiction.
131 | // See file LICENSE for detail or copy at http://jsoncpp.sourceforge.net/LICENSE
132 | 
133 | #ifndef JSON_ALLOCATOR_H_INCLUDED
134 | #define JSON_ALLOCATOR_H_INCLUDED
135 | 
136 | #include <cstring>
137 | #include <memory>
138 | 
139 | #pragma pack(push)
140 | #pragma pack()
141 | 
142 | namespace Json {
143 | template <typename T> class SecureAllocator {
144 | public:
145 |   // Type definitions
146 |   using value_type = T;
147 |   using pointer = T*;
148 |   using const_pointer = const T*;
149 |   using reference = T&;
150 |   using const_reference = const T&;
151 |   using size_type = std::size_t;
152 |   using difference_type = std::ptrdiff_t;
153 | 
154 |   /**
155 |    * Allocate memory for N items using the standard allocator.
156 |    */
157 |   pointer allocate(size_type n) {
158 |     // allocate using "global operator new"
159 |     return static_cast<pointer>(::operator new(n * sizeof(T)));
160 |   }
161 | 
162 |   /**
163 |    * Release memory which was allocated for N items at pointer P.
164 |    *
165 |    * The memory block is filled with zeroes before being released.
166 |    */
167 |   void deallocate(pointer p, size_type n) {
168 |     // memset_s is used because memset may be optimized away by the compiler
169 |     memset_s(p, n * sizeof(T), 0, n * sizeof(T));
170 |     // free using "global operator delete"
171 |     ::operator delete(p);
172 |   }
173 | 
174 |   /**
175 |    * Construct an item in-place at pointer P.
176 |    */
177 |   template <typename... Args> void construct(pointer p, Args&&... args) {
178 |     // construct using "placement new" and "perfect forwarding"
179 |     ::new (static_cast<void*>(p)) T(std::forward<Args>(args)...);
180 |   }
181 | 
182 |   size_type max_size() const { return size_t(-1) / sizeof(T); }
183 | 
184 |   pointer address(reference x) const { return std::addressof(x); }
185 | 
186 |   const_pointer address(const_reference x) const { return std::addressof(x); }
187 | 
188 |   /**
189 |    * Destroy an item in-place at pointer P.
190 |    */
191 |   void destroy(pointer p) {
192 |     // destroy using "explicit destructor"
193 |     p->~T();
194 |   }
195 | 
196 |   // Boilerplate
197 |   SecureAllocator() {}
198 |   template <typename U> SecureAllocator(const SecureAllocator<U>&) {}
199 |   template <typename U> struct rebind { using other = SecureAllocator<U>; };
200 | };
201 | 
202 | template <typename T, typename U>
203 | bool operator==(const SecureAllocator<T>&, const SecureAllocator<U>&) {
204 |   return true;
205 | }
206 | 
207 | template <typename T, typename U>
208 | bool operator!=(const SecureAllocator<T>&, const SecureAllocator<U>&) {
209 |   return false;
210 | }
211 | 
212 | } // namespace Json
213 | 
214 | #pragma pack(pop)
215 | 
216 | #endif // JSON_ALLOCATOR_H_INCLUDED
217 | 
218 | // //////////////////////////////////////////////////////////////////////
219 | // End of content of file: include/json/allocator.h
220 | // //////////////////////////////////////////////////////////////////////
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | // //////////////////////////////////////////////////////////////////////
228 | // Beginning of content of file: include/json/config.h
229 | // //////////////////////////////////////////////////////////////////////
230 | 
231 | // Copyright 2007-2010 Baptiste Lepilleur and The JsonCpp Authors
232 | // Distributed under MIT license, or public domain if desired and
233 | // recognized in your jurisdiction.
234 | // See file LICENSE for detail or copy at http://jsoncpp.sourceforge.net/LICENSE
235 | 
236 | #ifndef JSON_CONFIG_H_INCLUDED
237 | #define JSON_CONFIG_H_INCLUDED
238 | #include <cstddef>
239 | #include <cstdint>
240 | #include <istream>
241 | #include <memory>
242 | #include <ostream>
243 | #include <sstream>
244 | #include <string>
245 | #include <type_traits>
246 | 
247 | // If non-zero, the library uses exceptions to report bad input instead of C
248 | // assertion macros. The default is to use exceptions.
249 | #ifndef JSON_USE_EXCEPTION
250 | #define JSON_USE_EXCEPTION 1
251 | #endif
252 | 
253 | // Temporary, tracked for removal with issue #982.
254 | #ifndef JSON_USE_NULLREF
255 | #define JSON_USE_NULLREF 1
256 | #endif
257 | 
258 | /// If defined, indicates that the source file is amalgamated
259 | /// to prevent private header inclusion.
260 | /// Remarks: it is automatically defined in the generated amalgamated header.
261 | // #define JSON_IS_AMALGAMATION
262 | 
263 | // Export macros for DLL visibility
264 | #if defined(JSON_DLL_BUILD)
265 | #if defined(_MSC_VER) || defined(__MINGW32__)
266 | #define JSON_API __declspec(dllexport)
267 | #define JSONCPP_DISABLE_DLL_INTERFACE_WARNING
268 | #elif defined(__GNUC__) || defined(__clang__)
269 | #define JSON_API __attribute__((visibility("default")))
270 | #endif // if defined(_MSC_VER)
271 | 
272 | #elif defined(JSON_DLL)
273 | #if defined(_MSC_VER) || defined(__MINGW32__)
274 | #define JSON_API __declspec(dllimport)
275 | #define JSONCPP_DISABLE_DLL_INTERFACE_WARNING
276 | #endif // if defined(_MSC_VER)
277 | #endif // ifdef JSON_DLL_BUILD
278 | 
279 | #if !defined(JSON_API)
280 | #define JSON_API
281 | #endif
282 | 
283 | #if defined(_MSC_VER) && _MSC_VER < 1800
284 | #error                                                                         \
285 |     "ERROR:  Visual Studio 12 (2013) with _MSC_VER=1800 is the oldest supported compiler with sufficient C++11 capabilities"
286 | #endif
287 | 
288 | #if defined(_MSC_VER) && _MSC_VER < 1900
289 | // As recommended at
290 | // https://stackoverflow.com/questions/2915672/snprintf-and-visual-studio-2010
291 | extern JSON_API int msvc_pre1900_c99_snprintf(char* outBuf, size_t size,
292 |                                               const char* format, ...);
293 | #define jsoncpp_snprintf msvc_pre1900_c99_snprintf
294 | #else
295 | #define jsoncpp_snprintf std::snprintf
296 | #endif
297 | 
298 | // If JSON_NO_INT64 is defined, then Json only support C++ "int" type for
299 | // integer
300 | // Storages, and 64 bits integer support is disabled.
301 | // #define JSON_NO_INT64 1
302 | 
303 | // JSONCPP_OVERRIDE is maintained for backwards compatibility of external tools.
304 | // C++11 should be used directly in JSONCPP.
305 | #define JSONCPP_OVERRIDE override
306 | 
307 | #ifdef __clang__
308 | #if __has_extension(attribute_deprecated_with_message)
309 | #define JSONCPP_DEPRECATED(message) __attribute__((deprecated(message)))
310 | #endif
311 | #elif defined(__GNUC__) // not clang (gcc comes later since clang emulates gcc)
312 | #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
313 | #define JSONCPP_DEPRECATED(message) __attribute__((deprecated(message)))
314 | #elif (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
315 | #define JSONCPP_DEPRECATED(message) __attribute__((__deprecated__))
316 | #endif                  // GNUC version
317 | #elif defined(_MSC_VER) // MSVC (after clang because clang on Windows emulates
318 |                         // MSVC)
319 | #define JSONCPP_DEPRECATED(message) __declspec(deprecated(message))
320 | #endif // __clang__ || __GNUC__ || _MSC_VER
321 | 
322 | #if !defined(JSONCPP_DEPRECATED)
323 | #define JSONCPP_DEPRECATED(message)
324 | #endif // if !defined(JSONCPP_DEPRECATED)
325 | 
326 | #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 6))
327 | #define JSON_USE_INT64_DOUBLE_CONVERSION 1
328 | #endif
329 | 
330 | #if !defined(JSON_IS_AMALGAMATION)
331 | 
332 | #include "allocator.h"
333 | #include "version.h"
334 | 
335 | #endif // if !defined(JSON_IS_AMALGAMATION)
336 | 
337 | namespace Json {
338 | using Int = int;
339 | using UInt = unsigned int;
340 | #if defined(JSON_NO_INT64)
341 | using LargestInt = int;
342 | using LargestUInt = unsigned int;
343 | #undef JSON_HAS_INT64
344 | #else                 // if defined(JSON_NO_INT64)
345 | // For Microsoft Visual use specific types as long long is not supported
346 | #if defined(_MSC_VER) // Microsoft Visual Studio
347 | using Int64 = __int64;
348 | using UInt64 = unsigned __int64;
349 | #else                 // if defined(_MSC_VER) // Other platforms, use long long
350 | using Int64 = int64_t;
351 | using UInt64 = uint64_t;
352 | #endif                // if defined(_MSC_VER)
353 | using LargestInt = Int64;
354 | using LargestUInt = UInt64;
355 | #define JSON_HAS_INT64
356 | #endif // if defined(JSON_NO_INT64)
357 | 
358 | template <typename T>
359 | using Allocator =
360 |     typename std::conditional<JSONCPP_USING_SECURE_MEMORY, SecureAllocator<T>,
361 |                               std::allocator<T>>::type;
362 | using String = std::basic_string<char, std::char_traits<char>, Allocator<char>>;
363 | using IStringStream =
364 |     std::basic_istringstream<String::value_type, String::traits_type,
365 |                              String::allocator_type>;
366 | using OStringStream =
367 |     std::basic_ostringstream<String::value_type, String::traits_type,
368 |                              String::allocator_type>;
369 | using IStream = std::istream;
370 | using OStream = std::ostream;
371 | } // namespace Json
372 | 
373 | // Legacy names (formerly macros).
374 | using JSONCPP_STRING = Json::String;
375 | using JSONCPP_ISTRINGSTREAM = Json::IStringStream;
376 | using JSONCPP_OSTRINGSTREAM = Json::OStringStream;
377 | using JSONCPP_ISTREAM = Json::IStream;
378 | using JSONCPP_OSTREAM = Json::OStream;
379 | 
380 | #endif // JSON_CONFIG_H_INCLUDED
381 | 
382 | // //////////////////////////////////////////////////////////////////////
383 | // End of content of file: include/json/config.h
384 | // //////////////////////////////////////////////////////////////////////
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | // //////////////////////////////////////////////////////////////////////
392 | // Beginning of content of file: include/json/forwards.h
393 | // //////////////////////////////////////////////////////////////////////
394 | 
395 | // Copyright 2007-2010 Baptiste Lepilleur and The JsonCpp Authors
396 | // Distributed under MIT license, or public domain if desired and
397 | // recognized in your jurisdiction.
398 | // See file LICENSE for detail or copy at http://jsoncpp.sourceforge.net/LICENSE
399 | 
400 | #ifndef JSON_FORWARDS_H_INCLUDED
401 | #define JSON_FORWARDS_H_INCLUDED
402 | 
403 | #if !defined(JSON_IS_AMALGAMATION)
404 | #include "config.h"
405 | #endif // if !defined(JSON_IS_AMALGAMATION)
406 | 
407 | namespace Json {
408 | 
409 | // writer.h
410 | class StreamWriter;
411 | class StreamWriterBuilder;
412 | class Writer;
413 | class FastWriter;
414 | class StyledWriter;
415 | class StyledStreamWriter;
416 | 
417 | // reader.h
418 | class Reader;
419 | class CharReader;
420 | class CharReaderBuilder;
421 | 
422 | // json_features.h
423 | class Features;
424 | 
425 | // value.h
426 | using ArrayIndex = unsigned int;
427 | class StaticString;
428 | class Path;
429 | class PathArgument;
430 | class Value;
431 | class ValueIteratorBase;
432 | class ValueIterator;
433 | class ValueConstIterator;
434 | 
435 | } // namespace Json
436 | 
437 | #endif // JSON_FORWARDS_H_INCLUDED
438 | 
439 | // //////////////////////////////////////////////////////////////////////
440 | // End of content of file: include/json/forwards.h
441 | // //////////////////////////////////////////////////////////////////////
442 | 
443 | 
444 | 
445 | 
446 | 
447 | #endif //ifndef JSON_FORWARD_AMALGAMATED_H_INCLUDED
448 | 


--------------------------------------------------------------------------------
/json_output.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include <cassert>
 19 | #include <string>
 20 | 
 21 | #include "common.h"
 22 | #include "json_output.h"
 23 | #include "version.h"
 24 | 
 25 | const std::string NVB_TITLE("nvbandwidth");
 26 | const std::string NVB_HOST_NAME("Hostname");
 27 | const std::string NVB_CUDA_RUNTIME_VERSION("CUDA Runtime Version");
 28 | const std::string NVB_DEVICE_INFO("GPU Device info");
 29 | const std::string NVB_DEVICE_LIST("GPU Device list");
 30 | const std::string NVB_DRIVER_VERSION("Driver Version");
 31 | const std::string NVB_GIT_VERSION("git_version");
 32 | const std::string NVB_VERSION("version");
 33 | const std::string NVB_ERROR("error");
 34 | const std::string NVB_WARNING("warning");
 35 | const std::string NVB_TESTCASES("testcases");
 36 | const std::string NVB_TESTCASE_NAME("name");
 37 | const std::string NVB_TESTCASE_ERROR(NVB_ERROR);
 38 | const std::string NVB_STATUS("status");
 39 | const std::string NVB_BW_DESCRIPTION("bandwidth_description");
 40 | const std::string NVB_BW_MATRIX("bandwidth_matrix");
 41 | const std::string NVB_BW_SUM("sum");
 42 | const std::string NVB_BW_MAX("max");
 43 | const std::string NVB_BW_MIN("min");
 44 | const std::string NVB_BW_AVG("average");
 45 | const std::string NVB_BUFFER_SIZE("bufferSize");
 46 | const std::string NVB_TEST_SAMPLES("testSamples");
 47 | const std::string NVB_USE_MEAN("useMean");
 48 | const std::string NVB_PASSED("Passed");
 49 | const std::string NVB_RUNNING("Running");
 50 | const std::string NVB_WAIVED("Waived");
 51 | const std::string NVB_NOT_FOUND("Not Found");
 52 | const std::string NVB_ERROR_STATUS("Error");
 53 | 
 54 | JsonOutput::JsonOutput(bool _shouldOutput) {
 55 |     shouldOutput = _shouldOutput;
 56 | }
 57 | 
 58 | void JsonOutput::addTestcaseResults(const PeerValueMatrix<double> &matrix, const std::string &description) {
 59 |     assert(m_root[NVB_TITLE][NVB_TESTCASES].isArray() && m_root[NVB_TITLE][NVB_TESTCASES].size() > 0);
 60 | 
 61 |     unsigned int size = m_root[NVB_TITLE][NVB_TESTCASES].size();
 62 |     Json::Value &testcase = m_root[NVB_TITLE][NVB_TESTCASES][size-1];
 63 | 
 64 |     double maxVal = std::numeric_limits<double>::min();
 65 |     double minVal = std::numeric_limits<double>::max();
 66 |     double sum = 0;
 67 |     int count = 0;
 68 | 
 69 |     for (int currentDevice = 0; currentDevice < matrix.m_rows; currentDevice++) {
 70 |         Json::Value row;
 71 |         for (int peer = 0; peer < matrix.m_columns; peer++) {
 72 |             std::optional <double> val = matrix.value(currentDevice, peer);
 73 |             if (val) {
 74 |                 std::stringstream buf;
 75 |                 buf << val.value();
 76 |                 row.append(buf.str());
 77 |             } else {
 78 |                 row.append("N/A");
 79 |             }
 80 |             sum += val.value_or(0.0);
 81 |             maxVal = std::max(maxVal, val.value_or(0.0));
 82 |             minVal = std::min(minVal, val.value_or(0.0));
 83 |             if (val.value_or(0.0) > 0) count++;
 84 |         }
 85 |         testcase[NVB_BW_MATRIX].append(row);
 86 |     }
 87 | 
 88 |     testcase[NVB_BW_SUM] = sum;
 89 |     testcase[NVB_BW_DESCRIPTION] = description;
 90 |     testcase[NVB_STATUS] = NVB_PASSED;
 91 | 
 92 |     if (verbose) {
 93 |         testcase[NVB_BW_MIN] = minVal;
 94 |         testcase[NVB_BW_MAX] = maxVal;
 95 |         testcase[NVB_BW_AVG] = sum/count;
 96 |     }
 97 | }
 98 | 
 99 | void JsonOutput::addTestcase(const std::string &name, const std::string &status, const std::string &msg) {
100 |     Json::Value testcase;
101 |     testcase[NVB_TESTCASE_NAME] = name;
102 |     testcase[NVB_STATUS] = status;
103 |     m_root[NVB_TITLE][NVB_TESTCASES].append(testcase);
104 | }
105 | 
106 | void JsonOutput::recordErrorCurrentTest(const std::string &errorPart1, const std::string &errorPart2) {
107 |     bool testCaseExists = false;
108 |     if (m_root[NVB_TESTCASES].isArray()) {
109 |         Json::Value &testcases = m_root[NVB_TITLE][NVB_TESTCASES];
110 |         unsigned int size = testcases.size();
111 |         if (size > 0) {
112 |             testcases[size-1][NVB_TESTCASE_ERROR] = errorPart1 + " " + errorPart2;
113 |             testCaseExists = true;
114 |         }
115 |     }
116 | 
117 |     if (!testCaseExists) {
118 |         std::vector<std::string> errors;
119 |         errors.emplace_back(errorPart1);
120 |         errors.emplace_back(errorPart2);
121 |         recordError(errors);
122 |     }
123 | }
124 | 
125 | void JsonOutput::setTestcaseStatusAndAddIfNeeded(const std::string &name, const std::string &status, const std::string &msg) {
126 |     bool testCaseExists = false;
127 |     if (m_root[NVB_TESTCASES].isArray()) {
128 |         Json::Value &testcases = m_root[NVB_TITLE][NVB_TESTCASES];
129 |         unsigned int size = testcases.size();
130 |         if (size > 0 && testcases[size-1][NVB_TESTCASE_NAME].asString() == name) {
131 |             testcases[size-1][NVB_STATUS] = status;
132 |             testCaseExists = true;
133 |         }
134 |     }
135 | 
136 |     if (!testCaseExists) {
137 |         addTestcase(name, status);
138 |     }
139 | }
140 | 
141 | void JsonOutput::recordError(const std::string &error) {
142 |     m_root[NVB_TITLE][NVB_ERROR] = error;
143 |     print();
144 | }
145 | 
146 | void JsonOutput::recordError(const std::vector<std::string> &errorParts) {
147 |     std::stringstream buf;
148 |     bool first = true;
149 | 
150 |     for (auto &part : errorParts) {
151 |         if (first) {
152 |             buf << part << ":";
153 |             first = false;
154 |         } else {
155 |             buf << " " << part;
156 |         }
157 |     }
158 |     m_root[NVB_TITLE][NVB_ERROR] = buf.str();
159 | }
160 | 
161 | void JsonOutput::recordWarning(const std::string &warning) {
162 |     m_root[NVB_TITLE][NVB_WARNING] = warning;
163 | }
164 | 
165 | void JsonOutput::addVersionInfo() {
166 |     m_root[NVB_TITLE][NVB_VERSION] = NVBANDWIDTH_VERSION;
167 |     m_root[NVB_TITLE][NVB_GIT_VERSION] = GIT_VERSION;
168 | }
169 | 
170 | void JsonOutput::addCudaAndDriverInfo(int cudaVersion, const std::string &driverVersion) {
171 |     m_root[NVB_TITLE][NVB_CUDA_RUNTIME_VERSION] = cudaVersion;
172 |     m_root[NVB_TITLE][NVB_DRIVER_VERSION] = driverVersion;
173 | }
174 | 
175 | void JsonOutput::recordDevices(int deviceCount) {
176 |     Json::Value deviceList;
177 | 
178 |     for (int iDev = 0; iDev < deviceCount; iDev++) {
179 |         std::stringstream buf;
180 |         buf << iDev << ": " << getDeviceDisplayInfo(iDev) << ": (" << localHostname << ")";
181 |         deviceList.append(buf.str());
182 |     }
183 |     m_root[NVB_TITLE][NVB_DEVICE_LIST] = deviceList;
184 | }
185 | 
186 | void JsonOutput::print() {
187 |     if (shouldOutput) {
188 |         std::cout << m_root.toStyledString() << std::endl;
189 |     }
190 | }
191 | 
192 | void JsonOutput::printInfo() {
193 |     // NO-OP
194 | }
195 | 


--------------------------------------------------------------------------------
/json_output.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | #ifndef JSON_OUTPUT_H_
19 | #define JSON_OUTPUT_H_
20 | 
21 | #include <json/json.h>
22 | #include <string>
23 | 
24 | #include "common.h"
25 | #include "output.h"
26 | 
27 | class JsonOutput : public Output {
28 |  public:
29 |     JsonOutput(bool shouldOutput);
30 | 
31 |     void addTestcase(const std::string &name, const std::string &status, const std::string &msg = "");
32 | 
33 |     void setTestcaseStatusAndAddIfNeeded(const std::string &name, const std::string &status, const std::string &msg = "");
34 | 
35 |     void print();
36 | 
37 |     void recordError(const std::string &error);
38 | 
39 |     void recordError(const std::vector<std::string> &errorParts);
40 | 
41 |     void recordErrorCurrentTest(const std::string &errorPart1, const std::string &errorPart2);
42 | 
43 |     void recordWarning(const std::string &warning);
44 | 
45 |     void addVersionInfo();
46 | 
47 |     void addCudaAndDriverInfo(int cudaVersion, const std::string &driverVersion);
48 | 
49 |     void addTestcaseResults(const PeerValueMatrix<double> &matrix, const std::string &description);
50 | 
51 |     void printInfo();
52 | 
53 |     void recordDevices(int deviceCount);
54 | 
55 |  private:
56 |     bool shouldOutput;
57 |     Json::Value m_root;
58 | };
59 | 
60 | #endif  // JSON_OUTPUT_H_
61 | 


--------------------------------------------------------------------------------
/kernels.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include "kernels.cuh"
 19 | 
 20 | __global__ void simpleCopyKernel(unsigned long long loopCount, uint4 *dst, uint4 *src) {
 21 |     for (unsigned int i = 0; i < loopCount; i++) {
 22 |         const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 23 |         size_t offset = idx * sizeof(uint4);
 24 |         uint4* dst_uint4 = reinterpret_cast<uint4*>((char*)dst + offset);
 25 |         uint4* src_uint4 = reinterpret_cast<uint4*>((char*)src + offset);
 26 |         __stcg(dst_uint4, __ldcg(src_uint4));
 27 |     }
 28 | }
 29 | 
 30 | __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, uint4* dst, uint4* src, size_t chunkSizeInElement) {
 31 |     unsigned long long from = blockDim.x * blockIdx.x + threadIdx.x;
 32 |     unsigned long long bigChunkSizeInElement = chunkSizeInElement / 12;
 33 |     dst += from;
 34 |     src += from;
 35 |     uint4* dstBigEnd = dst + (bigChunkSizeInElement * 12) * totalThreadCount;
 36 |     uint4* dstEnd = dst + chunkSizeInElement * totalThreadCount;
 37 | 
 38 |     for (unsigned int i = 0; i < loopCount; i++) {
 39 |         uint4* cdst = dst;
 40 |         uint4* csrc = src;
 41 | 
 42 |         while (cdst < dstBigEnd) {
 43 |             uint4 pipe_0 = *csrc; csrc += totalThreadCount;
 44 |             uint4 pipe_1 = *csrc; csrc += totalThreadCount;
 45 |             uint4 pipe_2 = *csrc; csrc += totalThreadCount;
 46 |             uint4 pipe_3 = *csrc; csrc += totalThreadCount;
 47 |             uint4 pipe_4 = *csrc; csrc += totalThreadCount;
 48 |             uint4 pipe_5 = *csrc; csrc += totalThreadCount;
 49 |             uint4 pipe_6 = *csrc; csrc += totalThreadCount;
 50 |             uint4 pipe_7 = *csrc; csrc += totalThreadCount;
 51 |             uint4 pipe_8 = *csrc; csrc += totalThreadCount;
 52 |             uint4 pipe_9 = *csrc; csrc += totalThreadCount;
 53 |             uint4 pipe_10 = *csrc; csrc += totalThreadCount;
 54 |             uint4 pipe_11 = *csrc; csrc += totalThreadCount;
 55 | 
 56 |             *cdst = pipe_0; cdst += totalThreadCount;
 57 |             *cdst = pipe_1; cdst += totalThreadCount;
 58 |             *cdst = pipe_2; cdst += totalThreadCount;
 59 |             *cdst = pipe_3; cdst += totalThreadCount;
 60 |             *cdst = pipe_4; cdst += totalThreadCount;
 61 |             *cdst = pipe_5; cdst += totalThreadCount;
 62 |             *cdst = pipe_6; cdst += totalThreadCount;
 63 |             *cdst = pipe_7; cdst += totalThreadCount;
 64 |             *cdst = pipe_8; cdst += totalThreadCount;
 65 |             *cdst = pipe_9; cdst += totalThreadCount;
 66 |             *cdst = pipe_10; cdst += totalThreadCount;
 67 |             *cdst = pipe_11; cdst += totalThreadCount;
 68 |         }
 69 | 
 70 |         while (cdst < dstEnd) {
 71 |             *cdst = *csrc; cdst += totalThreadCount; csrc += totalThreadCount;
 72 |         }
 73 |     }
 74 | }
 75 | 
 76 | // This kernel performs a split warp copy, alternating copy directions across warps.
 77 | __global__ void splitWarpCopyKernel(unsigned long long loopCount, uint4 *dst, uint4 *src) {
 78 |     for (unsigned int i = 0; i < loopCount; i++) {
 79 |         unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
 80 |         unsigned int globalWarpId = idx / warpSize;
 81 |         unsigned int warpLaneId = idx % warpSize;
 82 |         uint4* dst_uint4;
 83 |         uint4* src_uint4;
 84 | 
 85 |         // alternate copy directions across warps
 86 |         if (globalWarpId & 0x1) {
 87 |             // odd warp
 88 |             dst_uint4 = dst + (globalWarpId * warpSize + warpLaneId);
 89 |             src_uint4 = src + (globalWarpId * warpSize + warpLaneId);
 90 |         } else {
 91 |             // even warp
 92 |             dst_uint4 = src + (globalWarpId * warpSize + warpLaneId);
 93 |             src_uint4 = dst + (globalWarpId * warpSize + warpLaneId);
 94 |         }
 95 | 
 96 |         __stcg(dst_uint4, __ldcg(src_uint4));
 97 |     }
 98 | }
 99 | 
100 | __global__ void ptrChasingKernel(struct LatencyNode *data, size_t size, unsigned int accesses, unsigned int targetBlock) {
101 |     struct LatencyNode *p = data;
102 |     if (blockIdx.x != targetBlock) return;
103 |     for (auto i = 0; i < accesses; ++i) {
104 |         p = p->next;
105 |     }
106 | 
107 |     // avoid compiler optimization
108 |     if (p == nullptr) {
109 |         __trap();
110 |     }
111 | }
112 | 
113 | static __device__ __noinline__
114 | void mc_st_u32(unsigned int *dst, unsigned int v) {
115 | #if __CUDA_ARCH__ >= 900
116 |     asm volatile ("multimem.st.u32 [%0], %1;" :: "l"(dst), "r" (v));
117 | #endif
118 | }
119 | 
120 | static __device__ __noinline__
121 | void mc_ld_u32(unsigned int *dst, const unsigned int *src) {
122 | #if __CUDA_ARCH__ >= 900
123 |      asm volatile ("multimem.ld_reduce.and.b32 %0, [%1];" : "=r"((*dst)) : "l" (src));
124 | #endif
125 | }
126 | 
127 | // Writes from regular memory to multicast memory
128 | __global__ void multicastCopyKernel(unsigned long long loopCount, unsigned int* __restrict__ dst, unsigned int* __restrict__ src, size_t nElems) {
129 |     const size_t totalThreadCount = blockDim.x * gridDim.x;
130 |     const size_t offset = blockDim.x * blockIdx.x + threadIdx.x;
131 |     unsigned int* const enddst = dst + nElems;
132 |     dst += offset;
133 |     src += offset;
134 | 
135 |     for (unsigned int i = 0; i < loopCount; i++) {
136 |         // Reset pointers to src and dst chunks.
137 |         unsigned int* cur_src_ptr = src;
138 |         unsigned int* cur_dst_ptr = dst;
139 |         #pragma unroll(12)
140 |         while (cur_dst_ptr < enddst) {
141 |             mc_st_u32(cur_dst_ptr, *cur_src_ptr);
142 |             cur_dst_ptr += totalThreadCount;
143 |             cur_src_ptr += totalThreadCount;
144 |         }
145 |     }
146 | }
147 | 
148 | double latencyPtrChaseKernel(const int srcId, void* data, size_t size, unsigned long long latencyMemAccessCnt, unsigned smCount) {
149 |     CUstream stream;
150 |     int device, clock_rate_khz;
151 |     double latencySum = 0.0f, finalLatencyPerAccessNs = 0.0;
152 |     CUcontext srcCtx;
153 |     cudaEvent_t start, end;
154 |     float latencyMs = 0;
155 | 
156 |     CUDA_ASSERT(cudaEventCreate(&start));
157 |     CUDA_ASSERT(cudaEventCreate(&end));
158 | 
159 |     CU_ASSERT(cuDevicePrimaryCtxRetain(&srcCtx, srcId));
160 |     CU_ASSERT(cuCtxSetCurrent(srcCtx));
161 | 
162 |     CU_ASSERT(cuStreamCreate(&stream, CU_STREAM_DEFAULT));
163 |     CU_ASSERT(cuCtxGetDevice(&device));
164 |     CU_ASSERT(cuDeviceGetAttribute(&clock_rate_khz, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
165 | 
166 |     for (int targetBlock = 0; targetBlock < smCount; ++targetBlock) {
167 |         CUDA_ASSERT(cudaEventRecord(start, stream));
168 |         ptrChasingKernel <<< smCount, 1, 0, stream>>> ((struct LatencyNode*) data, size, latencyMemAccessCnt / smCount, targetBlock);
169 |         CUDA_ASSERT(cudaEventRecord(end, stream));
170 |         CUDA_ASSERT(cudaGetLastError());
171 |         CU_ASSERT(cuStreamSynchronize(stream));
172 |         cudaEventElapsedTime(&latencyMs, start, end);
173 |         latencySum += (latencyMs / 1000);
174 |     }
175 |     finalLatencyPerAccessNs = (latencySum * 1.0E9) / (latencyMemAccessCnt);
176 | 
177 |     CUDA_ASSERT(cudaEventDestroy(start));
178 |     CUDA_ASSERT(cudaEventDestroy(end));
179 | 
180 |     return finalLatencyPerAccessNs;
181 | }
182 | 
183 | size_t copyKernel(MemcpyDescriptor &desc) {
184 |     CUdevice dev;
185 |     CUcontext ctx;
186 | 
187 |     CU_ASSERT(cuStreamGetCtx(desc.stream, &ctx));
188 |     CU_ASSERT(cuCtxGetDevice(&dev));
189 | 
190 |     int numSm;
191 |     CU_ASSERT(cuDeviceGetAttribute(&numSm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
192 |     unsigned int totalThreadCount = numSm * numThreadPerBlock;
193 | 
194 |     // If the user provided buffer size is samller than default buffer size,
195 |     // we use the simple copy kernel for our bandwidth test.
196 |     // This is done so that no trucation of the buffer size occurs.
197 |     // Please note that to achieve peak bandwidth, it is suggested to use the
198 |     // default buffer size, which in turn triggers the use of the optimized
199 |     // kernel.
200 |     if (desc.copySize < (smallBufferThreshold * _MiB)) {
201 |         // copy size is rounded down to 16 bytes
202 |         unsigned int numUint4 = desc.copySize / sizeof(uint4);
203 |         // we allow max 1024 threads per block, and then scale out the copy across multiple blocks
204 |         dim3 block(std::min(numUint4, static_cast<unsigned int>(1024)));
205 |         dim3 grid(numUint4/block.x);
206 |         simpleCopyKernel <<<grid, block, 0 , desc.stream>>> (desc.loopCount, (uint4 *)desc.dst, (uint4 *)desc.src);
207 |         return numUint4 * sizeof(uint4);
208 |     }
209 | 
210 |     // adjust size to elements (size is multiple of MB, so no truncation here)
211 |     size_t sizeInElement = desc.copySize / sizeof(uint4);
212 |     // this truncates the copy
213 |     sizeInElement = totalThreadCount * (sizeInElement / totalThreadCount);
214 | 
215 |     size_t chunkSizeInElement = sizeInElement / totalThreadCount;
216 | 
217 |     dim3 gridDim(numSm, 1, 1);
218 |     dim3 blockDim(numThreadPerBlock, 1, 1);
219 |     stridingMemcpyKernel<<<gridDim, blockDim, 0, desc.stream>>> (totalThreadCount, desc.loopCount, (uint4 *)desc.dst, (uint4 *)desc.src, chunkSizeInElement);
220 | 
221 |     return sizeInElement * sizeof(uint4);
222 | }
223 | 
224 | size_t copyKernelSplitWarp(MemcpyDescriptor &desc) {
225 |     CUdevice dev;
226 |     CUcontext ctx;
227 | 
228 |     CU_ASSERT(cuStreamGetCtx(desc.stream, &ctx));
229 |     CU_ASSERT(cuCtxGetDevice(&dev));
230 | 
231 |     int numSm;
232 |     CU_ASSERT(cuDeviceGetAttribute(&numSm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
233 | 
234 |     // copy size is rounded down to 16 bytes
235 |     unsigned int numUint4 = desc.copySize / sizeof(uint4);
236 | 
237 |     // we allow max 1024 threads per block, and then scale out the copy across multiple blocks
238 |     dim3 block(std::min(numUint4, static_cast<unsigned int>(1024)));
239 |     dim3 grid(numUint4/block.x);
240 |     splitWarpCopyKernel <<<grid, block, 0 , desc.stream>>> (desc.loopCount, (uint4 *)desc.dst, (uint4 *)desc.src);
241 |     return numUint4 * sizeof(uint4);
242 | }
243 | 
244 | size_t multicastCopy(CUdeviceptr dstBuffer, CUdeviceptr srcBuffer, size_t size, CUstream stream, unsigned long long loopCount) {
245 |     CUdevice dev;
246 |     CUcontext ctx;
247 | 
248 |     CU_ASSERT(cuStreamGetCtx(stream, &ctx));
249 |     CU_ASSERT(cuCtxGetDevice(&dev));
250 | 
251 |     int numSm;
252 |     CU_ASSERT(cuDeviceGetAttribute(&numSm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
253 |     // adjust size to elements (size is multiple of MB, so no truncation here)
254 |     size_t sizeInElement = size / sizeof(unsigned);
255 |     dim3 gridDim(numSm, 1, 1);
256 |     dim3 blockDim(numThreadPerBlock, 1, 1);
257 |     multicastCopyKernel<<<gridDim, blockDim, 0, stream>>> (loopCount, (unsigned *)dstBuffer, (unsigned *)srcBuffer, sizeInElement);
258 |     return sizeInElement * sizeof(unsigned);
259 | }
260 | 
261 | __global__ void spinKernelDevice(volatile int *latch, const unsigned long long timeoutClocks) {
262 |     register unsigned long long endTime = clock64() + timeoutClocks;
263 |     while (!*latch) {
264 |         if (timeoutClocks != ~0ULL && clock64() > endTime) {
265 |             break;
266 |         }
267 |     }
268 | }
269 | 
270 | CUresult spinKernel(volatile int *latch, CUstream stream, unsigned long long timeoutMs) {
271 |     int clocksPerMs = 0;
272 |     CUcontext ctx;
273 |     CUdevice dev;
274 | 
275 |     CU_ASSERT(cuStreamGetCtx(stream, &ctx));
276 |     CU_ASSERT(cuCtxGetDevice(&dev));
277 | 
278 |     CU_ASSERT(cuDeviceGetAttribute(&clocksPerMs, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
279 | 
280 |     unsigned long long timeoutClocks = clocksPerMs * timeoutMs;
281 | 
282 |     spinKernelDevice<<<1, 1, 0, stream>>>(latch, timeoutClocks);
283 | 
284 |     return CUDA_SUCCESS;
285 | }
286 | 
287 | __global__ void spinKernelDeviceMultistage(volatile int *latch1, volatile int *latch2, const unsigned long long timeoutClocks) {
288 |     if (latch1) {
289 |         register unsigned long long endTime = clock64() + timeoutClocks;
290 |         while (!*latch1) {
291 |             if (timeoutClocks != ~0ULL && clock64() > endTime) {
292 |                 return;
293 |             }
294 |         }
295 | 
296 |         *latch2 = 1;
297 |     }
298 | 
299 |     register unsigned long long endTime = clock64() + timeoutClocks;
300 |     while (!*latch2) {
301 |         if (timeoutClocks != ~0ULL && clock64() > endTime) {
302 |             break;
303 |         }
304 |     }
305 | }
306 | 
307 | // Implement a 2-stage spin kernel for multi-node synchronization.
308 | // One of the host nodes releases the first latch. Subsequently,
309 | // the second latch is released, that is polled by all other devices
310 | // latch1 argument is optional. If defined, kernel will spin on it until released, and then will release latch2.
311 | // latch2 argument is mandatory. Kernel will spin on it until released.
312 | // timeoutMs argument applies to each stage separately.
313 | // However, since each kernel will spin on only one stage, total runtime is still limited by timeoutMs
314 | CUresult spinKernelMultistage(volatile int *latch1, volatile int *latch2, CUstream stream, unsigned long long timeoutMs) {
315 |     int clocksPerMs = 0;
316 |     CUcontext ctx;
317 |     CUdevice dev;
318 | 
319 |     ASSERT(latch2 != nullptr);
320 | 
321 |     CU_ASSERT(cuStreamGetCtx(stream, &ctx));
322 |     CU_ASSERT(cuCtxGetDevice(&dev));
323 | 
324 |     CU_ASSERT(cuDeviceGetAttribute(&clocksPerMs, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
325 | 
326 |     unsigned long long timeoutClocks = clocksPerMs * timeoutMs;
327 | 
328 |     spinKernelDeviceMultistage<<<1, 1, 0, stream>>>(latch1, latch2, timeoutClocks);
329 | 
330 |     return CUDA_SUCCESS;
331 | }
332 | 
333 | __global__ void memsetKernelDevice(CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements) {
334 |     unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x;
335 |     unsigned int* buf = reinterpret_cast<unsigned int*>(buffer);
336 |     unsigned int* pat = reinterpret_cast<unsigned int*>(pattern);
337 | 
338 |     if (idx < num_elements) {
339 |         buf[idx] = pat[idx % num_pattern_elements];
340 |     }
341 | }
342 | 
343 | // This kernel clears memory locations in the buffer based on warp parity.
344 | // If clearOddWarpIndexed is true, it clears buffer locations indexed by odd warps.
345 | // Otherwise, it clears buffer locations indexed by even warps.
346 | __global__ void memclearKernelByWarpParityDevice(CUdeviceptr buffer, bool clearOddWarpIndexed) {
347 |     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
348 |     uint4* buf = reinterpret_cast<uint4*>(buffer);
349 |     unsigned int globalWarpId = idx / warpSize;
350 |     unsigned int thread_idx_in_warp = idx % warpSize;
351 | 
352 |     if (clearOddWarpIndexed) {
353 |         // clear memory locations in buffer indexed by odd warps
354 |         if (globalWarpId & 0x1) {
355 |             buf[globalWarpId * warpSize + thread_idx_in_warp] = make_uint4(0x0, 0x0, 0x0, 0x0);
356 |         }
357 |     } else {
358 |         // clear memory locations in buffer indexed by even warps
359 |         if (!(globalWarpId & 0x1)) {
360 |             buf[globalWarpId * warpSize + thread_idx_in_warp] = make_uint4(0x0, 0x0, 0x0, 0x0);
361 |         }
362 |     }
363 | }
364 | 
365 | __global__ void memcmpKernelDevice(CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag) {
366 |     unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x;
367 |     unsigned int* buf = reinterpret_cast<unsigned int*>(buffer);
368 |     unsigned int* pat = reinterpret_cast<unsigned int*>(pattern);
369 | 
370 |     if (idx < num_elements) {
371 |         if (buf[idx] != pat[idx % num_pattern_elements]) {
372 |             if (atomicCAS((int*)errorFlag, 0, 1) == 0) {
373 |                 // have the first thread that detects a mismatch print the error message
374 |                 printf(" Invalid value when checking the pattern at %p\n", (void*)((char*)buffer));
375 |                 printf(" Current offset : %lu \n", idx);
376 |                 return;
377 |             }
378 |         }
379 |     }
380 | }
381 | 
382 | __global__ void multicastMemcmpKernelDevice(CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag) {
383 |     unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x;
384 |     unsigned int* buf = reinterpret_cast<unsigned int*>(buffer);
385 |     unsigned int* pat = reinterpret_cast<unsigned int*>(pattern);
386 | 
387 |     if (idx < num_elements) {
388 |         unsigned buf_val;
389 |         mc_ld_u32(&buf_val, &buf[idx]);
390 |         if (buf_val != pat[idx % num_pattern_elements]) {
391 |             if (atomicCAS((int*)errorFlag, 0, 1) == 0) {
392 |                 // have the first thread that detects a mismatch print the error message
393 |                 printf(" Invalid value when checking the pattern at %p\n", (void*)((char*)buffer));
394 |                 printf(" Current offset : %lu \n", idx);
395 |                 return;
396 |             }
397 |         }
398 |     }
399 | }
400 | 
401 | CUresult memsetKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements) {
402 |     unsigned threadsPerBlock = 1024;
403 |     unsigned long long blocks = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
404 | 
405 |     memsetKernelDevice<<<blocks, threadsPerBlock, 0, stream>>>(buffer, pattern, num_elements, num_pattern_elements);
406 |     CUDA_ASSERT(cudaGetLastError());
407 |     return CUDA_SUCCESS;
408 | }
409 | 
410 | CUresult memclearKernelByWarpParity(CUstream stream, CUdeviceptr buffer, size_t size, bool clearOddWarpIndexed) {
411 |     CUdevice dev;
412 |     CUcontext ctx;
413 | 
414 |     CU_ASSERT(cuStreamGetCtx(stream, &ctx));
415 |     CU_ASSERT(cuCtxGetDevice(&dev));
416 | 
417 |     int numSm;
418 |     CU_ASSERT(cuDeviceGetAttribute(&numSm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
419 |     // copy size is rounded down to 16 bytes
420 |     unsigned int numUint4 = size / sizeof(uint4);
421 | 
422 |     // we allow max 1024 threads per block, and then scale out the copy across multiple blocks
423 |     dim3 block(std::min(numUint4, static_cast<unsigned int>(1024)));
424 | 
425 |     dim3 grid(numUint4/block.x);
426 |     memclearKernelByWarpParityDevice <<<grid, block, 0 , stream>>> (buffer, clearOddWarpIndexed);
427 |     CUDA_ASSERT(cudaGetLastError());
428 |     return CUDA_SUCCESS;
429 | }
430 | 
431 | CUresult memcmpKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag) {
432 |     unsigned threadsPerBlock = 1024;
433 |     unsigned long long blocks = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
434 | 
435 |     memcmpKernelDevice<<<blocks, threadsPerBlock, 0, stream>>>(buffer, pattern, num_elements, num_pattern_elements, errorFlag);
436 |     CUDA_ASSERT(cudaGetLastError());
437 |     return CUDA_SUCCESS;
438 | }
439 | 
440 | CUresult multicastMemcmpKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag) {
441 |     unsigned threadsPerBlock = 1024;
442 |     unsigned long long blocks = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
443 | 
444 |     multicastMemcmpKernelDevice<<<blocks, threadsPerBlock, 0, stream>>>(buffer, pattern, num_elements, num_pattern_elements, errorFlag);
445 |     CUDA_ASSERT(cudaGetLastError());
446 |     return CUDA_SUCCESS;
447 | }
448 | 
449 | void preloadKernels(int deviceCount) {
450 |     cudaFuncAttributes unused;
451 |     for (int iDev = 0; iDev < deviceCount; iDev++) {
452 |         cudaSetDevice(iDev);
453 |         cudaFuncGetAttributes(&unused, &stridingMemcpyKernel);
454 |         cudaFuncGetAttributes(&unused, &spinKernelDevice);
455 |         cudaFuncGetAttributes(&unused, &spinKernelDeviceMultistage);
456 |         cudaFuncGetAttributes(&unused, &simpleCopyKernel);
457 |         cudaFuncGetAttributes(&unused, &splitWarpCopyKernel);
458 |         cudaFuncGetAttributes(&unused, &multicastCopyKernel);
459 |         cudaFuncGetAttributes(&unused, &ptrChasingKernel);
460 |         cudaFuncGetAttributes(&unused, &multicastCopyKernel);
461 |         cudaFuncGetAttributes(&unused, &memsetKernelDevice);
462 |         cudaFuncGetAttributes(&unused, &memcmpKernelDevice);
463 |         cudaFuncGetAttributes(&unused, &multicastMemcmpKernelDevice);
464 |     }
465 | }
466 | 
467 | 
468 | 
469 | 


--------------------------------------------------------------------------------
/kernels.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | #ifndef KERNELS_CUH_
19 | #define KERNELS_CUH_
20 | 
21 | #include <cuda.h>
22 | #include "common.h"
23 | #include "inline_common.h"
24 | 
25 | const unsigned long long DEFAULT_SPIN_KERNEL_TIMEOUT_MS = 10000ULL;   // 10 seconds
26 | 
27 | size_t copyKernel(MemcpyDescriptor &desc);
28 | size_t copyKernelSplitWarp(MemcpyDescriptor &desc);
29 | size_t multicastCopy(CUdeviceptr dstBuffer, CUdeviceptr srcBuffer, size_t size, CUstream stream, unsigned long long loopCount);
30 | CUresult spinKernel(volatile int *latch, CUstream stream, unsigned long long timeoutMs = DEFAULT_SPIN_KERNEL_TIMEOUT_MS);
31 | CUresult spinKernelMultistage(volatile int *latch1, volatile int *latch2, CUstream stream, unsigned long long timeoutMs = DEFAULT_SPIN_KERNEL_TIMEOUT_MS);
32 | void preloadKernels(int deviceCount);
33 | double latencyPtrChaseKernel(const int srcId, void* data, size_t size, unsigned long long latencyMemAccessCnt, unsigned smCount);
34 | CUresult memsetKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements);
35 | CUresult memcmpKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag);
36 | CUresult multicastMemcmpKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag);
37 | 
38 | CUresult memclearKernelByWarpParity(CUstream stream, CUdeviceptr buffer, size_t size, bool clearOddWarpIndexed);
39 | #endif  // KERNELS_CUH_
40 | 


--------------------------------------------------------------------------------
/memcpy.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #ifndef MEMCPY_H_
 19 | #define MEMCPY_H_
 20 | 
 21 | #include <memory>
 22 | #include "common.h"
 23 | 
 24 | class MemcpyBuffer {
 25 |  protected:
 26 |     void* buffer{};
 27 |     size_t bufferSize;
 28 |  public:
 29 |     MemcpyBuffer(size_t bufferSize);
 30 |     virtual ~MemcpyBuffer() {}
 31 |     CUdeviceptr getBuffer() const;
 32 |     size_t getBufferSize() const;
 33 | 
 34 |     virtual int getBufferIdx() const = 0;
 35 |     virtual CUcontext getPrimaryCtx() const = 0;
 36 |     virtual std::string getBufferString() const = 0;
 37 |     // In MPI configuration we want to avoid using blocking functions such as cuStreamSynchronize to adhere to MPI notion of progress
 38 |     // For more details see https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/mpi.html#mpi-progress
 39 |     virtual CUresult streamSynchronizeWrapper(CUstream stream) const;
 40 |     virtual int getMPIRank() const;
 41 | };
 42 | 
 43 | // Represents the host buffer abstraction
 44 | class HostBuffer : public MemcpyBuffer {
 45 |  public:
 46 |     // NUMA affinity is set here through allocation of memory in the socket group where `targetDeviceId` resides
 47 |     HostBuffer(size_t bufferSize, int targetDeviceId);
 48 |     ~HostBuffer();
 49 | 
 50 |     int getBufferIdx() const override;
 51 |     CUcontext getPrimaryCtx() const override;
 52 |     virtual std::string getBufferString() const override;
 53 | };
 54 | 
 55 | // Represents the device buffer and context abstraction
 56 | class DeviceBuffer : public MemcpyBuffer {
 57 |  private:
 58 |     int deviceIdx;
 59 |     CUcontext primaryCtx{};
 60 |  public:
 61 |     DeviceBuffer(size_t bufferSize, int deviceIdx);
 62 |     ~DeviceBuffer();
 63 | 
 64 |     int getBufferIdx() const override;
 65 |     CUcontext getPrimaryCtx() const override;
 66 |     virtual std::string getBufferString() const override;
 67 | 
 68 |     bool enablePeerAcess(const DeviceBuffer &peerBuffer);
 69 | };
 70 | 
 71 | // Specifies the preferred node's context to do the operation from
 72 | // It's only a preference because if the preferred node is a HostBuffer, it has no context and will fall back to the other node
 73 | enum ContextPreference {
 74 |         PREFER_SRC_CONTEXT,    // Prefer the source buffer's context if available
 75 |         PREFER_DST_CONTEXT     // Prefer the destination buffer's context if available
 76 | };
 77 | 
 78 | class MemcpyOperation;
 79 | 
 80 | // forward declaration
 81 | class NodeHelper;
 82 | 
 83 | class MemcpyDispatchInfo {
 84 |  public:
 85 |     std::vector<CUcontext> contexts;
 86 |     std::vector<CUstream> streams;
 87 |     std::vector<const MemcpyBuffer*> srcBuffers;
 88 |     std::vector<const MemcpyBuffer*> dstBuffers;
 89 |     std::vector<int> originalRanks;
 90 |     std::vector<size_t> adjustedCopySizes;
 91 |     std::shared_ptr<NodeHelper> nodeHelper;
 92 |     MemcpyDispatchInfo(std::vector<const MemcpyBuffer*> srcBuffers, std::vector<const MemcpyBuffer*> dstBuffers, std::vector<CUcontext> contexts, std::vector<int> originalRanks = {});
 93 | };
 94 | 
 95 | class NodeHelper {
 96 |  public:
 97 |     virtual MemcpyDispatchInfo dispatchMemcpy(const std::vector<const MemcpyBuffer*> &srcBuffers, const std::vector<const MemcpyBuffer*> &dstBuffers, ContextPreference ctxPreference) = 0;
 98 |     virtual double calculateTotalBandwidth(double totalTime, double totalSize, size_t loopCount) = 0;
 99 |     virtual double calculateSumBandwidth(std::vector<PerformanceStatistic> &bandwidthStats) = 0;
100 |     virtual double calculateFirstBandwidth(std::vector<PerformanceStatistic> &bandwidthStats) = 0;
101 |     virtual std::vector<double> calculateVectorBandwidth(std::vector<double> &results, std::vector<int> originalRanks) = 0;
102 |     virtual void synchronizeProcess() = 0;
103 |     // In MPI configuration we want to avoid using blocking functions such as cuStreamSynchronize to adhere to MPI notion of progress
104 |     // For more details see https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/mpi.html#mpi-progress
105 |     virtual CUresult streamSynchronizeWrapper(CUstream stream) const = 0;
106 | 
107 |     // stream blocking functions
108 |     virtual void streamBlockerReset() = 0;
109 |     virtual void streamBlockerRelease() = 0;
110 |     virtual void streamBlockerBlock(CUstream stream) = 0;
111 | };
112 | 
113 | class NodeHelperSingle : public NodeHelper {
114 |  private:
115 |     volatile int* blockingVarHost;
116 |  public:
117 |     NodeHelperSingle();
118 |     ~NodeHelperSingle();
119 |     MemcpyDispatchInfo dispatchMemcpy(const std::vector<const MemcpyBuffer*> &srcBuffers, const std::vector<const MemcpyBuffer*> &dstBuffers, ContextPreference ctxPreference);
120 |     double calculateTotalBandwidth(double totalTime, double totalSize, size_t loopCount);
121 |     double calculateSumBandwidth(std::vector<PerformanceStatistic> &bandwidthStats);
122 |     double calculateFirstBandwidth(std::vector<PerformanceStatistic> &bandwidthStats);
123 |     std::vector<double> calculateVectorBandwidth(std::vector<double> &results, std::vector<int> originalRanks);
124 |     void synchronizeProcess();
125 |     CUresult streamSynchronizeWrapper(CUstream stream) const;
126 | 
127 |     // stream blocking functions
128 |     void streamBlockerReset();
129 |     void streamBlockerRelease();
130 |     void streamBlockerBlock(CUstream stream);
131 | };
132 | 
133 | class MemcpyInitiator {
134 |  public:
135 |     // Pure virtual function for implementation of the actual memcpy function
136 |     // return actual bytes copied
137 |     // This can vary from copySize due to SM copies truncated the copy to achieve max bandwidth
138 |     virtual size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor) = 0;
139 |     // Calculate the truncated sizes used by copy kernels
140 |     virtual size_t getAdjustedCopySize(size_t size, CUstream stream) = 0;
141 |     // Fill buffer with a pattern
142 |     virtual void memsetPattern(MemcpyDispatchInfo &info) const = 0;
143 |     // Compare buffer with a pattern
144 |     virtual void memcmpPattern(MemcpyDispatchInfo &info) const = 0;
145 |     // Adjust the bandwidth before final reporting
146 |     virtual unsigned long long getAdjustedBandwidth(unsigned long long bandwidth) = 0;
147 | };
148 | 
149 | class MemcpyInitiatorSM : public MemcpyInitiator {
150 |  public:
151 |     size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor);
152 |     // Calculate the truncated sizes used by copy kernels
153 |     size_t getAdjustedCopySize(size_t size, CUstream stream);
154 |     // Fill buffer with a pattern
155 |     void memsetPattern(MemcpyDispatchInfo &info) const;
156 |     // Compare buffer with a pattern
157 |     void memcmpPattern(MemcpyDispatchInfo &info) const;
158 |     // Adjust the bandwidth before final reporting
159 |     unsigned long long getAdjustedBandwidth(unsigned long long bandwidth);
160 | };
161 | 
162 | class MemcpyInitiatorCE : public MemcpyInitiator  {
163 |  public:
164 |     size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor);
165 |     // Calculate the truncated sizes used by copy kernels
166 |     size_t getAdjustedCopySize(size_t size, CUstream stream);
167 |     // Fill buffer with a pattern
168 |     void memsetPattern(MemcpyDispatchInfo &info) const;
169 |     // Compare buffer with a pattern
170 |     void memcmpPattern(MemcpyDispatchInfo &info) const;
171 |     // Adjust the bandwidth before final reporting
172 |     unsigned long long getAdjustedBandwidth(unsigned long long bandwidth);
173 | };
174 | 
175 | class MemcpyInitiatorMulticastWrite : public MemcpyInitiator {
176 |  public:
177 |     size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor);
178 |     // Calculate the truncated sizes used by copy kernels
179 |     size_t getAdjustedCopySize(size_t size, CUstream stream);
180 |     // Fill buffer with a pattern
181 |     void memsetPattern(MemcpyDispatchInfo &info) const;
182 |     // Compare buffer with a pattern
183 |     void memcmpPattern(MemcpyDispatchInfo &info) const;
184 |     // Adjust the bandwidth before final reporting
185 |     unsigned long long getAdjustedBandwidth(unsigned long long bandwidth);
186 | };
187 | 
188 | class MemcpyInitiatorSMSplitWarp : public MemcpyInitiatorSM {
189 |  public:
190 |     size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor);
191 |     // Fill buffer with a pattern
192 |     void memsetPattern(MemcpyDispatchInfo &info) const;
193 |     // Compare buffer with a pattern
194 |     void memcmpPattern(MemcpyDispatchInfo &info) const;
195 |     // Adjust the bandwidth before final reporting
196 |     unsigned long long getAdjustedBandwidth(unsigned long long bandwidth);
197 | };
198 | 
199 | // Abstraction of a memory Operation.
200 | class MemoryOperation {
201 |  public:
202 |     MemoryOperation() = default;
203 |     ~MemoryOperation() = default;
204 | };
205 | 
206 | // Abstraction of a memcpy operation
207 | class MemcpyOperation : public MemoryOperation {
208 |  public:
209 |     // Specifies which bandwidths to use for the final result of simultaneous copies
210 |     enum BandwidthValue {
211 |             USE_FIRST_BW,      // Use the bandwidth of the first copy in the simultaneous copy list
212 |             SUM_BW,            // Use the sum of all bandwidths from the simultaneous copy list
213 |             TOTAL_BW,          // Use the total bandwidth of all copies, based on total time and total bytes copied
214 |             VECTOR_BW,         // Return bandwidths of each copy separately
215 |     };
216 | 
217 |     ContextPreference ctxPreference;
218 | 
219 |  private:
220 |     unsigned long long loopCount;
221 | 
222 |  protected:
223 |     size_t *procMask;
224 |     BandwidthValue bandwidthValue;
225 | 
226 |     std::shared_ptr<NodeHelper> nodeHelper;
227 |     std::shared_ptr<MemcpyInitiator> memcpyInitiator;
228 | 
229 |  public:
230 |     MemcpyOperation(unsigned long long loopCount, MemcpyInitiator *_memcpyInitiator, ContextPreference ctxPreference = ContextPreference::PREFER_SRC_CONTEXT, BandwidthValue bandwidthValue = BandwidthValue::USE_FIRST_BW);
231 |     MemcpyOperation(unsigned long long loopCount, MemcpyInitiator *_memcpyInitiator, NodeHelper *_nodeHelper, ContextPreference ctxPreference = ContextPreference::PREFER_SRC_CONTEXT, BandwidthValue bandwidthValue = BandwidthValue::USE_FIRST_BW);
232 |     virtual ~MemcpyOperation();
233 | 
234 |     // Lists of paired nodes will be executed sumultaneously
235 |     // context of srcBuffers is preferred (if not host) unless otherwise specified
236 |     std::vector<double> doMemcpyCore(MemcpyDispatchInfo &info);
237 |     std::vector<double> doMemcpyVector(const std::vector<const MemcpyBuffer*> &srcBuffers, const std::vector<const MemcpyBuffer*> &dstBuffers);
238 |     double doMemcpy(const std::vector<const MemcpyBuffer*> &srcBuffers, const std::vector<const MemcpyBuffer*> &dstBuffers);
239 |     double doMemcpy(const MemcpyBuffer &srcBuffer, const MemcpyBuffer &dstBuffer);
240 | };
241 | 
242 | class MemPtrChaseOperation : public MemoryOperation {
243 |  public:
244 |     MemPtrChaseOperation(unsigned long long loopCount);
245 |     ~MemPtrChaseOperation() = default;
246 |     double doPtrChase(const int srcId, const MemcpyBuffer &peerBuffer);
247 |  private:
248 |     unsigned long long loopCount;
249 |     unsigned int smCount;
250 | };
251 | 
252 | #endif  // MEMCPY_H_
253 | 


--------------------------------------------------------------------------------
/multinode_memcpy.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #ifdef MULTINODE
 19 | #include <mpi.h>
 20 | #include <unistd.h>
 21 | 
 22 | #include "kernels.cuh"
 23 | #include "multinode_memcpy.h"
 24 | 
 25 | MultinodeMemoryAllocation::MultinodeMemoryAllocation(size_t bufferSize, int MPI_rank): bufferSize(bufferSize), MPI_rank(MPI_rank) {
 26 |     cudaSetDevice(localDevice);
 27 | }
 28 | 
 29 | static CUresult MPIstreamSyncHelper(CUstream stream) {
 30 |     CUresult err = CUDA_ERROR_NOT_READY;
 31 |     int flag;
 32 |     while (err == CUDA_ERROR_NOT_READY) {
 33 |         err = cuStreamQuery(stream);
 34 |         MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
 35 |     }
 36 |     return err;
 37 | }
 38 | 
 39 | CUresult MultinodeMemoryAllocation::streamSynchronizeWrapper(CUstream stream) const {
 40 |     return MPIstreamSyncHelper(stream);
 41 | }
 42 | 
 43 | MultinodeMemoryAllocationUnicast::MultinodeMemoryAllocationUnicast(size_t bufferSize, int MPI_rank): MultinodeMemoryAllocation(bufferSize, MPI_rank) {
 44 |     handleType = CU_MEM_HANDLE_TYPE_FABRIC;
 45 |     prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 46 |     prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 47 |     prop.location.id = localDevice;
 48 |     prop.requestedHandleTypes = handleType;
 49 | 
 50 |     size_t granularity = 0;
 51 |     CU_ASSERT(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
 52 | 
 53 |     roundedUpAllocationSize = ROUND_UP(bufferSize, granularity);
 54 | 
 55 |     if (MPI_rank == worldRank) {
 56 |         // Allocate the memory
 57 |         CU_ASSERT(cuMemCreate(&handle, roundedUpAllocationSize, &prop, 0 /*flags*/));
 58 | 
 59 |         // Export the allocation to the importing process
 60 |         CU_ASSERT(cuMemExportToShareableHandle(&fh, handle, handleType, 0 /*flags*/));
 61 |     }
 62 | 
 63 |     MPI_Bcast(&fh, sizeof(fh), MPI_BYTE, MPI_rank, MPI_COMM_WORLD);
 64 | 
 65 |     if (MPI_rank != worldRank) {
 66 |         CU_ASSERT(cuMemImportFromShareableHandle(&handle, (void *)&fh, handleType));
 67 |     }
 68 | 
 69 |     // Map the memory
 70 |     CU_ASSERT(cuMemAddressReserve((CUdeviceptr *) &buffer, roundedUpAllocationSize, 0, 0 /*baseVA*/, 0 /*flags*/));
 71 | 
 72 |     CU_ASSERT(cuMemMap((CUdeviceptr) buffer, roundedUpAllocationSize, 0 /*offset*/, handle, 0 /*flags*/));
 73 |     desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 74 |     desc.location.id = localDevice;
 75 |     desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 76 |     CU_ASSERT(cuMemSetAccess((CUdeviceptr) buffer, roundedUpAllocationSize, &desc, 1 /*count*/));
 77 | 
 78 |     // Make sure that everyone is done with mapping the fabric allocation
 79 |     MPI_Barrier(MPI_COMM_WORLD);
 80 | }
 81 | 
 82 | MultinodeMemoryAllocationUnicast::~MultinodeMemoryAllocationUnicast() {
 83 |     // Make sure that everyone is done using the memory
 84 |     MPI_Barrier(MPI_COMM_WORLD);
 85 | 
 86 |     CU_ASSERT(cuMemUnmap((CUdeviceptr) buffer, roundedUpAllocationSize));
 87 |     CU_ASSERT(cuMemRelease(handle));
 88 |     CU_ASSERT(cuMemAddressFree((CUdeviceptr) buffer, roundedUpAllocationSize));
 89 | }
 90 | 
 91 | MultinodeMemoryAllocationMulticast::MultinodeMemoryAllocationMulticast(size_t bufferSize, int MPI_rank): MultinodeMemoryAllocation(bufferSize, MPI_rank) {
 92 |     handleType = CU_MEM_HANDLE_TYPE_FABRIC;
 93 |     multicastProp.numDevices = worldSize;
 94 |     multicastProp.handleTypes = handleType;
 95 |     size_t gran;
 96 |     CU_ASSERT(cuMulticastGetGranularity(&gran, &multicastProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
 97 |     roundedUpAllocationSize = ROUND_UP(bufferSize, gran);
 98 |     multicastProp.size = roundedUpAllocationSize;
 99 | 
100 |     if (MPI_rank == worldRank) {
101 |         // Allocate the memory
102 |         CU_ASSERT(cuMulticastCreate(&multicastHandle, &multicastProp));
103 | 
104 |         // Export the allocation to the importing process
105 |         CU_ASSERT(cuMemExportToShareableHandle(&fh, multicastHandle, handleType, 0 /*flags*/));
106 |     }
107 | 
108 |     MPI_Bcast(&fh, sizeof(fh), MPI_BYTE, MPI_rank, MPI_COMM_WORLD);
109 | 
110 |     if (MPI_rank != worldRank) {
111 |         CU_ASSERT(cuMemImportFromShareableHandle(&multicastHandle, (void *)&fh, handleType));
112 |     }
113 | 
114 |     CUdevice dev;
115 |     CU_ASSERT(cuDeviceGet(&dev, localDevice));
116 |     CU_ASSERT(cuMulticastAddDevice(multicastHandle, dev));
117 | 
118 |     // Ensure all devices in this process are added BEFORE binding mem on any device
119 |     MPI_Barrier(MPI_COMM_WORLD);
120 | 
121 |     // Allocate the memory (same as unicast) and bind to MC handle
122 |     CUmemAllocationProp prop = {};
123 |     prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
124 |     prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
125 |     prop.location.id = localDevice;
126 |     prop.requestedHandleTypes = handleType;
127 |     CU_ASSERT(cuMemCreate(&handle, roundedUpAllocationSize, &prop, 0 /*flags*/));
128 |     CU_ASSERT(cuMulticastBindMem(multicastHandle, 0, handle, 0, roundedUpAllocationSize, 0));
129 | 
130 |     // Map the memory
131 |     CU_ASSERT(cuMemAddressReserve((CUdeviceptr *) &buffer, roundedUpAllocationSize, 0, 0 /*baseVA*/, 0 /*flags*/));
132 | 
133 |     CU_ASSERT(cuMemMap((CUdeviceptr) buffer, roundedUpAllocationSize, 0 /*offset*/, multicastHandle, 0 /*flags*/));
134 |     desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
135 |     desc.location.id = localDevice;
136 |     desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
137 |     CU_ASSERT(cuMemSetAccess((CUdeviceptr) buffer, roundedUpAllocationSize, &desc, 1 /*count*/));
138 | 
139 |     // Make sure that everyone is done with mapping the fabric allocation
140 |     MPI_Barrier(MPI_COMM_WORLD);
141 | }
142 | 
143 | MultinodeMemoryAllocationMulticast::~MultinodeMemoryAllocationMulticast() {
144 |     // Make sure that everyone is done using the memory
145 |     MPI_Barrier(MPI_COMM_WORLD);
146 | 
147 |     CUdevice dev;
148 |     CU_ASSERT(cuDeviceGet(&dev, localDevice));
149 |     CU_ASSERT(cuMulticastUnbind(multicastHandle, dev, 0, roundedUpAllocationSize));
150 |     CU_ASSERT(cuMemRelease(handle));
151 | 
152 |     CU_ASSERT(cuMemUnmap((CUdeviceptr) buffer, roundedUpAllocationSize));
153 |     CU_ASSERT(cuMemRelease(multicastHandle));
154 |     CU_ASSERT(cuMemAddressFree((CUdeviceptr) buffer, roundedUpAllocationSize));
155 | }
156 | 
157 | MultinodeDeviceBuffer::MultinodeDeviceBuffer(size_t bufferSize, int MPI_rank):
158 |     MPI_rank(MPI_rank),
159 |     MemcpyBuffer(bufferSize) {
160 | }
161 | 
162 | int MultinodeDeviceBuffer::getBufferIdx() const {
163 |     // only single-GPU supported for now
164 |     return 0;
165 | }
166 | 
167 | std::string MultinodeDeviceBuffer::getBufferString() const {
168 |     return "Multinode node " + std::to_string(MPI_rank);
169 | }
170 | 
171 | CUcontext MultinodeDeviceBuffer::getPrimaryCtx() const {
172 |     CUcontext primaryCtx;
173 |     CU_ASSERT(cuDevicePrimaryCtxRetain(&primaryCtx, localDevice));
174 |     return primaryCtx;
175 | }
176 | 
177 | int MultinodeDeviceBuffer::getMPIRank() const {
178 |     return MPI_rank;
179 | }
180 | 
181 | MultinodeDeviceBufferUnicast::MultinodeDeviceBufferUnicast(size_t bufferSize, int MPI_rank):
182 |     MultinodeDeviceBuffer(bufferSize, MPI_rank),
183 |     MemoryAllocation(bufferSize, MPI_rank) {
184 |     buffer = MemoryAllocation.getBuffer();
185 | }
186 | 
187 | MultinodeDeviceBufferMulticast::MultinodeDeviceBufferMulticast(size_t bufferSize, int MPI_rank):
188 |     MultinodeDeviceBuffer(bufferSize, MPI_rank),
189 |     MemoryAllocation(bufferSize, MPI_rank) {
190 |     buffer = MemoryAllocation.getBuffer();
191 | }
192 | 
193 | MultinodeDeviceBufferLocal::MultinodeDeviceBufferLocal(size_t bufferSize, int MPI_rank):
194 |     MultinodeDeviceBuffer(bufferSize, MPI_rank) {
195 |     buffer = nullptr;
196 |     if (worldRank == MPI_rank) {
197 |         CU_ASSERT(cuDevicePrimaryCtxRetain(&primaryCtx, localDevice));
198 |         CU_ASSERT(cuCtxSetCurrent(primaryCtx));
199 |         if (bufferSize) {
200 |             CU_ASSERT(cuMemAlloc((CUdeviceptr*)&buffer, bufferSize));
201 |         }
202 |     }
203 | }
204 | 
205 | MultinodeDeviceBufferLocal::~MultinodeDeviceBufferLocal() {
206 |     if (buffer) {
207 |         CU_ASSERT(cuCtxSetCurrent(primaryCtx));
208 |         CU_ASSERT(cuMemFree((CUdeviceptr)buffer));
209 |         CU_ASSERT(cuDevicePrimaryCtxRelease(localDevice));
210 |     }
211 | }
212 | 
213 | NodeHelperMulti::NodeHelperMulti() : blockingVarDeviceAllocation(sizeof(*blockingVarDevice), 0)  {
214 |     CU_ASSERT(cuMemHostAlloc((void **)&blockingVarHost, sizeof(*blockingVarHost), CU_MEMHOSTALLOC_PORTABLE));
215 |     blockingVarDevice = (volatile int*) blockingVarDeviceAllocation.getBuffer();
216 | }
217 | 
218 | NodeHelperMulti::~NodeHelperMulti() {
219 |     CU_ASSERT(cuMemFreeHost((void*)blockingVarHost));
220 | }
221 | 
222 | MemcpyDispatchInfo NodeHelperMulti::dispatchMemcpy(const std::vector<const MemcpyBuffer*> &srcNodesUnfiltered, const std::vector<const MemcpyBuffer*> &dstNodesUnfiltered, ContextPreference ctxPreference) {
223 |     std::vector<int> ranksUnfiltered(srcNodesUnfiltered.size(), -1);
224 |     std::vector<CUcontext> contextsUnfiltered(srcNodesUnfiltered.size());
225 |     std::vector<const MemcpyBuffer*> srcNodes;
226 |     std::vector<const MemcpyBuffer*> dstNodes;
227 |     std::vector<CUcontext> contexts;
228 | 
229 |     for (int i = 0; i < srcNodesUnfiltered.size(); i++) {
230 |         // prefer source context
231 |         // determine which ranks executes given operation
232 |         if (ctxPreference == PREFER_SRC_CONTEXT && srcNodesUnfiltered[i]->getPrimaryCtx() != nullptr) {
233 |             contextsUnfiltered[i] = srcNodesUnfiltered[i]->getPrimaryCtx();
234 |             ranksUnfiltered[i] = srcNodesUnfiltered[i]->getMPIRank();
235 |         } else if (dstNodesUnfiltered[i]->getPrimaryCtx() != nullptr) {
236 |             contextsUnfiltered[i] = dstNodesUnfiltered[i]->getPrimaryCtx();
237 |             ranksUnfiltered[i] = dstNodesUnfiltered[i]->getMPIRank();
238 |         }
239 |     }
240 | 
241 |     for (int i = 0; i < srcNodesUnfiltered.size(); i++) {
242 |         if (ranksUnfiltered[i] == worldRank) {
243 |             srcNodes.push_back(srcNodesUnfiltered[i]);
244 |             dstNodes.push_back(dstNodesUnfiltered[i]);
245 |             contexts.push_back(contextsUnfiltered[i]);
246 |         }
247 |     }
248 | 
249 |     // Don't crash if there are no memcopies to do
250 |     if (ranksUnfiltered.size() > 0) {
251 |         rankOfFirstMemcpy = ranksUnfiltered[0];
252 |     }
253 | 
254 |     return MemcpyDispatchInfo(srcNodes, dstNodes, contexts, ranksUnfiltered);
255 | }
256 | 
257 | double NodeHelperMulti::calculateTotalBandwidth(double totalTime, double totalSize, size_t loopCount) {
258 |     double totalMax = 0;
259 |     MPI_Allreduce(&totalTime, &totalMax, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
260 |     totalTime = totalMax;
261 | 
262 |     double totalSum = 0;
263 |     MPI_Allreduce(&totalSize, &totalSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
264 |     totalSize = totalSum;
265 | 
266 |     return (totalSize * loopCount * 1000ull * 1000ull) / totalTime;
267 | }
268 | 
269 | double NodeHelperMulti::calculateSumBandwidth(std::vector<PerformanceStatistic> &bandwidthStats) {
270 |     double sum = 0.0;
271 |     for (auto stat : bandwidthStats) {
272 |         sum += stat.returnAppropriateMetric() * 1e-9;
273 |     }
274 |     // Calculate total BW sum across all nodes and memcopies
275 |     double totalSum = 0;
276 |     MPI_Allreduce(&sum, &totalSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
277 |     return totalSum;
278 | }
279 | 
280 | double NodeHelperMulti::calculateFirstBandwidth(std::vector<PerformanceStatistic> &bandwidthStats) {
281 |     // Broadcast bandwidth of "first" memcopy to other nodes
282 |     double retval = 0;
283 |     if (worldRank == rankOfFirstMemcpy) {
284 |         retval = bandwidthStats[0].returnAppropriateMetric() * 1e-9;
285 |     }
286 |     MPI_Bcast(&retval, 1, MPI_DOUBLE, rankOfFirstMemcpy, MPI_COMM_WORLD);
287 |     return retval;
288 | }
289 | 
290 | std::vector<double> NodeHelperMulti::calculateVectorBandwidth(std::vector<double> &results, std::vector<int> originalRanks) {
291 |     std::vector<double> retval;
292 |     int current_local_elem = 0;
293 |     for (int i = 0; i < originalRanks.size(); i++) {
294 |         double tmp = 0;
295 |         if (worldRank == originalRanks[i]) {
296 |             tmp = results[current_local_elem];
297 |             current_local_elem++;
298 |         }
299 |         MPI_Bcast(&tmp, 1, MPI_DOUBLE, originalRanks[i], MPI_COMM_WORLD);
300 |         retval.push_back(tmp);
301 |     }
302 |     return retval;
303 | }
304 | 
305 | void NodeHelperMulti::synchronizeProcess() {
306 |     MPI_Barrier(MPI_COMM_WORLD);
307 | }
308 | 
309 | CUresult NodeHelperMulti::streamSynchronizeWrapper(CUstream stream) const {
310 |     return MPIstreamSyncHelper(stream);
311 | }
312 | 
313 | void NodeHelperMulti::streamBlockerReset() {
314 |     *blockingVarHost = 0;
315 |     CU_ASSERT(cuMemsetD32((CUdeviceptr) blockingVarDevice, 0, 1));
316 | }
317 | 
318 | void NodeHelperMulti::streamBlockerRelease() {
319 |     *blockingVarHost = 1;
320 | }
321 | 
322 | void NodeHelperMulti::streamBlockerBlock(CUstream stream) {
323 |     // MPI rank ranks[0] is released by blockingVar, writes to blockingVarDevice, releasing other ranks
324 |     CU_ASSERT(spinKernelMultistage((worldRank == rankOfFirstMemcpy) ? blockingVarHost : nullptr, blockingVarDevice, stream));
325 | }
326 | 
327 | #endif  // MULTINODE
328 | 


--------------------------------------------------------------------------------
/multinode_memcpy.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #ifndef MULTINODE_MEMCPY_H_
 19 | #define MULTINODE_MEMCPY_H_
 20 | #ifdef MULTINODE
 21 | 
 22 | #include <cuda.h>
 23 | #include <cuda_runtime_api.h>
 24 | 
 25 | #include "common.h"
 26 | #include "memcpy.h"
 27 | 
 28 | class MultinodeMemoryAllocation {
 29 |  protected:
 30 |     void* buffer = nullptr;
 31 |     size_t bufferSize;
 32 |     int MPI_rank;
 33 | 
 34 |  public:
 35 |     MultinodeMemoryAllocation(size_t bufferSize, int MPI_rank);
 36 |     void *getBuffer() { return (void *) buffer; }
 37 |     CUresult streamSynchronizeWrapper(CUstream stream) const;
 38 | };
 39 | 
 40 | // Class responsible for allocating memory that is shareable on a NVLink system, following RAII principles
 41 | // Constructor takes as parameters:
 42 | // - bufferSize: size of requested allocation
 43 | // - MPI_rank: node on which the allocation physically resides.
 44 | //      All other nodes will have this allocation mapped and accessible remotely.
 45 | class MultinodeMemoryAllocationUnicast : public MultinodeMemoryAllocation {
 46 |  private:
 47 |     CUmemGenericAllocationHandle handle = {};
 48 |     CUmemFabricHandle fh = {};
 49 |     CUmemAllocationHandleType handleType = {};
 50 |     CUmemAllocationProp prop = {};
 51 |     CUmemAccessDesc desc = {};
 52 |     size_t roundedUpAllocationSize;
 53 | 
 54 |  public:
 55 |     MultinodeMemoryAllocationUnicast(size_t bufferSize, int MPI_rank);
 56 |     ~MultinodeMemoryAllocationUnicast();
 57 | };
 58 | 
 59 | // Class responsible for allocating multicast object, following RAII principles
 60 | // Constructor takes as parameters:
 61 | // - bufferSize: size of requested allocation
 62 | // - MPI_rank: node driving the allocation process and exporting memory handle.
 63 | //      All nodes will have this allocation mapped and accessible.
 64 | class MultinodeMemoryAllocationMulticast : public MultinodeMemoryAllocation {
 65 |  private:
 66 |     CUmemGenericAllocationHandle handle = {};
 67 |     CUmemGenericAllocationHandle multicastHandle = {};
 68 |     CUmemFabricHandle fh = {};
 69 |     CUmemAllocationHandleType handleType = {};
 70 |     CUmulticastObjectProp multicastProp = {};
 71 |     CUmemAccessDesc desc = {};
 72 |     size_t roundedUpAllocationSize;
 73 |  public:
 74 |     MultinodeMemoryAllocationMulticast(size_t bufferSize, int MPI_rank);
 75 |     ~MultinodeMemoryAllocationMulticast();
 76 | };
 77 | 
 78 | // Class responsible for implementing Multinode MemcpyBuffer
 79 | // Each instance has information about which node owns the memory
 80 | class MultinodeDeviceBuffer : public MemcpyBuffer {
 81 |  private:
 82 |     int MPI_rank;
 83 |  public:
 84 |     MultinodeDeviceBuffer(size_t bufferSize, int MPI_rank);
 85 | 
 86 |     virtual CUcontext getPrimaryCtx() const override;
 87 |     virtual int getBufferIdx() const override;
 88 |     virtual std::string getBufferString() const override;
 89 |     virtual int getMPIRank() const override;
 90 | };
 91 | 
 92 | // MemcpyBuffer containing memory accessible from a different node in a multi-node NVLink connected system
 93 | // MPI_rank node owns the memory allocation, other nodes have it mapped
 94 | // Writes/reads to that memory from other nodes happen over NVLink
 95 | class MultinodeDeviceBufferUnicast : public MultinodeDeviceBuffer {
 96 |  private:
 97 |     MultinodeMemoryAllocationUnicast MemoryAllocation;
 98 |  public:
 99 |     MultinodeDeviceBufferUnicast(size_t bufferSize, int MPI_rank);
100 | };
101 | 
102 | // MemcpyBuffer containing memory bound to multicast object
103 | // Each node has its own copy of the memory, and the copies are the same
104 | // Writes to this memory are instantly propagated to other nodes (conforming to P2P writes memory model)
105 | class MultinodeDeviceBufferMulticast : public MultinodeDeviceBuffer {
106 |  private:
107 |     MultinodeMemoryAllocationMulticast MemoryAllocation;
108 |  public:
109 |     MultinodeDeviceBufferMulticast(size_t bufferSize, int MPI_rank);
110 | };
111 | 
112 | // MemcpyBuffer containing regular device memory
113 | // Only available on one node, exists primarily to simplify writing testcases
114 | class MultinodeDeviceBufferLocal : public MultinodeDeviceBuffer {
115 |  private:
116 |     CUcontext primaryCtx {};
117 |  public:
118 |     MultinodeDeviceBufferLocal(size_t bufferSize, int MPI_rank);
119 |     ~MultinodeDeviceBufferLocal();
120 | };
121 | 
122 | class NodeHelperMulti : public NodeHelper {
123 |  private:
124 |     int rankOfFirstMemcpy;
125 | 
126 |     // streamBlocker
127 |     volatile int* blockingVarHost;
128 |     volatile int* blockingVarDevice;
129 |     MultinodeMemoryAllocationUnicast blockingVarDeviceAllocation;
130 |  public:
131 |     NodeHelperMulti();
132 |     ~NodeHelperMulti();
133 |     MemcpyDispatchInfo dispatchMemcpy(const std::vector<const MemcpyBuffer*> &srcBuffers, const std::vector<const MemcpyBuffer*> &dstBuffers, ContextPreference ctxPreference);
134 |     double calculateTotalBandwidth(double totalTime, double totalSize, size_t loopCount);
135 |     double calculateSumBandwidth(std::vector<PerformanceStatistic> &bandwidthStats);
136 |     double calculateFirstBandwidth(std::vector<PerformanceStatistic> &bandwidthStats);
137 |     std::vector<double> calculateVectorBandwidth(std::vector<double> &results, std::vector<int> originalRanks);
138 |     void synchronizeProcess();
139 |     CUresult streamSynchronizeWrapper(CUstream stream) const;
140 | 
141 |     // stream blocking functions
142 |     void streamBlockerReset();
143 |     void streamBlockerRelease();
144 |     void streamBlockerBlock(CUstream stream);
145 | };
146 | 
147 | #endif  // MULTINODE
148 | #endif  // MULTINODE_MEMCPY_H_
149 | 


--------------------------------------------------------------------------------
/multinode_testcases.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include <cuda.h>
 19 | 
 20 | #include "testcase.h"
 21 | #include "memcpy.h"
 22 | #include "common.h"
 23 | #include "output.h"
 24 | #ifdef MULTINODE
 25 | #include <mpi.h>
 26 | #include "multinode_memcpy.h"
 27 | 
 28 | // DtoD Read test - copy from dst to src (backwards) using src contxt
 29 | void MultinodeDeviceToDeviceReadCE::run(unsigned long long size, unsigned long long loopCount) {
 30 |     PeerValueMatrix<double> bandwidthValues(worldSize, worldSize, key);
 31 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti(), PREFER_DST_CONTEXT);
 32 | 
 33 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
 34 |         for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) {
 35 |             if (peerDeviceId == srcDeviceId) {
 36 |                 continue;
 37 |             }
 38 |             MultinodeDeviceBufferUnicast srcNode(size, srcDeviceId);
 39 |             MultinodeDeviceBufferUnicast peerNode(size, peerDeviceId);
 40 | 
 41 |             // swap src and peer nodes, but use srcNodes (the copy's destination) context
 42 |             bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(peerNode, srcNode);
 43 |         }
 44 |     }
 45 | 
 46 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)");
 47 | }
 48 | 
 49 | 
 50 | // DtoD Write test - copy from src to dst using src context
 51 | void MultinodeDeviceToDeviceWriteCE::run(unsigned long long size, unsigned long long loopCount) {
 52 |     PeerValueMatrix<double> bandwidthValues(worldSize, worldSize, key);
 53 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti());
 54 | 
 55 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
 56 |         for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) {
 57 |             if (peerDeviceId == srcDeviceId) {
 58 |                 continue;
 59 |             }
 60 | 
 61 |             MultinodeDeviceBufferUnicast srcNode(size, srcDeviceId);
 62 |             MultinodeDeviceBufferUnicast peerNode(size, peerDeviceId);
 63 | 
 64 |             bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(srcNode, peerNode);
 65 |         }
 66 |     }
 67 | 
 68 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)");
 69 | }
 70 | 
 71 | // DtoD Bidir Read test - copy from dst to src (backwards) using src contxt
 72 | void MultinodeDeviceToDeviceBidirReadCE::run(unsigned long long size, unsigned long long loopCount) {
 73 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::VECTOR_BW);
 74 |     PeerValueMatrix<double> bandwidthValuesRead1(worldSize, worldSize, key + "_read1");
 75 |     PeerValueMatrix<double> bandwidthValuesRead2(worldSize, worldSize, key + "_read2");
 76 |     PeerValueMatrix<double> bandwidthValuesTotal(worldSize, worldSize, key + "_total");
 77 | 
 78 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
 79 |         for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) {
 80 |             if (peerDeviceId == srcDeviceId) {
 81 |                 continue;
 82 |             }
 83 | 
 84 |             // Double the size of the interference copy to ensure it interferes correctly
 85 |             MultinodeDeviceBufferUnicast src1(size, srcDeviceId), src2(size, srcDeviceId);
 86 |             MultinodeDeviceBufferUnicast peer1(size, peerDeviceId), peer2(size, peerDeviceId);
 87 | 
 88 |             // swap src and peer nodes, but use srcNodes (the copy's destination) context
 89 |             std::vector<const MemcpyBuffer*> srcNodes = {&peer1, &src2};
 90 |             std::vector<const MemcpyBuffer*> peerNodes = {&src1, &peer2};
 91 | 
 92 |             auto results = memcpyInstance.doMemcpyVector(srcNodes, peerNodes);
 93 |             bandwidthValuesRead1.value(srcDeviceId, peerDeviceId) = results[0];
 94 |             bandwidthValuesRead2.value(srcDeviceId, peerDeviceId) = results[1];
 95 |             bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1];
 96 |         }
 97 |     }
 98 | 
 99 |     output->addTestcaseResults(bandwidthValuesRead1, "memcpy CE CPU(row) <-> GPU(column) Read1 bandwidth (GB/s)");
100 |     output->addTestcaseResults(bandwidthValuesRead2, "memcpy CE CPU(row) <-> GPU(column) Read2 bandwidth (GB/s)");
101 |     output->addTestcaseResults(bandwidthValuesTotal, "memcpy CE CPU(row) <-> GPU(column) Total bandwidth (GB/s)");
102 | }
103 | 
104 | // DtoD Bidir Write test - copy from src to dst using src context
105 | void MultinodeDeviceToDeviceBidirWriteCE::run(unsigned long long size, unsigned long long loopCount) {
106 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
107 |     PeerValueMatrix<double> bandwidthValuesWrite1(worldSize, worldSize, key + "_write1");
108 |     PeerValueMatrix<double> bandwidthValuesWrite2(worldSize, worldSize, key + "_write2");
109 |     PeerValueMatrix<double> bandwidthValuesTotal(worldSize, worldSize, key + "_total");
110 | 
111 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
112 |         for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) {
113 |             if (peerDeviceId == srcDeviceId) {
114 |                 continue;
115 |             }
116 | 
117 |             // Double the size of the interference copy to ensure it interferes correctly
118 |             MultinodeDeviceBufferUnicast src1(size, srcDeviceId), src2(size, srcDeviceId);
119 |             MultinodeDeviceBufferUnicast peer1(size, peerDeviceId), peer2(size, peerDeviceId);
120 | 
121 |             std::vector<const MemcpyBuffer*> srcNodes = {&src1, &peer2};
122 |             std::vector<const MemcpyBuffer*> peerNodes = {&peer1, &src2};
123 | 
124 |             auto results = memcpyInstance.doMemcpyVector(srcNodes, peerNodes);
125 |             bandwidthValuesWrite1.value(srcDeviceId, peerDeviceId) = results[0];
126 |             bandwidthValuesWrite2.value(srcDeviceId, peerDeviceId) = results[1];
127 |             bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1];
128 |         }
129 |     }
130 | 
131 |     output->addTestcaseResults(bandwidthValuesWrite1, "memcpy CE CPU(row) <-> GPU(column) Read1 bandwidth (GB/s)");
132 |     output->addTestcaseResults(bandwidthValuesWrite2, "memcpy CE CPU(row) <-> GPU(column) Read2 bandwidth (GB/s)");
133 |     output->addTestcaseResults(bandwidthValuesTotal, "memcpy CE CPU(row) <-> GPU(column) Total bandwidth (GB/s)");
134 | }
135 | 
136 | 
137 | // DtoD Read test - copy from dst to src (backwards) using src contxt
138 | void MultinodeDeviceToDeviceReadSM::run(unsigned long long size, unsigned long long loopCount) {
139 |     PeerValueMatrix<double> bandwidthValues(worldSize, worldSize, key);
140 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_DST_CONTEXT);
141 | 
142 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
143 |         for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) {
144 |             if (peerDeviceId == srcDeviceId) {
145 |                 continue;
146 |             }
147 | 
148 |             MultinodeDeviceBufferUnicast srcNode(size, srcDeviceId);
149 |             MultinodeDeviceBufferUnicast peerNode(size, peerDeviceId);
150 | 
151 |             // swap src and peer nodes, but use srcNodes (the copy's destination) context
152 |             bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(peerNode, srcNode);
153 |         }
154 |     }
155 | 
156 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)");
157 | }
158 | 
159 | // DtoD Write test - copy from src to dst using src context
160 | void MultinodeDeviceToDeviceWriteSM::run(unsigned long long size, unsigned long long loopCount) {
161 |     PeerValueMatrix<double> bandwidthValues(worldSize, worldSize, key);
162 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti());
163 | 
164 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
165 |         for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) {
166 |             if (peerDeviceId == srcDeviceId) {
167 |                 continue;
168 |             }
169 | 
170 |             MultinodeDeviceBufferUnicast srcNode(size, srcDeviceId);
171 |             MultinodeDeviceBufferUnicast peerNode(size, peerDeviceId);
172 | 
173 |             bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(srcNode, peerNode);
174 |         }
175 |     }
176 | 
177 |     output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) <- GPU(column) bandwidth (GB/s)");
178 | }
179 | 
180 | // DtoD Bidir Read test - copy from dst to src (backwards) using src contxt
181 | void MultinodeDeviceToDeviceBidirReadSM::run(unsigned long long size, unsigned long long loopCount) {
182 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::VECTOR_BW);
183 |     PeerValueMatrix<double> bandwidthValuesRead1(worldSize, worldSize, key + "_read1");
184 |     PeerValueMatrix<double> bandwidthValuesRead2(worldSize, worldSize, key + "_read2");
185 |     PeerValueMatrix<double> bandwidthValuesTotal(worldSize, worldSize, key + "_total");
186 | 
187 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
188 |         for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) {
189 |             if (peerDeviceId == srcDeviceId) {
190 |                 continue;
191 |             }
192 | 
193 |             MultinodeDeviceBufferUnicast src1(size, srcDeviceId), src2(size, srcDeviceId);
194 |             MultinodeDeviceBufferUnicast peer1(size, peerDeviceId), peer2(size, peerDeviceId);
195 | 
196 |             // swap src and peer nodes, but use srcNodes (the copy's destination) context
197 |             std::vector<const MemcpyBuffer*> srcNodes = {&peer1, &src2};
198 |             std::vector<const MemcpyBuffer*> peerNodes = {&src1, &peer2};
199 | 
200 |             auto results = memcpyInstance.doMemcpyVector(srcNodes, peerNodes);
201 |             bandwidthValuesRead1.value(srcDeviceId, peerDeviceId) = results[0];
202 |             bandwidthValuesRead2.value(srcDeviceId, peerDeviceId) = results[1];
203 |             bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1];
204 |         }
205 |     }
206 | 
207 |     output->addTestcaseResults(bandwidthValuesRead1, "memcpy SM CPU(row) <-> GPU(column) Read1 bandwidth (GB/s)");
208 |     output->addTestcaseResults(bandwidthValuesRead2, "memcpy SM CPU(row) <-> GPU(column) Read2 bandwidth (GB/s)");
209 |     output->addTestcaseResults(bandwidthValuesTotal, "memcpy SM CPU(row) <-> GPU(column) Total bandwidth (GB/s)");
210 | }
211 | 
212 | // DtoD Bidir Write test - copy from src to dst using src context
213 | void MultinodeDeviceToDeviceBidirWriteSM::run(unsigned long long size, unsigned long long loopCount) {
214 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
215 |     PeerValueMatrix<double> bandwidthValuesWrite1(worldSize, worldSize, key + "_write1");
216 |     PeerValueMatrix<double> bandwidthValuesWrite2(worldSize, worldSize, key + "_write2");
217 |     PeerValueMatrix<double> bandwidthValuesTotal(worldSize, worldSize, key + "_total");
218 | 
219 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
220 |         for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) {
221 |             if (peerDeviceId == srcDeviceId) {
222 |                 continue;
223 |             }
224 | 
225 |             MultinodeDeviceBufferUnicast src1(size, srcDeviceId), src2(size, srcDeviceId);
226 |             MultinodeDeviceBufferUnicast peer1(size, peerDeviceId), peer2(size, peerDeviceId);
227 | 
228 |             std::vector<const MemcpyBuffer*> srcNodes = {&src1, &peer2};
229 |             std::vector<const MemcpyBuffer*> peerNodes = {&peer1, &src2};
230 | 
231 |             auto results = memcpyInstance.doMemcpyVector(srcNodes, peerNodes);
232 |             bandwidthValuesWrite1.value(srcDeviceId, peerDeviceId) = results[0];
233 |             bandwidthValuesWrite2.value(srcDeviceId, peerDeviceId) = results[1];
234 |             bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1];
235 |         }
236 |     }
237 | 
238 |     output->addTestcaseResults(bandwidthValuesWrite1, "memcpy SM CPU(row) <-> GPU(column) Write1 bandwidth (GB/s)");
239 |     output->addTestcaseResults(bandwidthValuesWrite2, "memcpy SM CPU(row) <-> GPU(column) Write2 bandwidth (GB/s)");
240 |     output->addTestcaseResults(bandwidthValuesTotal, "memcpy SM CPU(row) <-> GPU(column) Total bandwidth (GB/s)");
241 | }
242 | 
243 | void MultinodeAllToOneWriteSM::run(unsigned long long size, unsigned long long loopCount) {
244 |     PeerValueMatrix<double> bandwidthValues(1, worldSize, key);
245 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_SRC_CONTEXT, MemcpyOperation::SUM_BW);
246 | 
247 |     for (int dstDeviceId = 0; dstDeviceId < worldSize; dstDeviceId++) {
248 |         std::vector<const MemcpyBuffer*> srcNodes;
249 |         std::vector<const MemcpyBuffer*> dstNodes;
250 | 
251 |         for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
252 |             if (dstDeviceId == srcDeviceId) {
253 |                 continue;
254 |             }
255 | 
256 |             srcNodes.push_back(new MultinodeDeviceBufferLocal(size, srcDeviceId));
257 |             dstNodes.push_back(new MultinodeDeviceBufferUnicast(size, dstDeviceId));
258 |         }
259 | 
260 |         bandwidthValues.value(0, dstDeviceId) = memcpyInstance.doMemcpy(srcNodes, dstNodes);
261 | 
262 |         for (auto node : dstNodes) {
263 |             delete node;
264 |         }
265 |         for (auto node : srcNodes) {
266 |             delete node;
267 |         }
268 |     }
269 | 
270 |     output->addTestcaseResults(bandwidthValues, "memcpy SM All Gpus -> GPU(column) total bandwidth (GB/s)");
271 | }
272 | 
273 | void MultinodeAllFromOneReadSM::run(unsigned long long size, unsigned long long loopCount) {
274 |     PeerValueMatrix<double> bandwidthValues(1, worldSize, key);
275 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::SUM_BW);
276 | 
277 |     for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) {
278 |         std::vector<const MemcpyBuffer*> srcNodes;
279 |         std::vector<const MemcpyBuffer*> dstNodes;
280 | 
281 |         for (int dstDeviceId = 0; dstDeviceId < worldSize; dstDeviceId++) {
282 |             if (dstDeviceId == srcDeviceId) {
283 |                 continue;
284 |             }
285 | 
286 |             srcNodes.push_back(new MultinodeDeviceBufferUnicast(size, srcDeviceId));
287 |             dstNodes.push_back(new MultinodeDeviceBufferLocal(size, dstDeviceId));
288 |         }
289 | 
290 |         bandwidthValues.value(0, srcDeviceId) = memcpyInstance.doMemcpy(srcNodes, dstNodes);
291 | 
292 |         for (auto node : dstNodes) {
293 |             delete node;
294 |         }
295 |         for (auto node : srcNodes) {
296 |             delete node;
297 |         }
298 |     }
299 | 
300 |     output->addTestcaseResults(bandwidthValues, "memcpy SM All Gpus <- GPU(column) total bandwidth (GB/s)");
301 | }
302 | 
303 | void MultinodeBroadcastOneToAllSM::run(unsigned long long size, unsigned long long loopCount) {
304 |     PeerValueMatrix<double> bandwidthValues(1, worldSize, key);
305 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorMulticastWrite(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::SUM_BW);
306 | 
307 |     for (int dstDeviceId = 0; dstDeviceId < worldSize; dstDeviceId++) {
308 |         std::vector<const MemcpyBuffer*> srcNodes;
309 |         std::vector<const MemcpyBuffer*> dstNodes;
310 | 
311 |         srcNodes.push_back(new MultinodeDeviceBufferLocal(size, dstDeviceId));
312 |         dstNodes.push_back(new MultinodeDeviceBufferMulticast(size, dstDeviceId));
313 | 
314 |         bandwidthValues.value(0, dstDeviceId) = memcpyInstance.doMemcpy(srcNodes, dstNodes);
315 | 
316 |         for (auto node : dstNodes) {
317 |             delete node;
318 |         }
319 |         for (auto node : srcNodes) {
320 |             delete node;
321 |         }
322 |     }
323 | 
324 |     output->addTestcaseResults(bandwidthValues, "multicast SM GPU(column) -> All Gpus total bandwidth (GB/s)");
325 | }
326 | 
327 | void MultinodeBroadcastAllToAllSM::run(unsigned long long size, unsigned long long loopCount) {
328 |     PeerValueMatrix<double> bandwidthValues(1, 1, key);
329 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorMulticastWrite(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::SUM_BW);
330 |     std::vector<const MemcpyBuffer*> srcNodes;
331 |     std::vector<const MemcpyBuffer*> dstNodes;
332 | 
333 |     for (int dstDeviceId = 0; dstDeviceId < worldSize; dstDeviceId++) {
334 |         srcNodes.push_back(new MultinodeDeviceBufferLocal(size, dstDeviceId));
335 |         dstNodes.push_back(new MultinodeDeviceBufferMulticast(size, dstDeviceId));
336 |     }
337 | 
338 |     bandwidthValues.value(0, 0) = memcpyInstance.doMemcpy(srcNodes, dstNodes);
339 | 
340 |     for (auto node : dstNodes) {
341 |         delete node;
342 |     }
343 |     for (auto node : srcNodes) {
344 |         delete node;
345 |     }
346 | 
347 |     output->addTestcaseResults(bandwidthValues, "multicast SM All -> All Gpus total bandwidth (GB/s)");
348 | }
349 | 
350 | void MultinodeBisectWriteCE::run(unsigned long long size, unsigned long long loopCount) {
351 |     PeerValueMatrix<double> bandwidthValues(worldSize, 1, key);
352 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
353 |     std::vector<std::string> rowLabels;
354 |     std::vector<const MemcpyBuffer*> srcNodes, dstNodes;
355 | 
356 |     for (int i = 0; i < worldSize; i++) {
357 |         int peer = (i + worldSize / 2) % worldSize;
358 |         srcNodes.push_back(new MultinodeDeviceBufferUnicast(size, i));
359 |         dstNodes.push_back(new MultinodeDeviceBufferUnicast(size, peer));
360 | 
361 |         std::stringstream s;
362 |         s << getPaddedProcessId(i) << "->" << getPaddedProcessId(peer);
363 |         rowLabels.push_back(s.str());
364 |     }
365 | 
366 |     auto results = memcpyInstance.doMemcpyVector(dstNodes, srcNodes);
367 | 
368 |     for (int i = 0; i < results.size(); i++) {
369 |         bandwidthValues.value(i, 0) = results[i];
370 |     }
371 |     bandwidthValues.setRowLabels(rowLabels);
372 | 
373 |     for (auto node : dstNodes) {
374 |         delete node;
375 |     }
376 |     for (auto node : srcNodes) {
377 |         delete node;
378 |     }
379 | 
380 |     output->addTestcaseResults(bandwidthValues, "Bisect benchmarking, simultaneous write CE BW");
381 | }
382 | 
383 | #endif
384 | 


--------------------------------------------------------------------------------
/nvbandwidth.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include <boost/program_options.hpp>
 19 | #include <cuda.h>
 20 | #include <cuda_runtime_api.h>
 21 | #include <nvml.h>
 22 | #include <iostream>
 23 | 
 24 | #ifdef MULTINODE
 25 | #include <mpi.h>
 26 | #endif
 27 | 
 28 | #include "json_output.h"
 29 | #include "kernels.cuh"
 30 | #include "output.h"
 31 | #include "testcase.h"
 32 | #include "version.h"
 33 | #include "inline_common.h"
 34 | 
 35 | namespace opt = boost::program_options;
 36 | 
 37 | int deviceCount;
 38 | unsigned int averageLoopCount;
 39 | unsigned long long bufferSize;
 40 | unsigned long long loopCount;
 41 | bool verbose;
 42 | bool shouldOutput = true;
 43 | bool disableAffinity;
 44 | bool skipVerification;
 45 | bool useMean;
 46 | bool perfFormatter;
 47 | 
 48 | Verbosity VERBOSE(verbose);
 49 | Verbosity OUTPUT(shouldOutput);
 50 | 
 51 | #ifdef MULTINODE
 52 | // Device ordinal of the GPU owned by the process
 53 | int localRank;
 54 | // Process rank within one OS
 55 | int localDevice;
 56 | int worldRank;
 57 | int worldSize;
 58 | #endif
 59 | char localHostname[STRING_LENGTH];
 60 | bool jsonOutput;
 61 | Output *output;
 62 | 
 63 | // Define testcases here
 64 | std::vector<Testcase*> createTestcases() {
 65 |     return {
 66 |         new HostToDeviceCE(),
 67 |         new DeviceToHostCE(),
 68 |         new HostToDeviceBidirCE(),
 69 |         new DeviceToHostBidirCE(),
 70 |         new DeviceToDeviceReadCE(),
 71 |         new DeviceToDeviceWriteCE(),
 72 |         new DeviceToDeviceBidirReadCE(),
 73 |         new DeviceToDeviceBidirWriteCE(),
 74 |         new AllToHostCE(),
 75 |         new AllToHostBidirCE(),
 76 |         new HostToAllCE(),
 77 |         new HostToAllBidirCE(),
 78 |         new AllToOneWriteCE(),
 79 |         new AllToOneReadCE(),
 80 |         new OneToAllWriteCE(),
 81 |         new OneToAllReadCE(),
 82 |         new HostToDeviceSM(),
 83 |         new DeviceToHostSM(),
 84 |         new HostToDeviceBidirSM(),
 85 |         new DeviceToHostBidirSM(),
 86 |         new DeviceToDeviceReadSM(),
 87 |         new DeviceToDeviceWriteSM(),
 88 |         new DeviceToDeviceBidirReadSM(),
 89 |         new DeviceToDeviceBidirWriteSM(),
 90 |         new AllToHostSM(),
 91 |         new AllToHostBidirSM(),
 92 |         new HostToAllSM(),
 93 |         new HostToAllBidirSM(),
 94 |         new AllToOneWriteSM(),
 95 |         new AllToOneReadSM(),
 96 |         new OneToAllWriteSM(),
 97 |         new OneToAllReadSM(),
 98 |         new HostDeviceLatencySM(),
 99 |         new DeviceToDeviceLatencySM(),
100 |         new DeviceLocalCopy(),
101 | #ifdef MULTINODE
102 |         new MultinodeDeviceToDeviceReadCE(),
103 |         new MultinodeDeviceToDeviceWriteCE(),
104 |         new MultinodeDeviceToDeviceBidirReadCE(),
105 |         new MultinodeDeviceToDeviceBidirWriteCE(),
106 |         new MultinodeDeviceToDeviceReadSM(),
107 |         new MultinodeDeviceToDeviceWriteSM(),
108 |         new MultinodeDeviceToDeviceBidirReadSM(),
109 |         new MultinodeDeviceToDeviceBidirWriteSM(),
110 |         new MultinodeAllToOneWriteSM(),
111 |         new MultinodeAllFromOneReadSM(),
112 |         new MultinodeBroadcastOneToAllSM(),
113 |         new MultinodeBroadcastAllToAllSM(),
114 |         new MultinodeBisectWriteCE(),
115 | #endif
116 |     };
117 | }
118 | 
119 | Testcase* findTestcase(std::vector<Testcase*> &testcases, std::string id) {
120 |     // Check if testcase ID is index
121 |     char* p;
122 |     long index = strtol(id.c_str(), &p, 10);
123 |     if (*p) {
124 |         // Conversion failed so key is ID
125 |         auto it = find_if(testcases.begin(), testcases.end(), [&id](Testcase* test) {return test->testKey() == id;});
126 |         if (it != testcases.end()) {
127 |             return testcases.at(std::distance(testcases.begin(), it));
128 |         } else {
129 |             throw "Testcase " + id + " not found!";
130 |         }
131 |     } else {
132 |         // ID is index
133 |         if (index < 0 || index >= static_cast<long>(testcases.size())) throw "Testcase index " + id + " out of bound!";
134 |         return testcases.at(index);
135 |     }
136 | }
137 | 
138 | std::vector<std::string> expandTestcases(std::vector<Testcase*> &testcases, std::vector<std::string> prefixes) {
139 |     std::vector<std::string> testcasesToRun;
140 |     for (auto testcase : testcases) {
141 |          auto it = find_if(prefixes.begin(), prefixes.end(), [&testcase](std::string prefix) {return testcase->testKey().compare(0, prefix.size(), prefix) == 0;});
142 |             if (it != prefixes.end()) {
143 |                 testcasesToRun.push_back(testcase->testKey());
144 |             }
145 |     }
146 |     return testcasesToRun;
147 | }
148 | 
149 | void runTestcase(std::vector<Testcase*> &testcases, const std::string &testcaseID) {
150 |     Testcase* test{nullptr};
151 |     try {
152 |         test = findTestcase(testcases, testcaseID);
153 |     } catch (std::string &s) {
154 |         output->addTestcase(testcaseID, "ERROR", s);
155 |         return;
156 |     }
157 | 
158 |     try {
159 |         if (!test->filter()) {
160 |             output->addTestcase(test->testKey(), NVB_WAIVED);
161 |             return;
162 |         }
163 | 
164 |         output->addTestcase(test->testKey(), NVB_RUNNING);
165 | 
166 |         // Run the testcase
167 |         if (test->testKey() == "host_device_latency_sm" || test->testKey() == "device_to_device_latency_sm") {
168 |             // use fixd-size buffer for latency tests
169 |             test->run(2 * _MiB, loopCount);
170 |         } else {
171 |             test->run(bufferSize * _MiB, loopCount);
172 |         }
173 |     } catch (std::string &s) {
174 |         output->setTestcaseStatusAndAddIfNeeded(test->testKey(), NVB_ERROR_STATUS, s);
175 |     }
176 | }
177 | 
178 | int main(int argc, char **argv) {
179 |     std::vector<Testcase*> testcases = createTestcases();
180 |     std::vector<std::string> testcasesToRun;
181 |     std::vector<std::string> testcasePrefixes;
182 |     output = new Output();
183 | 
184 | #ifdef _WIN32
185 |     strncpy(localHostname, getenv("COMPUTERNAME"), STRING_LENGTH - 1);
186 |     const char* computername = getenv("COMPUTERNAME");
187 |     if (computername && computername[0] != '\0') {
188 |         snprintf(localHostname, STRING_LENGTH, "%s", computername);
189 |     } else {
190 |         snprintf(localHostname, STRING_LENGTH, "%s", "unknown");
191 |     }
192 | #else
193 |     ASSERT(0 == gethostname(localHostname, STRING_LENGTH - 1));
194 | #endif
195 | #ifdef MULTINODE
196 |     // Set up MPI
197 |     MPI_Init(NULL, NULL);
198 |     MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
199 |     MPI_Comm_rank(MPI_COMM_WORLD, &worldRank);
200 | 
201 |     // Avoid excessive output by limit output to rank 0
202 |     shouldOutput = (worldRank == 0);
203 | #endif
204 | 
205 |     // Args parsing
206 |     opt::options_description visible_opts("nvbandwidth CLI");
207 |     visible_opts.add_options()
208 |         ("help,h", "Produce help message")
209 |         ("bufferSize,b", opt::value<unsigned long long int>(&bufferSize)->default_value(defaultBufferSize), "Memcpy buffer size in MiB")
210 |         ("list,l", "List available testcases")
211 |         ("testcase,t", opt::value<std::vector<std::string>>(&testcasesToRun)->multitoken(), "Testcase(s) to run (by name or index)")
212 |         ("testcasePrefixes,p", opt::value<std::vector<std::string>>(&testcasePrefixes)->multitoken(), "Testcase(s) to run (by prefix))")
213 |         ("verbose,v", opt::bool_switch(&verbose)->default_value(false), "Verbose output")
214 |         ("skipVerification,s", opt::bool_switch(&skipVerification)->default_value(false), "Skips data verification after copy")
215 |         ("disableAffinity,d", opt::bool_switch(&disableAffinity)->default_value(false), "Disable automatic CPU affinity control")
216 |         ("testSamples,i", opt::value<unsigned int>(&averageLoopCount)->default_value(defaultAverageLoopCount), "Iterations of the benchmark")
217 |         ("useMean,m", opt::bool_switch(&useMean)->default_value(false), "Use mean instead of median for results")
218 |         ("json,j", opt::bool_switch(&jsonOutput)->default_value(false), "Print output in json format instead of plain text.");
219 | 
220 |     opt::options_description all_opts("");
221 |     all_opts.add(visible_opts);
222 |     all_opts.add_options()
223 |         ("loopCount", opt::value<unsigned long long int>(&loopCount)->default_value(defaultLoopCount), "Iterations of memcpy to be performed within a test sample")
224 |         ("perfFormatter", opt::bool_switch(&perfFormatter)->default_value(false), "Use perf formatter prefix (&&&& PERF) in output");
225 | 
226 |     opt::variables_map vm;
227 |     try {
228 |         opt::store(opt::parse_command_line(argc, argv, all_opts), vm);
229 |         opt::notify(vm);
230 |     } catch (...) {
231 |         output->addVersionInfo();
232 | 
233 |         std::stringstream errmsg;
234 |         errmsg << "ERROR: Invalid Arguments " << std::endl;
235 |         for (int i = 0; i < argc; i++) {
236 |             errmsg << argv[i] << " ";
237 |         }
238 |         std::vector<std::string> messageParts;
239 |         std::stringstream buf;
240 |         buf << visible_opts;
241 |         messageParts.emplace_back(errmsg.str());
242 |         messageParts.emplace_back(buf.str());
243 |         output->recordError(messageParts);
244 |         return 1;
245 |     }
246 | 
247 |     if (jsonOutput) {
248 |         delete output;
249 |         output = new JsonOutput(shouldOutput);
250 |     }
251 | 
252 |     output->addVersionInfo();
253 | 
254 |     if (vm.count("help")) {
255 |         OUTPUT << visible_opts << "\n";
256 |         return 0;
257 |     }
258 | 
259 |     if (vm.count("list")) {
260 |         output->listTestcases(testcases);
261 |         return 0;
262 |     }
263 | 
264 |     if (testcasePrefixes.size() != 0 && testcasesToRun.size() != 0) {
265 |         output->recordError("You cannot specify both testcase and testcasePrefix options at the same time");
266 |         return 1;
267 |     }
268 | 
269 | 
270 |     CU_ASSERT(cuInit(0));
271 |     NVML_ASSERT(nvmlInit());
272 |     CU_ASSERT(cuDeviceGetCount(&deviceCount));
273 |     if (bufferSize < defaultBufferSize) {
274 |         output->recordWarning("NOTE: You have chosen a buffer size that is smaller than the default buffer size. It is suggested to use the default buffer size (64MB) to achieve maximal peak bandwidth.");
275 |     }
276 | 
277 |     int cudaVersion;
278 |     cudaRuntimeGetVersion(&cudaVersion);
279 | 
280 |     CU_ASSERT(cuDriverGetVersion(&cudaVersion));
281 | 
282 |     char driverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
283 |     NVML_ASSERT(nvmlSystemGetDriverVersion(driverVersion, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE));
284 | 
285 |     output->addCudaAndDriverInfo(cudaVersion, driverVersion);
286 | 
287 |     output->recordDevices(deviceCount);
288 | 
289 |     if (testcasePrefixes.size() > 0) {
290 |         testcasesToRun = expandTestcases(testcases, testcasePrefixes);
291 |         if (testcasesToRun.size() == 0) {
292 |             output->recordError("Specified list of testcase prefixes did not match any testcases");
293 |             return 1;
294 |         }
295 |     }
296 | 
297 |     // This triggers the loading of all kernels on all devices, even with lazy loading enabled.
298 |     // Some tests can create complex dependencies between devices and function loading requires a
299 |     // device synchronization, so loading in the middle of a test can deadlock.
300 |     preloadKernels(deviceCount);
301 | 
302 |     if (testcasesToRun.size() == 0) {
303 |         // run all testcases
304 |         for (auto testcase : testcases) {
305 |             runTestcase(testcases, testcase->testKey());
306 |         }
307 |     } else {
308 |         for (const auto& testcaseIndex : testcasesToRun) {
309 |             runTestcase(testcases, testcaseIndex);
310 |         }
311 |     }
312 | 
313 |     output->print();
314 | 
315 |     for (auto testcase : testcases) {
316 |         delete testcase;
317 |     }
318 | 
319 | #ifdef MULTINODE
320 |     MPI_Finalize();
321 | #endif
322 | 
323 |     output->printInfo();
324 |     return 0;
325 | }
326 | 


--------------------------------------------------------------------------------
/output.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include "inline_common.h"
 19 | #include "output.h"
 20 | #include "version.h"
 21 | 
 22 | #include <math.h>
 23 | 
 24 | #ifdef MULTINODE
 25 | #include <unistd.h>
 26 | #include <mpi.h>
 27 | #include <map>
 28 | #endif
 29 | 
 30 | void Output::addVersionInfo() {
 31 |     OUTPUT << "nvbandwidth Version: " << NVBANDWIDTH_VERSION << std::endl;
 32 |     OUTPUT << "Built from Git version: " << GIT_VERSION << std::endl << std::endl;
 33 | 
 34 | #ifdef MULTINODE
 35 |     char MPIVersion[MPI_MAX_LIBRARY_VERSION_STRING];
 36 |     int MPIVersionLen;
 37 |     MPI_Get_library_version(MPIVersion, &MPIVersionLen);
 38 | 
 39 |     OUTPUT << "MPI version: " << MPIVersion << std::endl;
 40 | #endif
 41 | }
 42 | 
 43 | void Output::printInfo() {
 44 |     OUTPUT << "NOTE: The reported results may not reflect the full capabilities of the platform." << std::endl
 45 |            << "Performance can vary with software drivers, hardware clocks, and system topology." << std::endl << std::endl;
 46 | }
 47 | 
 48 | void Output::addCudaAndDriverInfo(int cudaVersion, const std::string &driverVersion) {
 49 |     OUTPUT << "CUDA Runtime Version: " << cudaVersion << std::endl;
 50 |     OUTPUT << "CUDA Driver Version: " << cudaVersion << std::endl;
 51 |     OUTPUT << "Driver Version: " << driverVersion << std::endl << std::endl;
 52 | }
 53 | 
 54 | void Output::recordError(const std::string &error) {
 55 |     std::cerr << error << std::endl;
 56 | }
 57 | 
 58 | void Output::recordError(const std::vector<std::string> &errorParts) {
 59 |     bool first = true;
 60 |     for (auto &part : errorParts) {
 61 |         if (first) {
 62 |             OUTPUT << part << ":\n\n";
 63 |             first = false;
 64 |         } else {
 65 |             OUTPUT << part << std::endl;
 66 |         }
 67 |     }
 68 | }
 69 | 
 70 | void Output::listTestcases(const std::vector<Testcase*> &testcases) {
 71 |     size_t numTestcases = testcases.size();
 72 |     OUTPUT << "Index, Name:\n\tDescription\n";
 73 |     OUTPUT << "=======================\n";
 74 |     for (unsigned int i = 0; i < numTestcases; i++) {
 75 |         OUTPUT << i << ", " << testcases.at(i)->testKey() << ":\n" << testcases.at(i)->testDesc() << "\n\n";
 76 |     }
 77 | }
 78 | 
 79 | std::string getDeviceDisplayInfo(int deviceOrdinal) {
 80 |     std::stringstream sstream;
 81 |     CUdevice dev;
 82 |     char name[STRING_LENGTH];
 83 |     int busId, deviceId, domainId;
 84 | 
 85 |     CU_ASSERT(cuDeviceGet(&dev, deviceOrdinal));
 86 |     CU_ASSERT(cuDeviceGetName(name, STRING_LENGTH, dev));
 87 |     CU_ASSERT(cuDeviceGetAttribute(&domainId, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
 88 |     CU_ASSERT(cuDeviceGetAttribute(&busId, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
 89 |     CU_ASSERT(cuDeviceGetAttribute(&deviceId, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
 90 |     sstream << name << " (" <<
 91 |         std::hex << std::setw(8) << std::setfill('0') << domainId << ":" <<
 92 |         std::hex << std::setw(2) << std::setfill('0') << busId << ":" <<
 93 |         std::hex << std::setw(2) << std::setfill('0') << deviceId << ")" <<
 94 |         std::dec << std::setfill(' ') << std::setw(0);  // reset formatting
 95 | 
 96 |     return sstream.str();
 97 | }
 98 | 
 99 | #ifdef MULTINODE
100 | // Exchange and print information about all devices in MPI world
101 | // Through this process each process learns about GPUs of other processes, as well as,
102 | // determines its own GPU index
103 | // Each process is allocated a dedicated GPU. It is advisable to initiate NUM_GPU processes per system,
104 | // with each process autonomously selecting a GPU to utilize. To determine this selection,
105 | // processes exchange their hostnames, and look for duplicates of own hostname among processes with lower value of worldRank.
106 | // localRank is equal to number of processes with the same hostname, but lower worldRank.
107 | static void printGPUsMultinode(int deviceCount) {
108 |     // Exchange hostnames
109 |     std::vector<char> hostnameExchange(worldSize * STRING_LENGTH);
110 |     MPI_Allgather(localHostname, STRING_LENGTH, MPI_BYTE, &hostnameExchange[0], STRING_LENGTH, MPI_BYTE, MPI_COMM_WORLD);
111 | 
112 |     // Find local rank based on hostnames
113 |     localRank = 0;
114 |     for (int i = 0; i < worldRank; i++) {
115 |         if (strncmp(localHostname, &hostnameExchange[i * STRING_LENGTH], STRING_LENGTH) == 0) {
116 |             localRank++;
117 |         }
118 |     }
119 | 
120 |     std::vector<int> deviceCountExchange(worldSize);
121 |     MPI_Allgather(&deviceCount, 1, MPI_INT, &deviceCountExchange[0], 1, MPI_INT, MPI_COMM_WORLD);
122 | 
123 |     localDevice = localRank % deviceCount;
124 | 
125 |     // It's not recommended to run more ranks per node than GPU count, but we want to make sure we handle it gracefully
126 |     std::map<std::string, int> gpuCounts;
127 |     for (int i = 0; i < worldSize; i++) {
128 |         std::string host(&hostnameExchange[i * STRING_LENGTH]);
129 |         gpuCounts[host]++;
130 |         if (gpuCounts[host] == deviceCountExchange[i] + 1) {
131 |             // Unconditionally emitting a warning, once per node
132 |             std::stringstream warning;
133 |             warning << "Warning: there are more processes than GPUs on " << host << ". Please reduce number of processes to match GPU count.";
134 |             output->recordWarning(warning.str());
135 |         }
136 |     }
137 | 
138 |     // Exchange device names
139 |     std::string localDeviceName = getDeviceDisplayInfo(localDevice);
140 |     ASSERT(localDeviceName.size() < STRING_LENGTH);
141 |     localDeviceName.resize(STRING_LENGTH);
142 | 
143 |     std::vector<char> deviceNameExchange(worldSize * STRING_LENGTH, 0);
144 |     MPI_Allgather(&localDeviceName[0], STRING_LENGTH, MPI_BYTE, &deviceNameExchange[0], STRING_LENGTH, MPI_BYTE, MPI_COMM_WORLD);
145 | 
146 |     // Exchange device ids
147 |     std::vector<int> localDeviceIdExchange(worldSize, -1);
148 |     MPI_Allgather(&localDevice, 1, MPI_INT, &localDeviceIdExchange[0], 1, MPI_INT, MPI_COMM_WORLD);
149 | 
150 |     // Print gathered info
151 |     for (int i = 0; i < worldSize; i++) {
152 |         char *deviceName = &deviceNameExchange[i * STRING_LENGTH];
153 |         OUTPUT << "Process " << getPaddedProcessId(i) << " (" << &hostnameExchange[i * STRING_LENGTH] << "): device " << localDeviceIdExchange[i] << ": " << deviceName << std::endl;
154 |     }
155 |     OUTPUT << std::endl;
156 | }
157 | #endif
158 | 
159 | static void printGPUs() {
160 |     OUTPUT << localHostname << std::endl;
161 |     for (int iDev = 0; iDev < deviceCount; iDev++) {
162 |         OUTPUT << "Device " << iDev << ": " << getDeviceDisplayInfo(iDev) << std::endl;
163 |     }
164 |     OUTPUT << std::endl;
165 | }
166 | 
167 | void Output::recordDevices(int deviceCount) {
168 | #ifdef MULTINODE
169 |     printGPUsMultinode(deviceCount);
170 | #else
171 |     printGPUs();
172 | #endif
173 | }
174 | 
175 | void Output::addTestcase(const std::string &name, const std::string &status, const std::string &msg) {
176 |     if (status == NVB_RUNNING) {
177 |         OUTPUT << status << " " << name << ".\n";
178 |     } else {
179 |         OUTPUT << status << ": " << msg << std::endl;
180 |     }
181 | }
182 | 
183 | void Output::setTestcaseStatusAndAddIfNeeded(const std::string &name, const std::string &status, const std::string &msg) {
184 |     // For plain text output, the name has always been printed already and therefore isn't needed here
185 |     OUTPUT << status << ": " << msg << std::endl;
186 | }
187 | 
188 | void Output::addTestcaseResults(const PeerValueMatrix<double> &bandwidthValues, const std::string &description) {
189 |     OUTPUT << description << std::endl;
190 |     OUTPUT << std::fixed << std::setprecision(2) << bandwidthValues << std::endl;
191 | }
192 | 
193 | void Output::print() {
194 |     // NO-OP
195 | }
196 | 
197 | void Output::recordErrorCurrentTest(const std::string &errorLine1, const std::string &errorLine2) {
198 |     OUTPUT << errorLine1 << std::endl << errorLine2 << std::endl;
199 | }
200 | 
201 | void Output::recordWarning(const std::string &warning) {
202 |     OUTPUT << warning << std::endl;
203 | }
204 | 
205 | void RecordError(const std::stringstream &errmsg) {
206 |     output->recordError(errmsg.str());
207 | }
208 | 


--------------------------------------------------------------------------------
/output.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #ifndef OUTPUT_H_
 19 | #define OUTPUT_H_
 20 | 
 21 | #include <string>
 22 | #include <vector>
 23 | 
 24 | #include "testcase.h"
 25 | 
 26 | extern const std::string NVB_TITLE;
 27 | extern const std::string NVB_CUDA_RUNTIME_VERSION;
 28 | extern const std::string NVB_DRIVER_VERSION;
 29 | extern const std::string NVB_GIT_VERSION;
 30 | extern const std::string NVB_ERROR;
 31 | extern const std::string NVB_WARNING;
 32 | extern const std::string NVB_TESTCASES;
 33 | extern const std::string NVB_TESTCASE_NAME;
 34 | extern const std::string NVB_STATUS;
 35 | extern const std::string NVB_BW_DESCRIPTION;
 36 | extern const std::string NVB_BW_MATRIX;
 37 | extern const std::string NVB_BW_SUM;
 38 | extern const std::string NVB_BUFFER_SIZE;
 39 | extern const std::string NVB_TEST_SAMPLES;
 40 | extern const std::string NVB_USE_MEAN;
 41 | extern const std::string NVB_PASSED;
 42 | extern const std::string NVB_RUNNING;
 43 | extern const std::string NVB_WAIVED;
 44 | extern const std::string NVB_NOT_FOUND;
 45 | extern const std::string NVB_ERROR_STATUS;
 46 | 
 47 | class Output {
 48 |  public:
 49 |     virtual void addTestcase(const std::string &name, const std::string &status, const std::string &msg = "");
 50 | 
 51 |     /*
 52 |      * If a test case matching the specified name exists, then update the status. If no testcase with that name exists,
 53 |      * then add a new one and set the status.
 54 |      *
 55 |      * @param name - the name of the test case
 56 |      * @param status - the status (PASS, FAIL, WAIVED, NOT FOUND)
 57 |      * @param msg - additional details if specified
 58 |      */
 59 |     virtual void setTestcaseStatusAndAddIfNeeded(const std::string &name, const std::string &status, const std::string &msg = "");
 60 | 
 61 |     virtual void print();
 62 | 
 63 |     /*
 64 |      * Records a global error
 65 |      *
 66 |      * @param errorParts - each entry in this vector is one line of an error. In JSON output, all lines are combined.
 67 |      */
 68 |     virtual void recordError(const std::vector<std::string> &errorParts);
 69 | 
 70 |     /*
 71 |      * Records a global error
 72 |      */
 73 |     virtual void recordError(const std::string &error);
 74 | 
 75 |     /*
 76 |      * Records a test error
 77 |      *
 78 |      * @param errorPart1 - the first part of the error. For plain text output, this is printed on line 1.
 79 |      * @param errorPart2 - the second part of the error. For plain text output, this is printed on line 2.
 80 |      * NOTE: in JSON output, these are combined on a single line
 81 |      */
 82 |     virtual void recordErrorCurrentTest(const std::string &errorPart1, const std::string &errorPart2);
 83 | 
 84 |     virtual void recordWarning(const std::string &warning);
 85 | 
 86 |     virtual void addCudaAndDriverInfo(int cudaVersion, const std::string &driverVersion);
 87 | 
 88 |     virtual void addTestcaseResults(const PeerValueMatrix<double> &matrix, const std::string &description);
 89 | 
 90 |     virtual void addVersionInfo();
 91 | 
 92 |     virtual void printInfo();
 93 | 
 94 |     virtual void recordDevices(int deviceCount);
 95 | 
 96 |     void listTestcases(const std::vector<Testcase*> &testcases);
 97 | };
 98 | 
 99 | extern Output *output;
100 | 
101 | std::string getDeviceDisplayInfo(int deviceOrdinal);
102 | 
103 | #endif  // OUTPUT_H_
104 | 


--------------------------------------------------------------------------------
/testcase.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include "common.h"
 19 | #include "output.h"
 20 | #include "testcase.h"
 21 | #include "inline_common.h"
 22 | 
 23 | Testcase::Testcase(std::string key, std::string desc) :
 24 |     key(std::move(key)), desc(std::move(desc))
 25 | {}
 26 | 
 27 | std::string Testcase::testKey() { return key; }
 28 | std::string Testcase::testDesc() { return desc; }
 29 | 
 30 | bool Testcase::filterHasAccessiblePeerPairs() {
 31 |     int deviceCount = 0;
 32 |     CU_ASSERT(cuDeviceGetCount(&deviceCount));
 33 | 
 34 |     for (int currentDevice = 0; currentDevice < deviceCount; currentDevice++) {
 35 |         for (int peer = 0; peer < deviceCount; peer++) {
 36 |             int canAccessPeer = 0;
 37 | 
 38 |             if (peer == currentDevice) {
 39 |                 continue;
 40 |             }
 41 | 
 42 |             CU_ASSERT(cuDeviceCanAccessPeer(&canAccessPeer, currentDevice, peer));
 43 |             if (canAccessPeer) {
 44 |                 return true;
 45 |             }
 46 |         }
 47 |     }
 48 | 
 49 |     return false;
 50 | }
 51 | 
 52 | bool Testcase::filterSupportsMulticast() {
 53 |     int deviceCount = 0;
 54 |     CU_ASSERT(cuDeviceGetCount(&deviceCount));
 55 | 
 56 |     for (int currentDevice = 0; currentDevice < deviceCount; currentDevice++) {
 57 |         CUdevice dev;
 58 |         CU_ASSERT(cuDeviceGet(&dev, currentDevice));
 59 |         int supportsMulticast = 0;
 60 | 
 61 |         CU_ASSERT(cuDeviceGetAttribute(&supportsMulticast, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
 62 |         if (!supportsMulticast) {
 63 |             return false;
 64 |         }
 65 |     }
 66 | 
 67 |     return true;
 68 | }
 69 | 
 70 | #ifdef MULTINODE
 71 | // Each MPI rank handles one GPU, so we simply have to check if we have more than 1 process
 72 | bool Testcase::filterHasMultipleGPUsMultinode() {
 73 |     return worldSize > 1;
 74 | }
 75 | #endif
 76 | 
 77 | void Testcase::latencyHelper(const MemcpyBuffer &dataBuffer, bool measureDeviceToDeviceLatency) {
 78 |     uint64_t n_ptrs = dataBuffer.getBufferSize() / sizeof(struct LatencyNode);
 79 | 
 80 |     if (measureDeviceToDeviceLatency) {
 81 |         // For device-to-device latency, create and initialize pattern on device
 82 |         for (uint64_t i = 0; i < n_ptrs; i++) {
 83 |             struct LatencyNode node;
 84 |             size_t nextOffset = ((i + strideLen) % n_ptrs) * sizeof(struct LatencyNode);
 85 |             // Set up pattern with device addresses
 86 |             node.next = (struct LatencyNode*)(dataBuffer.getBuffer() + nextOffset);
 87 |             CU_ASSERT(cuMemcpyHtoD(dataBuffer.getBuffer() + i*sizeof(struct LatencyNode),
 88 |                                  &node, sizeof(struct LatencyNode)));
 89 |         }
 90 |     } else {
 91 |         // For host-device latency, initialize pattern with host addresses
 92 |         struct LatencyNode* hostMem = (struct LatencyNode*)dataBuffer.getBuffer();
 93 |         for (uint64_t i = 0; i < n_ptrs; i++) {
 94 |             hostMem[i].next = &hostMem[(i + strideLen) % n_ptrs];
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | void Testcase::allToOneHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool isRead) {
100 |     std::vector<const DeviceBuffer*> allSrcBuffers;
101 | 
102 |     // allocate all src nodes up front, re-use to avoid reallocation
103 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
104 |         allSrcBuffers.push_back(new DeviceBuffer(size, deviceId));
105 |     }
106 | 
107 |     for (int dstDeviceId = 0; dstDeviceId < deviceCount; dstDeviceId++) {
108 |         std::vector<const MemcpyBuffer*> dstBuffers;
109 |         std::vector<const MemcpyBuffer*> srcBuffers;
110 | 
111 |         for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
112 |             if (srcDeviceId == dstDeviceId) {
113 |                 continue;
114 |             }
115 | 
116 |             DeviceBuffer* dstBuffer = new DeviceBuffer(size, dstDeviceId);
117 | 
118 |             if (!dstBuffer->enablePeerAcess(*allSrcBuffers[srcDeviceId])) {
119 |                 delete dstBuffer;
120 |                 continue;
121 |             }
122 | 
123 |             srcBuffers.push_back(allSrcBuffers[srcDeviceId]);
124 |             dstBuffers.push_back(dstBuffer);
125 |         }
126 |         // If no peer GPUs, skip measurements.
127 |         if (!srcBuffers.empty()) {
128 |             if (isRead) {
129 |                 // swap dst and src for read tests
130 |                 bandwidthValues.value(0, dstDeviceId) = memcpyInstance.doMemcpy(dstBuffers, srcBuffers);
131 |             } else {
132 |                 bandwidthValues.value(0, dstDeviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers);
133 |             }
134 |         }
135 | 
136 |         for (auto node : dstBuffers) {
137 |             delete node;
138 |         }
139 |     }
140 | 
141 |     for (auto node : allSrcBuffers) {
142 |         delete node;
143 |     }
144 | }
145 | 
146 | void Testcase::oneToAllHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool isRead) {
147 |     std::vector<const DeviceBuffer*> allDstBuffers;
148 | 
149 |     // allocate all src nodes up front, re-use to avoid reallocation
150 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
151 |         allDstBuffers.push_back(new DeviceBuffer(size, deviceId));
152 |     }
153 | 
154 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
155 |         std::vector<const MemcpyBuffer*> dstBuffers;
156 |         std::vector<const MemcpyBuffer*> srcBuffers;
157 | 
158 |         for (int dstDeviceId = 0; dstDeviceId < deviceCount; dstDeviceId++) {
159 |             if (srcDeviceId == dstDeviceId) {
160 |                 continue;
161 |             }
162 | 
163 |             DeviceBuffer* srcBuffer = new DeviceBuffer(size, srcDeviceId);
164 | 
165 |             if (!srcBuffer->enablePeerAcess(*allDstBuffers[dstDeviceId])) {
166 |                 delete srcBuffer;
167 |                 continue;
168 |             }
169 | 
170 |             srcBuffers.push_back(srcBuffer);
171 |             dstBuffers.push_back(allDstBuffers[dstDeviceId]);
172 |         }
173 |         // If no peer GPUs, skip measurements.
174 |         if ( !srcBuffers.empty() ) {
175 |             if (isRead) {
176 |                 // swap dst and src for read tests
177 |                 bandwidthValues.value(0, srcDeviceId) = memcpyInstance.doMemcpy(dstBuffers, srcBuffers);
178 |             } else {
179 |                 bandwidthValues.value(0, srcDeviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers);
180 |             }
181 |         }
182 | 
183 |         for (auto node : srcBuffers) {
184 |             delete node;
185 |         }
186 |     }
187 | 
188 |     for (auto node : allDstBuffers) {
189 |         delete node;
190 |     }
191 | }
192 | 
193 | void Testcase::allHostHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) {
194 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
195 |         std::vector<const MemcpyBuffer*> deviceBuffers;
196 |         std::vector<const MemcpyBuffer*> hostBuffers;
197 | 
198 |         deviceBuffers.push_back(new DeviceBuffer(size, deviceId));
199 |         hostBuffers.push_back(new HostBuffer(size, deviceId));
200 | 
201 |         for (int interferenceDeviceId = 0; interferenceDeviceId < deviceCount; interferenceDeviceId++) {
202 |             if (interferenceDeviceId == deviceId) {
203 |                 continue;
204 |             }
205 | 
206 |             // Double the size of the interference copy to ensure it interferes correctly
207 |             deviceBuffers.push_back(new DeviceBuffer(size * 2, interferenceDeviceId));
208 |             hostBuffers.push_back(new HostBuffer(size * 2, interferenceDeviceId));
209 |         }
210 | 
211 |         if (sourceIsHost) {
212 |             bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffers, deviceBuffers);
213 |         } else {
214 |             bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(deviceBuffers, hostBuffers);
215 |         }
216 | 
217 |         for (auto node : deviceBuffers) {
218 |             delete node;
219 |         }
220 | 
221 |         for (auto node : hostBuffers) {
222 |             delete node;
223 |         }
224 |     }
225 | }
226 | 
227 | void Testcase::allHostBidirHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) {
228 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
229 |         std::vector<const MemcpyBuffer*> srcBuffers;
230 |         std::vector<const MemcpyBuffer*> dstBuffers;
231 | 
232 |         if (sourceIsHost) {
233 |             srcBuffers.push_back(new HostBuffer(size, deviceId));
234 |             dstBuffers.push_back(new DeviceBuffer(size, deviceId));
235 | 
236 |             // Double the size of the interference copy to ensure it interferes correctly
237 |             srcBuffers.push_back(new DeviceBuffer(size * 2, deviceId));
238 |             dstBuffers.push_back(new HostBuffer(size * 2, deviceId));
239 |         } else {
240 |             srcBuffers.push_back(new DeviceBuffer(size, deviceId));
241 |             dstBuffers.push_back(new HostBuffer(size, deviceId));
242 | 
243 |             // Double the size of the interference copy to ensure it interferes correctly
244 |             srcBuffers.push_back(new HostBuffer(size * 2, deviceId));
245 |             dstBuffers.push_back(new DeviceBuffer(size * 2, deviceId));
246 |         }
247 | 
248 |         for (int interferenceDeviceId = 0; interferenceDeviceId < deviceCount; interferenceDeviceId++) {
249 |             if (interferenceDeviceId == deviceId) {
250 |                 continue;
251 |             }
252 | 
253 |             // Double the size of the interference copy to ensure it interferes correctly
254 |             srcBuffers.push_back(new DeviceBuffer(size * 2, interferenceDeviceId));
255 |             dstBuffers.push_back(new HostBuffer(size * 2, interferenceDeviceId));
256 | 
257 |             srcBuffers.push_back(new HostBuffer(size * 2, interferenceDeviceId));
258 |             dstBuffers.push_back(new DeviceBuffer(size * 2, interferenceDeviceId));
259 |         }
260 | 
261 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers);
262 | 
263 |         for (auto node : srcBuffers) {
264 |             delete node;
265 |         }
266 | 
267 |         for (auto node : dstBuffers) {
268 |             delete node;
269 |         }
270 |     }
271 | }
272 | 
273 | 


--------------------------------------------------------------------------------
/testcases_ce.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include <cuda.h>
 19 | 
 20 | #include "common.h"
 21 | #include "output.h"
 22 | #include "testcase.h"
 23 | #include "memcpy.h"
 24 | 
 25 | void HostToDeviceCE::run(unsigned long long size, unsigned long long loopCount) {
 26 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
 27 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
 28 | 
 29 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 30 |         HostBuffer hostBuffer(size, deviceId);
 31 |         DeviceBuffer deviceBuffer(size, deviceId);
 32 | 
 33 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffer, deviceBuffer);
 34 |     }
 35 | 
 36 |     output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)");
 37 | }
 38 | 
 39 | void DeviceToHostCE::run(unsigned long long size, unsigned long long loopCount) {
 40 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
 41 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
 42 | 
 43 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 44 |         HostBuffer hostBuffer(size, deviceId);
 45 |         DeviceBuffer deviceBuffer(size, deviceId);
 46 | 
 47 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(deviceBuffer, hostBuffer);
 48 |     }
 49 | 
 50 |     output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)");
 51 | }
 52 | 
 53 | void HostToDeviceBidirCE::run(unsigned long long size, unsigned long long loopCount) {
 54 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
 55 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
 56 | 
 57 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 58 |         // Double the size of the interference copy to ensure it interferes correctly
 59 |         HostBuffer host1(size, deviceId), host2(size * 2, deviceId);
 60 |         DeviceBuffer dev1(size, deviceId), dev2(size * 2, deviceId);
 61 | 
 62 |         std::vector<const MemcpyBuffer*> srcBuffers = {&host1, &dev2};
 63 |         std::vector<const MemcpyBuffer*> dstBuffers = {&dev1, &host2};
 64 | 
 65 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers);
 66 |     }
 67 | 
 68 |     output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)");
 69 | }
 70 | 
 71 | void DeviceToHostBidirCE::run(unsigned long long size, unsigned long long loopCount) {
 72 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
 73 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
 74 | 
 75 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 76 |         // Double the size of the interference copy to ensure it interferes correctly
 77 |         HostBuffer host1(size, deviceId), host2(size * 2, deviceId);
 78 |         DeviceBuffer dev1(size, deviceId), dev2(size * 2, deviceId);
 79 | 
 80 |         std::vector<const MemcpyBuffer*> srcBuffers = {&dev1, &host2};
 81 |         std::vector<const MemcpyBuffer*> dstBuffers = {&host1, &dev2};
 82 | 
 83 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers);
 84 |     }
 85 | 
 86 |     output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)");
 87 | }
 88 | 
 89 | // DtoD Read test - copy from dst to src (backwards) using src contxt
 90 | void DeviceToDeviceReadCE::run(unsigned long long size, unsigned long long loopCount) {
 91 |     PeerValueMatrix<double> bandwidthValues(deviceCount, deviceCount, key);
 92 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_DST_CONTEXT);
 93 | 
 94 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
 95 |         for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
 96 |             if (peerDeviceId == srcDeviceId) {
 97 |                 continue;
 98 |             }
 99 | 
100 |             DeviceBuffer srcBuffer(size, srcDeviceId);
101 |             DeviceBuffer peerBuffer(size, peerDeviceId);
102 | 
103 |             if (!srcBuffer.enablePeerAcess(peerBuffer)) {
104 |                 continue;
105 |             }
106 | 
107 |             // swap src and peer nodes, but use srcBuffers (the copy's destination) context
108 |             bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(peerBuffer, srcBuffer);
109 |         }
110 |     }
111 | 
112 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)");
113 | }
114 | 
115 | // DtoD Write test - copy from src to dst using src context
116 | void DeviceToDeviceWriteCE::run(unsigned long long size, unsigned long long loopCount) {
117 |     PeerValueMatrix<double> bandwidthValues(deviceCount, deviceCount, key);
118 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
119 | 
120 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
121 |         for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
122 |             if (peerDeviceId == srcDeviceId) {
123 |                 continue;
124 |             }
125 | 
126 |             DeviceBuffer srcBuffer(size, srcDeviceId);
127 |             DeviceBuffer peerBuffer(size, peerDeviceId);
128 | 
129 |             if (!srcBuffer.enablePeerAcess(peerBuffer)) {
130 |                 continue;
131 |             }
132 | 
133 |             bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(srcBuffer, peerBuffer);
134 |         }
135 |     }
136 | 
137 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)");
138 | }
139 | 
140 | // DtoD Bidir Read test - copy from dst to src (backwards) using src contxt
141 | void DeviceToDeviceBidirReadCE::run(unsigned long long size, unsigned long long loopCount) {
142 |     PeerValueMatrix<double> bandwidthValuesRead1(deviceCount, deviceCount, key + "_read1");
143 |     PeerValueMatrix<double> bandwidthValuesRead2(deviceCount, deviceCount, key + "_read2");
144 |     PeerValueMatrix<double> bandwidthValuesTotal(deviceCount, deviceCount, key + "_total");
145 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_DST_CONTEXT, MemcpyOperation::VECTOR_BW);
146 | 
147 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
148 |         for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
149 |             if (peerDeviceId == srcDeviceId) {
150 |                 continue;
151 |             }
152 | 
153 |             // Double the size of the interference copy to ensure it interferes correctly
154 |             DeviceBuffer src1(size, srcDeviceId), src2(size, srcDeviceId);
155 |             DeviceBuffer peer1(size, peerDeviceId), peer2(size, peerDeviceId);
156 | 
157 |             if (!src1.enablePeerAcess(peer1)) {
158 |                 continue;
159 |             }
160 | 
161 |             // swap src and peer nodes, but use srcBuffers (the copy's destination) context
162 |             std::vector<const MemcpyBuffer*> srcBuffers = {&peer1, &src2};
163 |             std::vector<const MemcpyBuffer*> peerBuffers = {&src1, &peer2};
164 | 
165 |             auto results = memcpyInstance.doMemcpyVector(srcBuffers, peerBuffers);
166 |             bandwidthValuesRead1.value(srcDeviceId, peerDeviceId) = results[0];
167 |             bandwidthValuesRead2.value(srcDeviceId, peerDeviceId) = results[1];
168 |             bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1];
169 |         }
170 |     }
171 | 
172 |     output->addTestcaseResults(bandwidthValuesRead1, "memcpy CE GPU(row) <-> GPU(column) Read1 bandwidth (GB/s)");
173 |     output->addTestcaseResults(bandwidthValuesRead2, "memcpy CE GPU(row) <-> GPU(column) Read2 bandwidth (GB/s)");
174 |     output->addTestcaseResults(bandwidthValuesTotal, "memcpy CE GPU(row) <-> GPU(column) Total bandwidth (GB/s)");
175 | }
176 | 
177 | // DtoD Bidir Write test - copy from src to dst using src context
178 | void DeviceToDeviceBidirWriteCE::run(unsigned long long size, unsigned long long loopCount) {
179 |     PeerValueMatrix<double> bandwidthValuesWrite1(deviceCount, deviceCount, key + "_write1");
180 |     PeerValueMatrix<double> bandwidthValuesWrite2(deviceCount, deviceCount, key + "_write2");
181 |     PeerValueMatrix<double> bandwidthValuesTotal(deviceCount, deviceCount, key + "_total");
182 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
183 | 
184 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
185 |         for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
186 |             if (peerDeviceId == srcDeviceId) {
187 |                 continue;
188 |             }
189 | 
190 |             // Double the size of the interference copy to ensure it interferes correctly
191 |             DeviceBuffer src1(size, srcDeviceId), src2(size, srcDeviceId);
192 |             DeviceBuffer peer1(size, peerDeviceId), peer2(size, peerDeviceId);
193 | 
194 |             if (!src1.enablePeerAcess(peer1)) {
195 |                 continue;
196 |             }
197 | 
198 |             // swap src and peer nodes, but use srcBuffers (the copy's destination) context
199 |             std::vector<const MemcpyBuffer*> srcBuffers = {&peer1, &src2};
200 |             std::vector<const MemcpyBuffer*> peerBuffers = {&src1, &peer2};
201 | 
202 |             auto results = memcpyInstance.doMemcpyVector(srcBuffers, peerBuffers);
203 |             bandwidthValuesWrite1.value(srcDeviceId, peerDeviceId) = results[0];
204 |             bandwidthValuesWrite2.value(srcDeviceId, peerDeviceId) = results[1];
205 |             bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1];
206 |         }
207 |     }
208 | 
209 |     output->addTestcaseResults(bandwidthValuesWrite1, "memcpy CE GPU(row) <-> GPU(column) Write1 bandwidth (GB/s)");
210 |     output->addTestcaseResults(bandwidthValuesWrite2, "memcpy CE GPU(row) <-> GPU(column) Write2 bandwidth (GB/s)");
211 |     output->addTestcaseResults(bandwidthValuesTotal, "memcpy CE GPU(row) <-> GPU(column) Total bandwidth (GB/s)");
212 | }
213 | 
214 | void DeviceLocalCopy::run(unsigned long long size, unsigned long long loopCount) {
215 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
216 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
217 | 
218 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
219 |         DeviceBuffer deviceBuffer1(size, deviceId);
220 |         DeviceBuffer deviceBuffer2(size, deviceId);
221 | 
222 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(deviceBuffer2, deviceBuffer1);
223 |     }
224 | 
225 |     output->addTestcaseResults(bandwidthValues, "memcpy local GPU(column) bandwidth (GB/s)");
226 | }
227 | 
228 | void AllToHostCE::run(unsigned long long size, unsigned long long loopCount) {
229 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
230 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
231 | 
232 |     allHostHelper(size, memcpyInstance, bandwidthValues, false);
233 | 
234 |     output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)");
235 | }
236 | 
237 | void AllToHostBidirCE::run(unsigned long long size, unsigned long long loopCount) {
238 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
239 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
240 | 
241 |     allHostBidirHelper(size, memcpyInstance, bandwidthValues, false);
242 | 
243 |     output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)");
244 | }
245 | 
246 | void HostToAllCE::run(unsigned long long size, unsigned long long loopCount) {
247 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
248 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
249 | 
250 |     allHostHelper(size, memcpyInstance, bandwidthValues, true);
251 | 
252 |     output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)");
253 | }
254 | 
255 | void HostToAllBidirCE::run(unsigned long long size, unsigned long long loopCount) {
256 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
257 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE());
258 | 
259 |     allHostBidirHelper(size, memcpyInstance, bandwidthValues, true);
260 | 
261 |     output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)");
262 | }
263 | 
264 | // Write test - copy from src to dst using src context
265 | void AllToOneWriteCE::run(unsigned long long size, unsigned long long loopCount) {
266 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
267 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW);
268 |     allToOneHelper(size, memcpyInstance, bandwidthValues, false);
269 | 
270 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)");
271 | }
272 | 
273 | // Read test - copy from dst to src (backwards) using src contxt
274 | void AllToOneReadCE::run(unsigned long long size, unsigned long long loopCount) {
275 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
276 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_DST_CONTEXT, MemcpyOperation::TOTAL_BW);
277 |     allToOneHelper(size, memcpyInstance, bandwidthValues, true);
278 | 
279 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)");
280 | }
281 | 
282 | // Write test - copy from src to dst using src context
283 | void OneToAllWriteCE::run(unsigned long long size, unsigned long long loopCount) {
284 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
285 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW);
286 |     oneToAllHelper(size, memcpyInstance, bandwidthValues, false);
287 | 
288 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)");
289 | }
290 | 
291 | // Read test - copy from dst to src (backwards) using src contxt
292 | void OneToAllReadCE::run(unsigned long long size, unsigned long long loopCount) {
293 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
294 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_DST_CONTEXT, MemcpyOperation::TOTAL_BW);
295 |     oneToAllHelper(size, memcpyInstance, bandwidthValues, true);
296 | 
297 |     output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)");
298 | }
299 | 


--------------------------------------------------------------------------------
/testcases_sm.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include "testcase.h"
 19 | #include "kernels.cuh"
 20 | #include "memcpy.h"
 21 | #include "common.h"
 22 | #include "output.h"
 23 | 
 24 | void HostDeviceLatencySM::run(unsigned long long size, unsigned long long loopCount) {
 25 |     PeerValueMatrix<double> latencyValues(1, deviceCount, key, perfFormatter, LATENCY);
 26 |     MemPtrChaseOperation ptrChaseOp(latencyMemAccessCnt);
 27 | 
 28 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 29 |         HostBuffer dataBuffer(size, deviceId);
 30 |         latencyHelper(dataBuffer, false);
 31 |         latencyValues.value(0, deviceId) = ptrChaseOp.doPtrChase(deviceId, dataBuffer);
 32 |     }
 33 | 
 34 |     output->addTestcaseResults(latencyValues, "memory latency SM CPU(row) <-> GPU(column) (ns)");
 35 | }
 36 | 
 37 | void HostToDeviceSM::run(unsigned long long size, unsigned long long loopCount) {
 38 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
 39 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM());
 40 | 
 41 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 42 |         HostBuffer hostBuffer(size, deviceId);
 43 |         DeviceBuffer deviceBuffer(size, deviceId);
 44 | 
 45 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffer, deviceBuffer);
 46 |     }
 47 | 
 48 |     output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)");
 49 | }
 50 | 
 51 | void DeviceToHostSM::run(unsigned long long size, unsigned long long loopCount) {
 52 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
 53 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM());
 54 | 
 55 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 56 |         HostBuffer hostBuffer(size, deviceId);
 57 |         DeviceBuffer deviceBuffer(size, deviceId);
 58 | 
 59 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(deviceBuffer, hostBuffer);
 60 |     }
 61 | 
 62 |     output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)");
 63 | }
 64 | 
 65 | void HostToDeviceBidirSM::run(unsigned long long size, unsigned long long loopCount) {
 66 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
 67 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSMSplitWarp());
 68 | 
 69 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 70 |         HostBuffer hostBuffer(size, deviceId);
 71 |         DeviceBuffer deviceBuffer(size, deviceId);
 72 | 
 73 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffer, deviceBuffer);
 74 |     }
 75 | 
 76 |     output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)");
 77 | }
 78 | 
 79 | void DeviceToHostBidirSM::run(unsigned long long size, unsigned long long loopCount) {
 80 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
 81 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSMSplitWarp());
 82 | 
 83 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
 84 |         HostBuffer hostBuffer(size, deviceId);
 85 |         DeviceBuffer deviceBuffer(size, deviceId);
 86 | 
 87 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffer, deviceBuffer);
 88 |     }
 89 | 
 90 |     output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)");
 91 | }
 92 | 
 93 | // DtoD Read test - copy from dst to src (backwards) using src contxt
 94 | void DeviceToDeviceReadSM::run(unsigned long long size, unsigned long long loopCount) {
 95 |     PeerValueMatrix<double> bandwidthValues(deviceCount, deviceCount, key);
 96 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_DST_CONTEXT);
 97 | 
 98 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
 99 |         for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
100 |             if (peerDeviceId == srcDeviceId) {
101 |                 continue;
102 |             }
103 | 
104 |             DeviceBuffer srcBuffer(size, srcDeviceId);
105 |             DeviceBuffer peerBuffer(size, peerDeviceId);
106 | 
107 |             if (!srcBuffer.enablePeerAcess(peerBuffer)) {
108 |                 continue;
109 |             }
110 | 
111 |             // swap src and peer nodes, but use srcBuffers (the copy's destination) context
112 |             bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(peerBuffer, srcBuffer);
113 |         }
114 |     }
115 | 
116 |     output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) -> GPU(column) bandwidth (GB/s)");
117 | }
118 | 
119 | void DeviceToDeviceLatencySM::run(unsigned long long size, unsigned long long loopCount) {
120 |     PeerValueMatrix<double> latencyValues(deviceCount, deviceCount, key, perfFormatter, LATENCY);
121 |     MemPtrChaseOperation ptrChaseOp(latencyMemAccessCnt);
122 | 
123 |     for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
124 |         DeviceBuffer peerBuffer(size, peerDeviceId);
125 |         latencyHelper(peerBuffer, true);
126 | 
127 |         for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
128 |             if (peerDeviceId == srcDeviceId) {
129 |                 continue;
130 |             }
131 | 
132 |             // Note: srcBuffer is not used in the pointer chase operation
133 |             // It is simply used here to enable peer access
134 |             DeviceBuffer srcBuffer(size, srcDeviceId);
135 |             if (!srcBuffer.enablePeerAcess(peerBuffer)) {
136 |                 continue;
137 |             }
138 |             latencyValues.value(srcDeviceId, peerDeviceId) = ptrChaseOp.doPtrChase(srcDeviceId, peerBuffer);
139 |         }
140 |     }
141 |     output->addTestcaseResults(latencyValues, "Device to Device Latency SM GPU(row) <-> GPU(column) (ns)");
142 | }
143 | 
144 | // DtoD Write test - copy from src to dst using src context
145 | void DeviceToDeviceWriteSM::run(unsigned long long size, unsigned long long loopCount) {
146 |     PeerValueMatrix<double> bandwidthValues(deviceCount, deviceCount, key);
147 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM());
148 | 
149 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
150 |         for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
151 |             if (peerDeviceId == srcDeviceId) {
152 |                 continue;
153 |             }
154 | 
155 |             DeviceBuffer srcBuffer(size, srcDeviceId);
156 |             DeviceBuffer peerBuffer(size, peerDeviceId);
157 | 
158 |             if (!srcBuffer.enablePeerAcess(peerBuffer)) {
159 |                 continue;
160 |             }
161 | 
162 |             bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(srcBuffer, peerBuffer);
163 |         }
164 |     }
165 | 
166 |     output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) <- GPU(column) bandwidth (GB/s)");
167 | }
168 | 
169 | // DtoD Bidir Read test - copy to dst from src (backwards) using dst contxt
170 | void DeviceToDeviceBidirReadSM::run(unsigned long long size, unsigned long long loopCount) {
171 |     PeerValueMatrix<double> bandwidthValuesRead1(deviceCount, deviceCount, key + "_read1");
172 |     PeerValueMatrix<double> bandwidthValuesRead2(deviceCount, deviceCount, key + "_read2");
173 |     PeerValueMatrix<double> bandwidthValuesTotal(deviceCount, deviceCount, key + "_total");
174 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_DST_CONTEXT, MemcpyOperation::VECTOR_BW);
175 | 
176 | 
177 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
178 |         for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
179 |             if (peerDeviceId == srcDeviceId) {
180 |                 continue;
181 |             }
182 | 
183 |             DeviceBuffer src1(size, srcDeviceId), src2(size, srcDeviceId);
184 |             DeviceBuffer peer1(size, peerDeviceId), peer2(size, peerDeviceId);
185 | 
186 |             if (!src1.enablePeerAcess(peer1)) {
187 |                 continue;
188 |             }
189 | 
190 |             // swap src and peer nodes, but use srcBuffers (the copy's destination) context
191 |             std::vector<const MemcpyBuffer*> srcBuffers = {&peer1, &src2};
192 |             std::vector<const MemcpyBuffer*> peerBuffers = {&src1, &peer2};
193 | 
194 |             auto results = memcpyInstance.doMemcpyVector(srcBuffers, peerBuffers);
195 |             bandwidthValuesRead1.value(srcDeviceId, peerDeviceId) = results[0];
196 |             bandwidthValuesRead2.value(srcDeviceId, peerDeviceId) = results[1];
197 |             bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1];
198 |         }
199 |     }
200 | 
201 |     output->addTestcaseResults(bandwidthValuesRead1, "memcpy SM GPU(row) <-> GPU(column) Read1 bandwidth (GB/s)");
202 |     output->addTestcaseResults(bandwidthValuesRead2, "memcpy SM GPU(row) <-> GPU(column) Read2 bandwidth (GB/s)");
203 |     output->addTestcaseResults(bandwidthValuesTotal, "memcpy SM GPU(row) <-> GPU(column) Total bandwidth (GB/s)");
204 | }
205 | 
206 | // DtoD Bidir Write test - copy from  src to dst using src contxt
207 | void DeviceToDeviceBidirWriteSM::run(unsigned long long size, unsigned long long loopCount) {
208 |     PeerValueMatrix<double> bandwidthValuesWrite1(deviceCount, deviceCount, key + "_write1");
209 |     PeerValueMatrix<double> bandwidthValuesWrite2(deviceCount, deviceCount, key + "_write2");
210 |     PeerValueMatrix<double> bandwidthValuesTotal(deviceCount, deviceCount, key + "_total");
211 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
212 | 
213 | 
214 |     for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) {
215 |         for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) {
216 |             if (peerDeviceId == srcDeviceId) {
217 |                 continue;
218 |             }
219 | 
220 |             DeviceBuffer src1(size, srcDeviceId), src2(size, srcDeviceId);
221 |             DeviceBuffer peer1(size, peerDeviceId), peer2(size, peerDeviceId);
222 | 
223 |             if (!src1.enablePeerAcess(peer1)) {
224 |                 continue;
225 |             }
226 | 
227 |             // swap src and peer nodes, but use srcBuffers (the copy's destination) context
228 |             std::vector<const MemcpyBuffer*> srcBuffers = {&peer1, &src2};
229 |             std::vector<const MemcpyBuffer*> peerBuffers = {&src1, &peer2};
230 | 
231 |             auto results = memcpyInstance.doMemcpyVector(srcBuffers, peerBuffers);
232 |             bandwidthValuesWrite1.value(srcDeviceId, peerDeviceId) = results[0];
233 |             bandwidthValuesWrite2.value(srcDeviceId, peerDeviceId) = results[1];
234 |             bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1];
235 |         }
236 |     }
237 | 
238 |     output->addTestcaseResults(bandwidthValuesWrite1, "memcpy SM GPU(row) <-> GPU(column) Write1 bandwidth (GB/s)");
239 |     output->addTestcaseResults(bandwidthValuesWrite2, "memcpy SM GPU(row) <-> GPU(column) Write2 bandwidth (GB/s)");
240 |     output->addTestcaseResults(bandwidthValuesTotal, "memcpy SM GPU(row) <-> GPU(column) Total bandwidth (GB/s)");
241 | }
242 | 
243 | void AllToHostSM::run(unsigned long long size, unsigned long long loopCount) {
244 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
245 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::USE_FIRST_BW);
246 | 
247 |     allHostHelper(size, memcpyInstance, bandwidthValues, false);
248 | 
249 |     output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)");
250 | }
251 | 
252 | void AllToHostBidirSM::run(unsigned long long size, unsigned long long loopCount) {
253 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
254 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSMSplitWarp(), PREFER_SRC_CONTEXT, MemcpyOperation::USE_FIRST_BW);
255 | 
256 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
257 |         std::vector<const MemcpyBuffer*> srcBuffers;
258 |         std::vector<const MemcpyBuffer*> dstBuffers;
259 | 
260 |         srcBuffers.push_back(new DeviceBuffer(size, deviceId));
261 |         dstBuffers.push_back(new HostBuffer(size, deviceId));
262 | 
263 |         for (int interferenceDeviceId = 0; interferenceDeviceId < deviceCount; interferenceDeviceId++) {
264 |             if (interferenceDeviceId == deviceId) {
265 |                 continue;
266 |             }
267 | 
268 |             srcBuffers.push_back(new DeviceBuffer(size, interferenceDeviceId));
269 |             dstBuffers.push_back(new HostBuffer(size, interferenceDeviceId));
270 |         }
271 | 
272 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers);
273 | 
274 |         for (auto node : srcBuffers) {
275 |             delete node;
276 |         }
277 | 
278 |         for (auto node : dstBuffers) {
279 |             delete node;
280 |         }
281 |     }
282 | 
283 |     output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)");
284 | }
285 | 
286 | void HostToAllSM::run(unsigned long long size, unsigned long long loopCount) {
287 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
288 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::USE_FIRST_BW);
289 | 
290 |     allHostHelper(size, memcpyInstance, bandwidthValues, true);
291 | 
292 |     output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)");
293 | }
294 | 
295 | void HostToAllBidirSM::run(unsigned long long size, unsigned long long loopCount) {
296 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
297 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSMSplitWarp(), PREFER_SRC_CONTEXT, MemcpyOperation::USE_FIRST_BW);
298 | 
299 |     for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
300 |         std::vector<const MemcpyBuffer*> srcBuffers;
301 |         std::vector<const MemcpyBuffer*> dstBuffers;
302 | 
303 |         srcBuffers.push_back(new HostBuffer(size, deviceId));
304 |         dstBuffers.push_back(new DeviceBuffer(size, deviceId));
305 | 
306 |         for (int interferenceDeviceId = 0; interferenceDeviceId < deviceCount; interferenceDeviceId++) {
307 |             if (interferenceDeviceId == deviceId) {
308 |                 continue;
309 |             }
310 | 
311 |             srcBuffers.push_back(new DeviceBuffer(size, interferenceDeviceId));
312 |             dstBuffers.push_back(new HostBuffer(size, interferenceDeviceId));
313 |         }
314 | 
315 |         bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers);
316 | 
317 |         for (auto node : srcBuffers) {
318 |             delete node;
319 |         }
320 | 
321 |         for (auto node : dstBuffers) {
322 |             delete node;
323 |         }
324 |     }
325 | 
326 |     output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)");
327 | }
328 | 
329 | // Write test - copy from src to dst using src context
330 | void AllToOneWriteSM::run(unsigned long long size, unsigned long long loopCount) {
331 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
332 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW);
333 | 
334 |     allToOneHelper(size, memcpyInstance, bandwidthValues, false);
335 | 
336 |     output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) <- GPU(column) bandwidth (GB/s)");
337 | }
338 | 
339 | // Read test - copy from dst to src (backwards) using src contxt
340 | void AllToOneReadSM::run(unsigned long long size, unsigned long long loopCount) {
341 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
342 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW);
343 |     allToOneHelper(size, memcpyInstance, bandwidthValues, true);
344 | 
345 |     output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) -> GPU(column) bandwidth (GB/s)");
346 | }
347 | 
348 | // Write test - copy from src to dst using src context
349 | void OneToAllWriteSM::run(unsigned long long size, unsigned long long loopCount) {
350 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
351 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW);
352 |     oneToAllHelper(size, memcpyInstance, bandwidthValues, false);
353 | 
354 |     output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) -> GPU(column) bandwidth (GB/s)");
355 | }
356 | 
357 | // Read test - copy from dst to src (backwards) using src contxt
358 | void OneToAllReadSM::run(unsigned long long size, unsigned long long loopCount) {
359 |     PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
360 |     MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW);
361 |     oneToAllHelper(size, memcpyInstance, bandwidthValues, true);
362 | 
363 |     output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) <- GPU(column) bandwidth (GB/s)");
364 | }
365 | 


--------------------------------------------------------------------------------
/version.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | #ifndef VERSION_H_
19 | #define VERSION_H_
20 | 
21 | #define NVBANDWIDTH_VERSION "v0.8"
22 | #ifndef GIT_VERSION
23 | #define GIT_VERSION "unknown"
24 | #endif
25 | 
26 | #endif  // VERSION_H_
27 | 


--------------------------------------------------------------------------------