├── CHANGELOG.md ├── CMakeLists.txt ├── LICENSE ├── Licenses.txt ├── README.md ├── common.h ├── debian_install.sh ├── detect_cuda_arch.cmake ├── diagrams ├── DtoDBidir.png ├── DtoHBidir.png ├── HtoDBidir.png └── measurement.png ├── error_handling.h ├── inline_common.h ├── json ├── json-forwards.h ├── json.h └── jsoncpp.cpp ├── json_output.cpp ├── json_output.h ├── kernels.cu ├── kernels.cuh ├── memcpy.cpp ├── memcpy.h ├── multinode_memcpy.cpp ├── multinode_memcpy.h ├── multinode_testcases.cpp ├── nvbandwidth.cpp ├── output.cpp ├── output.h ├── testcase.cpp ├── testcase.h ├── testcases_ce.cpp ├── testcases_sm.cpp └── version.h /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Changelog 2 | 3 | ### nvbandwidth 0.8 4 | Bug Fixes: 5 | * Device Latency Test Accuracy: 6 | * Fixed an issue where the device_to_device_latency test was incorrectly 7 | reporting host-device latency instead of device-to-device latency. 8 | * Host-device latency reports now correctly reflect C2C or PCIe latency 9 | depending on the system, while device-to-device latency reports focus on 10 | NVLINK or equivalent inter-device connections. 11 | * Adjust buffer size threshold use to select which copy kernel is used, for 12 | more accurate measurements. 13 | * Add host name to json output 14 | * Updated README 15 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(nvbandwidth 4 | LANGUAGES CUDA CXX) 5 | 6 | set(CMAKE_CXX_STANDARD 17) 7 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 8 | set(CMAKE_CUDA_STANDARD 17) 9 | set(CMAKE_CUDA_STANDARD_REQUIRED ON) 10 | 11 | if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") 12 | # 5.2 architecture not supported since CUDA 13.0 13 | set(supported_archs "70" "75" "80" "86" "89" "90" "100") 14 | else () 15 | set(supported_archs "52" "70" "75" "80" "86" "89" "90" "100") 16 | endif() 17 | 18 | if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) 19 | message(STATUS "Detecting underlying CUDA Arch to set CMAKE_CUDA_ARCHITECTURES") 20 | include(detect_cuda_arch.cmake) 21 | # Set CMAKE_CUDA_ARCHITECURES based on the underlying device 22 | cuda_detect_architectures(supported_archs CMAKE_CUDA_ARCHITECTURES) 23 | endif() 24 | 25 | if(NOT CMAKE_BUILD_TYPE) 26 | set(CMAKE_BUILD_TYPE "Release") 27 | endif() 28 | 29 | if(CMAKE_SYSTEM_NAME STREQUAL "Linux") 30 | file(READ "/etc/os-release" OS_RELEASE_CONTENT) 31 | # Skip static libs on Fedora - https://github.com/NVIDIA/nvbandwidth/issues/4 32 | if(NOT OS_RELEASE_CONTENT MATCHES "ID=.*fedora") 33 | set(Boost_USE_STATIC_LIBS ON) 34 | endif() 35 | else() 36 | set(Boost_USE_STATIC_LIBS ON) 37 | endif() 38 | find_package(Boost COMPONENTS program_options REQUIRED) 39 | 40 | set(src 41 | testcase.cpp 42 | testcases_ce.cpp 43 | testcases_sm.cpp 44 | kernels.cu 45 | memcpy.cpp 46 | nvbandwidth.cpp 47 | multinode_memcpy.cpp 48 | multinode_testcases.cpp 49 | output.cpp 50 | json_output.cpp 51 | json/jsoncpp.cpp 52 | ) 53 | 54 | execute_process( 55 | COMMAND git describe --always --tags 56 | WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} 57 | OUTPUT_VARIABLE GIT_VERSION 58 | OUTPUT_STRIP_TRAILING_WHITESPACE 59 | ) 60 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGIT_VERSION=\\\"\"${GIT_VERSION}\"\\\"") 61 | 62 | if(WIN32) 63 | set(NVML_LIB_NAME "nvml") 64 | else() 65 | set(NVML_LIB_NAME "nvidia-ml") 66 | endif() 67 | 68 | add_executable(nvbandwidth ${src}) 69 | target_include_directories(nvbandwidth PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} .) 70 | target_link_libraries(nvbandwidth Boost::program_options ${NVML_LIB_NAME} cuda) 71 | 72 | if (MULTINODE) 73 | find_package(MPI REQUIRED) 74 | include_directories(SYSTEM ${MPI_INCLUDE_PATH}) 75 | target_link_libraries(nvbandwidth MPI::MPI_CXX) 76 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMULTINODE") 77 | endif() 78 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /Licenses.txt: -------------------------------------------------------------------------------- 1 | JsonCpp: 2 | Copyright Baptiste Lepilleur - Public domain and MIT licenses 3 | Attribution statements: Nvidia actively chooses to accept jsoncpp as public domain where acceptable and MIT licensed where public domain is not accepted. 4 | License text ( https://github.com/open-source-parsers/jsoncpp/blob/master/LICENSE ) 5 | 6 | /*! 7 | * The JsonCpp library's source code, including accompanying documentation, 8 | * tests and demonstration applications, are licensed under the following 9 | * conditions... 10 | * 11 | * Baptiste Lepilleur and The JsonCpp Authors explicitly disclaim copyright in all 12 | * jurisdictions which recognize such a disclaimer. In such jurisdictions, 13 | * this software is released into the Public Domain. 14 | * 15 | * In jurisdictions which do not recognize Public Domain property (e.g. Germany as of 16 | * 2010), this software is Copyright (c) 2007-2010 by Baptiste Lepilleur and 17 | * The JsonCpp Authors, and is released under the terms of the MIT License (see below). 18 | * 19 | * In jurisdictions which recognize Public Domain property, the user of this 20 | * software may choose to accept it either as 1) Public Domain, 2) under the 21 | * conditions of the MIT License (see below), or 3) under the terms of dual 22 | * Public Domain/MIT License conditions described here, as they choose. 23 | * 24 | * The MIT License is about as close to Public Domain as a license can get, and is 25 | * described in clear, concise terms at: 26 | * 27 | * http://en.wikipedia.org/wiki/MIT_License 28 | * 29 | * The full text of the MIT License follows: 30 | * 31 | * ======================================================================== 32 | * Copyright (c) 2007-2010 Baptiste Lepilleur and The JsonCpp Authors 33 | * 34 | * Permission is hereby granted, free of charge, to any person 35 | * obtaining a copy of this software and associated documentation 36 | * files (the "Software"), to deal in the Software without 37 | * restriction, including without limitation the rights to use, copy, 38 | * modify, merge, publish, distribute, sublicense, and/or sell copies 39 | * of the Software, and to permit persons to whom the Software is 40 | * furnished to do so, subject to the following conditions: 41 | * 42 | * The above copyright notice and this permission notice shall be 43 | * included in all copies or substantial portions of the Software. 44 | * 45 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 46 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 47 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 48 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 49 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 50 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 51 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 52 | * SOFTWARE. 53 | * ======================================================================== 54 | * (END LICENSE TEXT) 55 | * 56 | * The MIT license is compatible with both the GPL and commercial 57 | * software, affording one all of the rights of Public Domain with the 58 | * minor nuisance of being required to keep the above copyright notice 59 | * and license text in the source code. Note also that by accepting the 60 | * Public Domain "license" you can re-license your copy using whatever 61 | * license you like. 62 | */ 63 | 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nvbandwidth 2 | A tool for bandwidth measurements on NVIDIA GPUs. 3 | 4 | Measures bandwidth for various memcpy patterns across different links using copy engine or kernel copy methods. 5 | nvbandwidth reports current measured bandwidth on your system. Additional system-specific tuning may be required to achieve maximal peak bandwidth. 6 | 7 | ## Requirements 8 | nvbandwidth requires the installation of a CUDA toolkit and some additional Linux software components to be built and run. This section provides the relevant details 9 | Install a cuda toolkit (version 11.X or above). Multinode version requires 12.3 toolkit and 550 driver or above. 10 | 11 | Install a compiler package which supports c++17. GCC 7.x or above is a possible option. 12 | 13 | Install cmake (version 3.20 or above). 14 | Cmake version 3.24 or newer is encouraged. 15 | 16 | Install Boost program options library (More details in the next section) 17 | 18 | Ensure that path to nvcc binary (install via toolkit) is available in the $PATH variable on linux systems 19 | In order to run nvbandwidth, the system should have a CUDA enabled GPU and an NVIDIA display driver that is compatible with the CUDA Toolkit being used to build nvbandwidth. 20 | For more information, refer to https://docs.nvidia.com/deploy/cuda-compatibility/ 21 | 22 | ## Dependencies 23 | To build and run nvbandwidth please install the Boost program_options library (https://www.boost.org/doc/libs/1_66_0/doc/html/program_options.html). 24 | 25 | Ubuntu/Debian users can run the following to install: 26 | ``` 27 | apt install libboost-program-options-dev 28 | ``` 29 | On Ubuntu/Debian, we have provided a utility script (debian_install.sh) which installs some generic software components needed for the build. 30 | The script also builds the nvbandwidth project. 31 | ``` 32 | sudo ./debian_install.sh 33 | ``` 34 | 35 | Fedora users can run the following to install: 36 | ``` 37 | sudo dnf -y install boost-devel 38 | ``` 39 | 40 | ## Build 41 | To build the `nvbandwidth` executable for single-node: 42 | ``` 43 | cmake . 44 | make 45 | ``` 46 | You may need to set the BOOST_ROOT environment variable on Windows to tell CMake where to find your Boost installation. 47 | 48 | ## Usage: 49 | ``` 50 | ./nvbandwidth -h 51 | 52 | nvbandwidth CLI: 53 | -h [ --help ] Produce help message 54 | -b [ --bufferSize ] arg (=512) Memcpy buffer size in MiB 55 | -l [ --list ] List available testcases 56 | -t [ --testcase ] arg Testcase(s) to run (by name or index) 57 | -p [ --testcasePrefixes ] arg Testcase(s) to run (by prefix)) 58 | -v [ --verbose ] Verbose output 59 | -s [ --skipVerification ] Skips data verification after copy 60 | -d [ --disableAffinity ] Disable automatic CPU affinity control 61 | -i [ --testSamples ] arg (=3) Iterations of the benchmark 62 | -m [ --useMean ] Use mean instead of median for results 63 | -j [ --json ] Print output in json format instead of plain 64 | text. 65 | ``` 66 | To run all testcases: 67 | ``` 68 | ./nvbandwidth 69 | ``` 70 | 71 | To run a specific testcase: 72 | ``` 73 | ./nvbandwidth -t device_to_device_memcpy_read_ce 74 | ``` 75 | Example output: 76 | ``` 77 | Running device_to_device_memcpy_write_ce. 78 | memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s) 79 | 0 1 2 3 4 5 6 7 80 | 0 0.00 276.07 276.36 276.14 276.29 276.48 276.55 276.33 81 | 1 276.19 0.00 276.29 276.29 276.57 276.48 276.38 276.24 82 | 2 276.33 276.29 0.00 276.38 276.50 276.50 276.29 276.31 83 | 3 276.19 276.62 276.24 0.00 276.29 276.60 276.29 276.55 84 | 4 276.03 276.55 276.45 276.76 0.00 276.45 276.36 276.62 85 | 5 276.17 276.57 276.19 276.50 276.31 0.00 276.31 276.15 86 | 6 274.89 276.41 276.38 276.67 276.41 276.26 0.00 276.33 87 | 7 276.12 276.45 276.12 276.36 276.00 276.57 276.45 0.00 88 | ``` 89 | 90 | Set number of iterations and the buffer size for copies with --testSamples and --bufferSize 91 | 92 | ## Multinode benchmarks 93 | 94 | In order to build multinode version of nvbandwidth, execute 95 | 96 | ``` 97 | cmake -DMULTINODE=1 . 98 | make 99 | ``` 100 | 101 | Multinode nvbandwidth requires MPI. Cmake will find a local installation of MPI to build and link against. Multinode also requires installing, setting up the imex service, and creating the imex channels. Imex service is the NVIDIA Internode Memory Exchange Service. Imex runs on each compute tray to support GPU memory export and import operations across OS domains in an NVLink multi-node deployment. To start the IMEX service, run the following command: 102 | 103 | `sudo systemctl start nvidia-imex.service` 104 | Specify the IP addresses of the cluster nodes in /etc/nvidia-imex/nodes_config.cfg file. 105 | 106 | For example, to run multinode bandwidth on a system with 2 nodes and 4 GPUs per node run the command: 107 | `mpirun --allow-run-as-root --map-by ppr:4:node --bind-to core -np 8 --report-bindings -q -mca btl_tcp_if_include enP5p9s0 --hostfile /etc/nvidia-imex/nodes_config.cfg ./nvbandwidth -p multinode` 108 | 109 | ### Local testing 110 | 111 | You can test it on a single-node machine (Ampere+ GPU required): 112 | 113 | ``` 114 | mpirun -n 4 ./nvbandwidth -p multinode 115 | ``` 116 | This command will spawn 4 processes, and run all tests with "multinode" prefix. 117 | 118 | ### Running it on a cluster 119 | 120 | To run it on a cluster, submit a job to a workload scheduler that has MPI integration. Run one process per GPU. 121 | 122 | Running less processes than GPU count is valid, processes will take consecutive GPUs, starting from GPU 0. 123 | 124 | Running more processes than GPU count is not valid. 125 | 126 | All ranks in the MPI batch must be part of one multinode clique. Run one instance of nvbandwidth per node/GPU. 127 | 128 | When running under MPI, only MPI rank 0 will output stdout to the console. Stderr, if needed, will be output by all processes. 129 | 130 | It is recommended to only run "multinode*" testcases under MPI. While any testcase will succeed, results for non multinode testcases will only come from MPI rank 0. 131 | 132 | ## Test Details 133 | There are two types of copies implemented, Copy Engine (CE) or Steaming Multiprocessor (SM) 134 | 135 | CE copies use memcpy APIs. SM copies use kernels. 136 | 137 | SM copies will truncate the copy size to fit uniformly on the target device to correctly report the bandwidth. The actual byte size for the copy is: 138 | ``` 139 | (threadsPerBlock * deviceSMCount) * floor(copySize / (threadsPerBlock * deviceSMCount)) 140 | ``` 141 | 142 | threadsPerBlock is set to 512. 143 | 144 | ### Measurement Details 145 | ![](diagrams/measurement.png) 146 | 147 | A blocking kernel and CUDA events are used to measure time to perform copies via SM or CE, and bandwidth is calculated from a series of copies. 148 | 149 | First, we enqueue a spin kernel that spins on a flag in host memory. The spin kernel spins on the device until all events for measurement have been fully enqueued into the measurement streams. This ensures that the overhead of enqueuing operations is excluded from the measurement of actual transfer over the interconnect. Next, we enqueue a start event, certain count of memcpy iterations, and finally a stop event. Finally, we release the flag to start the measurement. 150 | 151 | This process is repeated 3 times, and the median bandwidth for each trial is reported. 152 | 153 | Number of repetitions can be overriden using the --testSamples option, and in order to use arithmetic mean instead of median you can specify --useMean option. 154 | 155 | ### Unidirectional Bandwidth Tests 156 | ``` 157 | Running host_to_device_memcpy_ce. 158 | memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s) 159 | 0 1 2 3 4 5 6 7 160 | 0 26.03 25.94 25.97 26.00 26.19 25.95 26.00 25.97 161 | ``` 162 | 163 | Unidirectional tests measure the bandwidth between each pair in the output matrix individually. Traffic is not sent simultaneously. 164 | 165 | ### Bidirectional Host <-> Device Bandwidth Tests 166 | ``` 167 | Running host_to_device_bidirectional_memcpy_ce. 168 | memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s) 169 | 0 1 2 3 4 5 6 7 170 | 0 18.56 18.37 19.37 19.59 18.71 18.79 18.46 18.61 171 | ``` 172 | 173 | The setup for bidirectional host to device bandwidth transfer is shown below: 174 | ![](diagrams/HtoDBidir.png) 175 | 176 | **CE copies** 177 | Stream 0 (measured stream) performs writes to the device, while the interfering stream in the opposite direction produces reads. This pattern is reversed for measuring bidirectional device to host bandwidth as shown below. 178 | 179 | 180 | ![](diagrams/DtoHBidir.png) 181 | 182 | **SM copies** 183 | The test launches a kernel copy where alternating thread warps are copying data in alternating directions. 184 | 185 | ### Bidirectional Device <-> Device Bandwidth Tests 186 | The setup for bidirectional device to device transfers is shown below: 187 | 188 | ![](diagrams/DtoDBidir.png) 189 | 190 | **CE copies** 191 | Stream 0 (measured stream) performs writes to the device, while the interfering stream in the opposite direction produces reads. This pattern is reversed for measuring bidirectional device to host bandwidth as shown below. 192 | 193 | **SM Copies** 194 | Similar to the HtoDBidir test above, the test launches a kernel where alternating thread warps copy data in alternating directions. 195 | 196 | **Bandwidth calculation** 197 | CE bidirectional bandwidth tests calculate bandwidth on the measured stream: 198 | ``` 199 | CE bidir. bandwidth = (size of data on measured stream) / (time on measured stream) 200 | ``` 201 | However, SM bidirectional test launches a kernel where odd and even warps are copying data in different directions. Bandwidth is calculated as shown below: 202 | ``` 203 | SM bidir. bandwidth = size/(kernel time); 204 | ``` 205 | -------------------------------------------------------------------------------- /common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef COMMON_H_ 19 | #define COMMON_H_ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #define STRING_LENGTH 256 39 | 40 | // Default constants 41 | const unsigned long long defaultLoopCount = 16; 42 | const unsigned long long smallBufferThreshold = 64; 43 | const unsigned long long defaultBufferSize = 512; // 512 MiB 44 | const unsigned int defaultAverageLoopCount = 3; 45 | const unsigned int _MiB = 1024 * 1024; 46 | const unsigned int _2MiB = 2 * _MiB; 47 | const unsigned int numThreadPerBlock = 512; 48 | const unsigned int strideLen = 16; /* cacheLine size 128 Bytes, 16 words */ 49 | const unsigned long latencyMemAccessCnt = 1000000; /* 1M total read accesses to gauge latency */ 50 | extern int deviceCount; 51 | extern unsigned int averageLoopCount; 52 | extern bool disableAffinity; 53 | extern bool skipVerification; 54 | extern bool useMean; 55 | extern bool jsonOutput; 56 | // Verbosity 57 | extern bool verbose; 58 | extern bool perfFormatter; 59 | 60 | #ifdef MULTINODE 61 | extern int localDevice; 62 | extern int localRank; 63 | extern int worldRank; 64 | extern int worldSize; 65 | #endif 66 | extern char localHostname[STRING_LENGTH]; 67 | 68 | class Verbosity { 69 | public: 70 | bool &controlVariable; 71 | 72 | Verbosity(bool &controlVariable): controlVariable(controlVariable) {} 73 | 74 | template 75 | Verbosity& operator<<(T input) { 76 | if (!jsonOutput && controlVariable) std::cout << input; 77 | return *this; 78 | } 79 | 80 | using StreamType = decltype(std::cout); 81 | Verbosity &operator<<(StreamType &(*func)(StreamType &)) { 82 | if (!jsonOutput && controlVariable) { 83 | func(std::cout); 84 | } 85 | return *this; 86 | } 87 | }; 88 | extern Verbosity VERBOSE; 89 | extern Verbosity OUTPUT; 90 | 91 | #ifdef _MSC_VER 92 | #define __PRETTY_FUNCTION__ __FUNCTION__ 93 | #endif 94 | 95 | // Rounds n up to the nearest multiple of "multiple". 96 | // if n is already a multiple of "multiple", n is returned unchanged. 97 | // works for arbitrary value of "multiple". 98 | #define ROUND_UP(n, multiple) \ 99 | (((n) + ((multiple)-1)) - (((n) + ((multiple)-1)) % (multiple))) 100 | 101 | #define PROC_MASK_WORD_BITS (8 * sizeof(size_t)) 102 | 103 | #define PROC_MASK_SIZE \ 104 | ROUND_UP(std::thread::hardware_concurrency(), PROC_MASK_WORD_BITS) / 8 105 | 106 | #define PROC_MASK_QUERY_BIT(mask, proc) \ 107 | (mask[proc / PROC_MASK_WORD_BITS] & \ 108 | ((size_t)1 << (proc % PROC_MASK_WORD_BITS))) \ 109 | ? 1 \ 110 | : 0 111 | 112 | /* Set a bit in an affinity mask */ 113 | #define PROC_MASK_SET(mask, proc) \ 114 | do { \ 115 | size_t _proc = (proc); \ 116 | (mask)[_proc / PROC_MASK_WORD_BITS] |= (size_t)1 \ 117 | << (_proc % PROC_MASK_WORD_BITS); \ 118 | } while (0) 119 | 120 | /* Clear a bit in an affinity mask */ 121 | #define PROC_MASK_CLEAR(mask, proc) \ 122 | do { \ 123 | size_t _proc = (proc); \ 124 | (mask)[_proc / PROC_MASK_WORD_BITS] &= \ 125 | ~((size_t)1 << (_proc % PROC_MASK_WORD_BITS)); \ 126 | } while (0) 127 | 128 | inline size_t getFirstEnabledCPU() { 129 | size_t firstEnabledCPU = 0; 130 | size_t *procMask = (size_t *)calloc(1, PROC_MASK_SIZE); 131 | for (size_t i = 0; i < PROC_MASK_SIZE * 8; ++i) { 132 | if (PROC_MASK_QUERY_BIT(procMask, i)) { 133 | firstEnabledCPU = i; 134 | break; 135 | } 136 | } 137 | free(procMask); 138 | return firstEnabledCPU; 139 | } 140 | 141 | // Calculation and display of performance statistics 142 | // Basic online running statistics calculator, modeled after a less templated 143 | // version of boost::accumulators. 144 | class PerformanceStatistic { 145 | std::vector values; 146 | 147 | public: 148 | void operator()(const double &sample) { recordSample(sample); } 149 | 150 | void recordSample(const double &sample) { 151 | auto it = std::lower_bound(values.begin(), values.end(), sample); 152 | values.insert(it, sample); 153 | } 154 | 155 | void reset(void) { values.clear(); } 156 | 157 | double sum(void) const { 158 | double total = 0.0; 159 | for (double val : values) { 160 | total += val; 161 | } 162 | return total; 163 | } 164 | 165 | size_t count(void) const { return values.size(); } 166 | 167 | double mean(void) const { 168 | return sum() / count(); 169 | } 170 | 171 | double variance(void) const { 172 | double calculated_mean = mean(); 173 | double sum_diff_squared = 0.0; 174 | for (double val : values) { 175 | double diff = val - calculated_mean; 176 | sum_diff_squared += diff * diff; 177 | } 178 | return (values.size() > 1 ? sum_diff_squared / (values.size() - 1) : 0.0); 179 | } 180 | 181 | double stddev(void) const { 182 | return (variance() > 0.0 ? std::sqrt(variance()) : 0.0); 183 | } 184 | 185 | double largest(void) const { return values.size() > 0 ? values[values.size() - 1] : 0.0; } 186 | 187 | double smallest(void) const { return values.size() > 0 ? values[0] : 0.0; } 188 | 189 | double median(void) const { 190 | if (values.size() == 0) { 191 | return 0.0; 192 | } else if (values.size() % 2 == 0) { 193 | int idx = values.size() / 2; 194 | return (values[idx] + values[idx - 1]) / 2.0; 195 | } else { 196 | return values[values.size() / 2]; 197 | } 198 | } 199 | 200 | double returnAppropriateMetric(void) const { 201 | if (useMean) { 202 | return mean(); 203 | } else { 204 | return median(); 205 | } 206 | } 207 | }; 208 | 209 | #ifdef MULTINODE 210 | inline std::string getPaddedProcessId(int id) { 211 | // max printed number will be worldSize - 1 212 | int paddingSize = (int) log10(worldSize - 1) + 1; 213 | std::stringstream s; 214 | s << std::setfill(' ') << std::setw(paddingSize) << id; 215 | return s.str(); 216 | } 217 | #endif 218 | 219 | struct LatencyNode { 220 | struct LatencyNode *next; 221 | }; 222 | 223 | enum UnitType { 224 | BANDWIDTH, 225 | LATENCY 226 | }; 227 | 228 | inline std::string getUnitString(UnitType unitType) { 229 | switch (unitType) { 230 | case BANDWIDTH: 231 | return " +GB/s"; 232 | case LATENCY: 233 | return " -ns"; 234 | default: 235 | return ""; 236 | } 237 | } 238 | 239 | // Describe attributes of a single memcpy operation 240 | class MemcpyDescriptor { 241 | public: 242 | CUdeviceptr dst; 243 | CUdeviceptr src; 244 | CUstream stream; 245 | size_t copySize; 246 | unsigned long long loopCount; 247 | 248 | MemcpyDescriptor(CUdeviceptr dst, CUdeviceptr src, CUstream stream, size_t copySize, unsigned long long loopCount); 249 | }; 250 | 251 | 252 | #endif // COMMON_H_ 253 | -------------------------------------------------------------------------------- /debian_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Utility script that attempts to install 3 | # necessary software components needed to 4 | # build nvbandwidth 5 | 6 | apt install -y build-essential 7 | apt install -y libboost-program-options-dev 8 | apt install -y cmake 9 | output=$(cmake --version | sed -n 1p | sed 's/[^0-9]*//g') 10 | if [ $output -lt 3200 ]; then 11 | echo "Upgrade cmake version to 3.20 or above to build nvbandwidth" 12 | exit 1 13 | fi 14 | cmake . 15 | make 16 | -------------------------------------------------------------------------------- /detect_cuda_arch.cmake: -------------------------------------------------------------------------------- 1 | include_guard(GLOBAL) 2 | 3 | # Function uses the CUDA runtime API to query the compute capability of the device, so if a user 4 | # doesn't pass any architecture options to CMake we only build the current architecture 5 | 6 | # Adapted from https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/detail/detect_architectures.cmake 7 | 8 | function(cuda_detect_architectures possible_archs_var gpu_archs) 9 | 10 | set(__gpu_archs ${${possible_archs_var}}) 11 | 12 | set(eval_file eval_gpu_archs.cu) 13 | set(eval_exe eval_gpu_archs) 14 | set(error_file eval_gpu_archs.stderr.log) 15 | 16 | if(NOT DEFINED CMAKE_CUDA_COMPILER) 17 | message(FATAL_ERROR "No CUDA compiler specified, unable to determine machine's GPUs.") 18 | endif() 19 | 20 | if(NOT EXISTS "${eval_exe}") 21 | file(WRITE ${eval_file} 22 | " 23 | #include 24 | #include 25 | #include 26 | using namespace std; 27 | int main(int argc, char** argv) { 28 | set archs; 29 | int nDevices; 30 | if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) { 31 | for(int dev=0;dev 31 | #define MPI_ABORT MPI_Abort(MPI_COMM_WORLD, 1) 32 | #else 33 | #define MPI_ABORT 34 | #endif 35 | 36 | // CUDA Error handling 37 | #define CUDA_ASSERT(x) do { \ 38 | cudaError_t cudaErr = (x); \ 39 | if ((cudaErr) != cudaSuccess) { \ 40 | std::stringstream errmsg; \ 41 | errmsg << "[" << cudaGetErrorName(cudaErr) << "] " << cudaGetErrorString(cudaErr) << " in expression " << #x << HOST_INFO << " in " << __PRETTY_FUNCTION__ << "() : " << __FILE__ << ":" << __LINE__ << std::endl; \ 42 | RecordError(errmsg); \ 43 | MPI_ABORT; \ 44 | std::exit(1); \ 45 | } \ 46 | } while ( 0 ) 47 | 48 | #define CU_ASSERT(x) do { \ 49 | CUresult cuResult = (x); \ 50 | if ((cuResult) != CUDA_SUCCESS) { \ 51 | const char *errDescStr, *errNameStr; \ 52 | cuGetErrorString(cuResult, &errDescStr); \ 53 | cuGetErrorName(cuResult, &errNameStr); \ 54 | std::stringstream errmsg; \ 55 | errmsg << "[" << errNameStr << "] " << errDescStr << " in expression " << #x << HOST_INFO << " in " << __PRETTY_FUNCTION__ << "() : " << __FILE__ << ":" << __LINE__ << std::endl; \ 56 | RecordError(errmsg); \ 57 | MPI_ABORT; \ 58 | std::exit(1); \ 59 | } \ 60 | } while ( 0 ) 61 | 62 | // NVML Error handling 63 | #define NVML_ASSERT(x) do { \ 64 | nvmlReturn_t nvmlResult = (x); \ 65 | if ((nvmlResult) != NVML_SUCCESS) { \ 66 | std::stringstream errmsg; \ 67 | errmsg << "NVML_ERROR: [" << nvmlErrorString(nvmlResult) << "] in expression " << #x << HOST_INFO << " in " << __PRETTY_FUNCTION__ << "() : " << __FILE__ << ":" << __LINE__ << std::endl; \ 68 | RecordError(errmsg); \ 69 | MPI_ABORT; \ 70 | std::exit(1); \ 71 | } \ 72 | } while ( 0 ) 73 | 74 | // Generic Error handling 75 | #define ASSERT(x) do { \ 76 | if (!(x)) { \ 77 | std::stringstream errmsg; \ 78 | errmsg << "ASSERT in expression " << #x << HOST_INFO << " in " << __PRETTY_FUNCTION__ << "() : " << __FILE__ << ":" << __LINE__ << std::endl; \ 79 | RecordError(errmsg); \ 80 | MPI_ABORT; \ 81 | std::exit(1); \ 82 | } \ 83 | } while ( 0 ) 84 | 85 | #endif // ERROR_HANDLING_H_ 86 | -------------------------------------------------------------------------------- /inline_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef INLINE_COMMON_H_ 19 | #define INLINE_COMMON_H_ 20 | 21 | #include "common.h" 22 | #include "error_handling.h" 23 | 24 | template struct PeerValueMatrix { 25 | std::vector> m_matrix; 26 | int m_rows, m_columns; 27 | std::string key; 28 | std::vector column_labels; 29 | std::vector row_labels; 30 | bool pFormatter; 31 | UnitType uType; 32 | 33 | PeerValueMatrix(int rows, int columns, std::string key = "", bool pFormatter = perfFormatter, UnitType uType = BANDWIDTH): m_matrix(rows * columns), m_rows(rows), m_columns(columns), key(key), pFormatter(perfFormatter), uType(uType) {} 34 | 35 | std::optional &value(int src, int dst) { 36 | ASSERT(src >= 0 && src < m_rows); 37 | ASSERT(dst >= 0 && dst < m_columns); 38 | return m_matrix[src * m_columns + dst]; 39 | } 40 | const std::optional &value(int src, int dst) const { 41 | ASSERT(src >= 0 && src < m_rows); 42 | ASSERT(dst >= 0 && dst < m_columns); 43 | return m_matrix[src * m_columns + dst]; 44 | } 45 | 46 | void setRowLabels(std::vector _row_labels) { 47 | row_labels = _row_labels; 48 | } 49 | 50 | void setColumnLabels(std::vector _column_labels) { 51 | column_labels = _column_labels; 52 | } 53 | }; 54 | 55 | template 56 | std::ostream &operator<<(std::ostream &o, const PeerValueMatrix &matrix) { 57 | // This assumes T is numeric 58 | T maxVal = std::numeric_limits::min(); 59 | T minVal = std::numeric_limits::max(); 60 | T sum = 0; 61 | int count = 0; 62 | 63 | // First square of the table should be blank, calculate and print appropriately many spaces 64 | int columnIdWidth = 2; 65 | for (auto s : matrix.row_labels) { 66 | columnIdWidth = std::max(columnIdWidth, (int) s.size()); 67 | } 68 | 69 | for (int i = 0; i < columnIdWidth; i++) { 70 | o << " "; 71 | } 72 | 73 | for (int currentDevice = 0; currentDevice < matrix.m_columns; currentDevice++) { 74 | if (matrix.column_labels.size() > 0) { 75 | o << std::setw(10) << matrix.column_labels[currentDevice]; 76 | } else { 77 | o << std::setw(10) << currentDevice; 78 | } 79 | } 80 | o << std::endl; 81 | 82 | for (int currentDevice = 0; currentDevice < matrix.m_rows; currentDevice++) { 83 | if (matrix.row_labels.size() > 0) { 84 | o << std::setw(columnIdWidth) << matrix.row_labels[currentDevice]; 85 | } else { 86 | o << std::setw(2) << currentDevice; 87 | } 88 | 89 | for (int peer = 0; peer < matrix.m_columns; peer++) { 90 | std::optional val = matrix.value(currentDevice, peer); 91 | if (val) { 92 | o << std::setw(10) << val.value(); 93 | } else { 94 | o << std::setw(10) << "N/A"; 95 | } 96 | sum += val.value_or(0.0); 97 | maxVal = std::max(maxVal, val.value_or(0.0)); 98 | minVal = std::min(minVal, val.value_or(0.0)); 99 | if (val.value_or(0.0) > 0) count++; 100 | } 101 | o << std::endl; 102 | } 103 | o << std::endl; 104 | if (matrix.pFormatter) { 105 | o << "&&&& PERF " << matrix.key << " " << sum << getUnitString(matrix.uType) << std::endl; 106 | } else { 107 | o << "SUM " << matrix.key << " " << sum << std::endl; 108 | } 109 | 110 | VERBOSE << "MIN " << matrix.key << " " << minVal << '\n'; 111 | VERBOSE << "MAX " << matrix.key << " " << maxVal << '\n'; 112 | VERBOSE << "AVG " << matrix.key << " " << sum / count << '\n'; 113 | return o; 114 | } 115 | 116 | // NUMA optimal affinity 117 | inline void setOptimalCpuAffinity(int cudaDeviceID) { 118 | #ifdef _WIN32 119 | // NVML doesn't support setting affinity on Windows 120 | return; 121 | #endif 122 | if (disableAffinity) { 123 | return; 124 | } 125 | 126 | nvmlDevice_t device; 127 | CUuuid dev_uuid; 128 | 129 | std::stringstream s; 130 | std::unordered_set dashPos {0, 4, 6, 8, 10}; 131 | 132 | CU_ASSERT(cuDeviceGetUuid(&dev_uuid, cudaDeviceID)); 133 | 134 | s << "GPU"; 135 | for (int i = 0; i < 16; i++) { 136 | if (dashPos.count(i)) { 137 | s << '-'; 138 | } 139 | s << std::hex << std::setfill('0') << std::setw(2) << (0xFF & (int)dev_uuid.bytes[i]); 140 | } 141 | 142 | NVML_ASSERT(nvmlDeviceGetHandleByUUID(s.str().c_str(), &device)); 143 | nvmlReturn_t result = nvmlDeviceSetCpuAffinity(device); 144 | if (result != NVML_ERROR_NOT_SUPPORTED) { 145 | NVML_ASSERT(result); 146 | } 147 | } 148 | 149 | inline bool isMemoryOwnedByCUDA(void *memory) { 150 | CUmemorytype memorytype; 151 | CUresult status = cuPointerGetAttribute(&memorytype, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)memory); 152 | if (status == CUDA_ERROR_INVALID_VALUE) { 153 | return false; 154 | } else { 155 | CU_ASSERT(status); 156 | return true; 157 | } 158 | } 159 | 160 | #endif // INLINE_COMMON_H_ 161 | -------------------------------------------------------------------------------- /json/json-forwards.h: -------------------------------------------------------------------------------- 1 | /// Json-cpp amalgamated forward header (http://jsoncpp.sourceforge.net/). 2 | /// It is intended to be used with #include "json/json-forwards.h" 3 | /// This header provides forward declaration for all JsonCpp types. 4 | 5 | // ////////////////////////////////////////////////////////////////////// 6 | // Beginning of content of file: LICENSE 7 | // ////////////////////////////////////////////////////////////////////// 8 | 9 | /* 10 | The JsonCpp library's source code, including accompanying documentation, 11 | tests and demonstration applications, are licensed under the following 12 | conditions... 13 | 14 | Baptiste Lepilleur and The JsonCpp Authors explicitly disclaim copyright in all 15 | jurisdictions which recognize such a disclaimer. In such jurisdictions, 16 | this software is released into the Public Domain. 17 | 18 | In jurisdictions which do not recognize Public Domain property (e.g. Germany as of 19 | 2010), this software is Copyright (c) 2007-2010 by Baptiste Lepilleur and 20 | The JsonCpp Authors, and is released under the terms of the MIT License (see below). 21 | 22 | In jurisdictions which recognize Public Domain property, the user of this 23 | software may choose to accept it either as 1) Public Domain, 2) under the 24 | conditions of the MIT License (see below), or 3) under the terms of dual 25 | Public Domain/MIT License conditions described here, as they choose. 26 | 27 | The MIT License is about as close to Public Domain as a license can get, and is 28 | described in clear, concise terms at: 29 | 30 | http://en.wikipedia.org/wiki/MIT_License 31 | 32 | The full text of the MIT License follows: 33 | 34 | ======================================================================== 35 | Copyright (c) 2007-2010 Baptiste Lepilleur and The JsonCpp Authors 36 | 37 | Permission is hereby granted, free of charge, to any person 38 | obtaining a copy of this software and associated documentation 39 | files (the "Software"), to deal in the Software without 40 | restriction, including without limitation the rights to use, copy, 41 | modify, merge, publish, distribute, sublicense, and/or sell copies 42 | of the Software, and to permit persons to whom the Software is 43 | furnished to do so, subject to the following conditions: 44 | 45 | The above copyright notice and this permission notice shall be 46 | included in all copies or substantial portions of the Software. 47 | 48 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 49 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 50 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 51 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 52 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 53 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 54 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 55 | SOFTWARE. 56 | ======================================================================== 57 | (END LICENSE TEXT) 58 | 59 | The MIT license is compatible with both the GPL and commercial 60 | software, affording one all of the rights of Public Domain with the 61 | minor nuisance of being required to keep the above copyright notice 62 | and license text in the source code. Note also that by accepting the 63 | Public Domain "license" you can re-license your copy using whatever 64 | license you like. 65 | 66 | */ 67 | 68 | // ////////////////////////////////////////////////////////////////////// 69 | // End of content of file: LICENSE 70 | // ////////////////////////////////////////////////////////////////////// 71 | 72 | 73 | 74 | 75 | 76 | #ifndef JSON_FORWARD_AMALGAMATED_H_INCLUDED 77 | # define JSON_FORWARD_AMALGAMATED_H_INCLUDED 78 | /// If defined, indicates that the source file is amalgamated 79 | /// to prevent private header inclusion. 80 | #define JSON_IS_AMALGAMATION 81 | 82 | // ////////////////////////////////////////////////////////////////////// 83 | // Beginning of content of file: include/json/version.h 84 | // ////////////////////////////////////////////////////////////////////// 85 | 86 | #ifndef JSON_VERSION_H_INCLUDED 87 | #define JSON_VERSION_H_INCLUDED 88 | 89 | // Note: version must be updated in three places when doing a release. This 90 | // annoying process ensures that amalgamate, CMake, and meson all report the 91 | // correct version. 92 | // 1. /meson.build 93 | // 2. /include/json/version.h 94 | // 3. /CMakeLists.txt 95 | // IMPORTANT: also update the SOVERSION!! 96 | 97 | #define JSONCPP_VERSION_STRING "1.9.5" 98 | #define JSONCPP_VERSION_MAJOR 1 99 | #define JSONCPP_VERSION_MINOR 9 100 | #define JSONCPP_VERSION_PATCH 5 101 | #define JSONCPP_VERSION_QUALIFIER 102 | #define JSONCPP_VERSION_HEXA \ 103 | ((JSONCPP_VERSION_MAJOR << 24) | (JSONCPP_VERSION_MINOR << 16) | \ 104 | (JSONCPP_VERSION_PATCH << 8)) 105 | 106 | #ifdef JSONCPP_USING_SECURE_MEMORY 107 | #undef JSONCPP_USING_SECURE_MEMORY 108 | #endif 109 | #define JSONCPP_USING_SECURE_MEMORY 0 110 | // If non-zero, the library zeroes any memory that it has allocated before 111 | // it frees its memory. 112 | 113 | #endif // JSON_VERSION_H_INCLUDED 114 | 115 | // ////////////////////////////////////////////////////////////////////// 116 | // End of content of file: include/json/version.h 117 | // ////////////////////////////////////////////////////////////////////// 118 | 119 | 120 | 121 | 122 | 123 | 124 | // ////////////////////////////////////////////////////////////////////// 125 | // Beginning of content of file: include/json/allocator.h 126 | // ////////////////////////////////////////////////////////////////////// 127 | 128 | // Copyright 2007-2010 Baptiste Lepilleur and The JsonCpp Authors 129 | // Distributed under MIT license, or public domain if desired and 130 | // recognized in your jurisdiction. 131 | // See file LICENSE for detail or copy at http://jsoncpp.sourceforge.net/LICENSE 132 | 133 | #ifndef JSON_ALLOCATOR_H_INCLUDED 134 | #define JSON_ALLOCATOR_H_INCLUDED 135 | 136 | #include 137 | #include 138 | 139 | #pragma pack(push) 140 | #pragma pack() 141 | 142 | namespace Json { 143 | template class SecureAllocator { 144 | public: 145 | // Type definitions 146 | using value_type = T; 147 | using pointer = T*; 148 | using const_pointer = const T*; 149 | using reference = T&; 150 | using const_reference = const T&; 151 | using size_type = std::size_t; 152 | using difference_type = std::ptrdiff_t; 153 | 154 | /** 155 | * Allocate memory for N items using the standard allocator. 156 | */ 157 | pointer allocate(size_type n) { 158 | // allocate using "global operator new" 159 | return static_cast(::operator new(n * sizeof(T))); 160 | } 161 | 162 | /** 163 | * Release memory which was allocated for N items at pointer P. 164 | * 165 | * The memory block is filled with zeroes before being released. 166 | */ 167 | void deallocate(pointer p, size_type n) { 168 | // memset_s is used because memset may be optimized away by the compiler 169 | memset_s(p, n * sizeof(T), 0, n * sizeof(T)); 170 | // free using "global operator delete" 171 | ::operator delete(p); 172 | } 173 | 174 | /** 175 | * Construct an item in-place at pointer P. 176 | */ 177 | template void construct(pointer p, Args&&... args) { 178 | // construct using "placement new" and "perfect forwarding" 179 | ::new (static_cast(p)) T(std::forward(args)...); 180 | } 181 | 182 | size_type max_size() const { return size_t(-1) / sizeof(T); } 183 | 184 | pointer address(reference x) const { return std::addressof(x); } 185 | 186 | const_pointer address(const_reference x) const { return std::addressof(x); } 187 | 188 | /** 189 | * Destroy an item in-place at pointer P. 190 | */ 191 | void destroy(pointer p) { 192 | // destroy using "explicit destructor" 193 | p->~T(); 194 | } 195 | 196 | // Boilerplate 197 | SecureAllocator() {} 198 | template SecureAllocator(const SecureAllocator&) {} 199 | template struct rebind { using other = SecureAllocator; }; 200 | }; 201 | 202 | template 203 | bool operator==(const SecureAllocator&, const SecureAllocator&) { 204 | return true; 205 | } 206 | 207 | template 208 | bool operator!=(const SecureAllocator&, const SecureAllocator&) { 209 | return false; 210 | } 211 | 212 | } // namespace Json 213 | 214 | #pragma pack(pop) 215 | 216 | #endif // JSON_ALLOCATOR_H_INCLUDED 217 | 218 | // ////////////////////////////////////////////////////////////////////// 219 | // End of content of file: include/json/allocator.h 220 | // ////////////////////////////////////////////////////////////////////// 221 | 222 | 223 | 224 | 225 | 226 | 227 | // ////////////////////////////////////////////////////////////////////// 228 | // Beginning of content of file: include/json/config.h 229 | // ////////////////////////////////////////////////////////////////////// 230 | 231 | // Copyright 2007-2010 Baptiste Lepilleur and The JsonCpp Authors 232 | // Distributed under MIT license, or public domain if desired and 233 | // recognized in your jurisdiction. 234 | // See file LICENSE for detail or copy at http://jsoncpp.sourceforge.net/LICENSE 235 | 236 | #ifndef JSON_CONFIG_H_INCLUDED 237 | #define JSON_CONFIG_H_INCLUDED 238 | #include 239 | #include 240 | #include 241 | #include 242 | #include 243 | #include 244 | #include 245 | #include 246 | 247 | // If non-zero, the library uses exceptions to report bad input instead of C 248 | // assertion macros. The default is to use exceptions. 249 | #ifndef JSON_USE_EXCEPTION 250 | #define JSON_USE_EXCEPTION 1 251 | #endif 252 | 253 | // Temporary, tracked for removal with issue #982. 254 | #ifndef JSON_USE_NULLREF 255 | #define JSON_USE_NULLREF 1 256 | #endif 257 | 258 | /// If defined, indicates that the source file is amalgamated 259 | /// to prevent private header inclusion. 260 | /// Remarks: it is automatically defined in the generated amalgamated header. 261 | // #define JSON_IS_AMALGAMATION 262 | 263 | // Export macros for DLL visibility 264 | #if defined(JSON_DLL_BUILD) 265 | #if defined(_MSC_VER) || defined(__MINGW32__) 266 | #define JSON_API __declspec(dllexport) 267 | #define JSONCPP_DISABLE_DLL_INTERFACE_WARNING 268 | #elif defined(__GNUC__) || defined(__clang__) 269 | #define JSON_API __attribute__((visibility("default"))) 270 | #endif // if defined(_MSC_VER) 271 | 272 | #elif defined(JSON_DLL) 273 | #if defined(_MSC_VER) || defined(__MINGW32__) 274 | #define JSON_API __declspec(dllimport) 275 | #define JSONCPP_DISABLE_DLL_INTERFACE_WARNING 276 | #endif // if defined(_MSC_VER) 277 | #endif // ifdef JSON_DLL_BUILD 278 | 279 | #if !defined(JSON_API) 280 | #define JSON_API 281 | #endif 282 | 283 | #if defined(_MSC_VER) && _MSC_VER < 1800 284 | #error \ 285 | "ERROR: Visual Studio 12 (2013) with _MSC_VER=1800 is the oldest supported compiler with sufficient C++11 capabilities" 286 | #endif 287 | 288 | #if defined(_MSC_VER) && _MSC_VER < 1900 289 | // As recommended at 290 | // https://stackoverflow.com/questions/2915672/snprintf-and-visual-studio-2010 291 | extern JSON_API int msvc_pre1900_c99_snprintf(char* outBuf, size_t size, 292 | const char* format, ...); 293 | #define jsoncpp_snprintf msvc_pre1900_c99_snprintf 294 | #else 295 | #define jsoncpp_snprintf std::snprintf 296 | #endif 297 | 298 | // If JSON_NO_INT64 is defined, then Json only support C++ "int" type for 299 | // integer 300 | // Storages, and 64 bits integer support is disabled. 301 | // #define JSON_NO_INT64 1 302 | 303 | // JSONCPP_OVERRIDE is maintained for backwards compatibility of external tools. 304 | // C++11 should be used directly in JSONCPP. 305 | #define JSONCPP_OVERRIDE override 306 | 307 | #ifdef __clang__ 308 | #if __has_extension(attribute_deprecated_with_message) 309 | #define JSONCPP_DEPRECATED(message) __attribute__((deprecated(message))) 310 | #endif 311 | #elif defined(__GNUC__) // not clang (gcc comes later since clang emulates gcc) 312 | #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)) 313 | #define JSONCPP_DEPRECATED(message) __attribute__((deprecated(message))) 314 | #elif (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) 315 | #define JSONCPP_DEPRECATED(message) __attribute__((__deprecated__)) 316 | #endif // GNUC version 317 | #elif defined(_MSC_VER) // MSVC (after clang because clang on Windows emulates 318 | // MSVC) 319 | #define JSONCPP_DEPRECATED(message) __declspec(deprecated(message)) 320 | #endif // __clang__ || __GNUC__ || _MSC_VER 321 | 322 | #if !defined(JSONCPP_DEPRECATED) 323 | #define JSONCPP_DEPRECATED(message) 324 | #endif // if !defined(JSONCPP_DEPRECATED) 325 | 326 | #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 6)) 327 | #define JSON_USE_INT64_DOUBLE_CONVERSION 1 328 | #endif 329 | 330 | #if !defined(JSON_IS_AMALGAMATION) 331 | 332 | #include "allocator.h" 333 | #include "version.h" 334 | 335 | #endif // if !defined(JSON_IS_AMALGAMATION) 336 | 337 | namespace Json { 338 | using Int = int; 339 | using UInt = unsigned int; 340 | #if defined(JSON_NO_INT64) 341 | using LargestInt = int; 342 | using LargestUInt = unsigned int; 343 | #undef JSON_HAS_INT64 344 | #else // if defined(JSON_NO_INT64) 345 | // For Microsoft Visual use specific types as long long is not supported 346 | #if defined(_MSC_VER) // Microsoft Visual Studio 347 | using Int64 = __int64; 348 | using UInt64 = unsigned __int64; 349 | #else // if defined(_MSC_VER) // Other platforms, use long long 350 | using Int64 = int64_t; 351 | using UInt64 = uint64_t; 352 | #endif // if defined(_MSC_VER) 353 | using LargestInt = Int64; 354 | using LargestUInt = UInt64; 355 | #define JSON_HAS_INT64 356 | #endif // if defined(JSON_NO_INT64) 357 | 358 | template 359 | using Allocator = 360 | typename std::conditional, 361 | std::allocator>::type; 362 | using String = std::basic_string, Allocator>; 363 | using IStringStream = 364 | std::basic_istringstream; 366 | using OStringStream = 367 | std::basic_ostringstream; 369 | using IStream = std::istream; 370 | using OStream = std::ostream; 371 | } // namespace Json 372 | 373 | // Legacy names (formerly macros). 374 | using JSONCPP_STRING = Json::String; 375 | using JSONCPP_ISTRINGSTREAM = Json::IStringStream; 376 | using JSONCPP_OSTRINGSTREAM = Json::OStringStream; 377 | using JSONCPP_ISTREAM = Json::IStream; 378 | using JSONCPP_OSTREAM = Json::OStream; 379 | 380 | #endif // JSON_CONFIG_H_INCLUDED 381 | 382 | // ////////////////////////////////////////////////////////////////////// 383 | // End of content of file: include/json/config.h 384 | // ////////////////////////////////////////////////////////////////////// 385 | 386 | 387 | 388 | 389 | 390 | 391 | // ////////////////////////////////////////////////////////////////////// 392 | // Beginning of content of file: include/json/forwards.h 393 | // ////////////////////////////////////////////////////////////////////// 394 | 395 | // Copyright 2007-2010 Baptiste Lepilleur and The JsonCpp Authors 396 | // Distributed under MIT license, or public domain if desired and 397 | // recognized in your jurisdiction. 398 | // See file LICENSE for detail or copy at http://jsoncpp.sourceforge.net/LICENSE 399 | 400 | #ifndef JSON_FORWARDS_H_INCLUDED 401 | #define JSON_FORWARDS_H_INCLUDED 402 | 403 | #if !defined(JSON_IS_AMALGAMATION) 404 | #include "config.h" 405 | #endif // if !defined(JSON_IS_AMALGAMATION) 406 | 407 | namespace Json { 408 | 409 | // writer.h 410 | class StreamWriter; 411 | class StreamWriterBuilder; 412 | class Writer; 413 | class FastWriter; 414 | class StyledWriter; 415 | class StyledStreamWriter; 416 | 417 | // reader.h 418 | class Reader; 419 | class CharReader; 420 | class CharReaderBuilder; 421 | 422 | // json_features.h 423 | class Features; 424 | 425 | // value.h 426 | using ArrayIndex = unsigned int; 427 | class StaticString; 428 | class Path; 429 | class PathArgument; 430 | class Value; 431 | class ValueIteratorBase; 432 | class ValueIterator; 433 | class ValueConstIterator; 434 | 435 | } // namespace Json 436 | 437 | #endif // JSON_FORWARDS_H_INCLUDED 438 | 439 | // ////////////////////////////////////////////////////////////////////// 440 | // End of content of file: include/json/forwards.h 441 | // ////////////////////////////////////////////////////////////////////// 442 | 443 | 444 | 445 | 446 | 447 | #endif //ifndef JSON_FORWARD_AMALGAMATED_H_INCLUDED 448 | -------------------------------------------------------------------------------- /json_output.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | #include "common.h" 22 | #include "json_output.h" 23 | #include "version.h" 24 | 25 | const std::string NVB_TITLE("nvbandwidth"); 26 | const std::string NVB_HOST_NAME("Hostname"); 27 | const std::string NVB_CUDA_RUNTIME_VERSION("CUDA Runtime Version"); 28 | const std::string NVB_DEVICE_INFO("GPU Device info"); 29 | const std::string NVB_DEVICE_LIST("GPU Device list"); 30 | const std::string NVB_DRIVER_VERSION("Driver Version"); 31 | const std::string NVB_GIT_VERSION("git_version"); 32 | const std::string NVB_VERSION("version"); 33 | const std::string NVB_ERROR("error"); 34 | const std::string NVB_WARNING("warning"); 35 | const std::string NVB_TESTCASES("testcases"); 36 | const std::string NVB_TESTCASE_NAME("name"); 37 | const std::string NVB_TESTCASE_ERROR(NVB_ERROR); 38 | const std::string NVB_STATUS("status"); 39 | const std::string NVB_BW_DESCRIPTION("bandwidth_description"); 40 | const std::string NVB_BW_MATRIX("bandwidth_matrix"); 41 | const std::string NVB_BW_SUM("sum"); 42 | const std::string NVB_BW_MAX("max"); 43 | const std::string NVB_BW_MIN("min"); 44 | const std::string NVB_BW_AVG("average"); 45 | const std::string NVB_BUFFER_SIZE("bufferSize"); 46 | const std::string NVB_TEST_SAMPLES("testSamples"); 47 | const std::string NVB_USE_MEAN("useMean"); 48 | const std::string NVB_PASSED("Passed"); 49 | const std::string NVB_RUNNING("Running"); 50 | const std::string NVB_WAIVED("Waived"); 51 | const std::string NVB_NOT_FOUND("Not Found"); 52 | const std::string NVB_ERROR_STATUS("Error"); 53 | 54 | JsonOutput::JsonOutput(bool _shouldOutput) { 55 | shouldOutput = _shouldOutput; 56 | } 57 | 58 | void JsonOutput::addTestcaseResults(const PeerValueMatrix &matrix, const std::string &description) { 59 | assert(m_root[NVB_TITLE][NVB_TESTCASES].isArray() && m_root[NVB_TITLE][NVB_TESTCASES].size() > 0); 60 | 61 | unsigned int size = m_root[NVB_TITLE][NVB_TESTCASES].size(); 62 | Json::Value &testcase = m_root[NVB_TITLE][NVB_TESTCASES][size-1]; 63 | 64 | double maxVal = std::numeric_limits::min(); 65 | double minVal = std::numeric_limits::max(); 66 | double sum = 0; 67 | int count = 0; 68 | 69 | for (int currentDevice = 0; currentDevice < matrix.m_rows; currentDevice++) { 70 | Json::Value row; 71 | for (int peer = 0; peer < matrix.m_columns; peer++) { 72 | std::optional val = matrix.value(currentDevice, peer); 73 | if (val) { 74 | std::stringstream buf; 75 | buf << val.value(); 76 | row.append(buf.str()); 77 | } else { 78 | row.append("N/A"); 79 | } 80 | sum += val.value_or(0.0); 81 | maxVal = std::max(maxVal, val.value_or(0.0)); 82 | minVal = std::min(minVal, val.value_or(0.0)); 83 | if (val.value_or(0.0) > 0) count++; 84 | } 85 | testcase[NVB_BW_MATRIX].append(row); 86 | } 87 | 88 | testcase[NVB_BW_SUM] = sum; 89 | testcase[NVB_BW_DESCRIPTION] = description; 90 | testcase[NVB_STATUS] = NVB_PASSED; 91 | 92 | if (verbose) { 93 | testcase[NVB_BW_MIN] = minVal; 94 | testcase[NVB_BW_MAX] = maxVal; 95 | testcase[NVB_BW_AVG] = sum/count; 96 | } 97 | } 98 | 99 | void JsonOutput::addTestcase(const std::string &name, const std::string &status, const std::string &msg) { 100 | Json::Value testcase; 101 | testcase[NVB_TESTCASE_NAME] = name; 102 | testcase[NVB_STATUS] = status; 103 | m_root[NVB_TITLE][NVB_TESTCASES].append(testcase); 104 | } 105 | 106 | void JsonOutput::recordErrorCurrentTest(const std::string &errorPart1, const std::string &errorPart2) { 107 | bool testCaseExists = false; 108 | if (m_root[NVB_TESTCASES].isArray()) { 109 | Json::Value &testcases = m_root[NVB_TITLE][NVB_TESTCASES]; 110 | unsigned int size = testcases.size(); 111 | if (size > 0) { 112 | testcases[size-1][NVB_TESTCASE_ERROR] = errorPart1 + " " + errorPart2; 113 | testCaseExists = true; 114 | } 115 | } 116 | 117 | if (!testCaseExists) { 118 | std::vector errors; 119 | errors.emplace_back(errorPart1); 120 | errors.emplace_back(errorPart2); 121 | recordError(errors); 122 | } 123 | } 124 | 125 | void JsonOutput::setTestcaseStatusAndAddIfNeeded(const std::string &name, const std::string &status, const std::string &msg) { 126 | bool testCaseExists = false; 127 | if (m_root[NVB_TESTCASES].isArray()) { 128 | Json::Value &testcases = m_root[NVB_TITLE][NVB_TESTCASES]; 129 | unsigned int size = testcases.size(); 130 | if (size > 0 && testcases[size-1][NVB_TESTCASE_NAME].asString() == name) { 131 | testcases[size-1][NVB_STATUS] = status; 132 | testCaseExists = true; 133 | } 134 | } 135 | 136 | if (!testCaseExists) { 137 | addTestcase(name, status); 138 | } 139 | } 140 | 141 | void JsonOutput::recordError(const std::string &error) { 142 | m_root[NVB_TITLE][NVB_ERROR] = error; 143 | print(); 144 | } 145 | 146 | void JsonOutput::recordError(const std::vector &errorParts) { 147 | std::stringstream buf; 148 | bool first = true; 149 | 150 | for (auto &part : errorParts) { 151 | if (first) { 152 | buf << part << ":"; 153 | first = false; 154 | } else { 155 | buf << " " << part; 156 | } 157 | } 158 | m_root[NVB_TITLE][NVB_ERROR] = buf.str(); 159 | } 160 | 161 | void JsonOutput::recordWarning(const std::string &warning) { 162 | m_root[NVB_TITLE][NVB_WARNING] = warning; 163 | } 164 | 165 | void JsonOutput::addVersionInfo() { 166 | m_root[NVB_TITLE][NVB_VERSION] = NVBANDWIDTH_VERSION; 167 | m_root[NVB_TITLE][NVB_GIT_VERSION] = GIT_VERSION; 168 | } 169 | 170 | void JsonOutput::addCudaAndDriverInfo(int cudaVersion, const std::string &driverVersion) { 171 | m_root[NVB_TITLE][NVB_CUDA_RUNTIME_VERSION] = cudaVersion; 172 | m_root[NVB_TITLE][NVB_DRIVER_VERSION] = driverVersion; 173 | } 174 | 175 | void JsonOutput::recordDevices(int deviceCount) { 176 | Json::Value deviceList; 177 | 178 | for (int iDev = 0; iDev < deviceCount; iDev++) { 179 | std::stringstream buf; 180 | buf << iDev << ": " << getDeviceDisplayInfo(iDev) << ": (" << localHostname << ")"; 181 | deviceList.append(buf.str()); 182 | } 183 | m_root[NVB_TITLE][NVB_DEVICE_LIST] = deviceList; 184 | } 185 | 186 | void JsonOutput::print() { 187 | if (shouldOutput) { 188 | std::cout << m_root.toStyledString() << std::endl; 189 | } 190 | } 191 | 192 | void JsonOutput::printInfo() { 193 | // NO-OP 194 | } 195 | -------------------------------------------------------------------------------- /json_output.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef JSON_OUTPUT_H_ 19 | #define JSON_OUTPUT_H_ 20 | 21 | #include 22 | #include 23 | 24 | #include "common.h" 25 | #include "output.h" 26 | 27 | class JsonOutput : public Output { 28 | public: 29 | JsonOutput(bool shouldOutput); 30 | 31 | void addTestcase(const std::string &name, const std::string &status, const std::string &msg = ""); 32 | 33 | void setTestcaseStatusAndAddIfNeeded(const std::string &name, const std::string &status, const std::string &msg = ""); 34 | 35 | void print(); 36 | 37 | void recordError(const std::string &error); 38 | 39 | void recordError(const std::vector &errorParts); 40 | 41 | void recordErrorCurrentTest(const std::string &errorPart1, const std::string &errorPart2); 42 | 43 | void recordWarning(const std::string &warning); 44 | 45 | void addVersionInfo(); 46 | 47 | void addCudaAndDriverInfo(int cudaVersion, const std::string &driverVersion); 48 | 49 | void addTestcaseResults(const PeerValueMatrix &matrix, const std::string &description); 50 | 51 | void printInfo(); 52 | 53 | void recordDevices(int deviceCount); 54 | 55 | private: 56 | bool shouldOutput; 57 | Json::Value m_root; 58 | }; 59 | 60 | #endif // JSON_OUTPUT_H_ 61 | -------------------------------------------------------------------------------- /kernels.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include "kernels.cuh" 19 | 20 | __global__ void simpleCopyKernel(unsigned long long loopCount, uint4 *dst, uint4 *src) { 21 | for (unsigned int i = 0; i < loopCount; i++) { 22 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 23 | size_t offset = idx * sizeof(uint4); 24 | uint4* dst_uint4 = reinterpret_cast((char*)dst + offset); 25 | uint4* src_uint4 = reinterpret_cast((char*)src + offset); 26 | __stcg(dst_uint4, __ldcg(src_uint4)); 27 | } 28 | } 29 | 30 | __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, uint4* dst, uint4* src, size_t chunkSizeInElement) { 31 | unsigned long long from = blockDim.x * blockIdx.x + threadIdx.x; 32 | unsigned long long bigChunkSizeInElement = chunkSizeInElement / 12; 33 | dst += from; 34 | src += from; 35 | uint4* dstBigEnd = dst + (bigChunkSizeInElement * 12) * totalThreadCount; 36 | uint4* dstEnd = dst + chunkSizeInElement * totalThreadCount; 37 | 38 | for (unsigned int i = 0; i < loopCount; i++) { 39 | uint4* cdst = dst; 40 | uint4* csrc = src; 41 | 42 | while (cdst < dstBigEnd) { 43 | uint4 pipe_0 = *csrc; csrc += totalThreadCount; 44 | uint4 pipe_1 = *csrc; csrc += totalThreadCount; 45 | uint4 pipe_2 = *csrc; csrc += totalThreadCount; 46 | uint4 pipe_3 = *csrc; csrc += totalThreadCount; 47 | uint4 pipe_4 = *csrc; csrc += totalThreadCount; 48 | uint4 pipe_5 = *csrc; csrc += totalThreadCount; 49 | uint4 pipe_6 = *csrc; csrc += totalThreadCount; 50 | uint4 pipe_7 = *csrc; csrc += totalThreadCount; 51 | uint4 pipe_8 = *csrc; csrc += totalThreadCount; 52 | uint4 pipe_9 = *csrc; csrc += totalThreadCount; 53 | uint4 pipe_10 = *csrc; csrc += totalThreadCount; 54 | uint4 pipe_11 = *csrc; csrc += totalThreadCount; 55 | 56 | *cdst = pipe_0; cdst += totalThreadCount; 57 | *cdst = pipe_1; cdst += totalThreadCount; 58 | *cdst = pipe_2; cdst += totalThreadCount; 59 | *cdst = pipe_3; cdst += totalThreadCount; 60 | *cdst = pipe_4; cdst += totalThreadCount; 61 | *cdst = pipe_5; cdst += totalThreadCount; 62 | *cdst = pipe_6; cdst += totalThreadCount; 63 | *cdst = pipe_7; cdst += totalThreadCount; 64 | *cdst = pipe_8; cdst += totalThreadCount; 65 | *cdst = pipe_9; cdst += totalThreadCount; 66 | *cdst = pipe_10; cdst += totalThreadCount; 67 | *cdst = pipe_11; cdst += totalThreadCount; 68 | } 69 | 70 | while (cdst < dstEnd) { 71 | *cdst = *csrc; cdst += totalThreadCount; csrc += totalThreadCount; 72 | } 73 | } 74 | } 75 | 76 | // This kernel performs a split warp copy, alternating copy directions across warps. 77 | __global__ void splitWarpCopyKernel(unsigned long long loopCount, uint4 *dst, uint4 *src) { 78 | for (unsigned int i = 0; i < loopCount; i++) { 79 | unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 80 | unsigned int globalWarpId = idx / warpSize; 81 | unsigned int warpLaneId = idx % warpSize; 82 | uint4* dst_uint4; 83 | uint4* src_uint4; 84 | 85 | // alternate copy directions across warps 86 | if (globalWarpId & 0x1) { 87 | // odd warp 88 | dst_uint4 = dst + (globalWarpId * warpSize + warpLaneId); 89 | src_uint4 = src + (globalWarpId * warpSize + warpLaneId); 90 | } else { 91 | // even warp 92 | dst_uint4 = src + (globalWarpId * warpSize + warpLaneId); 93 | src_uint4 = dst + (globalWarpId * warpSize + warpLaneId); 94 | } 95 | 96 | __stcg(dst_uint4, __ldcg(src_uint4)); 97 | } 98 | } 99 | 100 | __global__ void ptrChasingKernel(struct LatencyNode *data, size_t size, unsigned int accesses, unsigned int targetBlock) { 101 | struct LatencyNode *p = data; 102 | if (blockIdx.x != targetBlock) return; 103 | for (auto i = 0; i < accesses; ++i) { 104 | p = p->next; 105 | } 106 | 107 | // avoid compiler optimization 108 | if (p == nullptr) { 109 | __trap(); 110 | } 111 | } 112 | 113 | static __device__ __noinline__ 114 | void mc_st_u32(unsigned int *dst, unsigned int v) { 115 | #if __CUDA_ARCH__ >= 900 116 | asm volatile ("multimem.st.u32 [%0], %1;" :: "l"(dst), "r" (v)); 117 | #endif 118 | } 119 | 120 | static __device__ __noinline__ 121 | void mc_ld_u32(unsigned int *dst, const unsigned int *src) { 122 | #if __CUDA_ARCH__ >= 900 123 | asm volatile ("multimem.ld_reduce.and.b32 %0, [%1];" : "=r"((*dst)) : "l" (src)); 124 | #endif 125 | } 126 | 127 | // Writes from regular memory to multicast memory 128 | __global__ void multicastCopyKernel(unsigned long long loopCount, unsigned int* __restrict__ dst, unsigned int* __restrict__ src, size_t nElems) { 129 | const size_t totalThreadCount = blockDim.x * gridDim.x; 130 | const size_t offset = blockDim.x * blockIdx.x + threadIdx.x; 131 | unsigned int* const enddst = dst + nElems; 132 | dst += offset; 133 | src += offset; 134 | 135 | for (unsigned int i = 0; i < loopCount; i++) { 136 | // Reset pointers to src and dst chunks. 137 | unsigned int* cur_src_ptr = src; 138 | unsigned int* cur_dst_ptr = dst; 139 | #pragma unroll(12) 140 | while (cur_dst_ptr < enddst) { 141 | mc_st_u32(cur_dst_ptr, *cur_src_ptr); 142 | cur_dst_ptr += totalThreadCount; 143 | cur_src_ptr += totalThreadCount; 144 | } 145 | } 146 | } 147 | 148 | double latencyPtrChaseKernel(const int srcId, void* data, size_t size, unsigned long long latencyMemAccessCnt, unsigned smCount) { 149 | CUstream stream; 150 | int device, clock_rate_khz; 151 | double latencySum = 0.0f, finalLatencyPerAccessNs = 0.0; 152 | CUcontext srcCtx; 153 | cudaEvent_t start, end; 154 | float latencyMs = 0; 155 | 156 | CUDA_ASSERT(cudaEventCreate(&start)); 157 | CUDA_ASSERT(cudaEventCreate(&end)); 158 | 159 | CU_ASSERT(cuDevicePrimaryCtxRetain(&srcCtx, srcId)); 160 | CU_ASSERT(cuCtxSetCurrent(srcCtx)); 161 | 162 | CU_ASSERT(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); 163 | CU_ASSERT(cuCtxGetDevice(&device)); 164 | CU_ASSERT(cuDeviceGetAttribute(&clock_rate_khz, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); 165 | 166 | for (int targetBlock = 0; targetBlock < smCount; ++targetBlock) { 167 | CUDA_ASSERT(cudaEventRecord(start, stream)); 168 | ptrChasingKernel <<< smCount, 1, 0, stream>>> ((struct LatencyNode*) data, size, latencyMemAccessCnt / smCount, targetBlock); 169 | CUDA_ASSERT(cudaEventRecord(end, stream)); 170 | CUDA_ASSERT(cudaGetLastError()); 171 | CU_ASSERT(cuStreamSynchronize(stream)); 172 | cudaEventElapsedTime(&latencyMs, start, end); 173 | latencySum += (latencyMs / 1000); 174 | } 175 | finalLatencyPerAccessNs = (latencySum * 1.0E9) / (latencyMemAccessCnt); 176 | 177 | CUDA_ASSERT(cudaEventDestroy(start)); 178 | CUDA_ASSERT(cudaEventDestroy(end)); 179 | 180 | return finalLatencyPerAccessNs; 181 | } 182 | 183 | size_t copyKernel(MemcpyDescriptor &desc) { 184 | CUdevice dev; 185 | CUcontext ctx; 186 | 187 | CU_ASSERT(cuStreamGetCtx(desc.stream, &ctx)); 188 | CU_ASSERT(cuCtxGetDevice(&dev)); 189 | 190 | int numSm; 191 | CU_ASSERT(cuDeviceGetAttribute(&numSm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); 192 | unsigned int totalThreadCount = numSm * numThreadPerBlock; 193 | 194 | // If the user provided buffer size is samller than default buffer size, 195 | // we use the simple copy kernel for our bandwidth test. 196 | // This is done so that no trucation of the buffer size occurs. 197 | // Please note that to achieve peak bandwidth, it is suggested to use the 198 | // default buffer size, which in turn triggers the use of the optimized 199 | // kernel. 200 | if (desc.copySize < (smallBufferThreshold * _MiB)) { 201 | // copy size is rounded down to 16 bytes 202 | unsigned int numUint4 = desc.copySize / sizeof(uint4); 203 | // we allow max 1024 threads per block, and then scale out the copy across multiple blocks 204 | dim3 block(std::min(numUint4, static_cast(1024))); 205 | dim3 grid(numUint4/block.x); 206 | simpleCopyKernel <<>> (desc.loopCount, (uint4 *)desc.dst, (uint4 *)desc.src); 207 | return numUint4 * sizeof(uint4); 208 | } 209 | 210 | // adjust size to elements (size is multiple of MB, so no truncation here) 211 | size_t sizeInElement = desc.copySize / sizeof(uint4); 212 | // this truncates the copy 213 | sizeInElement = totalThreadCount * (sizeInElement / totalThreadCount); 214 | 215 | size_t chunkSizeInElement = sizeInElement / totalThreadCount; 216 | 217 | dim3 gridDim(numSm, 1, 1); 218 | dim3 blockDim(numThreadPerBlock, 1, 1); 219 | stridingMemcpyKernel<<>> (totalThreadCount, desc.loopCount, (uint4 *)desc.dst, (uint4 *)desc.src, chunkSizeInElement); 220 | 221 | return sizeInElement * sizeof(uint4); 222 | } 223 | 224 | size_t copyKernelSplitWarp(MemcpyDescriptor &desc) { 225 | CUdevice dev; 226 | CUcontext ctx; 227 | 228 | CU_ASSERT(cuStreamGetCtx(desc.stream, &ctx)); 229 | CU_ASSERT(cuCtxGetDevice(&dev)); 230 | 231 | int numSm; 232 | CU_ASSERT(cuDeviceGetAttribute(&numSm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); 233 | 234 | // copy size is rounded down to 16 bytes 235 | unsigned int numUint4 = desc.copySize / sizeof(uint4); 236 | 237 | // we allow max 1024 threads per block, and then scale out the copy across multiple blocks 238 | dim3 block(std::min(numUint4, static_cast(1024))); 239 | dim3 grid(numUint4/block.x); 240 | splitWarpCopyKernel <<>> (desc.loopCount, (uint4 *)desc.dst, (uint4 *)desc.src); 241 | return numUint4 * sizeof(uint4); 242 | } 243 | 244 | size_t multicastCopy(CUdeviceptr dstBuffer, CUdeviceptr srcBuffer, size_t size, CUstream stream, unsigned long long loopCount) { 245 | CUdevice dev; 246 | CUcontext ctx; 247 | 248 | CU_ASSERT(cuStreamGetCtx(stream, &ctx)); 249 | CU_ASSERT(cuCtxGetDevice(&dev)); 250 | 251 | int numSm; 252 | CU_ASSERT(cuDeviceGetAttribute(&numSm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); 253 | // adjust size to elements (size is multiple of MB, so no truncation here) 254 | size_t sizeInElement = size / sizeof(unsigned); 255 | dim3 gridDim(numSm, 1, 1); 256 | dim3 blockDim(numThreadPerBlock, 1, 1); 257 | multicastCopyKernel<<>> (loopCount, (unsigned *)dstBuffer, (unsigned *)srcBuffer, sizeInElement); 258 | return sizeInElement * sizeof(unsigned); 259 | } 260 | 261 | __global__ void spinKernelDevice(volatile int *latch, const unsigned long long timeoutClocks) { 262 | register unsigned long long endTime = clock64() + timeoutClocks; 263 | while (!*latch) { 264 | if (timeoutClocks != ~0ULL && clock64() > endTime) { 265 | break; 266 | } 267 | } 268 | } 269 | 270 | CUresult spinKernel(volatile int *latch, CUstream stream, unsigned long long timeoutMs) { 271 | int clocksPerMs = 0; 272 | CUcontext ctx; 273 | CUdevice dev; 274 | 275 | CU_ASSERT(cuStreamGetCtx(stream, &ctx)); 276 | CU_ASSERT(cuCtxGetDevice(&dev)); 277 | 278 | CU_ASSERT(cuDeviceGetAttribute(&clocksPerMs, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev)); 279 | 280 | unsigned long long timeoutClocks = clocksPerMs * timeoutMs; 281 | 282 | spinKernelDevice<<<1, 1, 0, stream>>>(latch, timeoutClocks); 283 | 284 | return CUDA_SUCCESS; 285 | } 286 | 287 | __global__ void spinKernelDeviceMultistage(volatile int *latch1, volatile int *latch2, const unsigned long long timeoutClocks) { 288 | if (latch1) { 289 | register unsigned long long endTime = clock64() + timeoutClocks; 290 | while (!*latch1) { 291 | if (timeoutClocks != ~0ULL && clock64() > endTime) { 292 | return; 293 | } 294 | } 295 | 296 | *latch2 = 1; 297 | } 298 | 299 | register unsigned long long endTime = clock64() + timeoutClocks; 300 | while (!*latch2) { 301 | if (timeoutClocks != ~0ULL && clock64() > endTime) { 302 | break; 303 | } 304 | } 305 | } 306 | 307 | // Implement a 2-stage spin kernel for multi-node synchronization. 308 | // One of the host nodes releases the first latch. Subsequently, 309 | // the second latch is released, that is polled by all other devices 310 | // latch1 argument is optional. If defined, kernel will spin on it until released, and then will release latch2. 311 | // latch2 argument is mandatory. Kernel will spin on it until released. 312 | // timeoutMs argument applies to each stage separately. 313 | // However, since each kernel will spin on only one stage, total runtime is still limited by timeoutMs 314 | CUresult spinKernelMultistage(volatile int *latch1, volatile int *latch2, CUstream stream, unsigned long long timeoutMs) { 315 | int clocksPerMs = 0; 316 | CUcontext ctx; 317 | CUdevice dev; 318 | 319 | ASSERT(latch2 != nullptr); 320 | 321 | CU_ASSERT(cuStreamGetCtx(stream, &ctx)); 322 | CU_ASSERT(cuCtxGetDevice(&dev)); 323 | 324 | CU_ASSERT(cuDeviceGetAttribute(&clocksPerMs, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev)); 325 | 326 | unsigned long long timeoutClocks = clocksPerMs * timeoutMs; 327 | 328 | spinKernelDeviceMultistage<<<1, 1, 0, stream>>>(latch1, latch2, timeoutClocks); 329 | 330 | return CUDA_SUCCESS; 331 | } 332 | 333 | __global__ void memsetKernelDevice(CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements) { 334 | unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x; 335 | unsigned int* buf = reinterpret_cast(buffer); 336 | unsigned int* pat = reinterpret_cast(pattern); 337 | 338 | if (idx < num_elements) { 339 | buf[idx] = pat[idx % num_pattern_elements]; 340 | } 341 | } 342 | 343 | // This kernel clears memory locations in the buffer based on warp parity. 344 | // If clearOddWarpIndexed is true, it clears buffer locations indexed by odd warps. 345 | // Otherwise, it clears buffer locations indexed by even warps. 346 | __global__ void memclearKernelByWarpParityDevice(CUdeviceptr buffer, bool clearOddWarpIndexed) { 347 | unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 348 | uint4* buf = reinterpret_cast(buffer); 349 | unsigned int globalWarpId = idx / warpSize; 350 | unsigned int thread_idx_in_warp = idx % warpSize; 351 | 352 | if (clearOddWarpIndexed) { 353 | // clear memory locations in buffer indexed by odd warps 354 | if (globalWarpId & 0x1) { 355 | buf[globalWarpId * warpSize + thread_idx_in_warp] = make_uint4(0x0, 0x0, 0x0, 0x0); 356 | } 357 | } else { 358 | // clear memory locations in buffer indexed by even warps 359 | if (!(globalWarpId & 0x1)) { 360 | buf[globalWarpId * warpSize + thread_idx_in_warp] = make_uint4(0x0, 0x0, 0x0, 0x0); 361 | } 362 | } 363 | } 364 | 365 | __global__ void memcmpKernelDevice(CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag) { 366 | unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x; 367 | unsigned int* buf = reinterpret_cast(buffer); 368 | unsigned int* pat = reinterpret_cast(pattern); 369 | 370 | if (idx < num_elements) { 371 | if (buf[idx] != pat[idx % num_pattern_elements]) { 372 | if (atomicCAS((int*)errorFlag, 0, 1) == 0) { 373 | // have the first thread that detects a mismatch print the error message 374 | printf(" Invalid value when checking the pattern at %p\n", (void*)((char*)buffer)); 375 | printf(" Current offset : %lu \n", idx); 376 | return; 377 | } 378 | } 379 | } 380 | } 381 | 382 | __global__ void multicastMemcmpKernelDevice(CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag) { 383 | unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x; 384 | unsigned int* buf = reinterpret_cast(buffer); 385 | unsigned int* pat = reinterpret_cast(pattern); 386 | 387 | if (idx < num_elements) { 388 | unsigned buf_val; 389 | mc_ld_u32(&buf_val, &buf[idx]); 390 | if (buf_val != pat[idx % num_pattern_elements]) { 391 | if (atomicCAS((int*)errorFlag, 0, 1) == 0) { 392 | // have the first thread that detects a mismatch print the error message 393 | printf(" Invalid value when checking the pattern at %p\n", (void*)((char*)buffer)); 394 | printf(" Current offset : %lu \n", idx); 395 | return; 396 | } 397 | } 398 | } 399 | } 400 | 401 | CUresult memsetKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements) { 402 | unsigned threadsPerBlock = 1024; 403 | unsigned long long blocks = (num_elements + threadsPerBlock - 1) / threadsPerBlock; 404 | 405 | memsetKernelDevice<<>>(buffer, pattern, num_elements, num_pattern_elements); 406 | CUDA_ASSERT(cudaGetLastError()); 407 | return CUDA_SUCCESS; 408 | } 409 | 410 | CUresult memclearKernelByWarpParity(CUstream stream, CUdeviceptr buffer, size_t size, bool clearOddWarpIndexed) { 411 | CUdevice dev; 412 | CUcontext ctx; 413 | 414 | CU_ASSERT(cuStreamGetCtx(stream, &ctx)); 415 | CU_ASSERT(cuCtxGetDevice(&dev)); 416 | 417 | int numSm; 418 | CU_ASSERT(cuDeviceGetAttribute(&numSm, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); 419 | // copy size is rounded down to 16 bytes 420 | unsigned int numUint4 = size / sizeof(uint4); 421 | 422 | // we allow max 1024 threads per block, and then scale out the copy across multiple blocks 423 | dim3 block(std::min(numUint4, static_cast(1024))); 424 | 425 | dim3 grid(numUint4/block.x); 426 | memclearKernelByWarpParityDevice <<>> (buffer, clearOddWarpIndexed); 427 | CUDA_ASSERT(cudaGetLastError()); 428 | return CUDA_SUCCESS; 429 | } 430 | 431 | CUresult memcmpKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag) { 432 | unsigned threadsPerBlock = 1024; 433 | unsigned long long blocks = (num_elements + threadsPerBlock - 1) / threadsPerBlock; 434 | 435 | memcmpKernelDevice<<>>(buffer, pattern, num_elements, num_pattern_elements, errorFlag); 436 | CUDA_ASSERT(cudaGetLastError()); 437 | return CUDA_SUCCESS; 438 | } 439 | 440 | CUresult multicastMemcmpKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag) { 441 | unsigned threadsPerBlock = 1024; 442 | unsigned long long blocks = (num_elements + threadsPerBlock - 1) / threadsPerBlock; 443 | 444 | multicastMemcmpKernelDevice<<>>(buffer, pattern, num_elements, num_pattern_elements, errorFlag); 445 | CUDA_ASSERT(cudaGetLastError()); 446 | return CUDA_SUCCESS; 447 | } 448 | 449 | void preloadKernels(int deviceCount) { 450 | cudaFuncAttributes unused; 451 | for (int iDev = 0; iDev < deviceCount; iDev++) { 452 | cudaSetDevice(iDev); 453 | cudaFuncGetAttributes(&unused, &stridingMemcpyKernel); 454 | cudaFuncGetAttributes(&unused, &spinKernelDevice); 455 | cudaFuncGetAttributes(&unused, &spinKernelDeviceMultistage); 456 | cudaFuncGetAttributes(&unused, &simpleCopyKernel); 457 | cudaFuncGetAttributes(&unused, &splitWarpCopyKernel); 458 | cudaFuncGetAttributes(&unused, &multicastCopyKernel); 459 | cudaFuncGetAttributes(&unused, &ptrChasingKernel); 460 | cudaFuncGetAttributes(&unused, &multicastCopyKernel); 461 | cudaFuncGetAttributes(&unused, &memsetKernelDevice); 462 | cudaFuncGetAttributes(&unused, &memcmpKernelDevice); 463 | cudaFuncGetAttributes(&unused, &multicastMemcmpKernelDevice); 464 | } 465 | } 466 | 467 | 468 | 469 | -------------------------------------------------------------------------------- /kernels.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef KERNELS_CUH_ 19 | #define KERNELS_CUH_ 20 | 21 | #include 22 | #include "common.h" 23 | #include "inline_common.h" 24 | 25 | const unsigned long long DEFAULT_SPIN_KERNEL_TIMEOUT_MS = 10000ULL; // 10 seconds 26 | 27 | size_t copyKernel(MemcpyDescriptor &desc); 28 | size_t copyKernelSplitWarp(MemcpyDescriptor &desc); 29 | size_t multicastCopy(CUdeviceptr dstBuffer, CUdeviceptr srcBuffer, size_t size, CUstream stream, unsigned long long loopCount); 30 | CUresult spinKernel(volatile int *latch, CUstream stream, unsigned long long timeoutMs = DEFAULT_SPIN_KERNEL_TIMEOUT_MS); 31 | CUresult spinKernelMultistage(volatile int *latch1, volatile int *latch2, CUstream stream, unsigned long long timeoutMs = DEFAULT_SPIN_KERNEL_TIMEOUT_MS); 32 | void preloadKernels(int deviceCount); 33 | double latencyPtrChaseKernel(const int srcId, void* data, size_t size, unsigned long long latencyMemAccessCnt, unsigned smCount); 34 | CUresult memsetKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements); 35 | CUresult memcmpKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag); 36 | CUresult multicastMemcmpKernel(CUstream stream, CUdeviceptr buffer, CUdeviceptr pattern, unsigned long long num_elements, unsigned int num_pattern_elements, CUdeviceptr errorFlag); 37 | 38 | CUresult memclearKernelByWarpParity(CUstream stream, CUdeviceptr buffer, size_t size, bool clearOddWarpIndexed); 39 | #endif // KERNELS_CUH_ 40 | -------------------------------------------------------------------------------- /memcpy.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef MEMCPY_H_ 19 | #define MEMCPY_H_ 20 | 21 | #include 22 | #include "common.h" 23 | 24 | class MemcpyBuffer { 25 | protected: 26 | void* buffer{}; 27 | size_t bufferSize; 28 | public: 29 | MemcpyBuffer(size_t bufferSize); 30 | virtual ~MemcpyBuffer() {} 31 | CUdeviceptr getBuffer() const; 32 | size_t getBufferSize() const; 33 | 34 | virtual int getBufferIdx() const = 0; 35 | virtual CUcontext getPrimaryCtx() const = 0; 36 | virtual std::string getBufferString() const = 0; 37 | // In MPI configuration we want to avoid using blocking functions such as cuStreamSynchronize to adhere to MPI notion of progress 38 | // For more details see https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/mpi.html#mpi-progress 39 | virtual CUresult streamSynchronizeWrapper(CUstream stream) const; 40 | virtual int getMPIRank() const; 41 | }; 42 | 43 | // Represents the host buffer abstraction 44 | class HostBuffer : public MemcpyBuffer { 45 | public: 46 | // NUMA affinity is set here through allocation of memory in the socket group where `targetDeviceId` resides 47 | HostBuffer(size_t bufferSize, int targetDeviceId); 48 | ~HostBuffer(); 49 | 50 | int getBufferIdx() const override; 51 | CUcontext getPrimaryCtx() const override; 52 | virtual std::string getBufferString() const override; 53 | }; 54 | 55 | // Represents the device buffer and context abstraction 56 | class DeviceBuffer : public MemcpyBuffer { 57 | private: 58 | int deviceIdx; 59 | CUcontext primaryCtx{}; 60 | public: 61 | DeviceBuffer(size_t bufferSize, int deviceIdx); 62 | ~DeviceBuffer(); 63 | 64 | int getBufferIdx() const override; 65 | CUcontext getPrimaryCtx() const override; 66 | virtual std::string getBufferString() const override; 67 | 68 | bool enablePeerAcess(const DeviceBuffer &peerBuffer); 69 | }; 70 | 71 | // Specifies the preferred node's context to do the operation from 72 | // It's only a preference because if the preferred node is a HostBuffer, it has no context and will fall back to the other node 73 | enum ContextPreference { 74 | PREFER_SRC_CONTEXT, // Prefer the source buffer's context if available 75 | PREFER_DST_CONTEXT // Prefer the destination buffer's context if available 76 | }; 77 | 78 | class MemcpyOperation; 79 | 80 | // forward declaration 81 | class NodeHelper; 82 | 83 | class MemcpyDispatchInfo { 84 | public: 85 | std::vector contexts; 86 | std::vector streams; 87 | std::vector srcBuffers; 88 | std::vector dstBuffers; 89 | std::vector originalRanks; 90 | std::vector adjustedCopySizes; 91 | std::shared_ptr nodeHelper; 92 | MemcpyDispatchInfo(std::vector srcBuffers, std::vector dstBuffers, std::vector contexts, std::vector originalRanks = {}); 93 | }; 94 | 95 | class NodeHelper { 96 | public: 97 | virtual MemcpyDispatchInfo dispatchMemcpy(const std::vector &srcBuffers, const std::vector &dstBuffers, ContextPreference ctxPreference) = 0; 98 | virtual double calculateTotalBandwidth(double totalTime, double totalSize, size_t loopCount) = 0; 99 | virtual double calculateSumBandwidth(std::vector &bandwidthStats) = 0; 100 | virtual double calculateFirstBandwidth(std::vector &bandwidthStats) = 0; 101 | virtual std::vector calculateVectorBandwidth(std::vector &results, std::vector originalRanks) = 0; 102 | virtual void synchronizeProcess() = 0; 103 | // In MPI configuration we want to avoid using blocking functions such as cuStreamSynchronize to adhere to MPI notion of progress 104 | // For more details see https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/mpi.html#mpi-progress 105 | virtual CUresult streamSynchronizeWrapper(CUstream stream) const = 0; 106 | 107 | // stream blocking functions 108 | virtual void streamBlockerReset() = 0; 109 | virtual void streamBlockerRelease() = 0; 110 | virtual void streamBlockerBlock(CUstream stream) = 0; 111 | }; 112 | 113 | class NodeHelperSingle : public NodeHelper { 114 | private: 115 | volatile int* blockingVarHost; 116 | public: 117 | NodeHelperSingle(); 118 | ~NodeHelperSingle(); 119 | MemcpyDispatchInfo dispatchMemcpy(const std::vector &srcBuffers, const std::vector &dstBuffers, ContextPreference ctxPreference); 120 | double calculateTotalBandwidth(double totalTime, double totalSize, size_t loopCount); 121 | double calculateSumBandwidth(std::vector &bandwidthStats); 122 | double calculateFirstBandwidth(std::vector &bandwidthStats); 123 | std::vector calculateVectorBandwidth(std::vector &results, std::vector originalRanks); 124 | void synchronizeProcess(); 125 | CUresult streamSynchronizeWrapper(CUstream stream) const; 126 | 127 | // stream blocking functions 128 | void streamBlockerReset(); 129 | void streamBlockerRelease(); 130 | void streamBlockerBlock(CUstream stream); 131 | }; 132 | 133 | class MemcpyInitiator { 134 | public: 135 | // Pure virtual function for implementation of the actual memcpy function 136 | // return actual bytes copied 137 | // This can vary from copySize due to SM copies truncated the copy to achieve max bandwidth 138 | virtual size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor) = 0; 139 | // Calculate the truncated sizes used by copy kernels 140 | virtual size_t getAdjustedCopySize(size_t size, CUstream stream) = 0; 141 | // Fill buffer with a pattern 142 | virtual void memsetPattern(MemcpyDispatchInfo &info) const = 0; 143 | // Compare buffer with a pattern 144 | virtual void memcmpPattern(MemcpyDispatchInfo &info) const = 0; 145 | // Adjust the bandwidth before final reporting 146 | virtual unsigned long long getAdjustedBandwidth(unsigned long long bandwidth) = 0; 147 | }; 148 | 149 | class MemcpyInitiatorSM : public MemcpyInitiator { 150 | public: 151 | size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor); 152 | // Calculate the truncated sizes used by copy kernels 153 | size_t getAdjustedCopySize(size_t size, CUstream stream); 154 | // Fill buffer with a pattern 155 | void memsetPattern(MemcpyDispatchInfo &info) const; 156 | // Compare buffer with a pattern 157 | void memcmpPattern(MemcpyDispatchInfo &info) const; 158 | // Adjust the bandwidth before final reporting 159 | unsigned long long getAdjustedBandwidth(unsigned long long bandwidth); 160 | }; 161 | 162 | class MemcpyInitiatorCE : public MemcpyInitiator { 163 | public: 164 | size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor); 165 | // Calculate the truncated sizes used by copy kernels 166 | size_t getAdjustedCopySize(size_t size, CUstream stream); 167 | // Fill buffer with a pattern 168 | void memsetPattern(MemcpyDispatchInfo &info) const; 169 | // Compare buffer with a pattern 170 | void memcmpPattern(MemcpyDispatchInfo &info) const; 171 | // Adjust the bandwidth before final reporting 172 | unsigned long long getAdjustedBandwidth(unsigned long long bandwidth); 173 | }; 174 | 175 | class MemcpyInitiatorMulticastWrite : public MemcpyInitiator { 176 | public: 177 | size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor); 178 | // Calculate the truncated sizes used by copy kernels 179 | size_t getAdjustedCopySize(size_t size, CUstream stream); 180 | // Fill buffer with a pattern 181 | void memsetPattern(MemcpyDispatchInfo &info) const; 182 | // Compare buffer with a pattern 183 | void memcmpPattern(MemcpyDispatchInfo &info) const; 184 | // Adjust the bandwidth before final reporting 185 | unsigned long long getAdjustedBandwidth(unsigned long long bandwidth); 186 | }; 187 | 188 | class MemcpyInitiatorSMSplitWarp : public MemcpyInitiatorSM { 189 | public: 190 | size_t memcpyFunc(MemcpyDescriptor &memcpyDescriptor); 191 | // Fill buffer with a pattern 192 | void memsetPattern(MemcpyDispatchInfo &info) const; 193 | // Compare buffer with a pattern 194 | void memcmpPattern(MemcpyDispatchInfo &info) const; 195 | // Adjust the bandwidth before final reporting 196 | unsigned long long getAdjustedBandwidth(unsigned long long bandwidth); 197 | }; 198 | 199 | // Abstraction of a memory Operation. 200 | class MemoryOperation { 201 | public: 202 | MemoryOperation() = default; 203 | ~MemoryOperation() = default; 204 | }; 205 | 206 | // Abstraction of a memcpy operation 207 | class MemcpyOperation : public MemoryOperation { 208 | public: 209 | // Specifies which bandwidths to use for the final result of simultaneous copies 210 | enum BandwidthValue { 211 | USE_FIRST_BW, // Use the bandwidth of the first copy in the simultaneous copy list 212 | SUM_BW, // Use the sum of all bandwidths from the simultaneous copy list 213 | TOTAL_BW, // Use the total bandwidth of all copies, based on total time and total bytes copied 214 | VECTOR_BW, // Return bandwidths of each copy separately 215 | }; 216 | 217 | ContextPreference ctxPreference; 218 | 219 | private: 220 | unsigned long long loopCount; 221 | 222 | protected: 223 | size_t *procMask; 224 | BandwidthValue bandwidthValue; 225 | 226 | std::shared_ptr nodeHelper; 227 | std::shared_ptr memcpyInitiator; 228 | 229 | public: 230 | MemcpyOperation(unsigned long long loopCount, MemcpyInitiator *_memcpyInitiator, ContextPreference ctxPreference = ContextPreference::PREFER_SRC_CONTEXT, BandwidthValue bandwidthValue = BandwidthValue::USE_FIRST_BW); 231 | MemcpyOperation(unsigned long long loopCount, MemcpyInitiator *_memcpyInitiator, NodeHelper *_nodeHelper, ContextPreference ctxPreference = ContextPreference::PREFER_SRC_CONTEXT, BandwidthValue bandwidthValue = BandwidthValue::USE_FIRST_BW); 232 | virtual ~MemcpyOperation(); 233 | 234 | // Lists of paired nodes will be executed sumultaneously 235 | // context of srcBuffers is preferred (if not host) unless otherwise specified 236 | std::vector doMemcpyCore(MemcpyDispatchInfo &info); 237 | std::vector doMemcpyVector(const std::vector &srcBuffers, const std::vector &dstBuffers); 238 | double doMemcpy(const std::vector &srcBuffers, const std::vector &dstBuffers); 239 | double doMemcpy(const MemcpyBuffer &srcBuffer, const MemcpyBuffer &dstBuffer); 240 | }; 241 | 242 | class MemPtrChaseOperation : public MemoryOperation { 243 | public: 244 | MemPtrChaseOperation(unsigned long long loopCount); 245 | ~MemPtrChaseOperation() = default; 246 | double doPtrChase(const int srcId, const MemcpyBuffer &peerBuffer); 247 | private: 248 | unsigned long long loopCount; 249 | unsigned int smCount; 250 | }; 251 | 252 | #endif // MEMCPY_H_ 253 | -------------------------------------------------------------------------------- /multinode_memcpy.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifdef MULTINODE 19 | #include 20 | #include 21 | 22 | #include "kernels.cuh" 23 | #include "multinode_memcpy.h" 24 | 25 | MultinodeMemoryAllocation::MultinodeMemoryAllocation(size_t bufferSize, int MPI_rank): bufferSize(bufferSize), MPI_rank(MPI_rank) { 26 | cudaSetDevice(localDevice); 27 | } 28 | 29 | static CUresult MPIstreamSyncHelper(CUstream stream) { 30 | CUresult err = CUDA_ERROR_NOT_READY; 31 | int flag; 32 | while (err == CUDA_ERROR_NOT_READY) { 33 | err = cuStreamQuery(stream); 34 | MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE); 35 | } 36 | return err; 37 | } 38 | 39 | CUresult MultinodeMemoryAllocation::streamSynchronizeWrapper(CUstream stream) const { 40 | return MPIstreamSyncHelper(stream); 41 | } 42 | 43 | MultinodeMemoryAllocationUnicast::MultinodeMemoryAllocationUnicast(size_t bufferSize, int MPI_rank): MultinodeMemoryAllocation(bufferSize, MPI_rank) { 44 | handleType = CU_MEM_HANDLE_TYPE_FABRIC; 45 | prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; 46 | prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 47 | prop.location.id = localDevice; 48 | prop.requestedHandleTypes = handleType; 49 | 50 | size_t granularity = 0; 51 | CU_ASSERT(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); 52 | 53 | roundedUpAllocationSize = ROUND_UP(bufferSize, granularity); 54 | 55 | if (MPI_rank == worldRank) { 56 | // Allocate the memory 57 | CU_ASSERT(cuMemCreate(&handle, roundedUpAllocationSize, &prop, 0 /*flags*/)); 58 | 59 | // Export the allocation to the importing process 60 | CU_ASSERT(cuMemExportToShareableHandle(&fh, handle, handleType, 0 /*flags*/)); 61 | } 62 | 63 | MPI_Bcast(&fh, sizeof(fh), MPI_BYTE, MPI_rank, MPI_COMM_WORLD); 64 | 65 | if (MPI_rank != worldRank) { 66 | CU_ASSERT(cuMemImportFromShareableHandle(&handle, (void *)&fh, handleType)); 67 | } 68 | 69 | // Map the memory 70 | CU_ASSERT(cuMemAddressReserve((CUdeviceptr *) &buffer, roundedUpAllocationSize, 0, 0 /*baseVA*/, 0 /*flags*/)); 71 | 72 | CU_ASSERT(cuMemMap((CUdeviceptr) buffer, roundedUpAllocationSize, 0 /*offset*/, handle, 0 /*flags*/)); 73 | desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 74 | desc.location.id = localDevice; 75 | desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; 76 | CU_ASSERT(cuMemSetAccess((CUdeviceptr) buffer, roundedUpAllocationSize, &desc, 1 /*count*/)); 77 | 78 | // Make sure that everyone is done with mapping the fabric allocation 79 | MPI_Barrier(MPI_COMM_WORLD); 80 | } 81 | 82 | MultinodeMemoryAllocationUnicast::~MultinodeMemoryAllocationUnicast() { 83 | // Make sure that everyone is done using the memory 84 | MPI_Barrier(MPI_COMM_WORLD); 85 | 86 | CU_ASSERT(cuMemUnmap((CUdeviceptr) buffer, roundedUpAllocationSize)); 87 | CU_ASSERT(cuMemRelease(handle)); 88 | CU_ASSERT(cuMemAddressFree((CUdeviceptr) buffer, roundedUpAllocationSize)); 89 | } 90 | 91 | MultinodeMemoryAllocationMulticast::MultinodeMemoryAllocationMulticast(size_t bufferSize, int MPI_rank): MultinodeMemoryAllocation(bufferSize, MPI_rank) { 92 | handleType = CU_MEM_HANDLE_TYPE_FABRIC; 93 | multicastProp.numDevices = worldSize; 94 | multicastProp.handleTypes = handleType; 95 | size_t gran; 96 | CU_ASSERT(cuMulticastGetGranularity(&gran, &multicastProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); 97 | roundedUpAllocationSize = ROUND_UP(bufferSize, gran); 98 | multicastProp.size = roundedUpAllocationSize; 99 | 100 | if (MPI_rank == worldRank) { 101 | // Allocate the memory 102 | CU_ASSERT(cuMulticastCreate(&multicastHandle, &multicastProp)); 103 | 104 | // Export the allocation to the importing process 105 | CU_ASSERT(cuMemExportToShareableHandle(&fh, multicastHandle, handleType, 0 /*flags*/)); 106 | } 107 | 108 | MPI_Bcast(&fh, sizeof(fh), MPI_BYTE, MPI_rank, MPI_COMM_WORLD); 109 | 110 | if (MPI_rank != worldRank) { 111 | CU_ASSERT(cuMemImportFromShareableHandle(&multicastHandle, (void *)&fh, handleType)); 112 | } 113 | 114 | CUdevice dev; 115 | CU_ASSERT(cuDeviceGet(&dev, localDevice)); 116 | CU_ASSERT(cuMulticastAddDevice(multicastHandle, dev)); 117 | 118 | // Ensure all devices in this process are added BEFORE binding mem on any device 119 | MPI_Barrier(MPI_COMM_WORLD); 120 | 121 | // Allocate the memory (same as unicast) and bind to MC handle 122 | CUmemAllocationProp prop = {}; 123 | prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; 124 | prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 125 | prop.location.id = localDevice; 126 | prop.requestedHandleTypes = handleType; 127 | CU_ASSERT(cuMemCreate(&handle, roundedUpAllocationSize, &prop, 0 /*flags*/)); 128 | CU_ASSERT(cuMulticastBindMem(multicastHandle, 0, handle, 0, roundedUpAllocationSize, 0)); 129 | 130 | // Map the memory 131 | CU_ASSERT(cuMemAddressReserve((CUdeviceptr *) &buffer, roundedUpAllocationSize, 0, 0 /*baseVA*/, 0 /*flags*/)); 132 | 133 | CU_ASSERT(cuMemMap((CUdeviceptr) buffer, roundedUpAllocationSize, 0 /*offset*/, multicastHandle, 0 /*flags*/)); 134 | desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 135 | desc.location.id = localDevice; 136 | desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; 137 | CU_ASSERT(cuMemSetAccess((CUdeviceptr) buffer, roundedUpAllocationSize, &desc, 1 /*count*/)); 138 | 139 | // Make sure that everyone is done with mapping the fabric allocation 140 | MPI_Barrier(MPI_COMM_WORLD); 141 | } 142 | 143 | MultinodeMemoryAllocationMulticast::~MultinodeMemoryAllocationMulticast() { 144 | // Make sure that everyone is done using the memory 145 | MPI_Barrier(MPI_COMM_WORLD); 146 | 147 | CUdevice dev; 148 | CU_ASSERT(cuDeviceGet(&dev, localDevice)); 149 | CU_ASSERT(cuMulticastUnbind(multicastHandle, dev, 0, roundedUpAllocationSize)); 150 | CU_ASSERT(cuMemRelease(handle)); 151 | 152 | CU_ASSERT(cuMemUnmap((CUdeviceptr) buffer, roundedUpAllocationSize)); 153 | CU_ASSERT(cuMemRelease(multicastHandle)); 154 | CU_ASSERT(cuMemAddressFree((CUdeviceptr) buffer, roundedUpAllocationSize)); 155 | } 156 | 157 | MultinodeDeviceBuffer::MultinodeDeviceBuffer(size_t bufferSize, int MPI_rank): 158 | MPI_rank(MPI_rank), 159 | MemcpyBuffer(bufferSize) { 160 | } 161 | 162 | int MultinodeDeviceBuffer::getBufferIdx() const { 163 | // only single-GPU supported for now 164 | return 0; 165 | } 166 | 167 | std::string MultinodeDeviceBuffer::getBufferString() const { 168 | return "Multinode node " + std::to_string(MPI_rank); 169 | } 170 | 171 | CUcontext MultinodeDeviceBuffer::getPrimaryCtx() const { 172 | CUcontext primaryCtx; 173 | CU_ASSERT(cuDevicePrimaryCtxRetain(&primaryCtx, localDevice)); 174 | return primaryCtx; 175 | } 176 | 177 | int MultinodeDeviceBuffer::getMPIRank() const { 178 | return MPI_rank; 179 | } 180 | 181 | MultinodeDeviceBufferUnicast::MultinodeDeviceBufferUnicast(size_t bufferSize, int MPI_rank): 182 | MultinodeDeviceBuffer(bufferSize, MPI_rank), 183 | MemoryAllocation(bufferSize, MPI_rank) { 184 | buffer = MemoryAllocation.getBuffer(); 185 | } 186 | 187 | MultinodeDeviceBufferMulticast::MultinodeDeviceBufferMulticast(size_t bufferSize, int MPI_rank): 188 | MultinodeDeviceBuffer(bufferSize, MPI_rank), 189 | MemoryAllocation(bufferSize, MPI_rank) { 190 | buffer = MemoryAllocation.getBuffer(); 191 | } 192 | 193 | MultinodeDeviceBufferLocal::MultinodeDeviceBufferLocal(size_t bufferSize, int MPI_rank): 194 | MultinodeDeviceBuffer(bufferSize, MPI_rank) { 195 | buffer = nullptr; 196 | if (worldRank == MPI_rank) { 197 | CU_ASSERT(cuDevicePrimaryCtxRetain(&primaryCtx, localDevice)); 198 | CU_ASSERT(cuCtxSetCurrent(primaryCtx)); 199 | if (bufferSize) { 200 | CU_ASSERT(cuMemAlloc((CUdeviceptr*)&buffer, bufferSize)); 201 | } 202 | } 203 | } 204 | 205 | MultinodeDeviceBufferLocal::~MultinodeDeviceBufferLocal() { 206 | if (buffer) { 207 | CU_ASSERT(cuCtxSetCurrent(primaryCtx)); 208 | CU_ASSERT(cuMemFree((CUdeviceptr)buffer)); 209 | CU_ASSERT(cuDevicePrimaryCtxRelease(localDevice)); 210 | } 211 | } 212 | 213 | NodeHelperMulti::NodeHelperMulti() : blockingVarDeviceAllocation(sizeof(*blockingVarDevice), 0) { 214 | CU_ASSERT(cuMemHostAlloc((void **)&blockingVarHost, sizeof(*blockingVarHost), CU_MEMHOSTALLOC_PORTABLE)); 215 | blockingVarDevice = (volatile int*) blockingVarDeviceAllocation.getBuffer(); 216 | } 217 | 218 | NodeHelperMulti::~NodeHelperMulti() { 219 | CU_ASSERT(cuMemFreeHost((void*)blockingVarHost)); 220 | } 221 | 222 | MemcpyDispatchInfo NodeHelperMulti::dispatchMemcpy(const std::vector &srcNodesUnfiltered, const std::vector &dstNodesUnfiltered, ContextPreference ctxPreference) { 223 | std::vector ranksUnfiltered(srcNodesUnfiltered.size(), -1); 224 | std::vector contextsUnfiltered(srcNodesUnfiltered.size()); 225 | std::vector srcNodes; 226 | std::vector dstNodes; 227 | std::vector contexts; 228 | 229 | for (int i = 0; i < srcNodesUnfiltered.size(); i++) { 230 | // prefer source context 231 | // determine which ranks executes given operation 232 | if (ctxPreference == PREFER_SRC_CONTEXT && srcNodesUnfiltered[i]->getPrimaryCtx() != nullptr) { 233 | contextsUnfiltered[i] = srcNodesUnfiltered[i]->getPrimaryCtx(); 234 | ranksUnfiltered[i] = srcNodesUnfiltered[i]->getMPIRank(); 235 | } else if (dstNodesUnfiltered[i]->getPrimaryCtx() != nullptr) { 236 | contextsUnfiltered[i] = dstNodesUnfiltered[i]->getPrimaryCtx(); 237 | ranksUnfiltered[i] = dstNodesUnfiltered[i]->getMPIRank(); 238 | } 239 | } 240 | 241 | for (int i = 0; i < srcNodesUnfiltered.size(); i++) { 242 | if (ranksUnfiltered[i] == worldRank) { 243 | srcNodes.push_back(srcNodesUnfiltered[i]); 244 | dstNodes.push_back(dstNodesUnfiltered[i]); 245 | contexts.push_back(contextsUnfiltered[i]); 246 | } 247 | } 248 | 249 | // Don't crash if there are no memcopies to do 250 | if (ranksUnfiltered.size() > 0) { 251 | rankOfFirstMemcpy = ranksUnfiltered[0]; 252 | } 253 | 254 | return MemcpyDispatchInfo(srcNodes, dstNodes, contexts, ranksUnfiltered); 255 | } 256 | 257 | double NodeHelperMulti::calculateTotalBandwidth(double totalTime, double totalSize, size_t loopCount) { 258 | double totalMax = 0; 259 | MPI_Allreduce(&totalTime, &totalMax, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); 260 | totalTime = totalMax; 261 | 262 | double totalSum = 0; 263 | MPI_Allreduce(&totalSize, &totalSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 264 | totalSize = totalSum; 265 | 266 | return (totalSize * loopCount * 1000ull * 1000ull) / totalTime; 267 | } 268 | 269 | double NodeHelperMulti::calculateSumBandwidth(std::vector &bandwidthStats) { 270 | double sum = 0.0; 271 | for (auto stat : bandwidthStats) { 272 | sum += stat.returnAppropriateMetric() * 1e-9; 273 | } 274 | // Calculate total BW sum across all nodes and memcopies 275 | double totalSum = 0; 276 | MPI_Allreduce(&sum, &totalSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 277 | return totalSum; 278 | } 279 | 280 | double NodeHelperMulti::calculateFirstBandwidth(std::vector &bandwidthStats) { 281 | // Broadcast bandwidth of "first" memcopy to other nodes 282 | double retval = 0; 283 | if (worldRank == rankOfFirstMemcpy) { 284 | retval = bandwidthStats[0].returnAppropriateMetric() * 1e-9; 285 | } 286 | MPI_Bcast(&retval, 1, MPI_DOUBLE, rankOfFirstMemcpy, MPI_COMM_WORLD); 287 | return retval; 288 | } 289 | 290 | std::vector NodeHelperMulti::calculateVectorBandwidth(std::vector &results, std::vector originalRanks) { 291 | std::vector retval; 292 | int current_local_elem = 0; 293 | for (int i = 0; i < originalRanks.size(); i++) { 294 | double tmp = 0; 295 | if (worldRank == originalRanks[i]) { 296 | tmp = results[current_local_elem]; 297 | current_local_elem++; 298 | } 299 | MPI_Bcast(&tmp, 1, MPI_DOUBLE, originalRanks[i], MPI_COMM_WORLD); 300 | retval.push_back(tmp); 301 | } 302 | return retval; 303 | } 304 | 305 | void NodeHelperMulti::synchronizeProcess() { 306 | MPI_Barrier(MPI_COMM_WORLD); 307 | } 308 | 309 | CUresult NodeHelperMulti::streamSynchronizeWrapper(CUstream stream) const { 310 | return MPIstreamSyncHelper(stream); 311 | } 312 | 313 | void NodeHelperMulti::streamBlockerReset() { 314 | *blockingVarHost = 0; 315 | CU_ASSERT(cuMemsetD32((CUdeviceptr) blockingVarDevice, 0, 1)); 316 | } 317 | 318 | void NodeHelperMulti::streamBlockerRelease() { 319 | *blockingVarHost = 1; 320 | } 321 | 322 | void NodeHelperMulti::streamBlockerBlock(CUstream stream) { 323 | // MPI rank ranks[0] is released by blockingVar, writes to blockingVarDevice, releasing other ranks 324 | CU_ASSERT(spinKernelMultistage((worldRank == rankOfFirstMemcpy) ? blockingVarHost : nullptr, blockingVarDevice, stream)); 325 | } 326 | 327 | #endif // MULTINODE 328 | -------------------------------------------------------------------------------- /multinode_memcpy.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef MULTINODE_MEMCPY_H_ 19 | #define MULTINODE_MEMCPY_H_ 20 | #ifdef MULTINODE 21 | 22 | #include 23 | #include 24 | 25 | #include "common.h" 26 | #include "memcpy.h" 27 | 28 | class MultinodeMemoryAllocation { 29 | protected: 30 | void* buffer = nullptr; 31 | size_t bufferSize; 32 | int MPI_rank; 33 | 34 | public: 35 | MultinodeMemoryAllocation(size_t bufferSize, int MPI_rank); 36 | void *getBuffer() { return (void *) buffer; } 37 | CUresult streamSynchronizeWrapper(CUstream stream) const; 38 | }; 39 | 40 | // Class responsible for allocating memory that is shareable on a NVLink system, following RAII principles 41 | // Constructor takes as parameters: 42 | // - bufferSize: size of requested allocation 43 | // - MPI_rank: node on which the allocation physically resides. 44 | // All other nodes will have this allocation mapped and accessible remotely. 45 | class MultinodeMemoryAllocationUnicast : public MultinodeMemoryAllocation { 46 | private: 47 | CUmemGenericAllocationHandle handle = {}; 48 | CUmemFabricHandle fh = {}; 49 | CUmemAllocationHandleType handleType = {}; 50 | CUmemAllocationProp prop = {}; 51 | CUmemAccessDesc desc = {}; 52 | size_t roundedUpAllocationSize; 53 | 54 | public: 55 | MultinodeMemoryAllocationUnicast(size_t bufferSize, int MPI_rank); 56 | ~MultinodeMemoryAllocationUnicast(); 57 | }; 58 | 59 | // Class responsible for allocating multicast object, following RAII principles 60 | // Constructor takes as parameters: 61 | // - bufferSize: size of requested allocation 62 | // - MPI_rank: node driving the allocation process and exporting memory handle. 63 | // All nodes will have this allocation mapped and accessible. 64 | class MultinodeMemoryAllocationMulticast : public MultinodeMemoryAllocation { 65 | private: 66 | CUmemGenericAllocationHandle handle = {}; 67 | CUmemGenericAllocationHandle multicastHandle = {}; 68 | CUmemFabricHandle fh = {}; 69 | CUmemAllocationHandleType handleType = {}; 70 | CUmulticastObjectProp multicastProp = {}; 71 | CUmemAccessDesc desc = {}; 72 | size_t roundedUpAllocationSize; 73 | public: 74 | MultinodeMemoryAllocationMulticast(size_t bufferSize, int MPI_rank); 75 | ~MultinodeMemoryAllocationMulticast(); 76 | }; 77 | 78 | // Class responsible for implementing Multinode MemcpyBuffer 79 | // Each instance has information about which node owns the memory 80 | class MultinodeDeviceBuffer : public MemcpyBuffer { 81 | private: 82 | int MPI_rank; 83 | public: 84 | MultinodeDeviceBuffer(size_t bufferSize, int MPI_rank); 85 | 86 | virtual CUcontext getPrimaryCtx() const override; 87 | virtual int getBufferIdx() const override; 88 | virtual std::string getBufferString() const override; 89 | virtual int getMPIRank() const override; 90 | }; 91 | 92 | // MemcpyBuffer containing memory accessible from a different node in a multi-node NVLink connected system 93 | // MPI_rank node owns the memory allocation, other nodes have it mapped 94 | // Writes/reads to that memory from other nodes happen over NVLink 95 | class MultinodeDeviceBufferUnicast : public MultinodeDeviceBuffer { 96 | private: 97 | MultinodeMemoryAllocationUnicast MemoryAllocation; 98 | public: 99 | MultinodeDeviceBufferUnicast(size_t bufferSize, int MPI_rank); 100 | }; 101 | 102 | // MemcpyBuffer containing memory bound to multicast object 103 | // Each node has its own copy of the memory, and the copies are the same 104 | // Writes to this memory are instantly propagated to other nodes (conforming to P2P writes memory model) 105 | class MultinodeDeviceBufferMulticast : public MultinodeDeviceBuffer { 106 | private: 107 | MultinodeMemoryAllocationMulticast MemoryAllocation; 108 | public: 109 | MultinodeDeviceBufferMulticast(size_t bufferSize, int MPI_rank); 110 | }; 111 | 112 | // MemcpyBuffer containing regular device memory 113 | // Only available on one node, exists primarily to simplify writing testcases 114 | class MultinodeDeviceBufferLocal : public MultinodeDeviceBuffer { 115 | private: 116 | CUcontext primaryCtx {}; 117 | public: 118 | MultinodeDeviceBufferLocal(size_t bufferSize, int MPI_rank); 119 | ~MultinodeDeviceBufferLocal(); 120 | }; 121 | 122 | class NodeHelperMulti : public NodeHelper { 123 | private: 124 | int rankOfFirstMemcpy; 125 | 126 | // streamBlocker 127 | volatile int* blockingVarHost; 128 | volatile int* blockingVarDevice; 129 | MultinodeMemoryAllocationUnicast blockingVarDeviceAllocation; 130 | public: 131 | NodeHelperMulti(); 132 | ~NodeHelperMulti(); 133 | MemcpyDispatchInfo dispatchMemcpy(const std::vector &srcBuffers, const std::vector &dstBuffers, ContextPreference ctxPreference); 134 | double calculateTotalBandwidth(double totalTime, double totalSize, size_t loopCount); 135 | double calculateSumBandwidth(std::vector &bandwidthStats); 136 | double calculateFirstBandwidth(std::vector &bandwidthStats); 137 | std::vector calculateVectorBandwidth(std::vector &results, std::vector originalRanks); 138 | void synchronizeProcess(); 139 | CUresult streamSynchronizeWrapper(CUstream stream) const; 140 | 141 | // stream blocking functions 142 | void streamBlockerReset(); 143 | void streamBlockerRelease(); 144 | void streamBlockerBlock(CUstream stream); 145 | }; 146 | 147 | #endif // MULTINODE 148 | #endif // MULTINODE_MEMCPY_H_ 149 | -------------------------------------------------------------------------------- /multinode_testcases.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include 19 | 20 | #include "testcase.h" 21 | #include "memcpy.h" 22 | #include "common.h" 23 | #include "output.h" 24 | #ifdef MULTINODE 25 | #include 26 | #include "multinode_memcpy.h" 27 | 28 | // DtoD Read test - copy from dst to src (backwards) using src contxt 29 | void MultinodeDeviceToDeviceReadCE::run(unsigned long long size, unsigned long long loopCount) { 30 | PeerValueMatrix bandwidthValues(worldSize, worldSize, key); 31 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti(), PREFER_DST_CONTEXT); 32 | 33 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 34 | for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) { 35 | if (peerDeviceId == srcDeviceId) { 36 | continue; 37 | } 38 | MultinodeDeviceBufferUnicast srcNode(size, srcDeviceId); 39 | MultinodeDeviceBufferUnicast peerNode(size, peerDeviceId); 40 | 41 | // swap src and peer nodes, but use srcNodes (the copy's destination) context 42 | bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(peerNode, srcNode); 43 | } 44 | } 45 | 46 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)"); 47 | } 48 | 49 | 50 | // DtoD Write test - copy from src to dst using src context 51 | void MultinodeDeviceToDeviceWriteCE::run(unsigned long long size, unsigned long long loopCount) { 52 | PeerValueMatrix bandwidthValues(worldSize, worldSize, key); 53 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti()); 54 | 55 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 56 | for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) { 57 | if (peerDeviceId == srcDeviceId) { 58 | continue; 59 | } 60 | 61 | MultinodeDeviceBufferUnicast srcNode(size, srcDeviceId); 62 | MultinodeDeviceBufferUnicast peerNode(size, peerDeviceId); 63 | 64 | bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(srcNode, peerNode); 65 | } 66 | } 67 | 68 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)"); 69 | } 70 | 71 | // DtoD Bidir Read test - copy from dst to src (backwards) using src contxt 72 | void MultinodeDeviceToDeviceBidirReadCE::run(unsigned long long size, unsigned long long loopCount) { 73 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::VECTOR_BW); 74 | PeerValueMatrix bandwidthValuesRead1(worldSize, worldSize, key + "_read1"); 75 | PeerValueMatrix bandwidthValuesRead2(worldSize, worldSize, key + "_read2"); 76 | PeerValueMatrix bandwidthValuesTotal(worldSize, worldSize, key + "_total"); 77 | 78 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 79 | for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) { 80 | if (peerDeviceId == srcDeviceId) { 81 | continue; 82 | } 83 | 84 | // Double the size of the interference copy to ensure it interferes correctly 85 | MultinodeDeviceBufferUnicast src1(size, srcDeviceId), src2(size, srcDeviceId); 86 | MultinodeDeviceBufferUnicast peer1(size, peerDeviceId), peer2(size, peerDeviceId); 87 | 88 | // swap src and peer nodes, but use srcNodes (the copy's destination) context 89 | std::vector srcNodes = {&peer1, &src2}; 90 | std::vector peerNodes = {&src1, &peer2}; 91 | 92 | auto results = memcpyInstance.doMemcpyVector(srcNodes, peerNodes); 93 | bandwidthValuesRead1.value(srcDeviceId, peerDeviceId) = results[0]; 94 | bandwidthValuesRead2.value(srcDeviceId, peerDeviceId) = results[1]; 95 | bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1]; 96 | } 97 | } 98 | 99 | output->addTestcaseResults(bandwidthValuesRead1, "memcpy CE CPU(row) <-> GPU(column) Read1 bandwidth (GB/s)"); 100 | output->addTestcaseResults(bandwidthValuesRead2, "memcpy CE CPU(row) <-> GPU(column) Read2 bandwidth (GB/s)"); 101 | output->addTestcaseResults(bandwidthValuesTotal, "memcpy CE CPU(row) <-> GPU(column) Total bandwidth (GB/s)"); 102 | } 103 | 104 | // DtoD Bidir Write test - copy from src to dst using src context 105 | void MultinodeDeviceToDeviceBidirWriteCE::run(unsigned long long size, unsigned long long loopCount) { 106 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); 107 | PeerValueMatrix bandwidthValuesWrite1(worldSize, worldSize, key + "_write1"); 108 | PeerValueMatrix bandwidthValuesWrite2(worldSize, worldSize, key + "_write2"); 109 | PeerValueMatrix bandwidthValuesTotal(worldSize, worldSize, key + "_total"); 110 | 111 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 112 | for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) { 113 | if (peerDeviceId == srcDeviceId) { 114 | continue; 115 | } 116 | 117 | // Double the size of the interference copy to ensure it interferes correctly 118 | MultinodeDeviceBufferUnicast src1(size, srcDeviceId), src2(size, srcDeviceId); 119 | MultinodeDeviceBufferUnicast peer1(size, peerDeviceId), peer2(size, peerDeviceId); 120 | 121 | std::vector srcNodes = {&src1, &peer2}; 122 | std::vector peerNodes = {&peer1, &src2}; 123 | 124 | auto results = memcpyInstance.doMemcpyVector(srcNodes, peerNodes); 125 | bandwidthValuesWrite1.value(srcDeviceId, peerDeviceId) = results[0]; 126 | bandwidthValuesWrite2.value(srcDeviceId, peerDeviceId) = results[1]; 127 | bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1]; 128 | } 129 | } 130 | 131 | output->addTestcaseResults(bandwidthValuesWrite1, "memcpy CE CPU(row) <-> GPU(column) Read1 bandwidth (GB/s)"); 132 | output->addTestcaseResults(bandwidthValuesWrite2, "memcpy CE CPU(row) <-> GPU(column) Read2 bandwidth (GB/s)"); 133 | output->addTestcaseResults(bandwidthValuesTotal, "memcpy CE CPU(row) <-> GPU(column) Total bandwidth (GB/s)"); 134 | } 135 | 136 | 137 | // DtoD Read test - copy from dst to src (backwards) using src contxt 138 | void MultinodeDeviceToDeviceReadSM::run(unsigned long long size, unsigned long long loopCount) { 139 | PeerValueMatrix bandwidthValues(worldSize, worldSize, key); 140 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_DST_CONTEXT); 141 | 142 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 143 | for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) { 144 | if (peerDeviceId == srcDeviceId) { 145 | continue; 146 | } 147 | 148 | MultinodeDeviceBufferUnicast srcNode(size, srcDeviceId); 149 | MultinodeDeviceBufferUnicast peerNode(size, peerDeviceId); 150 | 151 | // swap src and peer nodes, but use srcNodes (the copy's destination) context 152 | bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(peerNode, srcNode); 153 | } 154 | } 155 | 156 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)"); 157 | } 158 | 159 | // DtoD Write test - copy from src to dst using src context 160 | void MultinodeDeviceToDeviceWriteSM::run(unsigned long long size, unsigned long long loopCount) { 161 | PeerValueMatrix bandwidthValues(worldSize, worldSize, key); 162 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti()); 163 | 164 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 165 | for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) { 166 | if (peerDeviceId == srcDeviceId) { 167 | continue; 168 | } 169 | 170 | MultinodeDeviceBufferUnicast srcNode(size, srcDeviceId); 171 | MultinodeDeviceBufferUnicast peerNode(size, peerDeviceId); 172 | 173 | bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(srcNode, peerNode); 174 | } 175 | } 176 | 177 | output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) <- GPU(column) bandwidth (GB/s)"); 178 | } 179 | 180 | // DtoD Bidir Read test - copy from dst to src (backwards) using src contxt 181 | void MultinodeDeviceToDeviceBidirReadSM::run(unsigned long long size, unsigned long long loopCount) { 182 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::VECTOR_BW); 183 | PeerValueMatrix bandwidthValuesRead1(worldSize, worldSize, key + "_read1"); 184 | PeerValueMatrix bandwidthValuesRead2(worldSize, worldSize, key + "_read2"); 185 | PeerValueMatrix bandwidthValuesTotal(worldSize, worldSize, key + "_total"); 186 | 187 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 188 | for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) { 189 | if (peerDeviceId == srcDeviceId) { 190 | continue; 191 | } 192 | 193 | MultinodeDeviceBufferUnicast src1(size, srcDeviceId), src2(size, srcDeviceId); 194 | MultinodeDeviceBufferUnicast peer1(size, peerDeviceId), peer2(size, peerDeviceId); 195 | 196 | // swap src and peer nodes, but use srcNodes (the copy's destination) context 197 | std::vector srcNodes = {&peer1, &src2}; 198 | std::vector peerNodes = {&src1, &peer2}; 199 | 200 | auto results = memcpyInstance.doMemcpyVector(srcNodes, peerNodes); 201 | bandwidthValuesRead1.value(srcDeviceId, peerDeviceId) = results[0]; 202 | bandwidthValuesRead2.value(srcDeviceId, peerDeviceId) = results[1]; 203 | bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1]; 204 | } 205 | } 206 | 207 | output->addTestcaseResults(bandwidthValuesRead1, "memcpy SM CPU(row) <-> GPU(column) Read1 bandwidth (GB/s)"); 208 | output->addTestcaseResults(bandwidthValuesRead2, "memcpy SM CPU(row) <-> GPU(column) Read2 bandwidth (GB/s)"); 209 | output->addTestcaseResults(bandwidthValuesTotal, "memcpy SM CPU(row) <-> GPU(column) Total bandwidth (GB/s)"); 210 | } 211 | 212 | // DtoD Bidir Write test - copy from src to dst using src context 213 | void MultinodeDeviceToDeviceBidirWriteSM::run(unsigned long long size, unsigned long long loopCount) { 214 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); 215 | PeerValueMatrix bandwidthValuesWrite1(worldSize, worldSize, key + "_write1"); 216 | PeerValueMatrix bandwidthValuesWrite2(worldSize, worldSize, key + "_write2"); 217 | PeerValueMatrix bandwidthValuesTotal(worldSize, worldSize, key + "_total"); 218 | 219 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 220 | for (int peerDeviceId = 0; peerDeviceId < worldSize; peerDeviceId++) { 221 | if (peerDeviceId == srcDeviceId) { 222 | continue; 223 | } 224 | 225 | MultinodeDeviceBufferUnicast src1(size, srcDeviceId), src2(size, srcDeviceId); 226 | MultinodeDeviceBufferUnicast peer1(size, peerDeviceId), peer2(size, peerDeviceId); 227 | 228 | std::vector srcNodes = {&src1, &peer2}; 229 | std::vector peerNodes = {&peer1, &src2}; 230 | 231 | auto results = memcpyInstance.doMemcpyVector(srcNodes, peerNodes); 232 | bandwidthValuesWrite1.value(srcDeviceId, peerDeviceId) = results[0]; 233 | bandwidthValuesWrite2.value(srcDeviceId, peerDeviceId) = results[1]; 234 | bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1]; 235 | } 236 | } 237 | 238 | output->addTestcaseResults(bandwidthValuesWrite1, "memcpy SM CPU(row) <-> GPU(column) Write1 bandwidth (GB/s)"); 239 | output->addTestcaseResults(bandwidthValuesWrite2, "memcpy SM CPU(row) <-> GPU(column) Write2 bandwidth (GB/s)"); 240 | output->addTestcaseResults(bandwidthValuesTotal, "memcpy SM CPU(row) <-> GPU(column) Total bandwidth (GB/s)"); 241 | } 242 | 243 | void MultinodeAllToOneWriteSM::run(unsigned long long size, unsigned long long loopCount) { 244 | PeerValueMatrix bandwidthValues(1, worldSize, key); 245 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_SRC_CONTEXT, MemcpyOperation::SUM_BW); 246 | 247 | for (int dstDeviceId = 0; dstDeviceId < worldSize; dstDeviceId++) { 248 | std::vector srcNodes; 249 | std::vector dstNodes; 250 | 251 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 252 | if (dstDeviceId == srcDeviceId) { 253 | continue; 254 | } 255 | 256 | srcNodes.push_back(new MultinodeDeviceBufferLocal(size, srcDeviceId)); 257 | dstNodes.push_back(new MultinodeDeviceBufferUnicast(size, dstDeviceId)); 258 | } 259 | 260 | bandwidthValues.value(0, dstDeviceId) = memcpyInstance.doMemcpy(srcNodes, dstNodes); 261 | 262 | for (auto node : dstNodes) { 263 | delete node; 264 | } 265 | for (auto node : srcNodes) { 266 | delete node; 267 | } 268 | } 269 | 270 | output->addTestcaseResults(bandwidthValues, "memcpy SM All Gpus -> GPU(column) total bandwidth (GB/s)"); 271 | } 272 | 273 | void MultinodeAllFromOneReadSM::run(unsigned long long size, unsigned long long loopCount) { 274 | PeerValueMatrix bandwidthValues(1, worldSize, key); 275 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::SUM_BW); 276 | 277 | for (int srcDeviceId = 0; srcDeviceId < worldSize; srcDeviceId++) { 278 | std::vector srcNodes; 279 | std::vector dstNodes; 280 | 281 | for (int dstDeviceId = 0; dstDeviceId < worldSize; dstDeviceId++) { 282 | if (dstDeviceId == srcDeviceId) { 283 | continue; 284 | } 285 | 286 | srcNodes.push_back(new MultinodeDeviceBufferUnicast(size, srcDeviceId)); 287 | dstNodes.push_back(new MultinodeDeviceBufferLocal(size, dstDeviceId)); 288 | } 289 | 290 | bandwidthValues.value(0, srcDeviceId) = memcpyInstance.doMemcpy(srcNodes, dstNodes); 291 | 292 | for (auto node : dstNodes) { 293 | delete node; 294 | } 295 | for (auto node : srcNodes) { 296 | delete node; 297 | } 298 | } 299 | 300 | output->addTestcaseResults(bandwidthValues, "memcpy SM All Gpus <- GPU(column) total bandwidth (GB/s)"); 301 | } 302 | 303 | void MultinodeBroadcastOneToAllSM::run(unsigned long long size, unsigned long long loopCount) { 304 | PeerValueMatrix bandwidthValues(1, worldSize, key); 305 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorMulticastWrite(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::SUM_BW); 306 | 307 | for (int dstDeviceId = 0; dstDeviceId < worldSize; dstDeviceId++) { 308 | std::vector srcNodes; 309 | std::vector dstNodes; 310 | 311 | srcNodes.push_back(new MultinodeDeviceBufferLocal(size, dstDeviceId)); 312 | dstNodes.push_back(new MultinodeDeviceBufferMulticast(size, dstDeviceId)); 313 | 314 | bandwidthValues.value(0, dstDeviceId) = memcpyInstance.doMemcpy(srcNodes, dstNodes); 315 | 316 | for (auto node : dstNodes) { 317 | delete node; 318 | } 319 | for (auto node : srcNodes) { 320 | delete node; 321 | } 322 | } 323 | 324 | output->addTestcaseResults(bandwidthValues, "multicast SM GPU(column) -> All Gpus total bandwidth (GB/s)"); 325 | } 326 | 327 | void MultinodeBroadcastAllToAllSM::run(unsigned long long size, unsigned long long loopCount) { 328 | PeerValueMatrix bandwidthValues(1, 1, key); 329 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorMulticastWrite(), new NodeHelperMulti(), PREFER_DST_CONTEXT, MemcpyOperation::SUM_BW); 330 | std::vector srcNodes; 331 | std::vector dstNodes; 332 | 333 | for (int dstDeviceId = 0; dstDeviceId < worldSize; dstDeviceId++) { 334 | srcNodes.push_back(new MultinodeDeviceBufferLocal(size, dstDeviceId)); 335 | dstNodes.push_back(new MultinodeDeviceBufferMulticast(size, dstDeviceId)); 336 | } 337 | 338 | bandwidthValues.value(0, 0) = memcpyInstance.doMemcpy(srcNodes, dstNodes); 339 | 340 | for (auto node : dstNodes) { 341 | delete node; 342 | } 343 | for (auto node : srcNodes) { 344 | delete node; 345 | } 346 | 347 | output->addTestcaseResults(bandwidthValues, "multicast SM All -> All Gpus total bandwidth (GB/s)"); 348 | } 349 | 350 | void MultinodeBisectWriteCE::run(unsigned long long size, unsigned long long loopCount) { 351 | PeerValueMatrix bandwidthValues(worldSize, 1, key); 352 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), new NodeHelperMulti(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); 353 | std::vector rowLabels; 354 | std::vector srcNodes, dstNodes; 355 | 356 | for (int i = 0; i < worldSize; i++) { 357 | int peer = (i + worldSize / 2) % worldSize; 358 | srcNodes.push_back(new MultinodeDeviceBufferUnicast(size, i)); 359 | dstNodes.push_back(new MultinodeDeviceBufferUnicast(size, peer)); 360 | 361 | std::stringstream s; 362 | s << getPaddedProcessId(i) << "->" << getPaddedProcessId(peer); 363 | rowLabels.push_back(s.str()); 364 | } 365 | 366 | auto results = memcpyInstance.doMemcpyVector(dstNodes, srcNodes); 367 | 368 | for (int i = 0; i < results.size(); i++) { 369 | bandwidthValues.value(i, 0) = results[i]; 370 | } 371 | bandwidthValues.setRowLabels(rowLabels); 372 | 373 | for (auto node : dstNodes) { 374 | delete node; 375 | } 376 | for (auto node : srcNodes) { 377 | delete node; 378 | } 379 | 380 | output->addTestcaseResults(bandwidthValues, "Bisect benchmarking, simultaneous write CE BW"); 381 | } 382 | 383 | #endif 384 | -------------------------------------------------------------------------------- /nvbandwidth.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #ifdef MULTINODE 25 | #include 26 | #endif 27 | 28 | #include "json_output.h" 29 | #include "kernels.cuh" 30 | #include "output.h" 31 | #include "testcase.h" 32 | #include "version.h" 33 | #include "inline_common.h" 34 | 35 | namespace opt = boost::program_options; 36 | 37 | int deviceCount; 38 | unsigned int averageLoopCount; 39 | unsigned long long bufferSize; 40 | unsigned long long loopCount; 41 | bool verbose; 42 | bool shouldOutput = true; 43 | bool disableAffinity; 44 | bool skipVerification; 45 | bool useMean; 46 | bool perfFormatter; 47 | 48 | Verbosity VERBOSE(verbose); 49 | Verbosity OUTPUT(shouldOutput); 50 | 51 | #ifdef MULTINODE 52 | // Device ordinal of the GPU owned by the process 53 | int localRank; 54 | // Process rank within one OS 55 | int localDevice; 56 | int worldRank; 57 | int worldSize; 58 | #endif 59 | char localHostname[STRING_LENGTH]; 60 | bool jsonOutput; 61 | Output *output; 62 | 63 | // Define testcases here 64 | std::vector createTestcases() { 65 | return { 66 | new HostToDeviceCE(), 67 | new DeviceToHostCE(), 68 | new HostToDeviceBidirCE(), 69 | new DeviceToHostBidirCE(), 70 | new DeviceToDeviceReadCE(), 71 | new DeviceToDeviceWriteCE(), 72 | new DeviceToDeviceBidirReadCE(), 73 | new DeviceToDeviceBidirWriteCE(), 74 | new AllToHostCE(), 75 | new AllToHostBidirCE(), 76 | new HostToAllCE(), 77 | new HostToAllBidirCE(), 78 | new AllToOneWriteCE(), 79 | new AllToOneReadCE(), 80 | new OneToAllWriteCE(), 81 | new OneToAllReadCE(), 82 | new HostToDeviceSM(), 83 | new DeviceToHostSM(), 84 | new HostToDeviceBidirSM(), 85 | new DeviceToHostBidirSM(), 86 | new DeviceToDeviceReadSM(), 87 | new DeviceToDeviceWriteSM(), 88 | new DeviceToDeviceBidirReadSM(), 89 | new DeviceToDeviceBidirWriteSM(), 90 | new AllToHostSM(), 91 | new AllToHostBidirSM(), 92 | new HostToAllSM(), 93 | new HostToAllBidirSM(), 94 | new AllToOneWriteSM(), 95 | new AllToOneReadSM(), 96 | new OneToAllWriteSM(), 97 | new OneToAllReadSM(), 98 | new HostDeviceLatencySM(), 99 | new DeviceToDeviceLatencySM(), 100 | new DeviceLocalCopy(), 101 | #ifdef MULTINODE 102 | new MultinodeDeviceToDeviceReadCE(), 103 | new MultinodeDeviceToDeviceWriteCE(), 104 | new MultinodeDeviceToDeviceBidirReadCE(), 105 | new MultinodeDeviceToDeviceBidirWriteCE(), 106 | new MultinodeDeviceToDeviceReadSM(), 107 | new MultinodeDeviceToDeviceWriteSM(), 108 | new MultinodeDeviceToDeviceBidirReadSM(), 109 | new MultinodeDeviceToDeviceBidirWriteSM(), 110 | new MultinodeAllToOneWriteSM(), 111 | new MultinodeAllFromOneReadSM(), 112 | new MultinodeBroadcastOneToAllSM(), 113 | new MultinodeBroadcastAllToAllSM(), 114 | new MultinodeBisectWriteCE(), 115 | #endif 116 | }; 117 | } 118 | 119 | Testcase* findTestcase(std::vector &testcases, std::string id) { 120 | // Check if testcase ID is index 121 | char* p; 122 | long index = strtol(id.c_str(), &p, 10); 123 | if (*p) { 124 | // Conversion failed so key is ID 125 | auto it = find_if(testcases.begin(), testcases.end(), [&id](Testcase* test) {return test->testKey() == id;}); 126 | if (it != testcases.end()) { 127 | return testcases.at(std::distance(testcases.begin(), it)); 128 | } else { 129 | throw "Testcase " + id + " not found!"; 130 | } 131 | } else { 132 | // ID is index 133 | if (index < 0 || index >= static_cast(testcases.size())) throw "Testcase index " + id + " out of bound!"; 134 | return testcases.at(index); 135 | } 136 | } 137 | 138 | std::vector expandTestcases(std::vector &testcases, std::vector prefixes) { 139 | std::vector testcasesToRun; 140 | for (auto testcase : testcases) { 141 | auto it = find_if(prefixes.begin(), prefixes.end(), [&testcase](std::string prefix) {return testcase->testKey().compare(0, prefix.size(), prefix) == 0;}); 142 | if (it != prefixes.end()) { 143 | testcasesToRun.push_back(testcase->testKey()); 144 | } 145 | } 146 | return testcasesToRun; 147 | } 148 | 149 | void runTestcase(std::vector &testcases, const std::string &testcaseID) { 150 | Testcase* test{nullptr}; 151 | try { 152 | test = findTestcase(testcases, testcaseID); 153 | } catch (std::string &s) { 154 | output->addTestcase(testcaseID, "ERROR", s); 155 | return; 156 | } 157 | 158 | try { 159 | if (!test->filter()) { 160 | output->addTestcase(test->testKey(), NVB_WAIVED); 161 | return; 162 | } 163 | 164 | output->addTestcase(test->testKey(), NVB_RUNNING); 165 | 166 | // Run the testcase 167 | if (test->testKey() == "host_device_latency_sm" || test->testKey() == "device_to_device_latency_sm") { 168 | // use fixd-size buffer for latency tests 169 | test->run(2 * _MiB, loopCount); 170 | } else { 171 | test->run(bufferSize * _MiB, loopCount); 172 | } 173 | } catch (std::string &s) { 174 | output->setTestcaseStatusAndAddIfNeeded(test->testKey(), NVB_ERROR_STATUS, s); 175 | } 176 | } 177 | 178 | int main(int argc, char **argv) { 179 | std::vector testcases = createTestcases(); 180 | std::vector testcasesToRun; 181 | std::vector testcasePrefixes; 182 | output = new Output(); 183 | 184 | #ifdef _WIN32 185 | strncpy(localHostname, getenv("COMPUTERNAME"), STRING_LENGTH - 1); 186 | const char* computername = getenv("COMPUTERNAME"); 187 | if (computername && computername[0] != '\0') { 188 | snprintf(localHostname, STRING_LENGTH, "%s", computername); 189 | } else { 190 | snprintf(localHostname, STRING_LENGTH, "%s", "unknown"); 191 | } 192 | #else 193 | ASSERT(0 == gethostname(localHostname, STRING_LENGTH - 1)); 194 | #endif 195 | #ifdef MULTINODE 196 | // Set up MPI 197 | MPI_Init(NULL, NULL); 198 | MPI_Comm_size(MPI_COMM_WORLD, &worldSize); 199 | MPI_Comm_rank(MPI_COMM_WORLD, &worldRank); 200 | 201 | // Avoid excessive output by limit output to rank 0 202 | shouldOutput = (worldRank == 0); 203 | #endif 204 | 205 | // Args parsing 206 | opt::options_description visible_opts("nvbandwidth CLI"); 207 | visible_opts.add_options() 208 | ("help,h", "Produce help message") 209 | ("bufferSize,b", opt::value(&bufferSize)->default_value(defaultBufferSize), "Memcpy buffer size in MiB") 210 | ("list,l", "List available testcases") 211 | ("testcase,t", opt::value>(&testcasesToRun)->multitoken(), "Testcase(s) to run (by name or index)") 212 | ("testcasePrefixes,p", opt::value>(&testcasePrefixes)->multitoken(), "Testcase(s) to run (by prefix))") 213 | ("verbose,v", opt::bool_switch(&verbose)->default_value(false), "Verbose output") 214 | ("skipVerification,s", opt::bool_switch(&skipVerification)->default_value(false), "Skips data verification after copy") 215 | ("disableAffinity,d", opt::bool_switch(&disableAffinity)->default_value(false), "Disable automatic CPU affinity control") 216 | ("testSamples,i", opt::value(&averageLoopCount)->default_value(defaultAverageLoopCount), "Iterations of the benchmark") 217 | ("useMean,m", opt::bool_switch(&useMean)->default_value(false), "Use mean instead of median for results") 218 | ("json,j", opt::bool_switch(&jsonOutput)->default_value(false), "Print output in json format instead of plain text."); 219 | 220 | opt::options_description all_opts(""); 221 | all_opts.add(visible_opts); 222 | all_opts.add_options() 223 | ("loopCount", opt::value(&loopCount)->default_value(defaultLoopCount), "Iterations of memcpy to be performed within a test sample") 224 | ("perfFormatter", opt::bool_switch(&perfFormatter)->default_value(false), "Use perf formatter prefix (&&&& PERF) in output"); 225 | 226 | opt::variables_map vm; 227 | try { 228 | opt::store(opt::parse_command_line(argc, argv, all_opts), vm); 229 | opt::notify(vm); 230 | } catch (...) { 231 | output->addVersionInfo(); 232 | 233 | std::stringstream errmsg; 234 | errmsg << "ERROR: Invalid Arguments " << std::endl; 235 | for (int i = 0; i < argc; i++) { 236 | errmsg << argv[i] << " "; 237 | } 238 | std::vector messageParts; 239 | std::stringstream buf; 240 | buf << visible_opts; 241 | messageParts.emplace_back(errmsg.str()); 242 | messageParts.emplace_back(buf.str()); 243 | output->recordError(messageParts); 244 | return 1; 245 | } 246 | 247 | if (jsonOutput) { 248 | delete output; 249 | output = new JsonOutput(shouldOutput); 250 | } 251 | 252 | output->addVersionInfo(); 253 | 254 | if (vm.count("help")) { 255 | OUTPUT << visible_opts << "\n"; 256 | return 0; 257 | } 258 | 259 | if (vm.count("list")) { 260 | output->listTestcases(testcases); 261 | return 0; 262 | } 263 | 264 | if (testcasePrefixes.size() != 0 && testcasesToRun.size() != 0) { 265 | output->recordError("You cannot specify both testcase and testcasePrefix options at the same time"); 266 | return 1; 267 | } 268 | 269 | 270 | CU_ASSERT(cuInit(0)); 271 | NVML_ASSERT(nvmlInit()); 272 | CU_ASSERT(cuDeviceGetCount(&deviceCount)); 273 | if (bufferSize < defaultBufferSize) { 274 | output->recordWarning("NOTE: You have chosen a buffer size that is smaller than the default buffer size. It is suggested to use the default buffer size (64MB) to achieve maximal peak bandwidth."); 275 | } 276 | 277 | int cudaVersion; 278 | cudaRuntimeGetVersion(&cudaVersion); 279 | 280 | CU_ASSERT(cuDriverGetVersion(&cudaVersion)); 281 | 282 | char driverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; 283 | NVML_ASSERT(nvmlSystemGetDriverVersion(driverVersion, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)); 284 | 285 | output->addCudaAndDriverInfo(cudaVersion, driverVersion); 286 | 287 | output->recordDevices(deviceCount); 288 | 289 | if (testcasePrefixes.size() > 0) { 290 | testcasesToRun = expandTestcases(testcases, testcasePrefixes); 291 | if (testcasesToRun.size() == 0) { 292 | output->recordError("Specified list of testcase prefixes did not match any testcases"); 293 | return 1; 294 | } 295 | } 296 | 297 | // This triggers the loading of all kernels on all devices, even with lazy loading enabled. 298 | // Some tests can create complex dependencies between devices and function loading requires a 299 | // device synchronization, so loading in the middle of a test can deadlock. 300 | preloadKernels(deviceCount); 301 | 302 | if (testcasesToRun.size() == 0) { 303 | // run all testcases 304 | for (auto testcase : testcases) { 305 | runTestcase(testcases, testcase->testKey()); 306 | } 307 | } else { 308 | for (const auto& testcaseIndex : testcasesToRun) { 309 | runTestcase(testcases, testcaseIndex); 310 | } 311 | } 312 | 313 | output->print(); 314 | 315 | for (auto testcase : testcases) { 316 | delete testcase; 317 | } 318 | 319 | #ifdef MULTINODE 320 | MPI_Finalize(); 321 | #endif 322 | 323 | output->printInfo(); 324 | return 0; 325 | } 326 | -------------------------------------------------------------------------------- /output.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include "inline_common.h" 19 | #include "output.h" 20 | #include "version.h" 21 | 22 | #include 23 | 24 | #ifdef MULTINODE 25 | #include 26 | #include 27 | #include 28 | #endif 29 | 30 | void Output::addVersionInfo() { 31 | OUTPUT << "nvbandwidth Version: " << NVBANDWIDTH_VERSION << std::endl; 32 | OUTPUT << "Built from Git version: " << GIT_VERSION << std::endl << std::endl; 33 | 34 | #ifdef MULTINODE 35 | char MPIVersion[MPI_MAX_LIBRARY_VERSION_STRING]; 36 | int MPIVersionLen; 37 | MPI_Get_library_version(MPIVersion, &MPIVersionLen); 38 | 39 | OUTPUT << "MPI version: " << MPIVersion << std::endl; 40 | #endif 41 | } 42 | 43 | void Output::printInfo() { 44 | OUTPUT << "NOTE: The reported results may not reflect the full capabilities of the platform." << std::endl 45 | << "Performance can vary with software drivers, hardware clocks, and system topology." << std::endl << std::endl; 46 | } 47 | 48 | void Output::addCudaAndDriverInfo(int cudaVersion, const std::string &driverVersion) { 49 | OUTPUT << "CUDA Runtime Version: " << cudaVersion << std::endl; 50 | OUTPUT << "CUDA Driver Version: " << cudaVersion << std::endl; 51 | OUTPUT << "Driver Version: " << driverVersion << std::endl << std::endl; 52 | } 53 | 54 | void Output::recordError(const std::string &error) { 55 | std::cerr << error << std::endl; 56 | } 57 | 58 | void Output::recordError(const std::vector &errorParts) { 59 | bool first = true; 60 | for (auto &part : errorParts) { 61 | if (first) { 62 | OUTPUT << part << ":\n\n"; 63 | first = false; 64 | } else { 65 | OUTPUT << part << std::endl; 66 | } 67 | } 68 | } 69 | 70 | void Output::listTestcases(const std::vector &testcases) { 71 | size_t numTestcases = testcases.size(); 72 | OUTPUT << "Index, Name:\n\tDescription\n"; 73 | OUTPUT << "=======================\n"; 74 | for (unsigned int i = 0; i < numTestcases; i++) { 75 | OUTPUT << i << ", " << testcases.at(i)->testKey() << ":\n" << testcases.at(i)->testDesc() << "\n\n"; 76 | } 77 | } 78 | 79 | std::string getDeviceDisplayInfo(int deviceOrdinal) { 80 | std::stringstream sstream; 81 | CUdevice dev; 82 | char name[STRING_LENGTH]; 83 | int busId, deviceId, domainId; 84 | 85 | CU_ASSERT(cuDeviceGet(&dev, deviceOrdinal)); 86 | CU_ASSERT(cuDeviceGetName(name, STRING_LENGTH, dev)); 87 | CU_ASSERT(cuDeviceGetAttribute(&domainId, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev)); 88 | CU_ASSERT(cuDeviceGetAttribute(&busId, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev)); 89 | CU_ASSERT(cuDeviceGetAttribute(&deviceId, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev)); 90 | sstream << name << " (" << 91 | std::hex << std::setw(8) << std::setfill('0') << domainId << ":" << 92 | std::hex << std::setw(2) << std::setfill('0') << busId << ":" << 93 | std::hex << std::setw(2) << std::setfill('0') << deviceId << ")" << 94 | std::dec << std::setfill(' ') << std::setw(0); // reset formatting 95 | 96 | return sstream.str(); 97 | } 98 | 99 | #ifdef MULTINODE 100 | // Exchange and print information about all devices in MPI world 101 | // Through this process each process learns about GPUs of other processes, as well as, 102 | // determines its own GPU index 103 | // Each process is allocated a dedicated GPU. It is advisable to initiate NUM_GPU processes per system, 104 | // with each process autonomously selecting a GPU to utilize. To determine this selection, 105 | // processes exchange their hostnames, and look for duplicates of own hostname among processes with lower value of worldRank. 106 | // localRank is equal to number of processes with the same hostname, but lower worldRank. 107 | static void printGPUsMultinode(int deviceCount) { 108 | // Exchange hostnames 109 | std::vector hostnameExchange(worldSize * STRING_LENGTH); 110 | MPI_Allgather(localHostname, STRING_LENGTH, MPI_BYTE, &hostnameExchange[0], STRING_LENGTH, MPI_BYTE, MPI_COMM_WORLD); 111 | 112 | // Find local rank based on hostnames 113 | localRank = 0; 114 | for (int i = 0; i < worldRank; i++) { 115 | if (strncmp(localHostname, &hostnameExchange[i * STRING_LENGTH], STRING_LENGTH) == 0) { 116 | localRank++; 117 | } 118 | } 119 | 120 | std::vector deviceCountExchange(worldSize); 121 | MPI_Allgather(&deviceCount, 1, MPI_INT, &deviceCountExchange[0], 1, MPI_INT, MPI_COMM_WORLD); 122 | 123 | localDevice = localRank % deviceCount; 124 | 125 | // It's not recommended to run more ranks per node than GPU count, but we want to make sure we handle it gracefully 126 | std::map gpuCounts; 127 | for (int i = 0; i < worldSize; i++) { 128 | std::string host(&hostnameExchange[i * STRING_LENGTH]); 129 | gpuCounts[host]++; 130 | if (gpuCounts[host] == deviceCountExchange[i] + 1) { 131 | // Unconditionally emitting a warning, once per node 132 | std::stringstream warning; 133 | warning << "Warning: there are more processes than GPUs on " << host << ". Please reduce number of processes to match GPU count."; 134 | output->recordWarning(warning.str()); 135 | } 136 | } 137 | 138 | // Exchange device names 139 | std::string localDeviceName = getDeviceDisplayInfo(localDevice); 140 | ASSERT(localDeviceName.size() < STRING_LENGTH); 141 | localDeviceName.resize(STRING_LENGTH); 142 | 143 | std::vector deviceNameExchange(worldSize * STRING_LENGTH, 0); 144 | MPI_Allgather(&localDeviceName[0], STRING_LENGTH, MPI_BYTE, &deviceNameExchange[0], STRING_LENGTH, MPI_BYTE, MPI_COMM_WORLD); 145 | 146 | // Exchange device ids 147 | std::vector localDeviceIdExchange(worldSize, -1); 148 | MPI_Allgather(&localDevice, 1, MPI_INT, &localDeviceIdExchange[0], 1, MPI_INT, MPI_COMM_WORLD); 149 | 150 | // Print gathered info 151 | for (int i = 0; i < worldSize; i++) { 152 | char *deviceName = &deviceNameExchange[i * STRING_LENGTH]; 153 | OUTPUT << "Process " << getPaddedProcessId(i) << " (" << &hostnameExchange[i * STRING_LENGTH] << "): device " << localDeviceIdExchange[i] << ": " << deviceName << std::endl; 154 | } 155 | OUTPUT << std::endl; 156 | } 157 | #endif 158 | 159 | static void printGPUs() { 160 | OUTPUT << localHostname << std::endl; 161 | for (int iDev = 0; iDev < deviceCount; iDev++) { 162 | OUTPUT << "Device " << iDev << ": " << getDeviceDisplayInfo(iDev) << std::endl; 163 | } 164 | OUTPUT << std::endl; 165 | } 166 | 167 | void Output::recordDevices(int deviceCount) { 168 | #ifdef MULTINODE 169 | printGPUsMultinode(deviceCount); 170 | #else 171 | printGPUs(); 172 | #endif 173 | } 174 | 175 | void Output::addTestcase(const std::string &name, const std::string &status, const std::string &msg) { 176 | if (status == NVB_RUNNING) { 177 | OUTPUT << status << " " << name << ".\n"; 178 | } else { 179 | OUTPUT << status << ": " << msg << std::endl; 180 | } 181 | } 182 | 183 | void Output::setTestcaseStatusAndAddIfNeeded(const std::string &name, const std::string &status, const std::string &msg) { 184 | // For plain text output, the name has always been printed already and therefore isn't needed here 185 | OUTPUT << status << ": " << msg << std::endl; 186 | } 187 | 188 | void Output::addTestcaseResults(const PeerValueMatrix &bandwidthValues, const std::string &description) { 189 | OUTPUT << description << std::endl; 190 | OUTPUT << std::fixed << std::setprecision(2) << bandwidthValues << std::endl; 191 | } 192 | 193 | void Output::print() { 194 | // NO-OP 195 | } 196 | 197 | void Output::recordErrorCurrentTest(const std::string &errorLine1, const std::string &errorLine2) { 198 | OUTPUT << errorLine1 << std::endl << errorLine2 << std::endl; 199 | } 200 | 201 | void Output::recordWarning(const std::string &warning) { 202 | OUTPUT << warning << std::endl; 203 | } 204 | 205 | void RecordError(const std::stringstream &errmsg) { 206 | output->recordError(errmsg.str()); 207 | } 208 | -------------------------------------------------------------------------------- /output.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef OUTPUT_H_ 19 | #define OUTPUT_H_ 20 | 21 | #include 22 | #include 23 | 24 | #include "testcase.h" 25 | 26 | extern const std::string NVB_TITLE; 27 | extern const std::string NVB_CUDA_RUNTIME_VERSION; 28 | extern const std::string NVB_DRIVER_VERSION; 29 | extern const std::string NVB_GIT_VERSION; 30 | extern const std::string NVB_ERROR; 31 | extern const std::string NVB_WARNING; 32 | extern const std::string NVB_TESTCASES; 33 | extern const std::string NVB_TESTCASE_NAME; 34 | extern const std::string NVB_STATUS; 35 | extern const std::string NVB_BW_DESCRIPTION; 36 | extern const std::string NVB_BW_MATRIX; 37 | extern const std::string NVB_BW_SUM; 38 | extern const std::string NVB_BUFFER_SIZE; 39 | extern const std::string NVB_TEST_SAMPLES; 40 | extern const std::string NVB_USE_MEAN; 41 | extern const std::string NVB_PASSED; 42 | extern const std::string NVB_RUNNING; 43 | extern const std::string NVB_WAIVED; 44 | extern const std::string NVB_NOT_FOUND; 45 | extern const std::string NVB_ERROR_STATUS; 46 | 47 | class Output { 48 | public: 49 | virtual void addTestcase(const std::string &name, const std::string &status, const std::string &msg = ""); 50 | 51 | /* 52 | * If a test case matching the specified name exists, then update the status. If no testcase with that name exists, 53 | * then add a new one and set the status. 54 | * 55 | * @param name - the name of the test case 56 | * @param status - the status (PASS, FAIL, WAIVED, NOT FOUND) 57 | * @param msg - additional details if specified 58 | */ 59 | virtual void setTestcaseStatusAndAddIfNeeded(const std::string &name, const std::string &status, const std::string &msg = ""); 60 | 61 | virtual void print(); 62 | 63 | /* 64 | * Records a global error 65 | * 66 | * @param errorParts - each entry in this vector is one line of an error. In JSON output, all lines are combined. 67 | */ 68 | virtual void recordError(const std::vector &errorParts); 69 | 70 | /* 71 | * Records a global error 72 | */ 73 | virtual void recordError(const std::string &error); 74 | 75 | /* 76 | * Records a test error 77 | * 78 | * @param errorPart1 - the first part of the error. For plain text output, this is printed on line 1. 79 | * @param errorPart2 - the second part of the error. For plain text output, this is printed on line 2. 80 | * NOTE: in JSON output, these are combined on a single line 81 | */ 82 | virtual void recordErrorCurrentTest(const std::string &errorPart1, const std::string &errorPart2); 83 | 84 | virtual void recordWarning(const std::string &warning); 85 | 86 | virtual void addCudaAndDriverInfo(int cudaVersion, const std::string &driverVersion); 87 | 88 | virtual void addTestcaseResults(const PeerValueMatrix &matrix, const std::string &description); 89 | 90 | virtual void addVersionInfo(); 91 | 92 | virtual void printInfo(); 93 | 94 | virtual void recordDevices(int deviceCount); 95 | 96 | void listTestcases(const std::vector &testcases); 97 | }; 98 | 99 | extern Output *output; 100 | 101 | std::string getDeviceDisplayInfo(int deviceOrdinal); 102 | 103 | #endif // OUTPUT_H_ 104 | -------------------------------------------------------------------------------- /testcase.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include "common.h" 19 | #include "output.h" 20 | #include "testcase.h" 21 | #include "inline_common.h" 22 | 23 | Testcase::Testcase(std::string key, std::string desc) : 24 | key(std::move(key)), desc(std::move(desc)) 25 | {} 26 | 27 | std::string Testcase::testKey() { return key; } 28 | std::string Testcase::testDesc() { return desc; } 29 | 30 | bool Testcase::filterHasAccessiblePeerPairs() { 31 | int deviceCount = 0; 32 | CU_ASSERT(cuDeviceGetCount(&deviceCount)); 33 | 34 | for (int currentDevice = 0; currentDevice < deviceCount; currentDevice++) { 35 | for (int peer = 0; peer < deviceCount; peer++) { 36 | int canAccessPeer = 0; 37 | 38 | if (peer == currentDevice) { 39 | continue; 40 | } 41 | 42 | CU_ASSERT(cuDeviceCanAccessPeer(&canAccessPeer, currentDevice, peer)); 43 | if (canAccessPeer) { 44 | return true; 45 | } 46 | } 47 | } 48 | 49 | return false; 50 | } 51 | 52 | bool Testcase::filterSupportsMulticast() { 53 | int deviceCount = 0; 54 | CU_ASSERT(cuDeviceGetCount(&deviceCount)); 55 | 56 | for (int currentDevice = 0; currentDevice < deviceCount; currentDevice++) { 57 | CUdevice dev; 58 | CU_ASSERT(cuDeviceGet(&dev, currentDevice)); 59 | int supportsMulticast = 0; 60 | 61 | CU_ASSERT(cuDeviceGetAttribute(&supportsMulticast, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); 62 | if (!supportsMulticast) { 63 | return false; 64 | } 65 | } 66 | 67 | return true; 68 | } 69 | 70 | #ifdef MULTINODE 71 | // Each MPI rank handles one GPU, so we simply have to check if we have more than 1 process 72 | bool Testcase::filterHasMultipleGPUsMultinode() { 73 | return worldSize > 1; 74 | } 75 | #endif 76 | 77 | void Testcase::latencyHelper(const MemcpyBuffer &dataBuffer, bool measureDeviceToDeviceLatency) { 78 | uint64_t n_ptrs = dataBuffer.getBufferSize() / sizeof(struct LatencyNode); 79 | 80 | if (measureDeviceToDeviceLatency) { 81 | // For device-to-device latency, create and initialize pattern on device 82 | for (uint64_t i = 0; i < n_ptrs; i++) { 83 | struct LatencyNode node; 84 | size_t nextOffset = ((i + strideLen) % n_ptrs) * sizeof(struct LatencyNode); 85 | // Set up pattern with device addresses 86 | node.next = (struct LatencyNode*)(dataBuffer.getBuffer() + nextOffset); 87 | CU_ASSERT(cuMemcpyHtoD(dataBuffer.getBuffer() + i*sizeof(struct LatencyNode), 88 | &node, sizeof(struct LatencyNode))); 89 | } 90 | } else { 91 | // For host-device latency, initialize pattern with host addresses 92 | struct LatencyNode* hostMem = (struct LatencyNode*)dataBuffer.getBuffer(); 93 | for (uint64_t i = 0; i < n_ptrs; i++) { 94 | hostMem[i].next = &hostMem[(i + strideLen) % n_ptrs]; 95 | } 96 | } 97 | } 98 | 99 | void Testcase::allToOneHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix &bandwidthValues, bool isRead) { 100 | std::vector allSrcBuffers; 101 | 102 | // allocate all src nodes up front, re-use to avoid reallocation 103 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 104 | allSrcBuffers.push_back(new DeviceBuffer(size, deviceId)); 105 | } 106 | 107 | for (int dstDeviceId = 0; dstDeviceId < deviceCount; dstDeviceId++) { 108 | std::vector dstBuffers; 109 | std::vector srcBuffers; 110 | 111 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 112 | if (srcDeviceId == dstDeviceId) { 113 | continue; 114 | } 115 | 116 | DeviceBuffer* dstBuffer = new DeviceBuffer(size, dstDeviceId); 117 | 118 | if (!dstBuffer->enablePeerAcess(*allSrcBuffers[srcDeviceId])) { 119 | delete dstBuffer; 120 | continue; 121 | } 122 | 123 | srcBuffers.push_back(allSrcBuffers[srcDeviceId]); 124 | dstBuffers.push_back(dstBuffer); 125 | } 126 | // If no peer GPUs, skip measurements. 127 | if (!srcBuffers.empty()) { 128 | if (isRead) { 129 | // swap dst and src for read tests 130 | bandwidthValues.value(0, dstDeviceId) = memcpyInstance.doMemcpy(dstBuffers, srcBuffers); 131 | } else { 132 | bandwidthValues.value(0, dstDeviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers); 133 | } 134 | } 135 | 136 | for (auto node : dstBuffers) { 137 | delete node; 138 | } 139 | } 140 | 141 | for (auto node : allSrcBuffers) { 142 | delete node; 143 | } 144 | } 145 | 146 | void Testcase::oneToAllHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix &bandwidthValues, bool isRead) { 147 | std::vector allDstBuffers; 148 | 149 | // allocate all src nodes up front, re-use to avoid reallocation 150 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 151 | allDstBuffers.push_back(new DeviceBuffer(size, deviceId)); 152 | } 153 | 154 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 155 | std::vector dstBuffers; 156 | std::vector srcBuffers; 157 | 158 | for (int dstDeviceId = 0; dstDeviceId < deviceCount; dstDeviceId++) { 159 | if (srcDeviceId == dstDeviceId) { 160 | continue; 161 | } 162 | 163 | DeviceBuffer* srcBuffer = new DeviceBuffer(size, srcDeviceId); 164 | 165 | if (!srcBuffer->enablePeerAcess(*allDstBuffers[dstDeviceId])) { 166 | delete srcBuffer; 167 | continue; 168 | } 169 | 170 | srcBuffers.push_back(srcBuffer); 171 | dstBuffers.push_back(allDstBuffers[dstDeviceId]); 172 | } 173 | // If no peer GPUs, skip measurements. 174 | if ( !srcBuffers.empty() ) { 175 | if (isRead) { 176 | // swap dst and src for read tests 177 | bandwidthValues.value(0, srcDeviceId) = memcpyInstance.doMemcpy(dstBuffers, srcBuffers); 178 | } else { 179 | bandwidthValues.value(0, srcDeviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers); 180 | } 181 | } 182 | 183 | for (auto node : srcBuffers) { 184 | delete node; 185 | } 186 | } 187 | 188 | for (auto node : allDstBuffers) { 189 | delete node; 190 | } 191 | } 192 | 193 | void Testcase::allHostHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix &bandwidthValues, bool sourceIsHost) { 194 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 195 | std::vector deviceBuffers; 196 | std::vector hostBuffers; 197 | 198 | deviceBuffers.push_back(new DeviceBuffer(size, deviceId)); 199 | hostBuffers.push_back(new HostBuffer(size, deviceId)); 200 | 201 | for (int interferenceDeviceId = 0; interferenceDeviceId < deviceCount; interferenceDeviceId++) { 202 | if (interferenceDeviceId == deviceId) { 203 | continue; 204 | } 205 | 206 | // Double the size of the interference copy to ensure it interferes correctly 207 | deviceBuffers.push_back(new DeviceBuffer(size * 2, interferenceDeviceId)); 208 | hostBuffers.push_back(new HostBuffer(size * 2, interferenceDeviceId)); 209 | } 210 | 211 | if (sourceIsHost) { 212 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffers, deviceBuffers); 213 | } else { 214 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(deviceBuffers, hostBuffers); 215 | } 216 | 217 | for (auto node : deviceBuffers) { 218 | delete node; 219 | } 220 | 221 | for (auto node : hostBuffers) { 222 | delete node; 223 | } 224 | } 225 | } 226 | 227 | void Testcase::allHostBidirHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix &bandwidthValues, bool sourceIsHost) { 228 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 229 | std::vector srcBuffers; 230 | std::vector dstBuffers; 231 | 232 | if (sourceIsHost) { 233 | srcBuffers.push_back(new HostBuffer(size, deviceId)); 234 | dstBuffers.push_back(new DeviceBuffer(size, deviceId)); 235 | 236 | // Double the size of the interference copy to ensure it interferes correctly 237 | srcBuffers.push_back(new DeviceBuffer(size * 2, deviceId)); 238 | dstBuffers.push_back(new HostBuffer(size * 2, deviceId)); 239 | } else { 240 | srcBuffers.push_back(new DeviceBuffer(size, deviceId)); 241 | dstBuffers.push_back(new HostBuffer(size, deviceId)); 242 | 243 | // Double the size of the interference copy to ensure it interferes correctly 244 | srcBuffers.push_back(new HostBuffer(size * 2, deviceId)); 245 | dstBuffers.push_back(new DeviceBuffer(size * 2, deviceId)); 246 | } 247 | 248 | for (int interferenceDeviceId = 0; interferenceDeviceId < deviceCount; interferenceDeviceId++) { 249 | if (interferenceDeviceId == deviceId) { 250 | continue; 251 | } 252 | 253 | // Double the size of the interference copy to ensure it interferes correctly 254 | srcBuffers.push_back(new DeviceBuffer(size * 2, interferenceDeviceId)); 255 | dstBuffers.push_back(new HostBuffer(size * 2, interferenceDeviceId)); 256 | 257 | srcBuffers.push_back(new HostBuffer(size * 2, interferenceDeviceId)); 258 | dstBuffers.push_back(new DeviceBuffer(size * 2, interferenceDeviceId)); 259 | } 260 | 261 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers); 262 | 263 | for (auto node : srcBuffers) { 264 | delete node; 265 | } 266 | 267 | for (auto node : dstBuffers) { 268 | delete node; 269 | } 270 | } 271 | } 272 | 273 | -------------------------------------------------------------------------------- /testcases_ce.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include 19 | 20 | #include "common.h" 21 | #include "output.h" 22 | #include "testcase.h" 23 | #include "memcpy.h" 24 | 25 | void HostToDeviceCE::run(unsigned long long size, unsigned long long loopCount) { 26 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 27 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 28 | 29 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 30 | HostBuffer hostBuffer(size, deviceId); 31 | DeviceBuffer deviceBuffer(size, deviceId); 32 | 33 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffer, deviceBuffer); 34 | } 35 | 36 | output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)"); 37 | } 38 | 39 | void DeviceToHostCE::run(unsigned long long size, unsigned long long loopCount) { 40 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 41 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 42 | 43 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 44 | HostBuffer hostBuffer(size, deviceId); 45 | DeviceBuffer deviceBuffer(size, deviceId); 46 | 47 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(deviceBuffer, hostBuffer); 48 | } 49 | 50 | output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)"); 51 | } 52 | 53 | void HostToDeviceBidirCE::run(unsigned long long size, unsigned long long loopCount) { 54 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 55 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 56 | 57 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 58 | // Double the size of the interference copy to ensure it interferes correctly 59 | HostBuffer host1(size, deviceId), host2(size * 2, deviceId); 60 | DeviceBuffer dev1(size, deviceId), dev2(size * 2, deviceId); 61 | 62 | std::vector srcBuffers = {&host1, &dev2}; 63 | std::vector dstBuffers = {&dev1, &host2}; 64 | 65 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers); 66 | } 67 | 68 | output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)"); 69 | } 70 | 71 | void DeviceToHostBidirCE::run(unsigned long long size, unsigned long long loopCount) { 72 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 73 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 74 | 75 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 76 | // Double the size of the interference copy to ensure it interferes correctly 77 | HostBuffer host1(size, deviceId), host2(size * 2, deviceId); 78 | DeviceBuffer dev1(size, deviceId), dev2(size * 2, deviceId); 79 | 80 | std::vector srcBuffers = {&dev1, &host2}; 81 | std::vector dstBuffers = {&host1, &dev2}; 82 | 83 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers); 84 | } 85 | 86 | output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)"); 87 | } 88 | 89 | // DtoD Read test - copy from dst to src (backwards) using src contxt 90 | void DeviceToDeviceReadCE::run(unsigned long long size, unsigned long long loopCount) { 91 | PeerValueMatrix bandwidthValues(deviceCount, deviceCount, key); 92 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_DST_CONTEXT); 93 | 94 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 95 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 96 | if (peerDeviceId == srcDeviceId) { 97 | continue; 98 | } 99 | 100 | DeviceBuffer srcBuffer(size, srcDeviceId); 101 | DeviceBuffer peerBuffer(size, peerDeviceId); 102 | 103 | if (!srcBuffer.enablePeerAcess(peerBuffer)) { 104 | continue; 105 | } 106 | 107 | // swap src and peer nodes, but use srcBuffers (the copy's destination) context 108 | bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(peerBuffer, srcBuffer); 109 | } 110 | } 111 | 112 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)"); 113 | } 114 | 115 | // DtoD Write test - copy from src to dst using src context 116 | void DeviceToDeviceWriteCE::run(unsigned long long size, unsigned long long loopCount) { 117 | PeerValueMatrix bandwidthValues(deviceCount, deviceCount, key); 118 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 119 | 120 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 121 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 122 | if (peerDeviceId == srcDeviceId) { 123 | continue; 124 | } 125 | 126 | DeviceBuffer srcBuffer(size, srcDeviceId); 127 | DeviceBuffer peerBuffer(size, peerDeviceId); 128 | 129 | if (!srcBuffer.enablePeerAcess(peerBuffer)) { 130 | continue; 131 | } 132 | 133 | bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(srcBuffer, peerBuffer); 134 | } 135 | } 136 | 137 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)"); 138 | } 139 | 140 | // DtoD Bidir Read test - copy from dst to src (backwards) using src contxt 141 | void DeviceToDeviceBidirReadCE::run(unsigned long long size, unsigned long long loopCount) { 142 | PeerValueMatrix bandwidthValuesRead1(deviceCount, deviceCount, key + "_read1"); 143 | PeerValueMatrix bandwidthValuesRead2(deviceCount, deviceCount, key + "_read2"); 144 | PeerValueMatrix bandwidthValuesTotal(deviceCount, deviceCount, key + "_total"); 145 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_DST_CONTEXT, MemcpyOperation::VECTOR_BW); 146 | 147 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 148 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 149 | if (peerDeviceId == srcDeviceId) { 150 | continue; 151 | } 152 | 153 | // Double the size of the interference copy to ensure it interferes correctly 154 | DeviceBuffer src1(size, srcDeviceId), src2(size, srcDeviceId); 155 | DeviceBuffer peer1(size, peerDeviceId), peer2(size, peerDeviceId); 156 | 157 | if (!src1.enablePeerAcess(peer1)) { 158 | continue; 159 | } 160 | 161 | // swap src and peer nodes, but use srcBuffers (the copy's destination) context 162 | std::vector srcBuffers = {&peer1, &src2}; 163 | std::vector peerBuffers = {&src1, &peer2}; 164 | 165 | auto results = memcpyInstance.doMemcpyVector(srcBuffers, peerBuffers); 166 | bandwidthValuesRead1.value(srcDeviceId, peerDeviceId) = results[0]; 167 | bandwidthValuesRead2.value(srcDeviceId, peerDeviceId) = results[1]; 168 | bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1]; 169 | } 170 | } 171 | 172 | output->addTestcaseResults(bandwidthValuesRead1, "memcpy CE GPU(row) <-> GPU(column) Read1 bandwidth (GB/s)"); 173 | output->addTestcaseResults(bandwidthValuesRead2, "memcpy CE GPU(row) <-> GPU(column) Read2 bandwidth (GB/s)"); 174 | output->addTestcaseResults(bandwidthValuesTotal, "memcpy CE GPU(row) <-> GPU(column) Total bandwidth (GB/s)"); 175 | } 176 | 177 | // DtoD Bidir Write test - copy from src to dst using src context 178 | void DeviceToDeviceBidirWriteCE::run(unsigned long long size, unsigned long long loopCount) { 179 | PeerValueMatrix bandwidthValuesWrite1(deviceCount, deviceCount, key + "_write1"); 180 | PeerValueMatrix bandwidthValuesWrite2(deviceCount, deviceCount, key + "_write2"); 181 | PeerValueMatrix bandwidthValuesTotal(deviceCount, deviceCount, key + "_total"); 182 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); 183 | 184 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 185 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 186 | if (peerDeviceId == srcDeviceId) { 187 | continue; 188 | } 189 | 190 | // Double the size of the interference copy to ensure it interferes correctly 191 | DeviceBuffer src1(size, srcDeviceId), src2(size, srcDeviceId); 192 | DeviceBuffer peer1(size, peerDeviceId), peer2(size, peerDeviceId); 193 | 194 | if (!src1.enablePeerAcess(peer1)) { 195 | continue; 196 | } 197 | 198 | // swap src and peer nodes, but use srcBuffers (the copy's destination) context 199 | std::vector srcBuffers = {&peer1, &src2}; 200 | std::vector peerBuffers = {&src1, &peer2}; 201 | 202 | auto results = memcpyInstance.doMemcpyVector(srcBuffers, peerBuffers); 203 | bandwidthValuesWrite1.value(srcDeviceId, peerDeviceId) = results[0]; 204 | bandwidthValuesWrite2.value(srcDeviceId, peerDeviceId) = results[1]; 205 | bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1]; 206 | } 207 | } 208 | 209 | output->addTestcaseResults(bandwidthValuesWrite1, "memcpy CE GPU(row) <-> GPU(column) Write1 bandwidth (GB/s)"); 210 | output->addTestcaseResults(bandwidthValuesWrite2, "memcpy CE GPU(row) <-> GPU(column) Write2 bandwidth (GB/s)"); 211 | output->addTestcaseResults(bandwidthValuesTotal, "memcpy CE GPU(row) <-> GPU(column) Total bandwidth (GB/s)"); 212 | } 213 | 214 | void DeviceLocalCopy::run(unsigned long long size, unsigned long long loopCount) { 215 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 216 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 217 | 218 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 219 | DeviceBuffer deviceBuffer1(size, deviceId); 220 | DeviceBuffer deviceBuffer2(size, deviceId); 221 | 222 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(deviceBuffer2, deviceBuffer1); 223 | } 224 | 225 | output->addTestcaseResults(bandwidthValues, "memcpy local GPU(column) bandwidth (GB/s)"); 226 | } 227 | 228 | void AllToHostCE::run(unsigned long long size, unsigned long long loopCount) { 229 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 230 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 231 | 232 | allHostHelper(size, memcpyInstance, bandwidthValues, false); 233 | 234 | output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)"); 235 | } 236 | 237 | void AllToHostBidirCE::run(unsigned long long size, unsigned long long loopCount) { 238 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 239 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 240 | 241 | allHostBidirHelper(size, memcpyInstance, bandwidthValues, false); 242 | 243 | output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)"); 244 | } 245 | 246 | void HostToAllCE::run(unsigned long long size, unsigned long long loopCount) { 247 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 248 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 249 | 250 | allHostHelper(size, memcpyInstance, bandwidthValues, true); 251 | 252 | output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)"); 253 | } 254 | 255 | void HostToAllBidirCE::run(unsigned long long size, unsigned long long loopCount) { 256 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 257 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE()); 258 | 259 | allHostBidirHelper(size, memcpyInstance, bandwidthValues, true); 260 | 261 | output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)"); 262 | } 263 | 264 | // Write test - copy from src to dst using src context 265 | void AllToOneWriteCE::run(unsigned long long size, unsigned long long loopCount) { 266 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 267 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW); 268 | allToOneHelper(size, memcpyInstance, bandwidthValues, false); 269 | 270 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)"); 271 | } 272 | 273 | // Read test - copy from dst to src (backwards) using src contxt 274 | void AllToOneReadCE::run(unsigned long long size, unsigned long long loopCount) { 275 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 276 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_DST_CONTEXT, MemcpyOperation::TOTAL_BW); 277 | allToOneHelper(size, memcpyInstance, bandwidthValues, true); 278 | 279 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)"); 280 | } 281 | 282 | // Write test - copy from src to dst using src context 283 | void OneToAllWriteCE::run(unsigned long long size, unsigned long long loopCount) { 284 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 285 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW); 286 | oneToAllHelper(size, memcpyInstance, bandwidthValues, false); 287 | 288 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) -> GPU(column) bandwidth (GB/s)"); 289 | } 290 | 291 | // Read test - copy from dst to src (backwards) using src contxt 292 | void OneToAllReadCE::run(unsigned long long size, unsigned long long loopCount) { 293 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 294 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_DST_CONTEXT, MemcpyOperation::TOTAL_BW); 295 | oneToAllHelper(size, memcpyInstance, bandwidthValues, true); 296 | 297 | output->addTestcaseResults(bandwidthValues, "memcpy CE GPU(row) <- GPU(column) bandwidth (GB/s)"); 298 | } 299 | -------------------------------------------------------------------------------- /testcases_sm.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include "testcase.h" 19 | #include "kernels.cuh" 20 | #include "memcpy.h" 21 | #include "common.h" 22 | #include "output.h" 23 | 24 | void HostDeviceLatencySM::run(unsigned long long size, unsigned long long loopCount) { 25 | PeerValueMatrix latencyValues(1, deviceCount, key, perfFormatter, LATENCY); 26 | MemPtrChaseOperation ptrChaseOp(latencyMemAccessCnt); 27 | 28 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 29 | HostBuffer dataBuffer(size, deviceId); 30 | latencyHelper(dataBuffer, false); 31 | latencyValues.value(0, deviceId) = ptrChaseOp.doPtrChase(deviceId, dataBuffer); 32 | } 33 | 34 | output->addTestcaseResults(latencyValues, "memory latency SM CPU(row) <-> GPU(column) (ns)"); 35 | } 36 | 37 | void HostToDeviceSM::run(unsigned long long size, unsigned long long loopCount) { 38 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 39 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM()); 40 | 41 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 42 | HostBuffer hostBuffer(size, deviceId); 43 | DeviceBuffer deviceBuffer(size, deviceId); 44 | 45 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffer, deviceBuffer); 46 | } 47 | 48 | output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)"); 49 | } 50 | 51 | void DeviceToHostSM::run(unsigned long long size, unsigned long long loopCount) { 52 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 53 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM()); 54 | 55 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 56 | HostBuffer hostBuffer(size, deviceId); 57 | DeviceBuffer deviceBuffer(size, deviceId); 58 | 59 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(deviceBuffer, hostBuffer); 60 | } 61 | 62 | output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)"); 63 | } 64 | 65 | void HostToDeviceBidirSM::run(unsigned long long size, unsigned long long loopCount) { 66 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 67 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSMSplitWarp()); 68 | 69 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 70 | HostBuffer hostBuffer(size, deviceId); 71 | DeviceBuffer deviceBuffer(size, deviceId); 72 | 73 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffer, deviceBuffer); 74 | } 75 | 76 | output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)"); 77 | } 78 | 79 | void DeviceToHostBidirSM::run(unsigned long long size, unsigned long long loopCount) { 80 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 81 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSMSplitWarp()); 82 | 83 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 84 | HostBuffer hostBuffer(size, deviceId); 85 | DeviceBuffer deviceBuffer(size, deviceId); 86 | 87 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(hostBuffer, deviceBuffer); 88 | } 89 | 90 | output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)"); 91 | } 92 | 93 | // DtoD Read test - copy from dst to src (backwards) using src contxt 94 | void DeviceToDeviceReadSM::run(unsigned long long size, unsigned long long loopCount) { 95 | PeerValueMatrix bandwidthValues(deviceCount, deviceCount, key); 96 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_DST_CONTEXT); 97 | 98 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 99 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 100 | if (peerDeviceId == srcDeviceId) { 101 | continue; 102 | } 103 | 104 | DeviceBuffer srcBuffer(size, srcDeviceId); 105 | DeviceBuffer peerBuffer(size, peerDeviceId); 106 | 107 | if (!srcBuffer.enablePeerAcess(peerBuffer)) { 108 | continue; 109 | } 110 | 111 | // swap src and peer nodes, but use srcBuffers (the copy's destination) context 112 | bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(peerBuffer, srcBuffer); 113 | } 114 | } 115 | 116 | output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) -> GPU(column) bandwidth (GB/s)"); 117 | } 118 | 119 | void DeviceToDeviceLatencySM::run(unsigned long long size, unsigned long long loopCount) { 120 | PeerValueMatrix latencyValues(deviceCount, deviceCount, key, perfFormatter, LATENCY); 121 | MemPtrChaseOperation ptrChaseOp(latencyMemAccessCnt); 122 | 123 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 124 | DeviceBuffer peerBuffer(size, peerDeviceId); 125 | latencyHelper(peerBuffer, true); 126 | 127 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 128 | if (peerDeviceId == srcDeviceId) { 129 | continue; 130 | } 131 | 132 | // Note: srcBuffer is not used in the pointer chase operation 133 | // It is simply used here to enable peer access 134 | DeviceBuffer srcBuffer(size, srcDeviceId); 135 | if (!srcBuffer.enablePeerAcess(peerBuffer)) { 136 | continue; 137 | } 138 | latencyValues.value(srcDeviceId, peerDeviceId) = ptrChaseOp.doPtrChase(srcDeviceId, peerBuffer); 139 | } 140 | } 141 | output->addTestcaseResults(latencyValues, "Device to Device Latency SM GPU(row) <-> GPU(column) (ns)"); 142 | } 143 | 144 | // DtoD Write test - copy from src to dst using src context 145 | void DeviceToDeviceWriteSM::run(unsigned long long size, unsigned long long loopCount) { 146 | PeerValueMatrix bandwidthValues(deviceCount, deviceCount, key); 147 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM()); 148 | 149 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 150 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 151 | if (peerDeviceId == srcDeviceId) { 152 | continue; 153 | } 154 | 155 | DeviceBuffer srcBuffer(size, srcDeviceId); 156 | DeviceBuffer peerBuffer(size, peerDeviceId); 157 | 158 | if (!srcBuffer.enablePeerAcess(peerBuffer)) { 159 | continue; 160 | } 161 | 162 | bandwidthValues.value(srcDeviceId, peerDeviceId) = memcpyInstance.doMemcpy(srcBuffer, peerBuffer); 163 | } 164 | } 165 | 166 | output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) <- GPU(column) bandwidth (GB/s)"); 167 | } 168 | 169 | // DtoD Bidir Read test - copy to dst from src (backwards) using dst contxt 170 | void DeviceToDeviceBidirReadSM::run(unsigned long long size, unsigned long long loopCount) { 171 | PeerValueMatrix bandwidthValuesRead1(deviceCount, deviceCount, key + "_read1"); 172 | PeerValueMatrix bandwidthValuesRead2(deviceCount, deviceCount, key + "_read2"); 173 | PeerValueMatrix bandwidthValuesTotal(deviceCount, deviceCount, key + "_total"); 174 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_DST_CONTEXT, MemcpyOperation::VECTOR_BW); 175 | 176 | 177 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 178 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 179 | if (peerDeviceId == srcDeviceId) { 180 | continue; 181 | } 182 | 183 | DeviceBuffer src1(size, srcDeviceId), src2(size, srcDeviceId); 184 | DeviceBuffer peer1(size, peerDeviceId), peer2(size, peerDeviceId); 185 | 186 | if (!src1.enablePeerAcess(peer1)) { 187 | continue; 188 | } 189 | 190 | // swap src and peer nodes, but use srcBuffers (the copy's destination) context 191 | std::vector srcBuffers = {&peer1, &src2}; 192 | std::vector peerBuffers = {&src1, &peer2}; 193 | 194 | auto results = memcpyInstance.doMemcpyVector(srcBuffers, peerBuffers); 195 | bandwidthValuesRead1.value(srcDeviceId, peerDeviceId) = results[0]; 196 | bandwidthValuesRead2.value(srcDeviceId, peerDeviceId) = results[1]; 197 | bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1]; 198 | } 199 | } 200 | 201 | output->addTestcaseResults(bandwidthValuesRead1, "memcpy SM GPU(row) <-> GPU(column) Read1 bandwidth (GB/s)"); 202 | output->addTestcaseResults(bandwidthValuesRead2, "memcpy SM GPU(row) <-> GPU(column) Read2 bandwidth (GB/s)"); 203 | output->addTestcaseResults(bandwidthValuesTotal, "memcpy SM GPU(row) <-> GPU(column) Total bandwidth (GB/s)"); 204 | } 205 | 206 | // DtoD Bidir Write test - copy from src to dst using src contxt 207 | void DeviceToDeviceBidirWriteSM::run(unsigned long long size, unsigned long long loopCount) { 208 | PeerValueMatrix bandwidthValuesWrite1(deviceCount, deviceCount, key + "_write1"); 209 | PeerValueMatrix bandwidthValuesWrite2(deviceCount, deviceCount, key + "_write2"); 210 | PeerValueMatrix bandwidthValuesTotal(deviceCount, deviceCount, key + "_total"); 211 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); 212 | 213 | 214 | for (int srcDeviceId = 0; srcDeviceId < deviceCount; srcDeviceId++) { 215 | for (int peerDeviceId = 0; peerDeviceId < deviceCount; peerDeviceId++) { 216 | if (peerDeviceId == srcDeviceId) { 217 | continue; 218 | } 219 | 220 | DeviceBuffer src1(size, srcDeviceId), src2(size, srcDeviceId); 221 | DeviceBuffer peer1(size, peerDeviceId), peer2(size, peerDeviceId); 222 | 223 | if (!src1.enablePeerAcess(peer1)) { 224 | continue; 225 | } 226 | 227 | // swap src and peer nodes, but use srcBuffers (the copy's destination) context 228 | std::vector srcBuffers = {&peer1, &src2}; 229 | std::vector peerBuffers = {&src1, &peer2}; 230 | 231 | auto results = memcpyInstance.doMemcpyVector(srcBuffers, peerBuffers); 232 | bandwidthValuesWrite1.value(srcDeviceId, peerDeviceId) = results[0]; 233 | bandwidthValuesWrite2.value(srcDeviceId, peerDeviceId) = results[1]; 234 | bandwidthValuesTotal.value(srcDeviceId, peerDeviceId) = results[0] + results[1]; 235 | } 236 | } 237 | 238 | output->addTestcaseResults(bandwidthValuesWrite1, "memcpy SM GPU(row) <-> GPU(column) Write1 bandwidth (GB/s)"); 239 | output->addTestcaseResults(bandwidthValuesWrite2, "memcpy SM GPU(row) <-> GPU(column) Write2 bandwidth (GB/s)"); 240 | output->addTestcaseResults(bandwidthValuesTotal, "memcpy SM GPU(row) <-> GPU(column) Total bandwidth (GB/s)"); 241 | } 242 | 243 | void AllToHostSM::run(unsigned long long size, unsigned long long loopCount) { 244 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 245 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::USE_FIRST_BW); 246 | 247 | allHostHelper(size, memcpyInstance, bandwidthValues, false); 248 | 249 | output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)"); 250 | } 251 | 252 | void AllToHostBidirSM::run(unsigned long long size, unsigned long long loopCount) { 253 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 254 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSMSplitWarp(), PREFER_SRC_CONTEXT, MemcpyOperation::USE_FIRST_BW); 255 | 256 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 257 | std::vector srcBuffers; 258 | std::vector dstBuffers; 259 | 260 | srcBuffers.push_back(new DeviceBuffer(size, deviceId)); 261 | dstBuffers.push_back(new HostBuffer(size, deviceId)); 262 | 263 | for (int interferenceDeviceId = 0; interferenceDeviceId < deviceCount; interferenceDeviceId++) { 264 | if (interferenceDeviceId == deviceId) { 265 | continue; 266 | } 267 | 268 | srcBuffers.push_back(new DeviceBuffer(size, interferenceDeviceId)); 269 | dstBuffers.push_back(new HostBuffer(size, interferenceDeviceId)); 270 | } 271 | 272 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers); 273 | 274 | for (auto node : srcBuffers) { 275 | delete node; 276 | } 277 | 278 | for (auto node : dstBuffers) { 279 | delete node; 280 | } 281 | } 282 | 283 | output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)"); 284 | } 285 | 286 | void HostToAllSM::run(unsigned long long size, unsigned long long loopCount) { 287 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 288 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::USE_FIRST_BW); 289 | 290 | allHostHelper(size, memcpyInstance, bandwidthValues, true); 291 | 292 | output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)"); 293 | } 294 | 295 | void HostToAllBidirSM::run(unsigned long long size, unsigned long long loopCount) { 296 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 297 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSMSplitWarp(), PREFER_SRC_CONTEXT, MemcpyOperation::USE_FIRST_BW); 298 | 299 | for (int deviceId = 0; deviceId < deviceCount; deviceId++) { 300 | std::vector srcBuffers; 301 | std::vector dstBuffers; 302 | 303 | srcBuffers.push_back(new HostBuffer(size, deviceId)); 304 | dstBuffers.push_back(new DeviceBuffer(size, deviceId)); 305 | 306 | for (int interferenceDeviceId = 0; interferenceDeviceId < deviceCount; interferenceDeviceId++) { 307 | if (interferenceDeviceId == deviceId) { 308 | continue; 309 | } 310 | 311 | srcBuffers.push_back(new DeviceBuffer(size, interferenceDeviceId)); 312 | dstBuffers.push_back(new HostBuffer(size, interferenceDeviceId)); 313 | } 314 | 315 | bandwidthValues.value(0, deviceId) = memcpyInstance.doMemcpy(srcBuffers, dstBuffers); 316 | 317 | for (auto node : srcBuffers) { 318 | delete node; 319 | } 320 | 321 | for (auto node : dstBuffers) { 322 | delete node; 323 | } 324 | } 325 | 326 | output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)"); 327 | } 328 | 329 | // Write test - copy from src to dst using src context 330 | void AllToOneWriteSM::run(unsigned long long size, unsigned long long loopCount) { 331 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 332 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW); 333 | 334 | allToOneHelper(size, memcpyInstance, bandwidthValues, false); 335 | 336 | output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) <- GPU(column) bandwidth (GB/s)"); 337 | } 338 | 339 | // Read test - copy from dst to src (backwards) using src contxt 340 | void AllToOneReadSM::run(unsigned long long size, unsigned long long loopCount) { 341 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 342 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW); 343 | allToOneHelper(size, memcpyInstance, bandwidthValues, true); 344 | 345 | output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) -> GPU(column) bandwidth (GB/s)"); 346 | } 347 | 348 | // Write test - copy from src to dst using src context 349 | void OneToAllWriteSM::run(unsigned long long size, unsigned long long loopCount) { 350 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 351 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW); 352 | oneToAllHelper(size, memcpyInstance, bandwidthValues, false); 353 | 354 | output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) -> GPU(column) bandwidth (GB/s)"); 355 | } 356 | 357 | // Read test - copy from dst to src (backwards) using src contxt 358 | void OneToAllReadSM::run(unsigned long long size, unsigned long long loopCount) { 359 | PeerValueMatrix bandwidthValues(1, deviceCount, key); 360 | MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::TOTAL_BW); 361 | oneToAllHelper(size, memcpyInstance, bandwidthValues, true); 362 | 363 | output->addTestcaseResults(bandwidthValues, "memcpy SM GPU(row) <- GPU(column) bandwidth (GB/s)"); 364 | } 365 | -------------------------------------------------------------------------------- /version.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef VERSION_H_ 19 | #define VERSION_H_ 20 | 21 | #define NVBANDWIDTH_VERSION "v0.8" 22 | #ifndef GIT_VERSION 23 | #define GIT_VERSION "unknown" 24 | #endif 25 | 26 | #endif // VERSION_H_ 27 | --------------------------------------------------------------------------------