├── CODE_OF_CONDUCT.md ├── CONTRIBUTE.md ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── msccl_samples ├── README.md ├── npkit_launcher.sh ├── npkit_runner.sh └── npkit_trace_generator.py ├── mscclpp_samples └── README.md ├── nccl_samples ├── README.md ├── npkit-for-nccl-2.17.1-1.diff ├── npkit_launcher.sh ├── npkit_runner.sh └── npkit_trace_generator.py ├── npkit_result_example.png └── rccl_samples ├── README.md ├── npkit_launcher.sh ├── npkit_runner.sh └── npkit_trace_generator.py /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTE.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 5 | and actually do, grant us the rights to use your contribution. For details, visit 6 | https://cla.microsoft.com. 7 | 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 15 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | NOTICES AND INFORMATION 2 | Do Not Translate or Localize 3 | 4 | This software incorporates material from third parties. Microsoft makes certain 5 | open source code available at https://3rdpartysource.microsoft.com, or you may 6 | send a check or money order for US $5.00, including the product name, the open 7 | source component name, and version number, to: 8 | 9 | Source Code Compliance Team 10 | Microsoft Corporation 11 | One Microsoft Way 12 | Redmond, WA 98052 13 | USA 14 | 15 | Notwithstanding any other terms, you may reverse engineer this software to the 16 | extent required to debug changes to any libraries licensed under the GNU Lesser 17 | General Public License. 18 | 19 | NCCL 20 | 21 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 22 | 23 | Redistribution and use in source and binary forms, with or without 24 | modification, are permitted provided that the following conditions 25 | are met: 26 | * Redistributions of source code must retain the above copyright 27 | notice, this list of conditions and the following disclaimer. 28 | * Redistributions in binary form must reproduce the above copyright 29 | notice, this list of conditions and the following disclaimer in the 30 | documentation and/or other materials provided with the distribution. 31 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 32 | Laboratory, the U.S. Department of Energy, nor the names of their 33 | contributors may be used to endorse or promote products derived 34 | from this software without specific prior written permission. 35 | 36 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 37 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 38 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 39 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 40 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 41 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 42 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 43 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 44 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 45 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 46 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 47 | 48 | The U.S. Department of Energy funded the development of this software 49 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 50 | 51 | 52 | This code also includes files from the NVIDIA Tools Extension SDK project. 53 | 54 | See: 55 | 56 | https://github.com/NVIDIA/NVTX 57 | 58 | for more information and license details. 59 | 60 | RCCL 61 | 62 | Attributions 63 | 64 | Contains contributions from NVIDIA. 65 | 66 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 67 | copyright = 'Copyright 2019-2021 Advanced Micro Devices'. 68 | 2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2021 Advanced Micro Devices. 69 | 70 | Redistribution and use in source and binary forms, with or without 71 | modification, are permitted provided that the following conditions 72 | are met: 73 | * Redistributions of source code must retain the above copyright 74 | notice, this list of conditions and the following disclaimer. 75 | * Redistributions in binary form must reproduce the above copyright 76 | notice, this list of conditions and the following disclaimer in the 77 | documentation and/or other materials provided with the distribution. 78 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 79 | Laboratory, the U.S. Department of Energy, nor the names of their 80 | contributors may be used to endorse or promote products derived 81 | from this software without specific prior written permission. 82 | 83 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 84 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 85 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 86 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 87 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 88 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 89 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 90 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 91 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 92 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 93 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 94 | 95 | The U.S. Department of Energy funded the development of this software 96 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 97 | 98 | 99 | This code also includes files from the NVIDIA Tools Extension SDK project. 100 | 101 | See: 102 | 103 | https://github.com/NVIDIA/NVTX 104 | 105 | for more information and license details. 106 | 107 | Notices and Licenses file 108 | _______________________________________________________________ 109 | 110 | Dependencies on nvidia-nccl v2.3.7-1 (BSD3) 111 | Copyright (c) 2015-2018, NVIDIA CORPORATION. 112 | Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. 113 | 114 | Redistribution and use in source and binary forms, with or without 115 | modification, are permitted provided that the following conditions 116 | are met: 117 | * Redistributions of source code must retain the above copyright 118 | notice, this list of conditions and the following disclaimer. 119 | * Redistributions in binary form must reproduce the above copyright 120 | notice, this list of conditions and the following disclaimer in the 121 | documentation and/or other materials provided with the distribution. 122 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 123 | Laboratory, the U.S. Department of Energy, nor the names of their 124 | contributors may be used to endorse or promote products derived 125 | from this software without specific prior written permission. 126 | 127 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 128 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 129 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 130 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 131 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 132 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 133 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 134 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 135 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 136 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 137 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 138 | 139 | The U.S. Department of Energy funded the development of this software 140 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 141 | 142 | 143 | nvidia-nccl v2.3.7-1 (BSD2) 144 | Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. 145 | 146 | Redistribution and use in source and binary forms, with or without 147 | modification, are permitted provided that the following conditions 148 | are met: 149 | * Redistributions of source code must retain the above copyright 150 | notice, this list of conditions and the following disclaimer. 151 | * Redistributions in binary form must reproduce the above copyright 152 | notice, this list of conditions and the following disclaimer in the 153 | documentation and/or other materials provided with the distribution. 154 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 155 | Laboratory, the U.S. Department of Energy, nor the names of their 156 | contributors may be used to endorse or promote products derived 157 | from this software without specific prior written permission. 158 | 159 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 160 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 161 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 162 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 163 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 164 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 165 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 166 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 167 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 168 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 169 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 170 | 171 | The U.S. Department of Energy funded the development of this software 172 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 173 | 174 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | NPKit (Networking Profiling Kit) is a profiling framework designed for popular collective communication libraries (CCLs), including [Microsoft MSCCL](https://github.com/Azure/msccl/), [Microsoft MSCCL++](https://github.com/microsoft/mscclpp/), [NVIDIA NCCL](https://github.com/NVIDIA/nccl) and [AMD RCCL](https://github.com/ROCmSoftwarePlatform/rccl/). It enables users to insert customized profiling events into different CCL components, especially into giant GPU kernels. These events are then automatically placed onto a unified timeline in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview), which users can then leverage trace viewer to understand CCLs' workflow and performance. 4 | 5 | NPKit is easy to use. It runs with all kinds of workloads where CCLs are leveraged. Users only need to dynamically link their workload binary to CCLs built with NPKit enabled, then the unified timeline with profiling events are automatically generated. 6 | 7 | NPKit is lightweight. During each run, users can choose to only enable profiling events they care about to minimize overhead caused by NPKit. 8 | 9 | Below is an example of NPKit timeline result. Green blocks are LL128 data transfer times in GPU, and each line represents a independent data flow (typically mapped to a channel or thread block). Red/purple blocks are net send/recv times in CPU. Each block contains other attributes, including data size, channel ID, etc. 10 | 11 | ![NPKit Result Example](./npkit_result_example.png) 12 | 13 | ## Quick Start 14 | 15 | Please check `msccl_samples` for MSCCL quick start, `mscclpp_samples` for MSCCL++ quick start, `nccl_samples` for NCCL quick start and `rccl_samples` for RCCL quick start. 16 | 17 | ## Trademarks 18 | 19 | This project may contain trademarks or logos for projects, products, or services. 20 | Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 21 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 22 | Any use of third-party trademarks or logos are subject to those third-party's policies. 23 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses [GitHub Issues] to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new issue. 8 | 9 | For help and questions about using this project, please create a new post in [GitHub Discussions]. 10 | 11 | ## Microsoft Support Policy 12 | 13 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 14 | 15 | [GitHub Issues]: https://github.com/microsoft/npkit/issues 16 | [GitHub Discussions]: https://github.com/microsoft/npkit/discussions 17 | -------------------------------------------------------------------------------- /msccl_samples/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | This folder contains scripts for NPKit sample workflow for [MSCCL](https://github.com/Azure/msccl). The sample workflow first builds MSCCL with NPKit enabled, then runs msccl-test to collect NPKit event dump files, and finally generates NPKit trace file. 4 | 5 | ## Dependencies 6 | 7 | [MSCCL executor](https://github.com/Azure/msccl-executor-nccl) (with NPKit integrated) and [MSCCL tests](https://github.com/Azure/msccl-tests-nccl). 8 | 9 | ## Usage 10 | 11 | 1) Make sure parameters in `npkit_launcher.sh` are valid. Also note that currently NPKit only supports collecting non-overlapped events in GPU, and `NPKIT_FLAGS` should follow this rule. 12 | 13 | 2) Make sure `msccl_test` function in `npkit_runner.sh` is a valid command to run `msccl-tests-nccl` binary. Also note that currently NPKit only supports 1 GPU per process, so `-g 1` mode is required in `msccl-tests-nccl` commands. 14 | 15 | 3) Run command `bash npkit_launcher.sh`. 16 | 17 | 4) The generated trace file `npkit_event_trace.json` (zipped in `npkit_result.tar.gz`) is in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) and can be viewed by trace viewers. 18 | -------------------------------------------------------------------------------- /msccl_samples/npkit_launcher.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | set -x 5 | 6 | # MSCCL source directory. 7 | export MSCCL_SRC_DIR="/mnt/msccl-executor-nccl" 8 | 9 | # NPKit source directory. 10 | export NPKIT_SRC_DIR="/mnt/npkit" 11 | 12 | # Path to msccl-tests-nccl binary being profiled. 13 | export NCCL_TEST_BIN="/mnt/msccl-tests-nccl/build/all_reduce_perf" 14 | # export NCCL_TEST_BIN="/mnt/msccl-tests-nccl/build/alltoall_perf" 15 | 16 | # NPKit runtime directory, used to store logs and results. 17 | export NPKIT_RUN_DIR="/mnt/npkit_run" 18 | 19 | # Message size of MSCCL operation. 20 | export MSCCL_MSG_SIZE="16K" 21 | 22 | # MSCCL communication algorithm. 23 | export MSCCL_ALGO="Ring" 24 | # export MSCCL_ALGO="Tree,MSCCL" 25 | 26 | # MSCCL communication protocol. Simple and LL are supported. 27 | # export MSCCL_PROTO="Simple" 28 | export MSCCL_PROTO="LL" 29 | # export MSCCL_PROTO="LL128" 30 | 31 | # Number of msccl-tests-nccl warmups. 32 | export MSCCL_NUM_WARMUPS="0" 33 | 34 | # Number of msccl-tests-nccl iterations. 35 | export MSCCL_NUM_ITERS="10" 36 | 37 | NPKIT_FLAGS_CPU_PREFIX="-DENABLE_NPKIT" 38 | NPKIT_FLAGS_GPU_PREFIX="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU" 39 | 40 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT" 41 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT" 42 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT" 43 | 44 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT" 45 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT" 46 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT" 47 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT" 48 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT" 49 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT" 50 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT" 51 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_ENTRY -DENABLE_NPKIT_EVENT_RECV_EXIT" 52 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT" 53 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT" 54 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT" 55 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT" 56 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_ENTRY -DENABLE_NPKIT_EVENT_SEND_EXIT" 57 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT" 58 | 59 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT" 60 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT" 61 | 62 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT" 63 | export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT" 64 | 65 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT" 66 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT" 67 | 68 | # export NPKIT_FLAGS=${NPKIT_FLAGS_CPU_PREFIX}" -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT" 69 | 70 | bash npkit_runner.sh 71 | -------------------------------------------------------------------------------- /msccl_samples/npkit_runner.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | set -x 5 | 6 | # Function that runs msccl-tests-nccl and collect NPKit traces. 7 | # msccl_test 8 | # 9 | # 10 | function msccl_test() { 11 | mpirun --allow-run-as-root \ 12 | -map-by ppr:8:node --bind-to numa \ 13 | -x LD_PRELOAD=$2/build/lib/libnccl.so:$LD_PRELOAD \ 14 | -x NCCL_DEBUG=WARN \ 15 | -x NCCL_ALGO=$4 \ 16 | -x NCCL_PROTO=$5 \ 17 | -x NPKIT_DUMP_DIR=$8 \ 18 | $1 -b $3 -e $3 -f 2 -g 1 -c 1 -w $6 -n $7 | tee $9/log.txt 19 | } 20 | 21 | # Tag of this NPKit run. 22 | npkit_run_tag=`basename ${NCCL_TEST_BIN}`"/${msg_size}/${MSCCL_ALGO}/${MSCCL_PROTO}" 23 | 24 | # Path to NPKit dump directory. 25 | npkit_dump_dir="${NPKIT_RUN_DIR}/npkit_dump/${npkit_run_tag}" 26 | 27 | # Path to NPKit post-process directory. 28 | npkit_trace_dir="${NPKIT_RUN_DIR}/npkit_trace/${npkit_run_tag}" 29 | 30 | # Path to NPKit result directory. 31 | npkit_result_dir="${NPKIT_RUN_DIR}/npkit_result/${npkit_run_tag}" 32 | 33 | # Build MSCCL with NPKit 34 | cd ${MSCCL_SRC_DIR} 35 | make clean 36 | make -j src.build NPKIT_FLAGS="${NPKIT_FLAGS}" 37 | 38 | # Clean existing results 39 | rm -rf ${NPKIT_RUN_DIR} 40 | mkdir -p ${npkit_dump_dir} 41 | mkdir -p ${npkit_trace_dir} 42 | mkdir -p ${npkit_result_dir} 43 | 44 | # Run NPKit on all nodes. 45 | msccl_test ${NCCL_TEST_BIN} ${MSCCL_SRC_DIR} ${MSCCL_MSG_SIZE} ${MSCCL_ALGO} ${MSCCL_PROTO} ${MSCCL_NUM_WARMUPS} ${MSCCL_NUM_ITERS} ${npkit_dump_dir} ${npkit_result_dir} 46 | 47 | # Generate trace file 48 | cd ${NPKIT_SRC_DIR}/msccl_samples 49 | python3 npkit_trace_generator.py --npkit_dump_dir=${npkit_dump_dir} --npkit_event_header_path=${MSCCL_SRC_DIR}/src/include/npkit/npkit_event.h --output_dir=${npkit_trace_dir} 50 | cd ${npkit_trace_dir} 51 | tar cvzf npkit_result.tar.gz npkit_event_trace.json 52 | mv npkit_result.tar.gz ${npkit_result_dir} 53 | -------------------------------------------------------------------------------- /msccl_samples/npkit_trace_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | import json 7 | 8 | from queue import Queue 9 | 10 | def parse_npkit_event_header(npkit_event_header_path): 11 | npkit_event_def = {'id_to_type': {}, 'type_to_id': {}} 12 | with open(npkit_event_header_path, 'r') as f: 13 | lines = [x.strip() for x in f.readlines() if len(x.strip()) != 0] 14 | line_idx = 0 15 | while line_idx < len(lines): 16 | if lines[line_idx].startswith('#define NPKIT_EVENT_'): 17 | fields = lines[line_idx].split() 18 | if len(fields) == 3: 19 | event_type = fields[1] 20 | event_id = int(fields[2], 0) 21 | npkit_event_def['type_to_id'][event_type] = event_id 22 | npkit_event_def['id_to_type'][event_id] = event_type 23 | line_idx += 1 24 | return npkit_event_def 25 | 26 | def parse_gpu_clock_scale(gpu_clock_file_path): 27 | with open(gpu_clock_file_path, 'r') as f: 28 | freq_in_khz = f.read() 29 | return float(freq_in_khz) * 1e3 / 1e6 30 | 31 | def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path): 32 | with open(cpu_clock_num_file_path, 'r') as f: 33 | num = float(f.read()) 34 | with open(cpu_clock_den_file_path, 'r') as f: 35 | den = float(f.read()) 36 | return den / num / 1e6 37 | 38 | def parse_gpu_event(event_bytes): 39 | return { 40 | 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), 41 | 'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False), 42 | 'rsvd': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False), 43 | 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) 44 | } 45 | 46 | def parse_cpu_event(event_bytes): 47 | return { 48 | 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), 49 | 'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False), 50 | 'slot': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False), 51 | 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) 52 | } 53 | 54 | def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale): 55 | gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx)) 56 | raw_event_size = 16 57 | curr_cpu_base_time = None 58 | curr_gpu_base_time = None 59 | gpu_events = [] 60 | event_type_to_seq = {} 61 | with open(gpu_event_file_path, 'rb') as f: 62 | raw_content = f.read() 63 | raw_content_size = len(raw_content) 64 | raw_content_idx = 0 65 | while raw_content_idx < raw_content_size: 66 | parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) 67 | if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU': 68 | curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale 69 | curr_gpu_base_time = None 70 | elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU': 71 | if curr_gpu_base_time is None: 72 | curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale 73 | else: 74 | if curr_gpu_base_time is None: 75 | curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale 76 | event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']] 77 | phase = 'B' if event_type.endswith('_ENTRY') else 'E' 78 | gpu_events.append({ 79 | 'ph': phase, 80 | 'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time, 81 | 'pid': rank, 82 | 'tid': buf_idx + 1 83 | }) 84 | if phase == 'B': 85 | if event_type not in event_type_to_seq: 86 | event_type_to_seq[event_type] = 0 87 | gpu_events[-1].update({ 88 | 'name': event_type, 89 | 'cat': 'GPU', 90 | 'args': { 91 | 'rank': rank, 92 | 'buf_idx': buf_idx, 93 | 'seq': event_type_to_seq[event_type], 94 | 'rsvd_0': parsed_gpu_event['rsvd'], 95 | 'size_0': parsed_gpu_event['size'] 96 | } 97 | }) 98 | event_type_to_seq[event_type] += 1 99 | else: 100 | gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']} 101 | delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts'] 102 | gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3 103 | raw_content_idx += raw_event_size 104 | return gpu_events 105 | 106 | def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale): 107 | cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel)) 108 | raw_event_size = 16 109 | cpu_events = [] 110 | event_type_to_seq = {} 111 | 112 | fiber_is_usable = [] 113 | fiber_open_ts = [] 114 | slot_to_fiber_id = {} 115 | channel_shift = 1000 116 | 117 | with open(cpu_event_file_path, 'rb') as f: 118 | raw_content = f.read() 119 | raw_content_size = len(raw_content) 120 | raw_content_idx = 0 121 | while raw_content_idx < raw_content_size: 122 | parsed_cpu_event = parse_cpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) 123 | event_type = npkit_event_def['id_to_type'][parsed_cpu_event['id']] 124 | phase = 'B' if event_type.endswith('_ENTRY') else 'E' 125 | cpu_events.append({ 126 | 'ph': phase, 127 | 'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale, 128 | 'pid': rank 129 | }) 130 | slot = parsed_cpu_event['slot'] 131 | if phase == 'B': 132 | # Open fiber event 133 | fiber_id = 0 134 | while fiber_id < len(fiber_is_usable): 135 | if fiber_is_usable[fiber_id]: 136 | break 137 | fiber_id += 1 138 | if fiber_id == len(fiber_is_usable): 139 | fiber_is_usable.append(True) 140 | fiber_open_ts.append(0.0) 141 | slot_to_fiber_id[slot] = fiber_id 142 | fiber_open_ts[fiber_id] = cpu_events[-1]['ts'] 143 | fiber_is_usable[fiber_id] = False 144 | 145 | if event_type not in event_type_to_seq: 146 | event_type_to_seq[event_type] = 0 147 | cpu_events[-1].update({ 148 | 'name': event_type, 149 | 'cat': 'CPU', 150 | 'args': { 151 | 'rank': rank, 152 | 'channel': channel, 153 | 'slot': parsed_cpu_event['slot'], 154 | 'seq': event_type_to_seq[event_type], 155 | 'size_0': parsed_cpu_event['size'] 156 | } 157 | }) 158 | event_type_to_seq[event_type] += 1 159 | else: 160 | # Close fiber event 161 | fiber_id = slot_to_fiber_id[slot] 162 | slot_to_fiber_id.pop(slot) 163 | last_ts = fiber_open_ts[fiber_id] 164 | fiber_is_usable[fiber_id] = True 165 | 166 | delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts) 167 | cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']} 168 | cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3 169 | 170 | cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift 171 | 172 | raw_content_idx += raw_event_size 173 | return cpu_events 174 | 175 | def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def): 176 | files_in_dump_dir = next(os.walk(npkit_dump_dir))[2] 177 | gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')] 178 | cpu_event_files = [x for x in files_in_dump_dir if x.startswith('cpu_events_rank_')] 179 | 180 | ranks = list(set([int(x.split('_rank_')[1].split('_')[0]) for x in gpu_event_files])) 181 | buf_indices = list(set([int(x.split('_buf_')[1].split('_')[0]) for x in gpu_event_files])) 182 | channels = list(set([int(x.split('_channel_')[1].split('_')[0]) for x in cpu_event_files])) 183 | 184 | trace = {'traceEvents': []} 185 | 186 | for rank in ranks: 187 | cpu_clock_den_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_den_rank_%d' % rank) 188 | cpu_clock_num_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_num_rank_%d' % rank) 189 | cpu_clock_scale = parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path) 190 | 191 | gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank) 192 | gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path) 193 | 194 | for buf_idx in buf_indices: 195 | gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale) 196 | trace['traceEvents'].extend(gpu_events) 197 | 198 | for channel in channels: 199 | cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale) 200 | trace['traceEvents'].extend(cpu_events) 201 | 202 | trace['traceEvents'].sort(key=lambda x : x['ts']) 203 | trace['displayTimeUnit'] = 'ns' 204 | 205 | os.makedirs(output_dir, exist_ok=True) 206 | with open(os.path.join(output_dir, 'npkit_event_trace.json'), 'w') as f: 207 | json.dump(trace, f) 208 | 209 | if __name__ == '__main__': 210 | parser = argparse.ArgumentParser() 211 | parser.add_argument('--npkit_dump_dir', type=str, required=True, help='NPKit dump directory.') 212 | parser.add_argument('--npkit_event_header_path', type=str, required=True, help='Path to npkit_event.h.') 213 | parser.add_argument('--output_dir', type=str, required=True, help='Path to output directory.') 214 | args = parser.parse_args() 215 | 216 | npkit_event_def = parse_npkit_event_header(args.npkit_event_header_path) 217 | convert_npkit_dump_to_trace(args.npkit_dump_dir, args.output_dir, npkit_event_def) 218 | -------------------------------------------------------------------------------- /mscclpp_samples/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | This file describes for NPKit sample workflow for [MSCCL++](https://github.com/microsoft/mscclpp). The sample workflow first builds MSCCL++ with NPKit enabled, then runs MSCCL++ executor test to collect NPKit event dump files, and finally generates NPKit trace file. 4 | 5 | ## Dependencies 6 | 7 | [MSCCL++](https://github.com/microsoft/mscclpp) (with NPKit integrated). 8 | 9 | ## Usage 10 | 11 | 1) Build MSCCL++ with NPKit enabled. 12 | 13 | ``` 14 | $ git clone https://github.com/microsoft/mscclpp && cd mscclpp 15 | $ mkdir build && cd build 16 | $ cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_LOCAL_GPU_TARGET_ONLY=ON -DNPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT" .. && make -j 17 | ``` 18 | 19 | 2) Create a directory to store NPKit dump files and trace files. 20 | 21 | ``` 22 | $ mkdir /path/to/npkit_dump 23 | $ mkdir /path/to/npkit_trace 24 | ``` 25 | 26 | 3) Run MSCCL++ executor test with NPKIT_DUMP_DIR specifid. 27 | 28 | ``` 29 | $ mpirun -tag-output -np 2 -x MSCCLPP_DEBUG=WARN -x MSCCLPP_DEBUG_SUBSYS=ALL -x NPKIT_DUMP_DIR=/path/to/npkit_dump -x LD_PRELOAD=/path/to/mscclpp/build/libmscclpp.so:$LD_PRELOAD /path/to/mscclpp/build/test/executor_test 1024 allreduce_pairs /path/to/mscclpp/test/execution-files/allreduce_packet.json 1024 10 1 LL8 30 | ``` 31 | 32 | 3) Run NPKit trace parsing script to generate trace file. 33 | 34 | ``` 35 | $ python3 /path/to/mscclpp/tools/npkit/npkit_trace_generator.py --npkit_dump_dir=/path/to/npkit_dump --npkit_event_header_path=/path/to/mscclpp/include/mscclpp/npkit/npkit_event.hpp --output_dir=/path/to/npkit_trace 36 | ``` 37 | 38 | 4) The generated trace file `npkit_event_trace.json` is in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) and can be viewed by trace viewers. 39 | -------------------------------------------------------------------------------- /nccl_samples/README.md: -------------------------------------------------------------------------------- 1 | ## Important Note 2 | 3 | We highly recommend using [msccl_samples](https://github.com/microsoft/NPKit/tree/main/msccl_samples) to profile NCCL, because [MSCCL](https://github.com/Azure/msccl) includes all NCCL functions, has NPKit already integrated and is actively maintained by Azure. The patch for NCCL in this folder is not actively maintained. 4 | 5 | ## Introduction 6 | 7 | This folder contains scripts for NPKit sample workflow for NCCL. The sample workflow first builds NCCL with NPKit enabled, then runs nccl-tests to collect NPKit event dump files, and finally generates NPKit trace file. 8 | 9 | ## Dependencies 10 | 11 | [NCCL 2.17.1-1](https://github.com/nvidia/nccl/tree/v2.17.1-1) and [nccl-tests](https://github.com/nvidia/nccl-tests). 12 | 13 | ## Usage 14 | 15 | 1) Get NCCL version 2.17.1-1 and apply `npkit-for-nccl-2.17.1-1.diff` to the source repo. 16 | 17 | 2) Make sure parameters in `npkit_launcher.sh` are valid. Also note that currently NPKit only supports collecting non-overlapped events in GPU, and `NPKIT_FLAGS` should follow this rule. 18 | 19 | 3) Make sure `nccl_test` function in `npkit_runner.sh` is a valid command to run `nccl-tests` binary. Also note that currently NPKit only supports 1 GPU per process, so `-g 1` mode is required in `nccl-tests` commands. 20 | 21 | 4) Run command `bash npkit_launcher.sh`. 22 | 23 | 5) The generated trace file `npkit_event_trace.json` (zipped in `npkit_result.tar.gz`) is in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) and can be viewed by trace viewers. 24 | -------------------------------------------------------------------------------- /nccl_samples/npkit-for-nccl-2.17.1-1.diff: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | diff --git a/makefiles/common.mk b/makefiles/common.mk 5 | index 35d1826..d8ac620 100644 6 | --- a/makefiles/common.mk 7 | +++ b/makefiles/common.mk 8 | @@ -57,12 +57,12 @@ $(info NVCC_GENCODE is ${NVCC_GENCODE}) 9 | 10 | CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ 11 | -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \ 12 | - -I $(CUDA_INC) \ 13 | + -I $(CUDA_INC) $(NPKIT_FLAGS) \ 14 | $(CXXFLAGS) 15 | # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) 16 | # 512 : 120, 640 : 96, 768 : 80, 1024 : 60 17 | # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. 18 | -NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all 19 | +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all $(NPKIT_FLAGS) 20 | # Use addprefix so that we can specify more than one path 21 | NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt 22 | 23 | diff --git a/src/Makefile b/src/Makefile 24 | index ca5ddce..b75e961 100644 25 | --- a/src/Makefile 26 | +++ b/src/Makefile 27 | @@ -12,7 +12,7 @@ INCEXPORTS := nccl.h nccl_net.h 28 | LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \ 29 | misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \ 30 | misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \ 31 | - misc/ipcsocket.cc \ 32 | + misc/ipcsocket.cc misc/npkit.cc \ 33 | transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \ 34 | collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ 35 | graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc 36 | diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h 37 | index f51eb43..f6c8022 100644 38 | --- a/src/collectives/device/all_reduce.h 39 | +++ b/src/collectives/device/all_reduce.h 40 | @@ -8,6 +8,10 @@ 41 | #include "collectives.h" 42 | #include "primitives.h" 43 | 44 | +#if defined(ENABLE_NPKIT) 45 | +#include "npkit/npkit.h" 46 | +#endif 47 | + 48 | namespace { 49 | template 50 | __device__ __forceinline__ void runRing(ncclWorkElem *args) { 51 | @@ -22,6 +26,32 @@ namespace { 52 | const ssize_t loopSize = nChannels*nranks*chunkSize; 53 | const ssize_t size = args->count; 54 | 55 | +#if defined(ENABLE_NPKIT) 56 | + int npKitCtxIdx = bid; 57 | +#endif 58 | + 59 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) 60 | + if (tid == 0) { 61 | + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; 62 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, 63 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 64 | + } 65 | +#endif 66 | + 67 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) 68 | + if (tid == 0) { 69 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(), 70 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 71 | + } 72 | +#endif 73 | + 74 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY) 75 | + if (tid == 0) { 76 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, clock64(), 77 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 78 | + } 79 | +#endif 80 | + 81 | int minChunkSize; 82 | if (Proto::Id == NCCL_PROTO_LL) 83 | minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T)); 84 | @@ -33,6 +63,12 @@ namespace { 85 | Primitives, 1, Proto, 0> prims 86 | (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); 87 | 88 | +#if defined(ENABLE_NPKIT) 89 | + if (tid == 0) { 90 | + prims.npKitCtxIdx = npKitCtxIdx; 91 | + } 92 | +#endif 93 | + 94 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 95 | ssize_t realChunkSize; 96 | if (Proto::Id == NCCL_PROTO_SIMPLE) { 97 | @@ -61,9 +97,34 @@ namespace { 98 | chunk = modRanks(ringIx + nranks-1); 99 | offset = calcOffset(chunk); 100 | nelem = min(realChunkSize, size-offset); 101 | + 102 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY) 103 | + if (tid == 0) { 104 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY, nelem*sizeof(T), 0, clock64(), 105 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 106 | + prims.npKitDataProcessTotalTime = 0; 107 | + } 108 | +#endif 109 | + 110 | prims.send(offset, nelem); 111 | 112 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT) 113 | + if (tid == 0) { 114 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 115 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 116 | + } 117 | +#endif 118 | + 119 | // k-2 steps: reduce and copy to next GPU 120 | + 121 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY) 122 | + if (tid == 0 && nranks > 2) { 123 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, clock64(), 124 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 125 | + prims.npKitDataProcessTotalTime = 0; 126 | + } 127 | +#endif 128 | + 129 | for (int j=2; j 2) { 138 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 139 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 140 | + } 141 | +#endif 142 | + 143 | // step k-1: reduce this buffer and data, which will produce the final 144 | // result that we store in this data and push to the next GPU 145 | chunk = ringIx + 0; 146 | offset = calcOffset(chunk); 147 | nelem = min(realChunkSize, size-offset); 148 | + 149 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY) 150 | + if (tid == 0) { 151 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, nelem*sizeof(T), 0, clock64(), 152 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 153 | + prims.npKitDataProcessTotalTime = 0; 154 | + } 155 | +#endif 156 | + 157 | prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true); 158 | 159 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT) 160 | + if (tid == 0) { 161 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 162 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 163 | + } 164 | +#endif 165 | + 166 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY) 167 | + if (tid == 0 && nranks > 2) { 168 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, clock64(), 169 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 170 | + prims.npKitDataProcessTotalTime = 0; 171 | + } 172 | +#endif 173 | + 174 | // k-2 steps: copy to next GPU 175 | for (int j=1; j 2) { 183 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 184 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 185 | + } 186 | +#endif 187 | + 188 | // Make final copy from buffer to dest. 189 | chunk = modRanks(ringIx + 1); 190 | offset = calcOffset(chunk); 191 | nelem = min(realChunkSize, size-offset); 192 | + 193 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY) 194 | + if (tid == 0) { 195 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, clock64(), 196 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 197 | + prims.npKitDataProcessTotalTime = 0; 198 | + } 199 | +#endif 200 | + 201 | prims.directRecv(offset, nelem); 202 | + 203 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT) 204 | + if (tid == 0) { 205 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 206 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 207 | + } 208 | +#endif 209 | + 210 | } 211 | + 212 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT) 213 | + if (tid == 0) { 214 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_EXIT, size*sizeof(T), 0, clock64(), 215 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 216 | + } 217 | +#endif 218 | + 219 | } 220 | 221 | template 222 | @@ -110,12 +234,53 @@ namespace { 223 | const ssize_t loopSize = int(nChannels*chunkSize); 224 | const ssize_t size = args->count; 225 | 226 | +#if defined(ENABLE_NPKIT) 227 | + int npKitCtxIdx = bid; 228 | +#endif 229 | + 230 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) 231 | + if (tid == 0) { 232 | + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; 233 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, 234 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 235 | + } 236 | +#endif 237 | + 238 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) 239 | + if (tid == 0) { 240 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(), 241 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 242 | + } 243 | +#endif 244 | + 245 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY) 246 | + if (tid == 0) { 247 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, clock64(), 248 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 249 | + } 250 | +#endif 251 | + 252 | if (loopSize > size) 253 | chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize); 254 | 255 | { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) 256 | Primitives, /*Direct=*/0, Proto, 0> prims 257 | (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg); 258 | + 259 | +#if defined(ENABLE_NPKIT) 260 | + if (tid == 0) { 261 | + prims.npKitCtxIdx = npKitCtxIdx; 262 | + } 263 | +#endif 264 | + 265 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY) 266 | + if (tid == 0) { 267 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY, size*sizeof(T), 0, clock64(), 268 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 269 | + prims.npKitDataProcessTotalTime = 0; 270 | + } 271 | +#endif 272 | + 273 | if (tree->up == -1) { 274 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 275 | ssize_t offset = gridOffset + bid*int(chunkSize); 276 | @@ -137,11 +302,34 @@ namespace { 277 | prims.recvReduceSend(offset, nelem); 278 | } 279 | } 280 | + 281 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT) 282 | + if (tid == 0) { 283 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 284 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 285 | + } 286 | +#endif 287 | + 288 | } 289 | 290 | { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) 291 | Primitives, /*Direct=*/1, Proto, 0> prims 292 | (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); 293 | + 294 | +#if defined(ENABLE_NPKIT) 295 | + if (tid == 0) { 296 | + prims.npKitCtxIdx = npKitCtxIdx; 297 | + } 298 | +#endif 299 | + 300 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY) 301 | + if (tid == 0) { 302 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY, size*sizeof(T), 0, clock64(), 303 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 304 | + prims.npKitDataProcessTotalTime = 0; 305 | + } 306 | +#endif 307 | + 308 | if (tree->up == -1) { 309 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 310 | ssize_t offset = gridOffset + bid*int(chunkSize); 311 | @@ -163,7 +351,23 @@ namespace { 312 | prims.directRecvCopySend(offset, offset, nelem); 313 | } 314 | } 315 | + 316 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT) 317 | + if (tid == 0) { 318 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 319 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 320 | + } 321 | +#endif 322 | + 323 | + } 324 | + 325 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT) 326 | + if (tid == 0) { 327 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT, size*sizeof(T), 0, clock64(), 328 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 329 | } 330 | +#endif 331 | + 332 | } 333 | 334 | template 335 | @@ -193,6 +397,40 @@ namespace { 336 | nthreadsSplit = (nthreads*7/(10*WARP_SIZE))*WARP_SIZE; 337 | } 338 | 339 | +#if defined(ENABLE_NPKIT) 340 | + bool isNpKitThread = false; 341 | + int npKitCtxIdx = 0; 342 | + if (threadIdx.x == 0) { 343 | + isNpKitThread = true; 344 | + npKitCtxIdx = bid * 2; 345 | + } else if (tree->up != -1 && threadIdx.x == nthreadsSplit) { 346 | + isNpKitThread = true; 347 | + npKitCtxIdx = bid * 2 + 1; 348 | + } 349 | +#endif 350 | + 351 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) 352 | + if (isNpKitThread) { 353 | + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; 354 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, 355 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 356 | + } 357 | +#endif 358 | + 359 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) 360 | + if (isNpKitThread) { 361 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(), 362 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 363 | + } 364 | +#endif 365 | + 366 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY) 367 | + if (isNpKitThread) { 368 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, clock64(), 369 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 370 | + } 371 | +#endif 372 | + 373 | if (loopSize > size) 374 | chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize); 375 | 376 | @@ -200,11 +438,34 @@ namespace { 377 | // Reduce and broadcast. Max number of recv is 3, max number of send is 3 378 | Primitives, /*Direct=*/1, Proto, 0> 379 | prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); 380 | + 381 | +#if defined(ENABLE_NPKIT) 382 | + if (isNpKitThread) { 383 | + prims.npKitCtxIdx = npKitCtxIdx; 384 | + } 385 | +#endif 386 | + 387 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY) 388 | + if (isNpKitThread) { 389 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY, size*sizeof(T), 0, clock64(), 390 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 391 | + prims.npKitDataProcessTotalTime = 0; 392 | + } 393 | +#endif 394 | + 395 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 396 | ssize_t offset = gridOffset + bid*int(chunkSize); 397 | int nelem = min(chunkSize, size-offset); 398 | prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*doPost=*/true); 399 | } 400 | + 401 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT) 402 | + if (isNpKitThread) { 403 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 404 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 405 | + } 406 | +#endif 407 | + 408 | } 409 | else if (tid < nthreadsSplit) { 410 | /* Reduce up. Max number of recv is 3, max number of send is 1 (binary tree + local). 411 | @@ -217,6 +478,21 @@ namespace { 412 | */ 413 | Primitives, /*Direct=*/1, Proto, 0> 414 | prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth); 415 | + 416 | +#if defined(ENABLE_NPKIT) 417 | + if (isNpKitThread) { 418 | + prims.npKitCtxIdx = npKitCtxIdx; 419 | + } 420 | +#endif 421 | + 422 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY) 423 | + if (isNpKitThread) { 424 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY, size*sizeof(T), 0, clock64(), 425 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 426 | + prims.npKitDataProcessTotalTime = 0; 427 | + } 428 | +#endif 429 | + 430 | if (tree->down[0] == -1) { 431 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 432 | ssize_t offset = gridOffset + bid*int(chunkSize); 433 | @@ -231,11 +507,34 @@ namespace { 434 | prims.recvReduceSend(offset, nelem); 435 | } 436 | } 437 | + 438 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT) 439 | + if (isNpKitThread) { 440 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 441 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 442 | + } 443 | +#endif 444 | + 445 | } 446 | else { 447 | // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local) 448 | Primitives, /*Direct=*/1, Proto, 0> 449 | prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth); 450 | + 451 | +#if defined(ENABLE_NPKIT) 452 | + if (isNpKitThread) { 453 | + prims.npKitCtxIdx = npKitCtxIdx; 454 | + } 455 | +#endif 456 | + 457 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY) 458 | + if (isNpKitThread) { 459 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY, size*sizeof(T), 0, clock64(), 460 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 461 | + prims.npKitDataProcessTotalTime = 0; 462 | + } 463 | +#endif 464 | + 465 | if (tree->down[0] == -1) { 466 | for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { 467 | ssize_t offset = gridOffset + bid*int(chunkSize); 468 | @@ -250,7 +549,23 @@ namespace { 469 | prims.directRecvCopySend(offset, offset, nelem); 470 | } 471 | } 472 | + 473 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT) 474 | + if (isNpKitThread) { 475 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 476 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 477 | + } 478 | +#endif 479 | + 480 | } 481 | + 482 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT) 483 | + if (isNpKitThread) { 484 | + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT, size*sizeof(T), 0, clock64(), 485 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 486 | + } 487 | +#endif 488 | + 489 | } 490 | } 491 | 492 | diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h 493 | index c43f1a5..869c6b5 100644 494 | --- a/src/collectives/device/prims_ll.h 495 | +++ b/src/collectives/device/prims_ll.h 496 | @@ -4,6 +4,10 @@ 497 | * See LICENSE.txt for license information 498 | ************************************************************************/ 499 | 500 | +#if defined(ENABLE_NPKIT) 501 | +#include "npkit/npkit.h" 502 | +#endif 503 | + 504 | template 505 | class Primitives: 506 | public PrimitivesWithoutDirect> { 507 | @@ -36,6 +40,22 @@ class Primitives: 508 | union ncclLLFifoLine* recvBuff[MaxRecv]; 509 | union ncclLLFifoLine* sendBuff[MaxSend]; 510 | 511 | +#if defined(ENABLE_NPKIT) 512 | +public: 513 | + int npKitCtxIdx = 0; 514 | + uint64_t npKitDataProcessEntryTime = 0; 515 | + uint64_t npKitDataProcessExitTime = 0; 516 | + uint64_t npKitDataProcessTotalTime = 0; 517 | +private: 518 | +#endif 519 | + 520 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 521 | + uint64_t npKitWaitRecvDataProcessSize = 0; 522 | + uint64_t npKitWaitRecvEntryTime = 0; 523 | + uint64_t npKitWaitRecvExitTime = 0; 524 | + uint64_t npKitWaitRecvTotalTime = 0; 525 | +#endif 526 | + 527 | inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; } 528 | inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; } 529 | inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } 530 | @@ -62,6 +82,12 @@ class Primitives: 531 | } 532 | 533 | inline __device__ void waitSend(int nbytes) { 534 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY) 535 | + if (tid == 0) { 536 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, clock64(), 537 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 538 | + } 539 | +#endif 540 | if (sendConnHeadPtr) { 541 | int spins = 0; 542 | while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { 543 | @@ -75,6 +101,12 @@ class Primitives: 544 | sendConnHead += 1; 545 | } 546 | barrier(); 547 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT) 548 | + if (tid == 0) { 549 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, clock64(), 550 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 551 | + } 552 | +#endif 553 | } 554 | 555 | inline __device__ void incRecv(int i) { 556 | @@ -99,11 +131,30 @@ class Primitives: 557 | uint32_t flag = recvFlag(i); 558 | uint32_t data1, flag1, data2, flag2; 559 | int spins = 0; 560 | + 561 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 562 | + int npkitWaitRecvSpins = 0; 563 | + if (tid == 0) { 564 | + npKitWaitRecvEntryTime = clock64(); 565 | + } 566 | +#endif 567 | + 568 | do { 569 | asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); 570 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 571 | + npkitWaitRecvSpins++; 572 | +#endif 573 | if (checkAbort(spins, 0)) break; 574 | } while ((flag1 != flag) || (flag2 != flag)); 575 | uint64_t val64 = data1 + (((uint64_t)data2) << 32); 576 | + 577 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 578 | + if (tid == 0) { 579 | + npKitWaitRecvExitTime = clock64(); 580 | + npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins; 581 | + } 582 | +#endif 583 | + 584 | return val64; 585 | } 586 | 587 | @@ -121,11 +172,30 @@ class Primitives: 588 | union ncclLLFifoLine* src = recvPtr(i) + offset; 589 | uint32_t flag = recvFlag(i); 590 | int spins = 0; 591 | + 592 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 593 | + int npkitWaitRecvSpins = 0; 594 | + if (tid == 0) { 595 | + npKitWaitRecvEntryTime = clock64(); 596 | + } 597 | +#endif 598 | + 599 | while (line[i].flag1 != flag || line[i].flag2 != flag) { 600 | asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4)); 601 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 602 | + npkitWaitRecvSpins++; 603 | +#endif 604 | if (checkAbort(spins, 0)) break; 605 | } 606 | uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32); 607 | + 608 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 609 | + if (tid == 0) { 610 | + npKitWaitRecvExitTime = clock64(); 611 | + npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins; 612 | + } 613 | +#endif 614 | + 615 | return val64; 616 | } 617 | 618 | @@ -234,6 +304,22 @@ class Primitives: 619 | nelem = nelem < 0 ? 0 : nelem; 620 | if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine)); 621 | 622 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) 623 | + if (tid == 0) { 624 | + npKitWaitRecvTotalTime = 0; 625 | + npKitWaitRecvDataProcessSize = nelem*sizeof(T); 626 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY, 627 | + npKitWaitRecvDataProcessSize, 0, clock64(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 628 | + } 629 | +#endif 630 | + 631 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 632 | + if (tid == 0) { 633 | + npKitWaitRecvTotalTime = 0; 634 | + npKitDataProcessEntryTime = clock64(); 635 | + } 636 | +#endif 637 | + 638 | nelem -= tid*EltPerLine; 639 | srcElts += tid*EltPerLine; 640 | dstElts += tid*EltPerLine; 641 | @@ -282,6 +368,21 @@ class Primitives: 642 | offset += nthreads; 643 | } 644 | 645 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 646 | + if (tid == 0) { 647 | + npKitDataProcessExitTime = clock64(); 648 | + npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime - npKitWaitRecvTotalTime; 649 | + } 650 | +#endif 651 | + 652 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) 653 | + if (tid == 0) { 654 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT, 655 | + npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, clock64(), 656 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 657 | + } 658 | +#endif 659 | + 660 | if (RECV) { 661 | for (int i=0; i < MaxRecv; i++) incRecv(i); 662 | postRecv(); 663 | @@ -367,27 +468,123 @@ class Primitives: 664 | } 665 | 666 | __device__ void send(intptr_t inpIx, int eltN) { 667 | - return LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); 668 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY) 669 | + if (tid == 0) { 670 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 671 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 672 | + } 673 | +#endif 674 | + LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); 675 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT) 676 | + if (tid == 0) { 677 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 678 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 679 | + } 680 | +#endif 681 | } 682 | __device__ void sendFromOutput(intptr_t outIx, int eltN) { 683 | - return LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); 684 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY) 685 | + if (tid == 0) { 686 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, clock64(), 687 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 688 | + } 689 | +#endif 690 | + LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); 691 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT) 692 | + if (tid == 0) { 693 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, clock64(), 694 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 695 | + } 696 | +#endif 697 | } 698 | __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) { 699 | - return LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); 700 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY) 701 | + if (tid == 0) { 702 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, clock64(), 703 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 704 | + } 705 | +#endif 706 | + LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); 707 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT) 708 | + if (tid == 0) { 709 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, clock64(), 710 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 711 | + } 712 | +#endif 713 | } 714 | __device__ void recvReduceSend(intptr_t inpIx, int eltN) { 715 | - return LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); 716 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY) 717 | + if (tid == 0) { 718 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 719 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 720 | + } 721 | +#endif 722 | + LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); 723 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT) 724 | + if (tid == 0) { 725 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 726 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 727 | + } 728 | +#endif 729 | } 730 | __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { 731 | - return LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); 732 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY) 733 | + if (tid == 0) { 734 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, clock64(), 735 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 736 | + } 737 | +#endif 738 | + LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); 739 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT) 740 | + if (tid == 0) { 741 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, clock64(), 742 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 743 | + } 744 | +#endif 745 | } 746 | __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { 747 | - return LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); 748 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY) 749 | + if (tid == 0) { 750 | + NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 751 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 752 | + } 753 | +#endif 754 | + LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); 755 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT) 756 | + if (tid == 0) { 757 | + NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 758 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 759 | + } 760 | +#endif 761 | } 762 | __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { 763 | - return LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); 764 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY) 765 | + if (tid == 0) { 766 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 767 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 768 | + } 769 | +#endif 770 | + LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); 771 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT) 772 | + if (tid == 0) { 773 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 774 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 775 | + } 776 | +#endif 777 | } 778 | __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { 779 | - return LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); 780 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY) 781 | + if (tid == 0) { 782 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 783 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 784 | + } 785 | +#endif 786 | + LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); 787 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT) 788 | + if (tid == 0) { 789 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 790 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 791 | + } 792 | +#endif 793 | } 794 | }; 795 | diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h 796 | index 8a4570a..ac8fccd 100644 797 | --- a/src/collectives/device/prims_ll128.h 798 | +++ b/src/collectives/device/prims_ll128.h 799 | @@ -5,6 +5,9 @@ 800 | ************************************************************************/ 801 | 802 | #include "op128.h" 803 | +#if defined(ENABLE_NPKIT) 804 | +#include "npkit/npkit.h" 805 | +#endif 806 | 807 | #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1) 808 | 809 | @@ -42,6 +45,22 @@ class Primitives: 810 | uint64_t* recvBuff[MaxRecv]; 811 | uint64_t* sendBuff[MaxSend]; 812 | 813 | +#if defined(ENABLE_NPKIT) 814 | +public: 815 | + int npKitCtxIdx = 0; 816 | + uint64_t npKitDataProcessEntryTime = 0; 817 | + uint64_t npKitDataProcessExitTime = 0; 818 | + uint64_t npKitDataProcessTotalTime = 0; 819 | +private: 820 | +#endif 821 | + 822 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 823 | + uint64_t npKitWaitRecvDataProcessSize = 0; 824 | + uint64_t npKitWaitRecvEntryTime = 0; 825 | + uint64_t npKitWaitRecvExitTime = 0; 826 | + uint64_t npKitWaitRecvTotalTime = 0; 827 | +#endif 828 | + 829 | inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } 830 | inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } 831 | inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } 832 | @@ -65,6 +84,12 @@ class Primitives: 833 | } 834 | 835 | inline __device__ void waitSend(int nbytes) { 836 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY) 837 | + if (tid == 0) { 838 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY, nbytes, 0, clock64(), 839 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 840 | + } 841 | +#endif 842 | if (sendConnHeadPtr) { 843 | int spins = 0; 844 | while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { 845 | @@ -76,6 +101,12 @@ class Primitives: 846 | } 847 | sendConnHead += 1; 848 | } 849 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT) 850 | + if (tid == 0) { 851 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT, nbytes, 0, clock64(), 852 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 853 | + } 854 | +#endif 855 | } 856 | 857 | inline __device__ void postRecv() { 858 | @@ -194,6 +225,14 @@ class Primitives: 859 | uint64_t flag = recvFlag(0); 860 | bool needReload; 861 | int spins = 0; 862 | + 863 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 864 | + int npkitWaitRecvSpins = 0; 865 | + if (tid == 0) { 866 | + npKitWaitRecvEntryTime = clock64(); 867 | + } 868 | +#endif 869 | + 870 | do { 871 | needReload = false; 872 | #pragma unroll 873 | @@ -201,9 +240,21 @@ class Primitives: 874 | load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]); 875 | needReload |= flagThread && (vr[u+1] != flag); 876 | } 877 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 878 | + npkitWaitRecvSpins++; 879 | +#endif 880 | needReload &= (0 == checkAbort(spins, 0, 0)); 881 | } while (__any_sync(WARP_MASK, needReload)); 882 | 883 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 884 | + if (tid == 0) { 885 | + npKitWaitRecvExitTime = clock64(); 886 | + npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins; 887 | + npkitWaitRecvSpins = 0; 888 | + } 889 | +#endif 890 | + 891 | + 892 | #pragma unroll 893 | for (int u=0; u: 896 | uint64_t* ptr = recvPtr(i)+ll128Offset; 897 | bool needReload; 898 | int spins = 0; 899 | + 900 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 901 | + int npkitWaitRecvSpins = 0; 902 | + if (tid == 0) { 903 | + npKitWaitRecvEntryTime = clock64(); 904 | + } 905 | +#endif 906 | + 907 | do { 908 | needReload = false; 909 | #pragma unroll 910 | @@ -246,9 +305,20 @@ class Primitives: 911 | load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]); 912 | needReload |= flagThread && (vr[u+1] != flag); 913 | } 914 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 915 | + npkitWaitRecvSpins++; 916 | +#endif 917 | needReload &= (0 == checkAbort(spins, i, 0)); 918 | } while (__any_sync(WARP_MASK, needReload)); 919 | 920 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) 921 | + if (tid == 0) { 922 | + npKitWaitRecvExitTime = clock64(); 923 | + npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins; 924 | + npkitWaitRecvSpins = 0; 925 | + } 926 | +#endif 927 | + 928 | #pragma unroll 929 | for (int u=0; u: 932 | 933 | if (SEND) waitSend(divUp(nelem, DataEltPerSlice)*WireWordPerSlice*sizeof(uint64_t)); 934 | barrier(); 935 | + 936 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) 937 | + if (tid == 0) { 938 | + npKitWaitRecvTotalTime = 0; 939 | + npKitWaitRecvDataProcessSize = nelem*sizeof(T); 940 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY, 941 | + npKitWaitRecvDataProcessSize, 0, clock64(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 942 | + } 943 | +#endif 944 | + 945 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 946 | + if (tid == 0) { 947 | + npKitWaitRecvTotalTime = 0; 948 | + npKitDataProcessEntryTime = clock64(); 949 | + } 950 | +#endif 951 | + 952 | nelem -= DataEltPerSlice*warp; 953 | srcPtr += DataEltPerSlice*warp; 954 | dstPtr += DataEltPerSlice*warp; 955 | @@ -322,6 +409,22 @@ class Primitives: 956 | } 957 | 958 | barrier(); 959 | + 960 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 961 | + if (tid == 0) { 962 | + npKitDataProcessExitTime = clock64(); 963 | + npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime - npKitWaitRecvTotalTime; 964 | + } 965 | +#endif 966 | + 967 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) 968 | + if (tid == 0) { 969 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT, 970 | + npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, clock64(), 971 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 972 | + } 973 | +#endif 974 | + 975 | if (SEND) for (int i=0; i < MaxSend; i++) sendStep[i] += 1; 976 | if (SEND) postSend(); 977 | if (RECV) for (int i=0; i < MaxRecv; i++) recvStep[i] += 1; 978 | @@ -408,27 +511,123 @@ public: 979 | } 980 | 981 | __device__ void send(intptr_t inpIx, int eltN) { 982 | - return GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); 983 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY) 984 | + if (tid == 0) { 985 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 986 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 987 | + } 988 | +#endif 989 | + GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); 990 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT) 991 | + if (tid == 0) { 992 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 993 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 994 | + } 995 | +#endif 996 | } 997 | __device__ void sendFromOutput(intptr_t outIx, int eltN) { 998 | - return GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); 999 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY) 1000 | + if (tid == 0) { 1001 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, clock64(), 1002 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1003 | + } 1004 | +#endif 1005 | + GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); 1006 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT) 1007 | + if (tid == 0) { 1008 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, clock64(), 1009 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1010 | + } 1011 | +#endif 1012 | } 1013 | __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) { 1014 | - return GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); 1015 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY) 1016 | + if (tid == 0) { 1017 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, clock64(), 1018 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1019 | + } 1020 | +#endif 1021 | + GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); 1022 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT) 1023 | + if (tid == 0) { 1024 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, clock64(), 1025 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1026 | + } 1027 | +#endif 1028 | } 1029 | __device__ void recvReduceSend(intptr_t inpIx, int eltN) { 1030 | - return GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); 1031 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY) 1032 | + if (tid == 0) { 1033 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 1034 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1035 | + } 1036 | +#endif 1037 | + GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); 1038 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT) 1039 | + if (tid == 0) { 1040 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 1041 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1042 | + } 1043 | +#endif 1044 | } 1045 | __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { 1046 | - return GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); 1047 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY) 1048 | + if (tid == 0) { 1049 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, clock64(), 1050 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1051 | + } 1052 | +#endif 1053 | + GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); 1054 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT) 1055 | + if (tid == 0) { 1056 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, clock64(), 1057 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1058 | + } 1059 | +#endif 1060 | } 1061 | __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { 1062 | - return GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); 1063 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY) 1064 | + if (tid == 0) { 1065 | + NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 1066 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1067 | + } 1068 | +#endif 1069 | + GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); 1070 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT) 1071 | + if (tid == 0) { 1072 | + NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 1073 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1074 | + } 1075 | +#endif 1076 | } 1077 | __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { 1078 | - return GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); 1079 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY) 1080 | + if (tid == 0) { 1081 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 1082 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1083 | + } 1084 | +#endif 1085 | + GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); 1086 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT) 1087 | + if (tid == 0) { 1088 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 1089 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1090 | + } 1091 | +#endif 1092 | } 1093 | __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { 1094 | - return GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); 1095 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY) 1096 | + if (tid == 0) { 1097 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(), 1098 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1099 | + } 1100 | +#endif 1101 | + GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); 1102 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT) 1103 | + if (tid == 0) { 1104 | + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(), 1105 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1106 | + } 1107 | +#endif 1108 | } 1109 | }; 1110 | diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h 1111 | index 2cd3797..3c7bd39 100644 1112 | --- a/src/collectives/device/prims_simple.h 1113 | +++ b/src/collectives/device/prims_simple.h 1114 | @@ -4,6 +4,10 @@ 1115 | * See LICENSE.txt for license information 1116 | ************************************************************************/ 1117 | 1118 | +#if defined(ENABLE_NPKIT) 1119 | +#include "npkit/npkit.h" 1120 | +#endif 1121 | + 1122 | template 1124 | class Primitives< 1125 | @@ -46,6 +50,15 @@ class Primitives< 1126 | uint64_t *connStepPtr; 1127 | uint64_t connStepCache; // Cache last seen value of (*connStepPtr) 1128 | 1129 | +#if defined(ENABLE_NPKIT) 1130 | +public: 1131 | + int npKitCtxIdx = 0; 1132 | + uint64_t npKitDataProcessEntryTime = 0; 1133 | + uint64_t npKitDataProcessExitTime = 0; 1134 | + uint64_t npKitDataProcessTotalTime = 0; 1135 | +private: 1136 | +#endif 1137 | + 1138 | // Don't use barrier 0 as it's used by the final sync 1139 | __device__ void barrier() { 1140 | flags |= ThreadsSynced; 1141 | @@ -238,20 +251,92 @@ class Primitives< 1142 | } else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { 1143 | // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy 1144 | if (Send) { 1145 | + 1146 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) 1147 | + if (tid == 0) { 1148 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, clock64(), 1149 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1150 | + } 1151 | +#endif 1152 | + 1153 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 1154 | + if (tid == 0) { 1155 | + npKitDataProcessEntryTime = clock64(); 1156 | + } 1157 | +#endif 1158 | + 1159 | ReduceOrCopyMulti 1160 | (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false, 1161 | 1, ncclShmem.groups[group].srcs, 1162 | fan.nsend(), ncclShmem.groups[group].dsts+1, 1163 | workSize); 1164 | + 1165 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 1166 | + if (tid == 0) { 1167 | + npKitDataProcessExitTime = clock64(); 1168 | + npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime; 1169 | + } 1170 | +#endif 1171 | + 1172 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) 1173 | + if (tid == 0) { 1174 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, clock64(), 1175 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1176 | + } 1177 | +#endif 1178 | + 1179 | } 1180 | } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) { 1181 | // For broadcast in CollNet to do empty send 1182 | + 1183 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) 1184 | + if (tid == 0) { 1185 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, clock64(), 1186 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1187 | + } 1188 | +#endif 1189 | + 1190 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 1191 | + if (tid == 0) { 1192 | + npKitDataProcessEntryTime = clock64(); 1193 | + } 1194 | +#endif 1195 | + 1196 | ReduceOrCopyMulti 1197 | (tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp, 1198 | Recv, ncclShmem.groups[group].srcs, 1199 | Dst, ncclShmem.groups[group].dsts, 1200 | workSize); 1201 | + 1202 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 1203 | + if (tid == 0) { 1204 | + npKitDataProcessExitTime = clock64(); 1205 | + npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime; 1206 | + } 1207 | +#endif 1208 | + 1209 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) 1210 | + if (tid == 0) { 1211 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, clock64(), 1212 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1213 | + } 1214 | +#endif 1215 | + 1216 | } else { 1217 | + 1218 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) 1219 | + if (tid == 0) { 1220 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, clock64(), 1221 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1222 | + } 1223 | +#endif 1224 | + 1225 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 1226 | + if (tid == 0) { 1227 | + npKitDataProcessEntryTime = clock64(); 1228 | + } 1229 | +#endif 1230 | + 1231 | constexpr int PreOpSrcs = SrcBuf != Input ? 0 : 1232 | DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; 1233 | ReduceOrCopyMulti 1234 | @@ -259,6 +344,21 @@ class Primitives< 1235 | Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs, 1236 | Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts, 1237 | workSize); 1238 | + 1239 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) 1240 | + if (tid == 0) { 1241 | + npKitDataProcessExitTime = clock64(); 1242 | + npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime; 1243 | + } 1244 | +#endif 1245 | + 1246 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) 1247 | + if (tid == 0) { 1248 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, clock64(), 1249 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1250 | + } 1251 | +#endif 1252 | + 1253 | } 1254 | barrier(); // This barrier has a counterpart in following loop 1255 | postPeer(0 < sliceSize); 1256 | diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h 1257 | index 41fe0c2..35ee734 100644 1258 | --- a/src/collectives/device/sendrecv.h 1259 | +++ b/src/collectives/device/sendrecv.h 1260 | @@ -7,19 +7,73 @@ 1261 | #include "devcomm.h" 1262 | #include "collectives.h" 1263 | #include "primitives.h" 1264 | +#if defined(ENABLE_NPKIT) 1265 | +#include "npkit/npkit.h" 1266 | +#endif 1267 | 1268 | template 1269 | struct RunWork { 1270 | template 1271 | __device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { 1272 | + 1273 | +#if defined(ENABLE_NPKIT) 1274 | + bool isNpKitThread = (tid == 0); 1275 | + int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P; 1276 | +#endif 1277 | + 1278 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) 1279 | + if (isNpKitThread) { 1280 | + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; 1281 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, 1282 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1283 | + } 1284 | +#endif 1285 | + 1286 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) 1287 | + if (isNpKitThread) { 1288 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(), 1289 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1290 | + } 1291 | +#endif 1292 | + 1293 | void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); 1294 | ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); 1295 | if (args->peer == ncclShmem.comm.rank) { 1296 | struct ncclWorkElemP2p* recvArgs = args-1; 1297 | void* recvBuff = reinterpret_cast(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32); 1298 | if (buff != recvBuff) { 1299 | + 1300 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY) 1301 | + if (isNpKitThread) { 1302 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, clock64(), 1303 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1304 | + } 1305 | +#endif 1306 | + 1307 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) 1308 | + if (isNpKitThread) { 1309 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, clock64(), 1310 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1311 | + } 1312 | +#endif 1313 | + 1314 | ReduceOrCopyMulti 1315 | (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count); 1316 | + 1317 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) 1318 | + if (isNpKitThread) { 1319 | + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, clock64(), 1320 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1321 | + } 1322 | +#endif 1323 | + 1324 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT) 1325 | + if (isNpKitThread) { 1326 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, clock64(), 1327 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1328 | + } 1329 | +#endif 1330 | + 1331 | } 1332 | } else { 1333 | int chunkSize = args->chunkSize/sizeof(T); 1334 | @@ -27,17 +81,60 @@ struct RunWork { 1335 | int const peer = args->peer; 1336 | Primitives, 1, Proto, 1> prims 1337 | (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group); 1338 | + 1339 | +#if defined(ENABLE_NPKIT) 1340 | + if (isNpKitThread) { 1341 | + prims.npKitCtxIdx = npKitCtxIdx; 1342 | + } 1343 | +#endif 1344 | + 1345 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY) 1346 | + if (isNpKitThread) { 1347 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, clock64(), 1348 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1349 | + prims.npKitDataProcessTotalTime = 0; 1350 | + } 1351 | +#endif 1352 | + 1353 | size_t offset = 0; 1354 | do { 1355 | int nelem = min(size_t(chunkSize), count-offset); 1356 | prims.directSend(offset, offset, nelem); 1357 | offset += nelem; 1358 | } while(offset < count); 1359 | + 1360 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT) 1361 | + if (isNpKitThread) { 1362 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 1363 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1364 | + } 1365 | +#endif 1366 | + 1367 | } 1368 | } 1369 | 1370 | template 1371 | __device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { 1372 | +#if defined(ENABLE_NPKIT) 1373 | + bool isNpKitThread = (tid == 0); 1374 | + int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1; 1375 | +#endif 1376 | + 1377 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) 1378 | + if (isNpKitThread) { 1379 | + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; 1380 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, 1381 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1382 | + } 1383 | +#endif 1384 | + 1385 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) 1386 | + if (isNpKitThread) { 1387 | + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(), 1388 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1389 | + } 1390 | +#endif 1391 | + 1392 | if (args->peer != ncclShmem.comm.rank) { 1393 | void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); 1394 | ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); 1395 | @@ -46,12 +143,35 @@ struct RunWork { 1396 | int const peer = args->peer; 1397 | Primitives, 1, Proto, 1> prims 1398 | (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group); 1399 | + 1400 | +#if defined(ENABLE_NPKIT) 1401 | + if (isNpKitThread) { 1402 | + prims.npKitCtxIdx = npKitCtxIdx; 1403 | + } 1404 | +#endif 1405 | + 1406 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY) 1407 | + if (isNpKitThread) { 1408 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, clock64(), 1409 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1410 | + prims.npKitDataProcessTotalTime = 0; 1411 | + } 1412 | +#endif 1413 | + 1414 | size_t offset = 0; 1415 | do { 1416 | int nelem = min(size_t(chunkSize), count-offset); 1417 | prims.directRecv(offset, nelem); 1418 | offset += nelem; 1419 | } while(offset < count); 1420 | + 1421 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT) 1422 | + if (isNpKitThread) { 1423 | + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, clock64(), 1424 | + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 1425 | + } 1426 | +#endif 1427 | + 1428 | } 1429 | } 1430 | 1431 | diff --git a/src/include/devcomm.h b/src/include/devcomm.h 1432 | index 14ff92e..5d54049 100644 1433 | --- a/src/include/devcomm.h 1434 | +++ b/src/include/devcomm.h 1435 | @@ -9,6 +9,9 @@ 1436 | 1437 | #include "nccl.h" 1438 | #include "align.h" 1439 | +#if defined(ENABLE_NPKIT) 1440 | +#include "npkit/npkit_struct.h" 1441 | +#endif 1442 | #include 1443 | 1444 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now 1445 | @@ -293,6 +296,12 @@ struct ncclDevComm { 1446 | 1447 | // Channels, device side 1448 | struct ncclDevChannel* channels/*[MAXCHANNELS]*/; 1449 | + 1450 | +#if defined(ENABLE_NPKIT) 1451 | + NpKitEventCollectContext* npKitEventCollectContexts; 1452 | + uint64_t* cpuTimestamp; 1453 | +#endif 1454 | + 1455 | }; 1456 | 1457 | struct alignas(16) ncclDevCommAndChannels { 1458 | diff --git a/src/include/npkit/npkit.h b/src/include/npkit/npkit.h 1459 | new file mode 100644 1460 | index 0000000..06b0deb 1461 | --- /dev/null 1462 | +++ b/src/include/npkit/npkit.h 1463 | @@ -0,0 +1,65 @@ 1464 | +#ifndef NPKIT_H_ 1465 | +#define NPKIT_H_ 1466 | + 1467 | +#include 1468 | +#include 1469 | + 1470 | +#include 1471 | + 1472 | +#include "npkit/npkit_event.h" 1473 | +#include "npkit/npkit_struct.h" 1474 | + 1475 | +class NpKit { 1476 | + public: 1477 | + static const uint64_t kNumGpuEventBuffers = 512; 1478 | + 1479 | + static const uint64_t kNumCpuEventBuffers = 32; 1480 | + 1481 | + static ncclResult_t Init(int rank); 1482 | + 1483 | + static ncclResult_t Dump(const std::string& dump_dir); 1484 | + 1485 | + static ncclResult_t Shutdown(); 1486 | + 1487 | + static NpKitEventCollectContext* GetGpuEventCollectContexts(); 1488 | + 1489 | + static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, 1490 | + NpKitEventCollectContext* ctx) { 1491 | + uint64_t event_buffer_head = ctx->event_buffer_head; 1492 | + if (event_buffer_head < kMaxNumGpuEventsPerBuffer) { 1493 | + NpKitEvent& event = ctx->event_buffer[event_buffer_head]; 1494 | + event.fields.type = type; 1495 | + event.fields.size = size; 1496 | + event.fields.rsvd = rsvd; 1497 | + event.fields.timestamp = timestamp; 1498 | + ctx->event_buffer_head++; 1499 | + } 1500 | + } 1501 | + 1502 | + static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id); 1503 | + 1504 | + static uint64_t* GetCpuTimestamp(); 1505 | + 1506 | + private: 1507 | + static void CpuTimestampUpdateThread(); 1508 | + 1509 | + // 64K * 512 * 16B = 512MB per GPU 1510 | + static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 16; 1511 | + 1512 | + // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU 1513 | + static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21; 1514 | + 1515 | + static NpKitEvent** gpu_event_buffers_; 1516 | + static NpKitEvent** cpu_event_buffers_; 1517 | + 1518 | + static NpKitEventCollectContext* gpu_collect_contexts_; 1519 | + static NpKitEventCollectContext* cpu_collect_contexts_; 1520 | + static uint64_t* cpu_timestamp_; 1521 | + 1522 | + static uint64_t rank_; 1523 | + 1524 | + static std::thread* cpu_timestamp_update_thread_; 1525 | + static volatile bool cpu_timestamp_update_thread_should_stop_; 1526 | +}; 1527 | + 1528 | +#endif 1529 | diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h 1530 | new file mode 100644 1531 | index 0000000..b328fc9 1532 | --- /dev/null 1533 | +++ b/src/include/npkit/npkit_event.h 1534 | @@ -0,0 +1,98 @@ 1535 | +#ifndef NPKIT_EVENT_H_ 1536 | +#define NPKIT_EVENT_H_ 1537 | + 1538 | +#define NPKIT_EVENT_INVALID 0x0 1539 | + 1540 | +#define NPKIT_EVENT_ALL_REDUCE_RING_ENTRY 0x1 1541 | +#define NPKIT_EVENT_ALL_REDUCE_RING_EXIT 0x2 1542 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY 0x3 1543 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT 0x4 1544 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY 0x5 1545 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT 0x6 1546 | + 1547 | +#define NPKIT_EVENT_COPY_SEND_ENTRY 0x7 1548 | +#define NPKIT_EVENT_COPY_SEND_EXIT 0x8 1549 | +#define NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY 0x9 1550 | +#define NPKIT_EVENT_DIRECT_COPY_SEND_EXIT 0xA 1551 | +#define NPKIT_EVENT_DIRECT_RECV_ENTRY 0xB 1552 | +#define NPKIT_EVENT_DIRECT_RECV_EXIT 0xC 1553 | +#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY 0xD 1554 | +#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT 0xE 1555 | +#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0xF 1556 | +#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x10 1557 | +#define NPKIT_EVENT_DIRECT_SEND_ENTRY 0x11 1558 | +#define NPKIT_EVENT_DIRECT_SEND_EXIT 0x12 1559 | +#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY 0x13 1560 | +#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT 0x14 1561 | +#define NPKIT_EVENT_RECV_ENTRY 0x15 1562 | +#define NPKIT_EVENT_RECV_EXIT 0x16 1563 | +#define NPKIT_EVENT_RECV_COPY_SEND_ENTRY 0x17 1564 | +#define NPKIT_EVENT_RECV_COPY_SEND_EXIT 0x18 1565 | +#define NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY 0x19 1566 | +#define NPKIT_EVENT_RECV_REDUCE_COPY_EXIT 0x1A 1567 | +#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY 0x1B 1568 | +#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT 0x1C 1569 | +#define NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY 0x1D 1570 | +#define NPKIT_EVENT_RECV_REDUCE_SEND_EXIT 0x1E 1571 | +#define NPKIT_EVENT_SEND_ENTRY 0x1F 1572 | +#define NPKIT_EVENT_SEND_EXIT 0x20 1573 | +#define NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY 0x21 1574 | +#define NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT 0x22 1575 | + 1576 | +#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY 0x23 1577 | +#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT 0x24 1578 | +#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY 0x25 1579 | +#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT 0x26 1580 | + 1581 | +#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY 0x27 1582 | +#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT 0x28 1583 | +#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY 0x29 1584 | +#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT 0x2A 1585 | + 1586 | +#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY 0x2B 1587 | +#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT 0x2C 1588 | +#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY 0x2D 1589 | +#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT 0x2E 1590 | + 1591 | +#define NPKIT_EVENT_NET_SEND_ENTRY 0x2F 1592 | +#define NPKIT_EVENT_NET_SEND_EXIT 0x30 1593 | + 1594 | +#define NPKIT_EVENT_NET_RECV_ENTRY 0x31 1595 | +#define NPKIT_EVENT_NET_RECV_EXIT 0x32 1596 | + 1597 | +#define NPKIT_EVENT_TIME_SYNC_GPU 0x33 1598 | +#define NPKIT_EVENT_TIME_SYNC_CPU 0x34 1599 | + 1600 | +#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x35 1601 | +#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x36 1602 | +#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x37 1603 | +#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x38 1604 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x39 1605 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x3A 1606 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x3B 1607 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3C 1608 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3D 1609 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3E 1610 | + 1611 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3F 1612 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x40 1613 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x41 1614 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x42 1615 | + 1616 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x43 1617 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x44 1618 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x45 1619 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x46 1620 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x47 1621 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x48 1622 | + 1623 | +#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x49 1624 | +#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x4A 1625 | +#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x4B 1626 | +#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4C 1627 | +#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4D 1628 | +#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4E 1629 | + 1630 | +#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F 1631 | + 1632 | +#endif 1633 | diff --git a/src/include/npkit/npkit_struct.h b/src/include/npkit/npkit_struct.h 1634 | new file mode 100644 1635 | index 0000000..89dadcb 1636 | --- /dev/null 1637 | +++ b/src/include/npkit/npkit_struct.h 1638 | @@ -0,0 +1,25 @@ 1639 | +#ifndef NPKIT_STRUCT_H_ 1640 | +#define NPKIT_STRUCT_H_ 1641 | + 1642 | +#include 1643 | + 1644 | +#pragma pack(push, 1) 1645 | + 1646 | +union NpKitEvent { 1647 | + uint64_t bits[2]; 1648 | + struct { 1649 | + uint64_t type : 8; 1650 | + uint64_t size : 32; 1651 | + uint64_t rsvd : 24; 1652 | + uint64_t timestamp; 1653 | + } fields; 1654 | +}; 1655 | + 1656 | +struct NpKitEventCollectContext { 1657 | + NpKitEvent* event_buffer; 1658 | + uint64_t event_buffer_head; 1659 | +}; 1660 | + 1661 | +#pragma pack(pop) 1662 | + 1663 | +#endif 1664 | diff --git a/src/include/proxy.h b/src/include/proxy.h 1665 | index 5e7f728..bdbe46d 100644 1666 | --- a/src/include/proxy.h 1667 | +++ b/src/include/proxy.h 1668 | @@ -64,6 +64,19 @@ struct ncclProxySubArgs { 1669 | uint64_t end; 1670 | void* requests[NCCL_STEPS]; 1671 | void* profilingEvents[NCCL_STEPS]; 1672 | + 1673 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) 1674 | + int npKitSizesFifo[NCCL_STEPS]; 1675 | +#endif 1676 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 1677 | + int npKitSizesFifo[NCCL_STEPS]; 1678 | + uint64_t npKitStartTime[NCCL_STEPS]; 1679 | + uint64_t npKitLastPollTime[NCCL_STEPS]; 1680 | + uint64_t npKitLastPollInterval[NCCL_STEPS]; 1681 | + uint64_t npKitMaxPollInterval[NCCL_STEPS]; 1682 | + uint64_t npKitPollIntervalSum[NCCL_STEPS]; 1683 | + uint64_t npKitPollCnt[NCCL_STEPS]; 1684 | +#endif 1685 | }; 1686 | 1687 | struct ncclProxyArgs { 1688 | diff --git a/src/init.cc b/src/init.cc 1689 | index 40f7872..4d7d5ce 100644 1690 | --- a/src/init.cc 1691 | +++ b/src/init.cc 1692 | @@ -16,6 +16,9 @@ 1693 | #include "enqueue.h" 1694 | #include "graph.h" 1695 | #include "argcheck.h" 1696 | +#if defined(ENABLE_NPKIT) 1697 | +#include "npkit/npkit.h" 1698 | +#endif 1699 | #include 1700 | #include 1701 | #include 1702 | @@ -399,7 +402,15 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { 1703 | } 1704 | } 1705 | 1706 | +#if defined(ENABLE_NPKIT) 1707 | + // Init NPKit 1708 | + NCCLCHECK(NpKit::Init(comm->rank)); 1709 | + tmpCommAndChans.comm.npKitEventCollectContexts = NpKit::GetGpuEventCollectContexts(); 1710 | + tmpCommAndChans.comm.cpuTimestamp = NpKit::GetCpuTimestamp(); 1711 | +#endif 1712 | + 1713 | NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.cudaStream), ret, fail); 1714 | + 1715 | exit: 1716 | CUDACHECK(cudaStreamSynchronize(comm->deviceStream.cudaStream)); 1717 | NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream)); 1718 | @@ -1454,11 +1465,26 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { 1719 | int commDevice = comm->cudaDev; 1720 | ncclResult_t ret = ncclSuccess; 1721 | 1722 | +#if defined(ENABLE_NPKIT) 1723 | + const char* npkitDumpDir = nullptr; 1724 | +#endif 1725 | + 1726 | CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail); 1727 | if (savedDevice != commDevice) { 1728 | CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail); 1729 | } 1730 | 1731 | +#if defined(ENABLE_NPKIT) 1732 | + // Dump NPKit events and shutdown 1733 | + npkitDumpDir = getenv("NPKIT_DUMP_DIR"); 1734 | + if (npkitDumpDir == nullptr) { 1735 | + WARN("NPKIT_DUMP_DIR is empty"); 1736 | + } else { 1737 | + NCCLCHECKGOTO(NpKit::Dump(npkitDumpDir), ret, fail); 1738 | + } 1739 | + NCCLCHECKGOTO(NpKit::Shutdown(), ret, fail); 1740 | +#endif 1741 | + 1742 | TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult); 1743 | 1744 | if (comm->initState == ncclSuccess) { 1745 | diff --git a/src/misc/npkit.cc b/src/misc/npkit.cc 1746 | new file mode 100644 1747 | index 0000000..af180e4 1748 | --- /dev/null 1749 | +++ b/src/misc/npkit.cc 1750 | @@ -0,0 +1,174 @@ 1751 | +#include 1752 | +#include 1753 | +#include 1754 | + 1755 | +#include "alloc.h" 1756 | +#include "npkit/npkit.h" 1757 | + 1758 | +uint64_t NpKit::rank_ = 0; 1759 | + 1760 | +NpKitEvent** NpKit::gpu_event_buffers_ = nullptr; 1761 | +NpKitEvent** NpKit::cpu_event_buffers_ = nullptr; 1762 | + 1763 | +NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr; 1764 | +NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr; 1765 | +uint64_t* NpKit::cpu_timestamp_ = nullptr; 1766 | + 1767 | +std::thread* NpKit::cpu_timestamp_update_thread_ = nullptr; 1768 | +volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false; 1769 | + 1770 | +void NpKit::CpuTimestampUpdateThread() { 1771 | + uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count(); 1772 | + uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count(); 1773 | + uint64_t curr_steady_clock = 0; 1774 | + volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_; 1775 | + while (!cpu_timestamp_update_thread_should_stop_) { 1776 | + curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count(); 1777 | + *volatile_cpu_timestamp_ = init_system_clock + (curr_steady_clock - init_steady_clock); 1778 | + } 1779 | +} 1780 | + 1781 | +ncclResult_t NpKit::Init(int rank) { 1782 | + uint64_t i = 0; 1783 | + NpKitEventCollectContext ctx; 1784 | + ctx.event_buffer_head = 0; 1785 | + rank_ = rank; 1786 | + 1787 | + // Init event data structures 1788 | + NCCLCHECK(ncclCalloc(&gpu_event_buffers_, kNumGpuEventBuffers)); 1789 | + NCCLCHECK(ncclCudaCalloc(&gpu_collect_contexts_, kNumGpuEventBuffers)); 1790 | + for (i = 0; i < kNumGpuEventBuffers; i++) { 1791 | + NCCLCHECK(ncclCudaCalloc(gpu_event_buffers_ + i, kMaxNumGpuEventsPerBuffer)); 1792 | + ctx.event_buffer = gpu_event_buffers_[i]; 1793 | + NCCLCHECK(ncclCudaMemcpy(gpu_collect_contexts_ + i, &ctx, 1)); 1794 | + } 1795 | + 1796 | + NCCLCHECK(ncclCalloc(&cpu_event_buffers_, kNumCpuEventBuffers)); 1797 | + NCCLCHECK(ncclCalloc(&cpu_collect_contexts_, kNumCpuEventBuffers)); 1798 | + for (i = 0; i < kNumCpuEventBuffers; i++) { 1799 | + NCCLCHECK(ncclCalloc(cpu_event_buffers_ + i, kMaxNumCpuEventsPerBuffer)); 1800 | + ctx.event_buffer = cpu_event_buffers_[i]; 1801 | + cpu_collect_contexts_[i] = ctx; 1802 | + } 1803 | + 1804 | + // Init timestamp 1805 | + NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_, 1)); 1806 | + volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_; 1807 | + *volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count(); 1808 | + cpu_timestamp_update_thread_should_stop_ = false; 1809 | + cpu_timestamp_update_thread_ = new std::thread(CpuTimestampUpdateThread); 1810 | + 1811 | + return ncclSuccess; 1812 | +} 1813 | + 1814 | +ncclResult_t NpKit::Dump(const std::string& dump_dir) { 1815 | + uint64_t i = 0; 1816 | + std::string dump_file_path; 1817 | + 1818 | + // Dump CPU events 1819 | + for (i = 0; i < kNumCpuEventBuffers; i++) { 1820 | + dump_file_path = dump_dir; 1821 | + dump_file_path += "/cpu_events_rank_"; 1822 | + dump_file_path += std::to_string(rank_); 1823 | + dump_file_path += "_channel_"; 1824 | + dump_file_path += std::to_string(i); 1825 | + auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary); 1826 | + cpu_trace_file.write(reinterpret_cast(cpu_event_buffers_[i]), 1827 | + cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent)); 1828 | + cpu_trace_file.close(); 1829 | + } 1830 | + 1831 | + // Dump CPU clock info 1832 | + dump_file_path = dump_dir; 1833 | + dump_file_path += "/cpu_clock_period_num_rank_"; 1834 | + dump_file_path += std::to_string(rank_); 1835 | + std::string clock_period_num_str = std::to_string(std::chrono::steady_clock::duration::period::num); 1836 | + auto clock_period_num_file = std::fstream(dump_file_path, std::ios::out); 1837 | + clock_period_num_file.write(clock_period_num_str.c_str(), clock_period_num_str.length()); 1838 | + clock_period_num_file.close(); 1839 | + 1840 | + dump_file_path = dump_dir; 1841 | + dump_file_path += "/cpu_clock_period_den_rank_"; 1842 | + dump_file_path += std::to_string(rank_); 1843 | + std::string clock_period_den_str = std::to_string(std::chrono::steady_clock::duration::period::den); 1844 | + auto clock_period_den_file = std::fstream(dump_file_path, std::ios::out); 1845 | + clock_period_den_file.write(clock_period_den_str.c_str(), clock_period_den_str.length()); 1846 | + clock_period_den_file.close(); 1847 | + 1848 | + // Dump GPU events, reuse CPU struct 1849 | + for (i = 0; i < kNumGpuEventBuffers; i++) { 1850 | + dump_file_path = dump_dir; 1851 | + dump_file_path += "/gpu_events_rank_"; 1852 | + dump_file_path += std::to_string(rank_); 1853 | + dump_file_path += "_buf_"; 1854 | + dump_file_path += std::to_string(i); 1855 | + NCCLCHECK(ncclCudaMemcpy(cpu_event_buffers_[0], gpu_event_buffers_[i], kMaxNumGpuEventsPerBuffer)); 1856 | + NCCLCHECK(ncclCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1)); 1857 | + auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary); 1858 | + gpu_trace_file.write(reinterpret_cast(cpu_event_buffers_[0]), 1859 | + cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent)); 1860 | + gpu_trace_file.close(); 1861 | + } 1862 | + 1863 | + // Dump GPU clockRate 1864 | + dump_file_path = dump_dir; 1865 | + dump_file_path += "/gpu_clock_rate_rank_"; 1866 | + dump_file_path += std::to_string(rank_); 1867 | + cudaDeviceProp dev_prop; 1868 | + int dev; 1869 | + CUDACHECK(cudaGetDevice(&dev)); 1870 | + CUDACHECK(cudaGetDeviceProperties(&dev_prop, dev)); 1871 | + std::string clock_rate_str = std::to_string(dev_prop.clockRate); 1872 | + auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out); 1873 | + gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length()); 1874 | + gpu_clock_rate_file.close(); 1875 | + 1876 | + return ncclSuccess; 1877 | +} 1878 | + 1879 | +ncclResult_t NpKit::Shutdown() { 1880 | + uint64_t i = 0; 1881 | + 1882 | + // Stop CPU timestamp updating thread 1883 | + cpu_timestamp_update_thread_should_stop_ = true; 1884 | + cpu_timestamp_update_thread_->join(); 1885 | + 1886 | + // Free CPU event data structures 1887 | + for (i = 0; i < kNumCpuEventBuffers; i++) { 1888 | + free(cpu_event_buffers_[i]); 1889 | + } 1890 | + free(cpu_event_buffers_); 1891 | + free(cpu_collect_contexts_); 1892 | + 1893 | + // Free GPU event data structures 1894 | + for (i = 0; i < kNumGpuEventBuffers; i++) { 1895 | + CUDACHECK(cudaFree(gpu_event_buffers_[i])); 1896 | + } 1897 | + free(gpu_event_buffers_); 1898 | + CUDACHECK(cudaFree(gpu_collect_contexts_)); 1899 | + 1900 | + // Free timestamp 1901 | + NCCLCHECK(ncclCudaHostFree(cpu_timestamp_)); 1902 | + 1903 | + return ncclSuccess; 1904 | +} 1905 | + 1906 | +NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { 1907 | + return gpu_collect_contexts_; 1908 | +} 1909 | + 1910 | +void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) { 1911 | + uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head; 1912 | + if (event_buffer_head < kMaxNumCpuEventsPerBuffer) { 1913 | + NpKitEvent& event = cpu_collect_contexts_[channel_id].event_buffer[event_buffer_head]; 1914 | + event.fields.type = type; 1915 | + event.fields.size = size; 1916 | + event.fields.rsvd = rsvd; 1917 | + event.fields.timestamp = timestamp; 1918 | + cpu_collect_contexts_[channel_id].event_buffer_head++; 1919 | + } 1920 | +} 1921 | + 1922 | +uint64_t* NpKit::GetCpuTimestamp() { 1923 | + return cpu_timestamp_; 1924 | +} 1925 | diff --git a/src/transport/net.cc b/src/transport/net.cc 1926 | index fe98a4c..6e0b801 100644 1927 | --- a/src/transport/net.cc 1928 | +++ b/src/transport/net.cc 1929 | @@ -12,6 +12,36 @@ 1930 | #include "gdrwrap.h" 1931 | #include "shm.h" 1932 | #include "profiler.h" 1933 | +#if defined(ENABLE_NPKIT) 1934 | +#include "npkit/npkit.h" 1935 | +#endif 1936 | + 1937 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 1938 | +#include 1939 | +static uint64_t g_npkit_net_check_latency_threshold_us = 100; 1940 | +static uint64_t g_npkit_time_den = 1000000000; 1941 | +static uint64_t g_npkit_time_num = 1; 1942 | +static uint64_t g_npkit_num_warmup_ops = 10000; 1943 | +static inline uint64_t npKitGetTsInUs() { 1944 | + return std::chrono::steady_clock::now().time_since_epoch().count() * 1000000 * g_npkit_time_num / g_npkit_time_den; 1945 | +} 1946 | +static void npKitInitCheckLatencyEnv() { 1947 | + const char* param_threshold_str = "NPKIT_NET_CHECK_LATENCY_THRESHOLD"; 1948 | + const char* param_warmup_str = "NPKIT_NUM_WARMUP_OPS"; 1949 | + static bool initialized = false; 1950 | + if (!initialized) { 1951 | + g_npkit_time_den = std::chrono::steady_clock::duration::period::den; 1952 | + g_npkit_time_num = std::chrono::steady_clock::duration::period::num; 1953 | + if (getenv(param_threshold_str) != nullptr) { 1954 | + g_npkit_net_check_latency_threshold_us = strtoull(getenv(param_threshold_str), nullptr, 10); 1955 | + } 1956 | + if (getenv(param_warmup_str) != nullptr) { 1957 | + g_npkit_num_warmup_ops = strtoull(getenv(param_warmup_str), nullptr, 10); 1958 | + } 1959 | + initialized = true; 1960 | + } 1961 | +} 1962 | +#endif 1963 | 1964 | static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); 1965 | 1966 | @@ -188,6 +218,11 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph 1967 | proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); 1968 | } 1969 | *((int*)connectInfo) = proxyRank; 1970 | + 1971 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 1972 | + npKitInitCheckLatencyEnv(); 1973 | +#endif 1974 | + 1975 | return ncclSuccess; 1976 | } 1977 | 1978 | @@ -221,6 +256,11 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph 1979 | NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); 1980 | INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev, 1981 | req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); 1982 | + 1983 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 1984 | + npKitInitCheckLatencyEnv(); 1985 | +#endif 1986 | + 1987 | return ncclSuccess; 1988 | } 1989 | 1990 | @@ -863,7 +903,16 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct 1991 | 1992 | static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps"); 1993 | 1994 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 1995 | +static int g_npkit_net_poll_cnt = 0; 1996 | +#endif 1997 | + 1998 | static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { 1999 | + 2000 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2001 | + g_npkit_net_poll_cnt++; 2002 | +#endif 2003 | + 2004 | if (args->state == ncclProxyOpReady) { 2005 | for (int s=0; snsubs; s++) { 2006 | struct ncclProxySubArgs* sub = args->subs+s; 2007 | @@ -916,6 +965,14 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg 2008 | if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) { 2009 | // We have something to receive, let's check if it's completely ready. 2010 | int size = sizesFifo[buffSlot]; 2011 | + 2012 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) 2013 | + sub->npKitSizesFifo[buffSlot] = size; 2014 | +#endif 2015 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 2016 | + sub->npKitSizesFifo[buffSlot] = size; 2017 | +#endif 2018 | + 2019 | bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared; 2020 | char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; 2021 | int ready = 1; 2022 | @@ -946,6 +1003,27 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg 2023 | // Data is ready, try to send. 2024 | NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot)); 2025 | if (sub->requests[buffSlot] != NULL) { 2026 | + 2027 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) 2028 | + NpKit::CollectCpuEvent( 2029 | + NPKIT_EVENT_NET_SEND_ENTRY, 2030 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2031 | + g_npkit_net_poll_cnt, 2032 | +#else 2033 | + size, 2034 | +#endif 2035 | + uint64_t(sub->requests+buffSlot)/sizeof(void*), 2036 | + *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId); 2037 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2038 | + g_npkit_net_poll_cnt = 0; 2039 | +#endif 2040 | +#endif 2041 | + 2042 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 2043 | + sub->npKitStartTime[buffSlot] = sub->npKitLastPollTime[buffSlot] = npKitGetTsInUs(); 2044 | + sub->npKitMaxPollInterval[buffSlot] = sub->npKitPollIntervalSum[buffSlot] = sub->npKitPollCnt[buffSlot] = 0; 2045 | +#endif 2046 | + 2047 | TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); 2048 | sizesFifo[buffSlot] = -1; 2049 | // Make sure size is reset to zero before we update the head. 2050 | @@ -963,7 +1041,48 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg 2051 | int done; 2052 | int buffSlot = (sub->base+sub->done)%NCCL_STEPS; 2053 | NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL)); 2054 | + 2055 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 2056 | + uint64_t npKitPollTime = npKitGetTsInUs(); 2057 | + sub->npKitLastPollInterval[buffSlot] = npKitPollTime - sub->npKitLastPollTime[buffSlot]; 2058 | + sub->npKitPollIntervalSum[buffSlot] += sub->npKitLastPollInterval[buffSlot]; 2059 | + if (sub->npKitLastPollInterval[buffSlot] > sub->npKitMaxPollInterval[buffSlot]) { 2060 | + sub->npKitMaxPollInterval[buffSlot] = sub->npKitLastPollInterval[buffSlot]; 2061 | + } 2062 | + sub->npKitLastPollTime[buffSlot] = npKitPollTime; 2063 | + sub->npKitPollCnt[buffSlot]++; 2064 | +#endif 2065 | + 2066 | if (done) { 2067 | + 2068 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) 2069 | + NpKit::CollectCpuEvent( 2070 | + NPKIT_EVENT_NET_SEND_EXIT, 2071 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2072 | + g_npkit_net_poll_cnt, 2073 | +#else 2074 | + sub->npKitSizesFifo[buffSlot], 2075 | +#endif 2076 | + uint64_t(sub->requests+buffSlot)/sizeof(void*), 2077 | + *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId); 2078 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2079 | + g_npkit_net_poll_cnt = 0; 2080 | +#endif 2081 | +#endif 2082 | + 2083 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 2084 | + uint64_t npKitSendDuration = sub->npKitLastPollTime[buffSlot] - sub->npKitStartTime[buffSlot]; 2085 | + if (g_npkit_num_warmup_ops > 0) { 2086 | + g_npkit_num_warmup_ops--; 2087 | + } 2088 | + if (g_npkit_num_warmup_ops == 0 && npKitSendDuration > g_npkit_net_check_latency_threshold_us) { 2089 | + fprintf(stdout, "NPKIT LONG SEND (R:%d,P:%d,C:%d,S:%d): %d took %lu us, last/max/sum poll interval %lu/%lu/%lu us, cnt: %lu, ts: %lu/%lu\n", 2090 | + comm->rank, sub->peer, sub->channelId, buffSlot, sub->npKitSizesFifo[buffSlot], npKitSendDuration, sub->npKitLastPollInterval[buffSlot], sub->npKitMaxPollInterval[buffSlot], sub->npKitPollIntervalSum[buffSlot], sub->npKitPollCnt[buffSlot], sub->npKitStartTime[buffSlot], sub->npKitLastPollTime[buffSlot]); 2091 | + sub->npKitStartTime[buffSlot] = sub->npKitLastPollTime[buffSlot] = npKitGetTsInUs(); 2092 | + sub->npKitMaxPollInterval[buffSlot] = sub->npKitPollIntervalSum[buffSlot] = sub->npKitPollCnt[buffSlot] = 0; 2093 | + } 2094 | +#endif 2095 | + 2096 | TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); 2097 | sub->done += args->sliceSteps; 2098 | for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd); 2099 | @@ -989,6 +1108,11 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg 2100 | } 2101 | 2102 | static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { 2103 | + 2104 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2105 | + g_npkit_net_poll_cnt++; 2106 | +#endif 2107 | + 2108 | if (args->state == ncclProxyOpReady) { 2109 | // Initialize subs and group them by same recvComm. 2110 | void* recvComm; 2111 | @@ -1070,6 +1194,27 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg 2112 | if (*requestPtr) { 2113 | for (int i=0; igroupSize; i++) { 2114 | struct ncclProxySubArgs* sub = subGroup+i; 2115 | + 2116 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT) 2117 | + NpKit::CollectCpuEvent( 2118 | + NPKIT_EVENT_NET_RECV_ENTRY, 2119 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2120 | + g_npkit_net_poll_cnt, 2121 | +#else 2122 | + sizes[i], 2123 | +#endif 2124 | + uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*), 2125 | + *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId); 2126 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2127 | + g_npkit_net_poll_cnt = 0; 2128 | +#endif 2129 | +#endif 2130 | + 2131 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 2132 | + sub->npKitStartTime[step%NCCL_STEPS] = sub->npKitLastPollTime[step%NCCL_STEPS] = npKitGetTsInUs(); 2133 | + sub->npKitMaxPollInterval[step%NCCL_STEPS] = sub->npKitPollIntervalSum[step%NCCL_STEPS] = sub->npKitPollCnt[step%NCCL_STEPS] = 0; 2134 | +#endif 2135 | + 2136 | sub->posted += args->sliceSteps; 2137 | for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait); 2138 | } 2139 | @@ -1089,12 +1234,56 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg 2140 | void* mhandles[NCCL_PROXY_MAX_SUBS]; 2141 | for (int i=0; irequests[step%NCCL_STEPS], &done, sizes)); 2143 | + 2144 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 2145 | + uint64_t npKitPollTime = npKitGetTsInUs(); 2146 | + for (int i=0; igroupSize; i++) { 2147 | + struct ncclProxySubArgs* sub = subGroup + i; 2148 | + sub->npKitLastPollInterval[step%NCCL_STEPS] = npKitPollTime - sub->npKitLastPollTime[step%NCCL_STEPS]; 2149 | + sub->npKitPollIntervalSum[step%NCCL_STEPS] += sub->npKitLastPollInterval[step%NCCL_STEPS]; 2150 | + if (sub->npKitLastPollInterval[step%NCCL_STEPS] > sub->npKitMaxPollInterval[step%NCCL_STEPS]) { 2151 | + sub->npKitMaxPollInterval[step%NCCL_STEPS] = sub->npKitLastPollInterval[step%NCCL_STEPS]; 2152 | + } 2153 | + sub->npKitLastPollTime[step%NCCL_STEPS] = npKitPollTime; 2154 | + sub->npKitPollCnt[step%NCCL_STEPS]++; 2155 | + } 2156 | +#endif 2157 | + 2158 | if (done) { 2159 | int needFlush = 0; 2160 | int totalSize = 0; 2161 | for (int i=0; igroupSize; i++) { 2163 | struct ncclProxySubArgs* sub = subGroup + i; 2164 | + 2165 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT) 2166 | + NpKit::CollectCpuEvent( 2167 | + NPKIT_EVENT_NET_RECV_EXIT, 2168 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2169 | + g_npkit_net_poll_cnt, 2170 | +#else 2171 | + sizes[i], 2172 | +#endif 2173 | + uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*), 2174 | + *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId); 2175 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) 2176 | + g_npkit_net_poll_cnt = 0; 2177 | +#endif 2178 | +#endif 2179 | + 2180 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY) 2181 | + if (g_npkit_num_warmup_ops > 0) { 2182 | + g_npkit_num_warmup_ops--; 2183 | + } 2184 | + uint64_t npKitRecvDuration = sub->npKitLastPollTime[step%NCCL_STEPS] - sub->npKitStartTime[step%NCCL_STEPS]; 2185 | + if (g_npkit_num_warmup_ops == 0 && npKitRecvDuration > g_npkit_net_check_latency_threshold_us) { 2186 | + fprintf(stdout, "NPKIT LONG RECV (R:%d,P:%d,C:%d,S:%lu): %d took %lu us, last/max/sum poll interval %lu/%lu/%lu us, cnt: %lu, ts: %lu/%lu\n", 2187 | + comm->rank, sub->peer, sub->channelId, step%NCCL_STEPS, sizes[i], npKitRecvDuration, sub->npKitLastPollInterval[step%NCCL_STEPS], sub->npKitMaxPollInterval[step%NCCL_STEPS], sub->npKitPollIntervalSum[step%NCCL_STEPS], sub->npKitPollCnt[step%NCCL_STEPS], sub->npKitStartTime[step%NCCL_STEPS], sub->npKitLastPollTime[step%NCCL_STEPS]); 2188 | + sub->npKitStartTime[step%NCCL_STEPS] = sub->npKitLastPollTime[step%NCCL_STEPS] = npKitGetTsInUs(); 2189 | + sub->npKitMaxPollInterval[step%NCCL_STEPS] = sub->npKitPollIntervalSum[step%NCCL_STEPS] = sub->npKitPollCnt[step%NCCL_STEPS] = 0; 2190 | + } 2191 | +#endif 2192 | + 2193 | sub->received += args->sliceSteps; 2194 | for (uint64_t step=sub->received-args->sliceSteps; stepreceived; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait); 2195 | if (step < sub->nsteps) { 2196 | -------------------------------------------------------------------------------- /nccl_samples/npkit_launcher.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | set -x 5 | 6 | # NCCL source directory. 7 | export NCCL_SRC_DIR="/mnt/nccl" 8 | 9 | # NPKit source directory. 10 | export NPKIT_SRC_DIR="/mnt/npkit" 11 | 12 | # Path to nccl-tests binary being profiled. 13 | export NCCL_TEST_BIN="/mnt/nccl-tests/build/all_reduce_perf" 14 | # export NCCL_TEST_BIN="/mnt/nccl-tests/build/alltoall_perf" 15 | 16 | # NPKit runtime directory, used to store logs and results. 17 | export NPKIT_RUN_DIR="/mnt/npkit_run" 18 | 19 | # Message size of NCCL operation. 20 | export NCCL_MSG_SIZE="16M" 21 | 22 | # NCCL communication algorithm. 23 | export NCCL_ALGO="Ring" 24 | # export NCCL_ALGO="Tree" 25 | 26 | # NCCL communication protocol. Simple and LL are supported. 27 | export NCCL_PROTO="Simple" 28 | # export NCCL_PROTO="LL" 29 | # export NCCL_PROTO="LL128" 30 | 31 | # Number of nccl-tests warmups. 32 | export NCCL_NUM_WARMUPS="0" 33 | 34 | # Number of nccl-tests iterations. 35 | export NCCL_NUM_ITERS="10" 36 | 37 | NPKIT_FLAGS_CPU_PREFIX="-DENABLE_NPKIT" 38 | NPKIT_FLAGS_GPU_PREFIX="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU" 39 | 40 | export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT" 41 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT" 42 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT" 43 | 44 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT" 45 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT" 46 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT" 47 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT" 48 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT" 49 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT" 50 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT" 51 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_ENTRY -DENABLE_NPKIT_EVENT_RECV_EXIT" 52 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT" 53 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT" 54 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT" 55 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT" 56 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_ENTRY -DENABLE_NPKIT_EVENT_SEND_EXIT" 57 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT" 58 | 59 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT" 60 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT" 61 | 62 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT" 63 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT" 64 | 65 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT" 66 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT" 67 | 68 | # export NPKIT_FLAGS=${NPKIT_FLAGS_CPU_PREFIX}" -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT" 69 | 70 | bash npkit_runner.sh 71 | -------------------------------------------------------------------------------- /nccl_samples/npkit_runner.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | set -x 5 | 6 | # Function that runs nccl-tests and collect NPKit traces. 7 | # nccl_test 8 | # 9 | # 10 | function nccl_test() { 11 | mpirun --allow-run-as-root \ 12 | -map-by ppr:8:node --bind-to numa \ 13 | -x LD_PRELOAD=$2/build/lib/libnccl.so:$LD_PRELOAD \ 14 | -x NCCL_DEBUG=WARN \ 15 | -x NCCL_ALGO=$4 \ 16 | -x NCCL_PROTO=$5 \ 17 | -x NPKIT_DUMP_DIR=$8 \ 18 | $1 -b $3 -e $3 -f 2 -g 1 -c 1 -w $6 -n $7 | tee $9/log.txt 19 | } 20 | 21 | # Tag of this NPKit run. 22 | npkit_run_tag=`basename ${NCCL_TEST_BIN}`"/${msg_size}/${NCCL_ALGO}/${NCCL_PROTO}" 23 | 24 | # Path to NPKit dump directory. 25 | npkit_dump_dir="${NPKIT_RUN_DIR}/npkit_dump/${npkit_run_tag}" 26 | 27 | # Path to NPKit post-process directory. 28 | npkit_trace_dir="${NPKIT_RUN_DIR}/npkit_trace/${npkit_run_tag}" 29 | 30 | # Path to NPKit result directory. 31 | npkit_result_dir="${NPKIT_RUN_DIR}/npkit_result/${npkit_run_tag}" 32 | 33 | # Build NCCL with NPKit 34 | cd ${NCCL_SRC_DIR} 35 | make clean 36 | make -j src.build NPKIT_FLAGS="${NPKIT_FLAGS}" 37 | 38 | # Clean existing results 39 | rm -rf ${NPKIT_RUN_DIR} 40 | mkdir -p ${npkit_dump_dir} 41 | mkdir -p ${npkit_trace_dir} 42 | mkdir -p ${npkit_result_dir} 43 | 44 | # Run NPKit on all nodes. 45 | nccl_test ${NCCL_TEST_BIN} ${NCCL_SRC_DIR} ${NCCL_MSG_SIZE} ${NCCL_ALGO} ${NCCL_PROTO} ${NCCL_NUM_WARMUPS} ${NCCL_NUM_ITERS} ${npkit_dump_dir} ${npkit_result_dir} 46 | 47 | # Generate trace file 48 | cd ${NPKIT_SRC_DIR}/nccl_samples 49 | python3 npkit_trace_generator.py --npkit_dump_dir=${npkit_dump_dir} --npkit_event_header_path=${NCCL_SRC_DIR}/src/include/npkit/npkit_event.h --output_dir=${npkit_trace_dir} 50 | cd ${npkit_trace_dir} 51 | tar cvzf npkit_result.tar.gz npkit_event_trace.json 52 | mv npkit_result.tar.gz ${npkit_result_dir} 53 | -------------------------------------------------------------------------------- /nccl_samples/npkit_trace_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | import json 7 | 8 | from queue import Queue 9 | 10 | def parse_npkit_event_header(npkit_event_header_path): 11 | npkit_event_def = {'id_to_type': {}, 'type_to_id': {}} 12 | with open(npkit_event_header_path, 'r') as f: 13 | lines = [x.strip() for x in f.readlines() if len(x.strip()) != 0] 14 | line_idx = 0 15 | while line_idx < len(lines): 16 | if lines[line_idx].startswith('#define NPKIT_EVENT_'): 17 | fields = lines[line_idx].split() 18 | if len(fields) == 3: 19 | event_type = fields[1] 20 | event_id = int(fields[2], 0) 21 | npkit_event_def['type_to_id'][event_type] = event_id 22 | npkit_event_def['id_to_type'][event_id] = event_type 23 | line_idx += 1 24 | return npkit_event_def 25 | 26 | def parse_gpu_clock_scale(gpu_clock_file_path): 27 | with open(gpu_clock_file_path, 'r') as f: 28 | freq_in_khz = f.read() 29 | return float(freq_in_khz) * 1e3 / 1e6 30 | 31 | def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path): 32 | with open(cpu_clock_num_file_path, 'r') as f: 33 | num = float(f.read()) 34 | with open(cpu_clock_den_file_path, 'r') as f: 35 | den = float(f.read()) 36 | return den / num / 1e6 37 | 38 | def parse_gpu_event(event_bytes): 39 | return { 40 | 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), 41 | 'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False), 42 | 'rsvd': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False), 43 | 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) 44 | } 45 | 46 | def parse_cpu_event(event_bytes): 47 | return { 48 | 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), 49 | 'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False), 50 | 'slot': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False), 51 | 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) 52 | } 53 | 54 | def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale): 55 | gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx)) 56 | raw_event_size = 16 57 | curr_cpu_base_time = None 58 | curr_gpu_base_time = None 59 | gpu_events = [] 60 | event_type_to_seq = {} 61 | with open(gpu_event_file_path, 'rb') as f: 62 | raw_content = f.read() 63 | raw_content_size = len(raw_content) 64 | raw_content_idx = 0 65 | while raw_content_idx < raw_content_size: 66 | parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) 67 | if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU': 68 | curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale 69 | curr_gpu_base_time = None 70 | elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU': 71 | if curr_gpu_base_time is None: 72 | curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale 73 | else: 74 | if curr_gpu_base_time is None: 75 | curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale 76 | event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']] 77 | phase = 'B' if event_type.endswith('_ENTRY') else 'E' 78 | gpu_events.append({ 79 | 'ph': phase, 80 | 'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time, 81 | 'pid': rank, 82 | 'tid': buf_idx + 1 83 | }) 84 | if phase == 'B': 85 | if event_type not in event_type_to_seq: 86 | event_type_to_seq[event_type] = 0 87 | gpu_events[-1].update({ 88 | 'name': event_type, 89 | 'cat': 'GPU', 90 | 'args': { 91 | 'rank': rank, 92 | 'buf_idx': buf_idx, 93 | 'seq': event_type_to_seq[event_type], 94 | 'rsvd_0': parsed_gpu_event['rsvd'], 95 | 'size_0': parsed_gpu_event['size'] 96 | } 97 | }) 98 | event_type_to_seq[event_type] += 1 99 | else: 100 | gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']} 101 | delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts'] 102 | gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3 103 | raw_content_idx += raw_event_size 104 | return gpu_events 105 | 106 | def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale): 107 | cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel)) 108 | raw_event_size = 16 109 | cpu_events = [] 110 | event_type_to_seq = {} 111 | 112 | fiber_is_usable = [] 113 | fiber_open_ts = [] 114 | slot_to_fiber_id = {} 115 | channel_shift = 1000 116 | 117 | with open(cpu_event_file_path, 'rb') as f: 118 | raw_content = f.read() 119 | raw_content_size = len(raw_content) 120 | raw_content_idx = 0 121 | while raw_content_idx < raw_content_size: 122 | parsed_cpu_event = parse_cpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) 123 | event_type = npkit_event_def['id_to_type'][parsed_cpu_event['id']] 124 | phase = 'B' if event_type.endswith('_ENTRY') else 'E' 125 | cpu_events.append({ 126 | 'ph': phase, 127 | 'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale, 128 | 'pid': rank 129 | }) 130 | slot = parsed_cpu_event['slot'] 131 | if phase == 'B': 132 | # Open fiber event 133 | fiber_id = 0 134 | while fiber_id < len(fiber_is_usable): 135 | if fiber_is_usable[fiber_id]: 136 | break 137 | fiber_id += 1 138 | if fiber_id == len(fiber_is_usable): 139 | fiber_is_usable.append(True) 140 | fiber_open_ts.append(0.0) 141 | slot_to_fiber_id[slot] = fiber_id 142 | fiber_open_ts[fiber_id] = cpu_events[-1]['ts'] 143 | fiber_is_usable[fiber_id] = False 144 | 145 | if event_type not in event_type_to_seq: 146 | event_type_to_seq[event_type] = 0 147 | cpu_events[-1].update({ 148 | 'name': event_type, 149 | 'cat': 'CPU', 150 | 'args': { 151 | 'rank': rank, 152 | 'channel': channel, 153 | 'slot': parsed_cpu_event['slot'], 154 | 'seq': event_type_to_seq[event_type], 155 | 'size_0': parsed_cpu_event['size'] 156 | } 157 | }) 158 | event_type_to_seq[event_type] += 1 159 | else: 160 | # Close fiber event 161 | fiber_id = slot_to_fiber_id[slot] 162 | slot_to_fiber_id.pop(slot) 163 | last_ts = fiber_open_ts[fiber_id] 164 | fiber_is_usable[fiber_id] = True 165 | 166 | delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts) 167 | cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']} 168 | cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3 169 | 170 | cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift 171 | 172 | raw_content_idx += raw_event_size 173 | return cpu_events 174 | 175 | def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def): 176 | files_in_dump_dir = next(os.walk(npkit_dump_dir))[2] 177 | gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')] 178 | cpu_event_files = [x for x in files_in_dump_dir if x.startswith('cpu_events_rank_')] 179 | 180 | ranks = list(set([int(x.split('_rank_')[1].split('_')[0]) for x in gpu_event_files])) 181 | buf_indices = list(set([int(x.split('_buf_')[1].split('_')[0]) for x in gpu_event_files])) 182 | channels = list(set([int(x.split('_channel_')[1].split('_')[0]) for x in cpu_event_files])) 183 | 184 | trace = {'traceEvents': []} 185 | 186 | for rank in ranks: 187 | cpu_clock_den_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_den_rank_%d' % rank) 188 | cpu_clock_num_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_num_rank_%d' % rank) 189 | cpu_clock_scale = parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path) 190 | 191 | gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank) 192 | gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path) 193 | 194 | for buf_idx in buf_indices: 195 | gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale) 196 | trace['traceEvents'].extend(gpu_events) 197 | 198 | for channel in channels: 199 | cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale) 200 | trace['traceEvents'].extend(cpu_events) 201 | 202 | trace['traceEvents'].sort(key=lambda x : x['ts']) 203 | trace['displayTimeUnit'] = 'ns' 204 | 205 | os.makedirs(output_dir, exist_ok=True) 206 | with open(os.path.join(output_dir, 'npkit_event_trace.json'), 'w') as f: 207 | json.dump(trace, f) 208 | 209 | if __name__ == '__main__': 210 | parser = argparse.ArgumentParser() 211 | parser.add_argument('--npkit_dump_dir', type=str, required=True, help='NPKit dump directory.') 212 | parser.add_argument('--npkit_event_header_path', type=str, required=True, help='Path to npkit_event.h.') 213 | parser.add_argument('--output_dir', type=str, required=True, help='Path to output directory.') 214 | args = parser.parse_args() 215 | 216 | npkit_event_def = parse_npkit_event_header(args.npkit_event_header_path) 217 | convert_npkit_dump_to_trace(args.npkit_dump_dir, args.output_dir, npkit_event_def) 218 | -------------------------------------------------------------------------------- /npkit_result_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/NPKit/4cbb26e3c145d2f9b19892ee250a17e8a4e4e680/npkit_result_example.png -------------------------------------------------------------------------------- /rccl_samples/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | This folder contains scripts for NPKit sample workflow for RCCL. The sample workflow first builds RCCL with NPKit enabled, then runs rccl-test to collect NPKit event dump files, and finally generates NPKit trace file. 4 | 5 | ## Dependencies 6 | 7 | [RCCL](https://github.com/ROCmSoftwarePlatform/rccl) (with NPKit integrated) and [rccl-tests](https://github.com/ROCmSoftwarePlatform/rccl-tests). 8 | 9 | ## Usage 10 | 11 | 1) Make sure parameters in `npkit_launcher.sh` are valid. Also note that currently NPKit only supports collecting non-overlapped events in GPU, and `NPKIT_FLAGS` should follow this rule. 12 | 13 | 2) Make sure `rccl_test` function in `npkit_runner.sh` is a valid command to run `rccl-tests` binary. Also note that currently NPKit only supports 1 GPU per process, so `-g 1` mode is required in `rccl-tests` commands. 14 | 15 | 3) Run command `bash npkit_launcher.sh`. 16 | 17 | 4) The generated trace file `npkit_event_trace.json` (zipped in `npkit_result.tar.gz`) is in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) and can be viewed by trace viewers. 18 | -------------------------------------------------------------------------------- /rccl_samples/npkit_launcher.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | set -x 5 | 6 | # RCCL source directory. 7 | export RCCL_SRC_DIR="/mnt/rccl" 8 | 9 | # NPKit source directory. 10 | export NPKIT_SRC_DIR="/mnt/npkit" 11 | 12 | # Path to rccl-tests binary being profiled. 13 | export RCCL_TEST_BIN="/mnt/rccl-tests/build/all_reduce_perf" 14 | # export RCCL_TEST_BIN="/mnt/rccl-tests/build/alltoall_perf" 15 | 16 | # NPKit runtime directory, used to store logs and results. 17 | export NPKIT_RUN_DIR="/mnt/npkit_run" 18 | 19 | # Message size of RCCL operation. 20 | export RCCL_MSG_SIZE="16M" 21 | 22 | # RCCL communication algorithm. 23 | export RCCL_ALGO="Ring" 24 | # export RCCL_ALGO="Tree" 25 | 26 | # RCCL communication protocol. Simple and LL are supported. 27 | export RCCL_PROTO="Simple" 28 | # export RCCL_PROTO="LL" 29 | 30 | # Number of rccl-tests warmups. 31 | export RCCL_NUM_WARMUPS="0" 32 | 33 | # Number of rccl-tests iterations. 34 | export RCCL_NUM_ITERS="10" 35 | 36 | NPKIT_FLAGS_CPU_PREFIX="-DENABLE_NPKIT" 37 | NPKIT_FLAGS_GPU_PREFIX="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU" 38 | 39 | export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT" 40 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT" 41 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT" 42 | 43 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT" 44 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT" 45 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT" 46 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT" 47 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT" 48 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT" 49 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT" 50 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_ENTRY -DENABLE_NPKIT_EVENT_RECV_EXIT" 51 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT" 52 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT" 53 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT" 54 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT" 55 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_ENTRY -DENABLE_NPKIT_EVENT_SEND_EXIT" 56 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT" 57 | 58 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT" 59 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT" 60 | 61 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT" 62 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT" 63 | 64 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT" 65 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT" 66 | 67 | # export NPKIT_FLAGS=${NPKIT_FLAGS_CPU_PREFIX}" -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT" 68 | 69 | bash npkit_runner.sh 70 | -------------------------------------------------------------------------------- /rccl_samples/npkit_runner.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | set -x 5 | 6 | # Function that runs rccl-tests and collect NPKit traces. 7 | # rccl_test 8 | # 9 | # 10 | function rccl_test() { 11 | mpirun --allow-run-as-root \ 12 | -map-by ppr:16:node --bind-to numa \ 13 | -x LD_PRELOAD=$2/build/librccl.so:$LD_LIBRARY_PATH \ 14 | -x NCCL_DEBUG=WARN \ 15 | -x NCCL_ALGO=$4 \ 16 | -x NCCL_PROTO=$5 \ 17 | -x NPKIT_DUMP_DIR=$8 \ 18 | $1 -b $3 -e $3 -f 2 -g 1 -c 1 -w $6 -n $7 | tee $9/log.txt 19 | } 20 | 21 | # Tag of this NPKit run. 22 | npkit_run_tag=`basename ${RCCL_TEST_BIN}`"/${msg_size}/${RCCL_ALGO}/${RCCL_PROTO}" 23 | 24 | # Path to NPKit dump directory. 25 | npkit_dump_dir="${NPKIT_RUN_DIR}/npkit_dump/${npkit_run_tag}" 26 | 27 | # Path to NPKit post-process directory. 28 | npkit_trace_dir="${NPKIT_RUN_DIR}/npkit_trace/${npkit_run_tag}" 29 | 30 | # Path to NPKit result directory. 31 | npkit_result_dir="${NPKIT_RUN_DIR}/npkit_result/${npkit_run_tag}" 32 | 33 | # Build RCCL with NPKit 34 | cd ${RCCL_SRC_DIR} 35 | rm -rf build 36 | mkdir -p build 37 | cd build 38 | CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ -DNPKIT_FLAGS="${NPKIT_FLAGS}" .. 39 | make -j 40 | 41 | # Clean existing results 42 | rm -rf ${NPKIT_RUN_DIR} 43 | mkdir -p ${npkit_dump_dir} 44 | mkdir -p ${npkit_trace_dir} 45 | mkdir -p ${npkit_result_dir} 46 | 47 | # Run NPKit on all nodes. 48 | rccl_test ${RCCL_TEST_BIN} ${RCCL_SRC_DIR} ${RCCL_MSG_SIZE} ${RCCL_ALGO} ${RCCL_PROTO} ${RCCL_NUM_WARMUPS} ${RCCL_NUM_ITERS} ${npkit_dump_dir} ${npkit_result_dir} 49 | 50 | # Generate trace file 51 | cd ${NPKIT_SRC_DIR}/rccl_samples 52 | python3 npkit_trace_generator.py --npkit_dump_dir=${npkit_dump_dir} --npkit_event_header_path=${RCCL_SRC_DIR}/src/include/npkit/npkit_event.h --output_dir=${npkit_trace_dir} 53 | cd ${npkit_trace_dir} 54 | tar cvzf npkit_result.tar.gz npkit_event_trace.json 55 | mv npkit_result.tar.gz ${npkit_result_dir} 56 | -------------------------------------------------------------------------------- /rccl_samples/npkit_trace_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | import json 7 | 8 | from queue import Queue 9 | 10 | def parse_npkit_event_header(npkit_event_header_path): 11 | npkit_event_def = {'id_to_type': {}, 'type_to_id': {}} 12 | with open(npkit_event_header_path, 'r') as f: 13 | lines = [x.strip() for x in f.readlines() if len(x.strip()) != 0] 14 | line_idx = 0 15 | while line_idx < len(lines): 16 | if lines[line_idx].startswith('#define NPKIT_EVENT_'): 17 | fields = lines[line_idx].split() 18 | if len(fields) == 3: 19 | event_type = fields[1] 20 | event_id = int(fields[2], 0) 21 | npkit_event_def['type_to_id'][event_type] = event_id 22 | npkit_event_def['id_to_type'][event_id] = event_type 23 | line_idx += 1 24 | return npkit_event_def 25 | 26 | def parse_gpu_clock_scale(gpu_clock_file_path): 27 | with open(gpu_clock_file_path, 'r') as f: 28 | freq_in_khz = f.read() 29 | return float(freq_in_khz) * 1e3 / 1e6 30 | 31 | def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path): 32 | with open(cpu_clock_num_file_path, 'r') as f: 33 | num = float(f.read()) 34 | with open(cpu_clock_den_file_path, 'r') as f: 35 | den = float(f.read()) 36 | return den / num / 1e6 37 | 38 | def parse_gpu_event(event_bytes): 39 | return { 40 | 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), 41 | 'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False), 42 | 'rsvd': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False), 43 | 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) 44 | } 45 | 46 | def parse_cpu_event(event_bytes): 47 | return { 48 | 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), 49 | 'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False), 50 | 'slot': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False), 51 | 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) 52 | } 53 | 54 | def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale): 55 | gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx)) 56 | raw_event_size = 16 57 | curr_cpu_base_time = None 58 | curr_gpu_base_time = None 59 | gpu_events = [] 60 | event_type_to_seq = {} 61 | with open(gpu_event_file_path, 'rb') as f: 62 | raw_content = f.read() 63 | raw_content_size = len(raw_content) 64 | raw_content_idx = 0 65 | while raw_content_idx < raw_content_size: 66 | parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) 67 | if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU': 68 | curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale 69 | curr_gpu_base_time = None 70 | elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU': 71 | if curr_gpu_base_time is None: 72 | curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale 73 | else: 74 | if curr_gpu_base_time is None: 75 | curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale 76 | event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']] 77 | phase = 'B' if event_type.endswith('_ENTRY') else 'E' 78 | gpu_events.append({ 79 | 'ph': phase, 80 | 'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time, 81 | 'pid': rank, 82 | 'tid': buf_idx + 1 83 | }) 84 | if phase == 'B': 85 | if event_type not in event_type_to_seq: 86 | event_type_to_seq[event_type] = 0 87 | gpu_events[-1].update({ 88 | 'name': event_type, 89 | 'cat': 'GPU', 90 | 'args': { 91 | 'rank': rank, 92 | 'buf_idx': buf_idx, 93 | 'seq': event_type_to_seq[event_type], 94 | 'rsvd_0': parsed_gpu_event['rsvd'], 95 | 'size_0': parsed_gpu_event['size'] 96 | } 97 | }) 98 | event_type_to_seq[event_type] += 1 99 | else: 100 | gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']} 101 | delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts'] 102 | gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3 103 | raw_content_idx += raw_event_size 104 | return gpu_events 105 | 106 | def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale): 107 | cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel)) 108 | raw_event_size = 16 109 | cpu_events = [] 110 | event_type_to_seq = {} 111 | 112 | fiber_is_usable = [] 113 | fiber_open_ts = [] 114 | slot_to_fiber_id = {} 115 | channel_shift = 1000 116 | 117 | with open(cpu_event_file_path, 'rb') as f: 118 | raw_content = f.read() 119 | raw_content_size = len(raw_content) 120 | raw_content_idx = 0 121 | while raw_content_idx < raw_content_size: 122 | parsed_cpu_event = parse_cpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) 123 | event_type = npkit_event_def['id_to_type'][parsed_cpu_event['id']] 124 | phase = 'B' if event_type.endswith('_ENTRY') else 'E' 125 | cpu_events.append({ 126 | 'ph': phase, 127 | 'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale, 128 | 'pid': rank 129 | }) 130 | slot = parsed_cpu_event['slot'] 131 | if phase == 'B': 132 | # Open fiber event 133 | fiber_id = 0 134 | while fiber_id < len(fiber_is_usable): 135 | if fiber_is_usable[fiber_id]: 136 | break 137 | fiber_id += 1 138 | if fiber_id == len(fiber_is_usable): 139 | fiber_is_usable.append(True) 140 | fiber_open_ts.append(0.0) 141 | slot_to_fiber_id[slot] = fiber_id 142 | fiber_open_ts[fiber_id] = cpu_events[-1]['ts'] 143 | fiber_is_usable[fiber_id] = False 144 | 145 | if event_type not in event_type_to_seq: 146 | event_type_to_seq[event_type] = 0 147 | cpu_events[-1].update({ 148 | 'name': event_type, 149 | 'cat': 'CPU', 150 | 'args': { 151 | 'rank': rank, 152 | 'channel': channel, 153 | 'slot': parsed_cpu_event['slot'], 154 | 'seq': event_type_to_seq[event_type], 155 | 'size_0': parsed_cpu_event['size'] 156 | } 157 | }) 158 | event_type_to_seq[event_type] += 1 159 | else: 160 | # Close fiber event 161 | fiber_id = slot_to_fiber_id[slot] 162 | slot_to_fiber_id.pop(slot) 163 | last_ts = fiber_open_ts[fiber_id] 164 | fiber_is_usable[fiber_id] = True 165 | 166 | delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts) 167 | cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']} 168 | cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3 169 | 170 | cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift 171 | 172 | raw_content_idx += raw_event_size 173 | return cpu_events 174 | 175 | def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def): 176 | files_in_dump_dir = next(os.walk(npkit_dump_dir))[2] 177 | gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')] 178 | cpu_event_files = [x for x in files_in_dump_dir if x.startswith('cpu_events_rank_')] 179 | 180 | ranks = list(set([int(x.split('_rank_')[1].split('_')[0]) for x in gpu_event_files])) 181 | buf_indices = list(set([int(x.split('_buf_')[1].split('_')[0]) for x in gpu_event_files])) 182 | channels = list(set([int(x.split('_channel_')[1].split('_')[0]) for x in cpu_event_files])) 183 | 184 | trace = {'traceEvents': []} 185 | 186 | for rank in ranks: 187 | cpu_clock_den_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_den_rank_%d' % rank) 188 | cpu_clock_num_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_num_rank_%d' % rank) 189 | cpu_clock_scale = parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path) 190 | 191 | gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank) 192 | gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path) 193 | 194 | for buf_idx in buf_indices: 195 | gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale) 196 | trace['traceEvents'].extend(gpu_events) 197 | 198 | for channel in channels: 199 | cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale) 200 | trace['traceEvents'].extend(cpu_events) 201 | 202 | trace['traceEvents'].sort(key=lambda x : x['ts']) 203 | trace['displayTimeUnit'] = 'ns' 204 | 205 | os.makedirs(output_dir, exist_ok=True) 206 | with open(os.path.join(output_dir, 'npkit_event_trace.json'), 'w') as f: 207 | json.dump(trace, f) 208 | 209 | if __name__ == '__main__': 210 | parser = argparse.ArgumentParser() 211 | parser.add_argument('--npkit_dump_dir', type=str, required=True, help='NPKit dump directory.') 212 | parser.add_argument('--npkit_event_header_path', type=str, required=True, help='Path to npkit_event.h.') 213 | parser.add_argument('--output_dir', type=str, required=True, help='Path to output directory.') 214 | args = parser.parse_args() 215 | 216 | npkit_event_def = parse_npkit_event_header(args.npkit_event_header_path) 217 | convert_npkit_dump_to_trace(args.npkit_dump_dir, args.output_dir, npkit_event_def) 218 | --------------------------------------------------------------------------------