├── CODE_OF_CONDUCT.md
├── CONTRIBUTE.md
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── msccl_samples
    ├── README.md
    ├── npkit_launcher.sh
    ├── npkit_runner.sh
    └── npkit_trace_generator.py
├── mscclpp_samples
    └── README.md
├── nccl_samples
    ├── README.md
    ├── npkit-for-nccl-2.17.1-1.diff
    ├── npkit_launcher.sh
    ├── npkit_runner.sh
    └── npkit_trace_generator.py
├── npkit_result_example.png
└── rccl_samples
    ├── README.md
    ├── npkit_launcher.sh
    ├── npkit_runner.sh
    └── npkit_trace_generator.py


/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTE.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | This project welcomes contributions and suggestions. Most contributions require you to
 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
 5 | and actually do, grant us the rights to use your contribution. For details, visit
 6 | https://cla.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need
 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
11 | 
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
15 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
  1 |  NOTICES AND INFORMATION
  2 |  Do Not Translate or Localize
  3 | 
  4 |  This software incorporates material from third parties. Microsoft makes certain
  5 |  open source code available at https://3rdpartysource.microsoft.com, or you may
  6 |  send a check or money order for US $5.00, including the product name, the open
  7 |  source component name, and version number, to:
  8 | 
  9 |  Source Code Compliance Team
 10 |  Microsoft Corporation
 11 |  One Microsoft Way
 12 |  Redmond, WA 98052
 13 |  USA
 14 | 
 15 |  Notwithstanding any other terms, you may reverse engineer this software to the
 16 |  extent required to debug changes to any libraries licensed under the GNU Lesser
 17 |  General Public License.
 18 | 
 19 | NCCL
 20 | 
 21 |  Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 22 | 
 23 |  Redistribution and use in source and binary forms, with or without
 24 |  modification, are permitted provided that the following conditions
 25 |  are met:
 26 |   * Redistributions of source code must retain the above copyright
 27 |     notice, this list of conditions and the following disclaimer.
 28 |   * Redistributions in binary form must reproduce the above copyright
 29 |     notice, this list of conditions and the following disclaimer in the
 30 |     documentation and/or other materials provided with the distribution.
 31 |   * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
 32 |     Laboratory, the U.S. Department of Energy, nor the names of their
 33 |     contributors may be used to endorse or promote products derived
 34 |     from this software without specific prior written permission.
 35 | 
 36 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 37 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 38 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 39 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 40 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 41 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 42 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 43 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 44 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 45 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 46 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 47 | 
 48 |  The U.S. Department of Energy funded the development of this software
 49 |  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
 50 | 
 51 | 
 52 | This code also includes files from the NVIDIA Tools Extension SDK project.
 53 | 
 54 | See:
 55 | 
 56 |    https://github.com/NVIDIA/NVTX
 57 | 
 58 | for more information and license details.
 59 | 
 60 | RCCL
 61 | 
 62 | Attributions
 63 | 
 64 | Contains contributions from NVIDIA.
 65 | 
 66 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 67 | copyright = 'Copyright 2019-2021 Advanced Micro Devices'.
 68 | 2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2021 Advanced Micro Devices.
 69 | 
 70 |  Redistribution and use in source and binary forms, with or without
 71 |  modification, are permitted provided that the following conditions
 72 |  are met:
 73 |   * Redistributions of source code must retain the above copyright
 74 |     notice, this list of conditions and the following disclaimer.
 75 |   * Redistributions in binary form must reproduce the above copyright
 76 |     notice, this list of conditions and the following disclaimer in the
 77 |     documentation and/or other materials provided with the distribution.
 78 |   * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
 79 |     Laboratory, the U.S. Department of Energy, nor the names of their
 80 |     contributors may be used to endorse or promote products derived
 81 |     from this software without specific prior written permission.
 82 | 
 83 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 84 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 85 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 86 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 87 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 88 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 89 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 90 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 91 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 92 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 93 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 94 | 
 95 |  The U.S. Department of Energy funded the development of this software
 96 |  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
 97 | 
 98 | 
 99 | This code also includes files from the NVIDIA Tools Extension SDK project.
100 | 
101 | See:
102 | 
103 |    https://github.com/NVIDIA/NVTX
104 | 
105 | for more information and license details.
106 | 
107 | Notices and Licenses file
108 | _______________________________________________________________
109 | 
110 | Dependencies on nvidia-nccl v2.3.7-1 (BSD3)
111 | Copyright (c) 2015-2018, NVIDIA CORPORATION.
112 | Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc.
113 | 
114 | Redistribution and use in source and binary forms, with or without
115 | modification, are permitted provided that the following conditions
116 | are met:
117 |  * Redistributions of source code must retain the above copyright
118 |    notice, this list of conditions and the following disclaimer.
119 |  * Redistributions in binary form must reproduce the above copyright
120 |    notice, this list of conditions and the following disclaimer in the
121 |    documentation and/or other materials provided with the distribution.
122 |  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
123 |    Laboratory, the U.S. Department of Energy, nor the names of their
124 |    contributors may be used to endorse or promote products derived
125 |    from this software without specific prior written permission.
126 | 
127 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
128 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
129 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
130 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
131 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
132 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
133 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
134 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
135 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
136 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
137 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
138 | 
139 | The U.S. Department of Energy funded the development of this software
140 | under subcontract 7078610 with Lawrence Berkeley National Laboratory.
141 | 
142 | 
143 | nvidia-nccl v2.3.7-1 (BSD2)
144 | Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
145 | 
146 | Redistribution and use in source and binary forms, with or without
147 | modification, are permitted provided that the following conditions
148 | are met:
149 |  * Redistributions of source code must retain the above copyright
150 |    notice, this list of conditions and the following disclaimer.
151 |  * Redistributions in binary form must reproduce the above copyright
152 |    notice, this list of conditions and the following disclaimer in the
153 |    documentation and/or other materials provided with the distribution.
154 |  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
155 |    Laboratory, the U.S. Department of Energy, nor the names of their
156 |    contributors may be used to endorse or promote products derived
157 |    from this software without specific prior written permission.
158 | 
159 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
160 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
161 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
162 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
163 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
164 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
165 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
166 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
167 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
168 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
169 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
170 | 
171 | The U.S. Department of Energy funded the development of this software
172 | under subcontract 7078610 with Lawrence Berkeley National Laboratory.
173 | 
174 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | NPKit (Networking Profiling Kit) is a profiling framework designed for popular collective communication libraries (CCLs), including [Microsoft MSCCL](https://github.com/Azure/msccl/), [Microsoft MSCCL++](https://github.com/microsoft/mscclpp/), [NVIDIA NCCL](https://github.com/NVIDIA/nccl) and [AMD RCCL](https://github.com/ROCmSoftwarePlatform/rccl/). It enables users to insert customized profiling events into different CCL components, especially into giant GPU kernels. These events are then automatically placed onto a unified timeline in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview), which users can then leverage trace viewer to understand CCLs' workflow and performance.
 4 | 
 5 | NPKit is easy to use. It runs with all kinds of workloads where CCLs are leveraged. Users only need to dynamically link their workload binary to CCLs built with NPKit enabled, then the unified timeline with profiling events are automatically generated.
 6 | 
 7 | NPKit is lightweight. During each run, users can choose to only enable profiling events they care about to minimize overhead caused by NPKit.
 8 | 
 9 | Below is an example of NPKit timeline result. Green blocks are LL128 data transfer times in GPU, and each line represents a independent data flow (typically mapped to a channel or thread block). Red/purple blocks are net send/recv times in CPU. Each block contains other attributes, including data size, channel ID, etc.
10 | 
11 | ![NPKit Result Example](./npkit_result_example.png)
12 | 
13 | ## Quick Start
14 | 
15 | Please check `msccl_samples` for MSCCL quick start, `mscclpp_samples` for MSCCL++ quick start, `nccl_samples` for NCCL quick start and `rccl_samples` for RCCL quick start.
16 | 
17 | ## Trademarks
18 | 
19 | This project may contain trademarks or logos for projects, products, or services.
20 | Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
21 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
22 | Any use of third-party trademarks or logos are subject to those third-party's policies.
23 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Support
 2 | 
 3 | ## How to file issues and get help
 4 | 
 5 | This project uses [GitHub Issues] to track bugs and feature requests. Please search the existing
 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
 7 | feature request as a new issue.
 8 | 
 9 | For help and questions about using this project, please create a new post in [GitHub Discussions].
10 | 
11 | ## Microsoft Support Policy
12 | 
13 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
14 | 
15 | [GitHub Issues]: https://github.com/microsoft/npkit/issues
16 | [GitHub Discussions]: https://github.com/microsoft/npkit/discussions
17 | 


--------------------------------------------------------------------------------
/msccl_samples/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | This folder contains scripts for NPKit sample workflow for [MSCCL](https://github.com/Azure/msccl). The sample workflow first builds MSCCL with NPKit enabled, then runs msccl-test to collect NPKit event dump files, and finally generates NPKit trace file.
 4 | 
 5 | ## Dependencies
 6 | 
 7 | [MSCCL executor](https://github.com/Azure/msccl-executor-nccl) (with NPKit integrated) and [MSCCL tests](https://github.com/Azure/msccl-tests-nccl).
 8 | 
 9 | ## Usage
10 | 
11 | 1) Make sure parameters in `npkit_launcher.sh` are valid. Also note that currently NPKit only supports collecting non-overlapped events in GPU, and `NPKIT_FLAGS` should follow this rule.
12 | 
13 | 2) Make sure `msccl_test` function in `npkit_runner.sh` is a valid command to run `msccl-tests-nccl` binary. Also note that currently NPKit only supports 1 GPU per process, so `-g 1` mode is required in `msccl-tests-nccl` commands.
14 | 
15 | 3) Run command `bash npkit_launcher.sh`.
16 | 
17 | 4) The generated trace file `npkit_event_trace.json` (zipped in `npkit_result.tar.gz`) is in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) and can be viewed by trace viewers.
18 | 


--------------------------------------------------------------------------------
/msccl_samples/npkit_launcher.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | set -x
 5 | 
 6 | # MSCCL source directory.
 7 | export MSCCL_SRC_DIR="/mnt/msccl-executor-nccl"
 8 | 
 9 | # NPKit source directory.
10 | export NPKIT_SRC_DIR="/mnt/npkit"
11 | 
12 | # Path to msccl-tests-nccl binary being profiled.
13 | export NCCL_TEST_BIN="/mnt/msccl-tests-nccl/build/all_reduce_perf"
14 | # export NCCL_TEST_BIN="/mnt/msccl-tests-nccl/build/alltoall_perf"
15 | 
16 | # NPKit runtime directory, used to store logs and results.
17 | export NPKIT_RUN_DIR="/mnt/npkit_run"
18 | 
19 | # Message size of MSCCL operation.
20 | export MSCCL_MSG_SIZE="16K"
21 | 
22 | # MSCCL communication algorithm.
23 | export MSCCL_ALGO="Ring"
24 | # export MSCCL_ALGO="Tree,MSCCL"
25 | 
26 | # MSCCL communication protocol. Simple and LL are supported.
27 | # export MSCCL_PROTO="Simple"
28 | export MSCCL_PROTO="LL"
29 | # export MSCCL_PROTO="LL128"
30 | 
31 | # Number of msccl-tests-nccl warmups.
32 | export MSCCL_NUM_WARMUPS="0"
33 | 
34 | # Number of msccl-tests-nccl iterations.
35 | export MSCCL_NUM_ITERS="10"
36 | 
37 | NPKIT_FLAGS_CPU_PREFIX="-DENABLE_NPKIT"
38 | NPKIT_FLAGS_GPU_PREFIX="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU"
39 | 
40 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT"
41 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT"
42 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT"
43 | 
44 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT"
45 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT"
46 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT"
47 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT"
48 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT"
49 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT"
50 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT"
51 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_ENTRY -DENABLE_NPKIT_EVENT_RECV_EXIT"
52 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT"
53 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT"
54 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT"
55 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT"
56 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_ENTRY -DENABLE_NPKIT_EVENT_SEND_EXIT"
57 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT"
58 | 
59 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT"
60 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT"
61 | 
62 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT"
63 | export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT"
64 | 
65 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT"
66 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT"
67 | 
68 | # export NPKIT_FLAGS=${NPKIT_FLAGS_CPU_PREFIX}" -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT"
69 | 
70 | bash npkit_runner.sh
71 | 


--------------------------------------------------------------------------------
/msccl_samples/npkit_runner.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | set -x
 5 | 
 6 | # Function that runs msccl-tests-nccl and collect NPKit traces.
 7 | # msccl_test
 8 | #   <nccl_test_bin> <msg_size> <algorithm> <protocol> <num_warmups> <num_iters>
 9 | #   <npkit_dump_dir> <npkit_result_dir>
10 | function msccl_test() {
11 |   mpirun --allow-run-as-root \
12 |     -map-by ppr:8:node --bind-to numa \
13 |     -x LD_PRELOAD=$2/build/lib/libnccl.so:$LD_PRELOAD \
14 |     -x NCCL_DEBUG=WARN \
15 |     -x NCCL_ALGO=$4 \
16 |     -x NCCL_PROTO=$5 \
17 |     -x NPKIT_DUMP_DIR=$8 \
18 |     $1 -b $3 -e $3 -f 2 -g 1 -c 1 -w $6 -n $7 | tee $9/log.txt
19 | }
20 | 
21 | # Tag of this NPKit run.
22 | npkit_run_tag=`basename ${NCCL_TEST_BIN}`"/${msg_size}/${MSCCL_ALGO}/${MSCCL_PROTO}"
23 | 
24 | # Path to NPKit dump directory.
25 | npkit_dump_dir="${NPKIT_RUN_DIR}/npkit_dump/${npkit_run_tag}"
26 | 
27 | # Path to NPKit post-process directory.
28 | npkit_trace_dir="${NPKIT_RUN_DIR}/npkit_trace/${npkit_run_tag}"
29 | 
30 | # Path to NPKit result directory.
31 | npkit_result_dir="${NPKIT_RUN_DIR}/npkit_result/${npkit_run_tag}"
32 | 
33 | # Build MSCCL with NPKit
34 | cd ${MSCCL_SRC_DIR}
35 | make clean
36 | make -j src.build NPKIT_FLAGS="${NPKIT_FLAGS}"
37 | 
38 | # Clean existing results
39 | rm -rf ${NPKIT_RUN_DIR}
40 | mkdir -p ${npkit_dump_dir}
41 | mkdir -p ${npkit_trace_dir}
42 | mkdir -p ${npkit_result_dir}
43 | 
44 | # Run NPKit on all nodes.
45 | msccl_test ${NCCL_TEST_BIN} ${MSCCL_SRC_DIR} ${MSCCL_MSG_SIZE} ${MSCCL_ALGO} ${MSCCL_PROTO} ${MSCCL_NUM_WARMUPS} ${MSCCL_NUM_ITERS} ${npkit_dump_dir} ${npkit_result_dir}
46 | 
47 | # Generate trace file
48 | cd ${NPKIT_SRC_DIR}/msccl_samples
49 | python3 npkit_trace_generator.py --npkit_dump_dir=${npkit_dump_dir} --npkit_event_header_path=${MSCCL_SRC_DIR}/src/include/npkit/npkit_event.h --output_dir=${npkit_trace_dir}
50 | cd ${npkit_trace_dir}
51 | tar cvzf npkit_result.tar.gz npkit_event_trace.json
52 | mv npkit_result.tar.gz ${npkit_result_dir}
53 | 


--------------------------------------------------------------------------------
/msccl_samples/npkit_trace_generator.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import argparse
  5 | import os
  6 | import json
  7 | 
  8 | from queue import Queue
  9 | 
 10 | def parse_npkit_event_header(npkit_event_header_path):
 11 |     npkit_event_def = {'id_to_type': {}, 'type_to_id': {}}
 12 |     with open(npkit_event_header_path, 'r') as f:
 13 |         lines = [x.strip() for x in f.readlines() if len(x.strip()) != 0]
 14 |         line_idx = 0
 15 |         while line_idx < len(lines):
 16 |             if lines[line_idx].startswith('#define NPKIT_EVENT_'):
 17 |                 fields = lines[line_idx].split()
 18 |                 if len(fields) == 3:
 19 |                     event_type = fields[1]
 20 |                     event_id = int(fields[2], 0)
 21 |                     npkit_event_def['type_to_id'][event_type] = event_id
 22 |                     npkit_event_def['id_to_type'][event_id] = event_type
 23 |             line_idx += 1
 24 |     return npkit_event_def
 25 | 
 26 | def parse_gpu_clock_scale(gpu_clock_file_path):
 27 |     with open(gpu_clock_file_path, 'r') as f:
 28 |         freq_in_khz = f.read()
 29 |         return float(freq_in_khz) * 1e3 / 1e6
 30 | 
 31 | def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path):
 32 |     with open(cpu_clock_num_file_path, 'r') as f:
 33 |         num = float(f.read())
 34 |     with open(cpu_clock_den_file_path, 'r') as f:
 35 |         den = float(f.read())
 36 |     return den / num / 1e6
 37 | 
 38 | def parse_gpu_event(event_bytes):
 39 |     return {
 40 |         'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
 41 |         'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False),
 42 |         'rsvd': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False),
 43 |         'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
 44 |     }
 45 | 
 46 | def parse_cpu_event(event_bytes):
 47 |     return {
 48 |         'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
 49 |         'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False),
 50 |         'slot': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False),
 51 |         'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
 52 |     }
 53 | 
 54 | def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale):
 55 |     gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx))
 56 |     raw_event_size = 16
 57 |     curr_cpu_base_time = None
 58 |     curr_gpu_base_time = None
 59 |     gpu_events = []
 60 |     event_type_to_seq = {}
 61 |     with open(gpu_event_file_path, 'rb') as f:
 62 |         raw_content = f.read()
 63 |         raw_content_size = len(raw_content)
 64 |         raw_content_idx = 0
 65 |         while raw_content_idx < raw_content_size:
 66 |             parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
 67 |             if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU':
 68 |                 curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale
 69 |                 curr_gpu_base_time = None
 70 |             elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU':
 71 |                 if curr_gpu_base_time is None:
 72 |                     curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
 73 |             else:
 74 |                 if curr_gpu_base_time is None:
 75 |                     curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
 76 |                 event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
 77 |                 phase = 'B' if event_type.endswith('_ENTRY') else 'E'
 78 |                 gpu_events.append({
 79 |                     'ph': phase,
 80 |                     'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time,
 81 |                     'pid': rank,
 82 |                     'tid': buf_idx + 1
 83 |                 })
 84 |                 if phase == 'B':
 85 |                     if event_type not in event_type_to_seq:
 86 |                         event_type_to_seq[event_type] = 0
 87 |                     gpu_events[-1].update({
 88 |                         'name': event_type,
 89 |                         'cat': 'GPU',
 90 |                         'args': {
 91 |                             'rank': rank,
 92 |                             'buf_idx': buf_idx,
 93 |                             'seq': event_type_to_seq[event_type],
 94 |                             'rsvd_0': parsed_gpu_event['rsvd'],
 95 |                             'size_0': parsed_gpu_event['size']
 96 |                         }
 97 |                     })
 98 |                     event_type_to_seq[event_type] += 1
 99 |                 else:
100 |                     gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
101 |                     delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts']
102 |                     gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3
103 |             raw_content_idx += raw_event_size
104 |     return gpu_events
105 | 
106 | def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale):
107 |     cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel))
108 |     raw_event_size = 16
109 |     cpu_events = []
110 |     event_type_to_seq = {}
111 | 
112 |     fiber_is_usable = []
113 |     fiber_open_ts = []
114 |     slot_to_fiber_id = {}
115 |     channel_shift = 1000
116 | 
117 |     with open(cpu_event_file_path, 'rb') as f:
118 |         raw_content = f.read()
119 |         raw_content_size = len(raw_content)
120 |         raw_content_idx = 0
121 |         while raw_content_idx < raw_content_size:
122 |             parsed_cpu_event = parse_cpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
123 |             event_type = npkit_event_def['id_to_type'][parsed_cpu_event['id']]
124 |             phase = 'B' if event_type.endswith('_ENTRY') else 'E'
125 |             cpu_events.append({
126 |                 'ph': phase,
127 |                 'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale,
128 |                 'pid': rank
129 |             })
130 |             slot = parsed_cpu_event['slot']
131 |             if phase == 'B':
132 |                 # Open fiber event
133 |                 fiber_id = 0
134 |                 while fiber_id < len(fiber_is_usable):
135 |                     if fiber_is_usable[fiber_id]:
136 |                         break
137 |                     fiber_id += 1
138 |                 if fiber_id == len(fiber_is_usable):
139 |                     fiber_is_usable.append(True)
140 |                     fiber_open_ts.append(0.0)
141 |                 slot_to_fiber_id[slot] = fiber_id
142 |                 fiber_open_ts[fiber_id] = cpu_events[-1]['ts']
143 |                 fiber_is_usable[fiber_id] = False
144 | 
145 |                 if event_type not in event_type_to_seq:
146 |                     event_type_to_seq[event_type] = 0
147 |                 cpu_events[-1].update({
148 |                     'name': event_type,
149 |                     'cat': 'CPU',
150 |                     'args': {
151 |                         'rank': rank,
152 |                         'channel': channel,
153 |                         'slot': parsed_cpu_event['slot'],
154 |                         'seq': event_type_to_seq[event_type],
155 |                         'size_0': parsed_cpu_event['size']
156 |                     }
157 |                 })
158 |                 event_type_to_seq[event_type] += 1
159 |             else:
160 |                 # Close fiber event
161 |                 fiber_id = slot_to_fiber_id[slot]
162 |                 slot_to_fiber_id.pop(slot)
163 |                 last_ts = fiber_open_ts[fiber_id]
164 |                 fiber_is_usable[fiber_id] = True
165 | 
166 |                 delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts)
167 |                 cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']}
168 |                 cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3
169 | 
170 |             cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift
171 | 
172 |             raw_content_idx += raw_event_size
173 |     return cpu_events
174 | 
175 | def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def):
176 |     files_in_dump_dir = next(os.walk(npkit_dump_dir))[2]
177 |     gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')]
178 |     cpu_event_files = [x for x in files_in_dump_dir if x.startswith('cpu_events_rank_')]
179 | 
180 |     ranks = list(set([int(x.split('_rank_')[1].split('_')[0]) for x in gpu_event_files]))
181 |     buf_indices = list(set([int(x.split('_buf_')[1].split('_')[0]) for x in gpu_event_files]))
182 |     channels = list(set([int(x.split('_channel_')[1].split('_')[0]) for x in cpu_event_files]))
183 | 
184 |     trace = {'traceEvents': []}
185 | 
186 |     for rank in ranks:
187 |         cpu_clock_den_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_den_rank_%d' % rank)
188 |         cpu_clock_num_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_num_rank_%d' % rank)
189 |         cpu_clock_scale = parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path)
190 | 
191 |         gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank)
192 |         gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path)
193 | 
194 |         for buf_idx in buf_indices:
195 |             gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale)
196 |             trace['traceEvents'].extend(gpu_events)
197 | 
198 |         for channel in channels:
199 |             cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale)
200 |             trace['traceEvents'].extend(cpu_events)
201 | 
202 |     trace['traceEvents'].sort(key=lambda x : x['ts'])
203 |     trace['displayTimeUnit'] = 'ns'
204 | 
205 |     os.makedirs(output_dir, exist_ok=True)
206 |     with open(os.path.join(output_dir, 'npkit_event_trace.json'), 'w') as f:
207 |         json.dump(trace, f)
208 | 
209 | if __name__ == '__main__':
210 |     parser = argparse.ArgumentParser()
211 |     parser.add_argument('--npkit_dump_dir', type=str, required=True, help='NPKit dump directory.')
212 |     parser.add_argument('--npkit_event_header_path', type=str, required=True, help='Path to npkit_event.h.')
213 |     parser.add_argument('--output_dir', type=str, required=True, help='Path to output directory.')
214 |     args = parser.parse_args()
215 | 
216 |     npkit_event_def = parse_npkit_event_header(args.npkit_event_header_path)
217 |     convert_npkit_dump_to_trace(args.npkit_dump_dir, args.output_dir, npkit_event_def)
218 | 


--------------------------------------------------------------------------------
/mscclpp_samples/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | This file describes for NPKit sample workflow for [MSCCL++](https://github.com/microsoft/mscclpp). The sample workflow first builds MSCCL++ with NPKit enabled, then runs MSCCL++ executor test to collect NPKit event dump files, and finally generates NPKit trace file.
 4 | 
 5 | ## Dependencies
 6 | 
 7 | [MSCCL++](https://github.com/microsoft/mscclpp) (with NPKit integrated).
 8 | 
 9 | ## Usage
10 | 
11 | 1) Build MSCCL++ with NPKit enabled.
12 | 
13 | ```
14 | $ git clone https://github.com/microsoft/mscclpp && cd mscclpp
15 | $ mkdir build && cd build
16 | $ cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_LOCAL_GPU_TARGET_ONLY=ON -DNPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT" .. && make -j
17 | ```
18 | 
19 | 2) Create a directory to store NPKit dump files and trace files.
20 | 
21 | ```
22 | $ mkdir /path/to/npkit_dump
23 | $ mkdir /path/to/npkit_trace
24 | ```
25 | 
26 | 3) Run MSCCL++ executor test with NPKIT_DUMP_DIR specifid.
27 | 
28 | ```
29 | $ mpirun -tag-output -np 2 -x MSCCLPP_DEBUG=WARN -x MSCCLPP_DEBUG_SUBSYS=ALL -x NPKIT_DUMP_DIR=/path/to/npkit_dump -x LD_PRELOAD=/path/to/mscclpp/build/libmscclpp.so:$LD_PRELOAD /path/to/mscclpp/build/test/executor_test 1024 allreduce_pairs /path/to/mscclpp/test/execution-files/allreduce_packet.json 1024 10 1 LL8
30 | ```
31 | 
32 | 3) Run NPKit trace parsing script to generate trace file.
33 | 
34 | ```
35 | $ python3 /path/to/mscclpp/tools/npkit/npkit_trace_generator.py --npkit_dump_dir=/path/to/npkit_dump --npkit_event_header_path=/path/to/mscclpp/include/mscclpp/npkit/npkit_event.hpp --output_dir=/path/to/npkit_trace
36 | ```
37 | 
38 | 4) The generated trace file `npkit_event_trace.json` is in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) and can be viewed by trace viewers.
39 | 


--------------------------------------------------------------------------------
/nccl_samples/README.md:
--------------------------------------------------------------------------------
 1 | ## Important Note
 2 | 
 3 | We highly recommend using [msccl_samples](https://github.com/microsoft/NPKit/tree/main/msccl_samples) to profile NCCL, because [MSCCL](https://github.com/Azure/msccl) includes all NCCL functions, has NPKit already integrated and is actively maintained by Azure. The patch for NCCL in this folder is not actively maintained.
 4 | 
 5 | ## Introduction
 6 | 
 7 | This folder contains scripts for NPKit sample workflow for NCCL. The sample workflow first builds NCCL with NPKit enabled, then runs nccl-tests to collect NPKit event dump files, and finally generates NPKit trace file.
 8 | 
 9 | ## Dependencies
10 | 
11 | [NCCL 2.17.1-1](https://github.com/nvidia/nccl/tree/v2.17.1-1) and [nccl-tests](https://github.com/nvidia/nccl-tests).
12 | 
13 | ## Usage
14 | 
15 | 1) Get NCCL version 2.17.1-1 and apply `npkit-for-nccl-2.17.1-1.diff` to the source repo.
16 | 
17 | 2) Make sure parameters in `npkit_launcher.sh` are valid. Also note that currently NPKit only supports collecting non-overlapped events in GPU, and `NPKIT_FLAGS` should follow this rule.
18 | 
19 | 3) Make sure `nccl_test` function in `npkit_runner.sh` is a valid command to run `nccl-tests` binary. Also note that currently NPKit only supports 1 GPU per process, so `-g 1` mode is required in `nccl-tests` commands.
20 | 
21 | 4) Run command `bash npkit_launcher.sh`.
22 | 
23 | 5) The generated trace file `npkit_event_trace.json` (zipped in `npkit_result.tar.gz`) is in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) and can be viewed by trace viewers.
24 | 


--------------------------------------------------------------------------------
/nccl_samples/npkit-for-nccl-2.17.1-1.diff:
--------------------------------------------------------------------------------
   1 | # Copyright (c) Microsoft Corporation.
   2 | # Licensed under the MIT License.
   3 | 
   4 | diff --git a/makefiles/common.mk b/makefiles/common.mk
   5 | index 35d1826..d8ac620 100644
   6 | --- a/makefiles/common.mk
   7 | +++ b/makefiles/common.mk
   8 | @@ -57,12 +57,12 @@ $(info NVCC_GENCODE is ${NVCC_GENCODE})
   9 |  
  10 |  CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
  11 |                -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
  12 | -              -I $(CUDA_INC) \
  13 | +              -I $(CUDA_INC) $(NPKIT_FLAGS) \
  14 |                $(CXXFLAGS)
  15 |  # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
  16 |  # 512 : 120, 640 : 96, 768 : 80, 1024 : 60
  17 |  # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
  18 | -NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all
  19 | +NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all $(NPKIT_FLAGS)
  20 |  # Use addprefix so that we can specify more than one path
  21 |  NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
  22 |  
  23 | diff --git a/src/Makefile b/src/Makefile
  24 | index ca5ddce..b75e961 100644
  25 | --- a/src/Makefile
  26 | +++ b/src/Makefile
  27 | @@ -12,7 +12,7 @@ INCEXPORTS  := nccl.h nccl_net.h
  28 |  LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
  29 |  		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
  30 |  		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
  31 | -		misc/ipcsocket.cc \
  32 | +		misc/ipcsocket.cc misc/npkit.cc \
  33 |  		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
  34 |                  collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
  35 |                  graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
  36 | diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
  37 | index f51eb43..f6c8022 100644
  38 | --- a/src/collectives/device/all_reduce.h
  39 | +++ b/src/collectives/device/all_reduce.h
  40 | @@ -8,6 +8,10 @@
  41 |  #include "collectives.h"
  42 |  #include "primitives.h"
  43 |  
  44 | +#if defined(ENABLE_NPKIT)
  45 | +#include "npkit/npkit.h"
  46 | +#endif
  47 | +
  48 |  namespace {
  49 |    template<typename T, typename RedOp, typename Proto>
  50 |    __device__ __forceinline__ void runRing(ncclWorkElem *args) {
  51 | @@ -22,6 +26,32 @@ namespace {
  52 |      const ssize_t loopSize = nChannels*nranks*chunkSize;
  53 |      const ssize_t size = args->count;
  54 |  
  55 | +#if defined(ENABLE_NPKIT)
  56 | +    int npKitCtxIdx = bid;
  57 | +#endif
  58 | +
  59 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
  60 | +    if (tid == 0) {
  61 | +      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
  62 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
  63 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
  64 | +    }
  65 | +#endif
  66 | +
  67 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
  68 | +    if (tid == 0) {
  69 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(),
  70 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
  71 | +    }
  72 | +#endif
  73 | +
  74 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
  75 | +    if (tid == 0) {
  76 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, clock64(),
  77 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
  78 | +    }
  79 | +#endif
  80 | +
  81 |      int minChunkSize;
  82 |      if (Proto::Id == NCCL_PROTO_LL)
  83 |        minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T));
  84 | @@ -33,6 +63,12 @@ namespace {
  85 |      Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
  86 |        (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
  87 |  
  88 | +#if defined(ENABLE_NPKIT)
  89 | +    if (tid == 0) {
  90 | +      prims.npKitCtxIdx = npKitCtxIdx;
  91 | +    }
  92 | +#endif
  93 | +
  94 |      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
  95 |        ssize_t realChunkSize;
  96 |        if (Proto::Id == NCCL_PROTO_SIMPLE) {
  97 | @@ -61,9 +97,34 @@ namespace {
  98 |        chunk = modRanks(ringIx + nranks-1);
  99 |        offset = calcOffset(chunk);
 100 |        nelem = min(realChunkSize, size-offset);
 101 | +
 102 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
 103 | +      if (tid == 0) {
 104 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY, nelem*sizeof(T), 0, clock64(),
 105 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 106 | +        prims.npKitDataProcessTotalTime = 0;
 107 | +      }
 108 | +#endif
 109 | +
 110 |        prims.send(offset, nelem);
 111 |  
 112 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
 113 | +      if (tid == 0) {
 114 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 115 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 116 | +      }
 117 | +#endif
 118 | +
 119 |        // k-2 steps: reduce and copy to next GPU
 120 | +
 121 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY)
 122 | +      if (tid == 0 && nranks > 2) {
 123 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, clock64(),
 124 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 125 | +        prims.npKitDataProcessTotalTime = 0;
 126 | +      }
 127 | +#endif
 128 | +
 129 |        for (int j=2; j<nranks; ++j) {
 130 |          chunk = modRanks(ringIx + nranks-j);
 131 |          offset = calcOffset(chunk);
 132 | @@ -71,13 +132,44 @@ namespace {
 133 |          prims.recvReduceSend(offset, nelem);
 134 |        }
 135 |  
 136 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
 137 | +      if (tid == 0 && nranks > 2) {
 138 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 139 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 140 | +      }
 141 | +#endif
 142 | +
 143 |        // step k-1: reduce this buffer and data, which will produce the final
 144 |        // result that we store in this data and push to the next GPU
 145 |        chunk = ringIx + 0;
 146 |        offset = calcOffset(chunk);
 147 |        nelem = min(realChunkSize, size-offset);
 148 | +
 149 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
 150 | +      if (tid == 0) {
 151 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, nelem*sizeof(T), 0, clock64(),
 152 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 153 | +        prims.npKitDataProcessTotalTime = 0;
 154 | +      }
 155 | +#endif
 156 | +
 157 |        prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true);
 158 |  
 159 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
 160 | +      if (tid == 0) {
 161 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 162 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 163 | +      }
 164 | +#endif
 165 | +
 166 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY)
 167 | +      if (tid == 0 && nranks > 2) {
 168 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, clock64(),
 169 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 170 | +        prims.npKitDataProcessTotalTime = 0;
 171 | +      }
 172 | +#endif
 173 | +
 174 |        // k-2 steps: copy to next GPU
 175 |        for (int j=1; j<nranks-1; ++j) {
 176 |          chunk = modRanks(ringIx + nranks-j);
 177 | @@ -86,12 +178,44 @@ namespace {
 178 |          prims.directRecvCopySend(offset, offset, nelem);
 179 |        }
 180 |  
 181 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
 182 | +      if (tid == 0 && nranks > 2) {
 183 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 184 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 185 | +      }
 186 | +#endif
 187 | +
 188 |        // Make final copy from buffer to dest.
 189 |        chunk = modRanks(ringIx + 1);
 190 |        offset = calcOffset(chunk);
 191 |        nelem = min(realChunkSize, size-offset);
 192 | +
 193 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
 194 | +      if (tid == 0) {
 195 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, clock64(),
 196 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 197 | +        prims.npKitDataProcessTotalTime = 0;
 198 | +      }
 199 | +#endif
 200 | +
 201 |        prims.directRecv(offset, nelem);
 202 | +
 203 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
 204 | +      if (tid == 0) {
 205 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 206 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 207 | +      }
 208 | +#endif
 209 | +
 210 |      }
 211 | +
 212 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
 213 | +    if (tid == 0) {
 214 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_EXIT, size*sizeof(T), 0, clock64(),
 215 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 216 | +    }
 217 | +#endif
 218 | +
 219 |    }
 220 |  
 221 |    template<typename T, typename RedOp, typename Proto>
 222 | @@ -110,12 +234,53 @@ namespace {
 223 |      const ssize_t loopSize = int(nChannels*chunkSize);
 224 |      const ssize_t size = args->count;
 225 |  
 226 | +#if defined(ENABLE_NPKIT)
 227 | +    int npKitCtxIdx = bid;
 228 | +#endif
 229 | +
 230 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
 231 | +    if (tid == 0) {
 232 | +      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
 233 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
 234 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 235 | +    }
 236 | +#endif
 237 | +
 238 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
 239 | +    if (tid == 0) {
 240 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(),
 241 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 242 | +    }
 243 | +#endif
 244 | +
 245 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
 246 | +    if (tid == 0) {
 247 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, clock64(),
 248 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 249 | +    }
 250 | +#endif
 251 | +
 252 |      if (loopSize > size)
 253 |        chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
 254 |  
 255 |      { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
 256 |        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
 257 |          (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
 258 | +
 259 | +#if defined(ENABLE_NPKIT)
 260 | +      if (tid == 0) {
 261 | +        prims.npKitCtxIdx = npKitCtxIdx;
 262 | +      }
 263 | +#endif
 264 | +
 265 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY)
 266 | +      if (tid == 0) {
 267 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY, size*sizeof(T), 0, clock64(),
 268 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 269 | +        prims.npKitDataProcessTotalTime = 0;
 270 | +      }
 271 | +#endif
 272 | +
 273 |        if (tree->up == -1) {
 274 |          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 275 |            ssize_t offset = gridOffset + bid*int(chunkSize);
 276 | @@ -137,11 +302,34 @@ namespace {
 277 |            prims.recvReduceSend(offset, nelem);
 278 |          }
 279 |        }
 280 | +
 281 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT)
 282 | +      if (tid == 0) {
 283 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 284 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 285 | +      }
 286 | +#endif
 287 | +
 288 |      }
 289 |  
 290 |      { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
 291 |        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0> prims
 292 |          (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
 293 | +
 294 | +#if defined(ENABLE_NPKIT)
 295 | +      if (tid == 0) {
 296 | +        prims.npKitCtxIdx = npKitCtxIdx;
 297 | +      }
 298 | +#endif
 299 | +
 300 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY)
 301 | +      if (tid == 0) {
 302 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY, size*sizeof(T), 0, clock64(),
 303 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 304 | +        prims.npKitDataProcessTotalTime = 0;
 305 | +      }
 306 | +#endif
 307 | +
 308 |        if (tree->up == -1) {
 309 |          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 310 |            ssize_t offset = gridOffset + bid*int(chunkSize);
 311 | @@ -163,7 +351,23 @@ namespace {
 312 |            prims.directRecvCopySend(offset, offset, nelem);
 313 |          }
 314 |        }
 315 | +
 316 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT)
 317 | +      if (tid == 0) {
 318 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 319 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 320 | +      }
 321 | +#endif
 322 | +
 323 | +    }
 324 | +
 325 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT)
 326 | +    if (tid == 0) {
 327 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT, size*sizeof(T), 0, clock64(),
 328 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 329 |      }
 330 | +#endif
 331 | +
 332 |    }
 333 |  
 334 |    template<typename T, typename RedOp, typename Proto>
 335 | @@ -193,6 +397,40 @@ namespace {
 336 |        nthreadsSplit = (nthreads*7/(10*WARP_SIZE))*WARP_SIZE;
 337 |      }
 338 |  
 339 | +#if defined(ENABLE_NPKIT)
 340 | +    bool isNpKitThread = false;
 341 | +    int npKitCtxIdx = 0;
 342 | +    if (threadIdx.x == 0) {
 343 | +      isNpKitThread = true;
 344 | +      npKitCtxIdx = bid * 2;
 345 | +    } else if (tree->up != -1 && threadIdx.x == nthreadsSplit) {
 346 | +      isNpKitThread = true;
 347 | +      npKitCtxIdx = bid * 2 + 1;
 348 | +    }
 349 | +#endif
 350 | +
 351 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
 352 | +    if (isNpKitThread) {
 353 | +      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
 354 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
 355 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 356 | +    }
 357 | +#endif
 358 | +
 359 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
 360 | +    if (isNpKitThread) {
 361 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(),
 362 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 363 | +    }
 364 | +#endif
 365 | +
 366 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
 367 | +    if (isNpKitThread) {
 368 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, clock64(),
 369 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 370 | +    }
 371 | +#endif
 372 | +
 373 |      if (loopSize > size)
 374 |        chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize);
 375 |  
 376 | @@ -200,11 +438,34 @@ namespace {
 377 |        // Reduce and broadcast. Max number of recv is 3, max number of send is 3
 378 |        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0>
 379 |          prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
 380 | +
 381 | +#if defined(ENABLE_NPKIT)
 382 | +      if (isNpKitThread) {
 383 | +        prims.npKitCtxIdx = npKitCtxIdx;
 384 | +      }
 385 | +#endif
 386 | +
 387 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY)
 388 | +      if (isNpKitThread) {
 389 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY, size*sizeof(T), 0, clock64(),
 390 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 391 | +        prims.npKitDataProcessTotalTime = 0;
 392 | +      }
 393 | +#endif
 394 | +
 395 |        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 396 |          ssize_t offset = gridOffset + bid*int(chunkSize);
 397 |          int nelem = min(chunkSize, size-offset);
 398 |          prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*doPost=*/true);
 399 |        }
 400 | +
 401 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
 402 | +      if (isNpKitThread) {
 403 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 404 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 405 | +      }
 406 | +#endif
 407 | +
 408 |      }
 409 |      else if (tid < nthreadsSplit) {
 410 |        /* Reduce up. Max number of recv is 3, max number of send is 1 (binary tree + local).
 411 | @@ -217,6 +478,21 @@ namespace {
 412 |         */
 413 |        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/1, Proto, 0>
 414 |          prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
 415 | +
 416 | +#if defined(ENABLE_NPKIT)
 417 | +      if (isNpKitThread) {
 418 | +        prims.npKitCtxIdx = npKitCtxIdx;
 419 | +      }
 420 | +#endif
 421 | +
 422 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY)
 423 | +      if (isNpKitThread) {
 424 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY, size*sizeof(T), 0, clock64(),
 425 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 426 | +        prims.npKitDataProcessTotalTime = 0;
 427 | +      }
 428 | +#endif
 429 | +
 430 |        if (tree->down[0] == -1) {
 431 |          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 432 |            ssize_t offset = gridOffset + bid*int(chunkSize);
 433 | @@ -231,11 +507,34 @@ namespace {
 434 |            prims.recvReduceSend(offset, nelem);
 435 |          }
 436 |        }
 437 | +
 438 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT)
 439 | +      if (isNpKitThread) {
 440 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 441 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 442 | +      }
 443 | +#endif
 444 | +
 445 |      }
 446 |      else {
 447 |        // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
 448 |        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0>
 449 |          prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth);
 450 | +
 451 | +#if defined(ENABLE_NPKIT)
 452 | +      if (isNpKitThread) {
 453 | +        prims.npKitCtxIdx = npKitCtxIdx;
 454 | +      }
 455 | +#endif
 456 | +
 457 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY)
 458 | +      if (isNpKitThread) {
 459 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY, size*sizeof(T), 0, clock64(),
 460 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 461 | +        prims.npKitDataProcessTotalTime = 0;
 462 | +      }
 463 | +#endif
 464 | +
 465 |        if (tree->down[0] == -1) {
 466 |          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
 467 |            ssize_t offset = gridOffset + bid*int(chunkSize);
 468 | @@ -250,7 +549,23 @@ namespace {
 469 |            prims.directRecvCopySend(offset, offset, nelem);
 470 |          }
 471 |        }
 472 | +
 473 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT)
 474 | +      if (isNpKitThread) {
 475 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
 476 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 477 | +      }
 478 | +#endif
 479 | +
 480 |      }
 481 | +
 482 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT)
 483 | +    if (isNpKitThread) {
 484 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT, size*sizeof(T), 0, clock64(),
 485 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 486 | +    }
 487 | +#endif
 488 | +
 489 |    }
 490 |  }
 491 |  
 492 | diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
 493 | index c43f1a5..869c6b5 100644
 494 | --- a/src/collectives/device/prims_ll.h
 495 | +++ b/src/collectives/device/prims_ll.h
 496 | @@ -4,6 +4,10 @@
 497 |   * See LICENSE.txt for license information
 498 |   ************************************************************************/
 499 |  
 500 | +#if defined(ENABLE_NPKIT)
 501 | +#include "npkit/npkit.h"
 502 | +#endif
 503 | +
 504 |  template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
 505 |  class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 506 |    public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
 507 | @@ -36,6 +40,22 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 508 |    union ncclLLFifoLine* recvBuff[MaxRecv];
 509 |    union ncclLLFifoLine* sendBuff[MaxSend];
 510 |  
 511 | +#if defined(ENABLE_NPKIT)
 512 | +public:
 513 | +  int npKitCtxIdx = 0;
 514 | +  uint64_t npKitDataProcessEntryTime = 0;
 515 | +  uint64_t npKitDataProcessExitTime = 0;
 516 | +  uint64_t npKitDataProcessTotalTime = 0;
 517 | +private:
 518 | +#endif
 519 | +
 520 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 521 | +  uint64_t npKitWaitRecvDataProcessSize = 0;
 522 | +  uint64_t npKitWaitRecvEntryTime = 0;
 523 | +  uint64_t npKitWaitRecvExitTime = 0;
 524 | +  uint64_t npKitWaitRecvTotalTime = 0;
 525 | +#endif
 526 | +
 527 |    inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; }
 528 |    inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; }
 529 |    inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
 530 | @@ -62,6 +82,12 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 531 |    }
 532 |  
 533 |    inline __device__ void waitSend(int nbytes) {
 534 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
 535 | +    if (tid == 0) {
 536 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, clock64(),
 537 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 538 | +    }
 539 | +#endif
 540 |      if (sendConnHeadPtr) {
 541 |        int spins = 0;
 542 |        while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
 543 | @@ -75,6 +101,12 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 544 |        sendConnHead += 1;
 545 |      }
 546 |      barrier();
 547 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
 548 | +    if (tid == 0) {
 549 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, clock64(),
 550 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 551 | +    }
 552 | +#endif
 553 |    }
 554 |  
 555 |    inline __device__ void incRecv(int i) {
 556 | @@ -99,11 +131,30 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 557 |      uint32_t flag = recvFlag(i);
 558 |      uint32_t data1, flag1, data2, flag2;
 559 |      int spins = 0;
 560 | +
 561 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 562 | +    int npkitWaitRecvSpins = 0;
 563 | +    if (tid == 0) {
 564 | +      npKitWaitRecvEntryTime = clock64();
 565 | +    }
 566 | +#endif
 567 | +
 568 |      do {
 569 |        asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
 570 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 571 | +      npkitWaitRecvSpins++;
 572 | +#endif
 573 |        if (checkAbort(spins, 0)) break;
 574 |      } while ((flag1 != flag) || (flag2 != flag));
 575 |      uint64_t val64 = data1 + (((uint64_t)data2) << 32);
 576 | +
 577 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 578 | +    if (tid == 0) {
 579 | +      npKitWaitRecvExitTime = clock64();
 580 | +      npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
 581 | +    }
 582 | +#endif
 583 | +
 584 |      return val64;
 585 |    }
 586 |  
 587 | @@ -121,11 +172,30 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 588 |      union ncclLLFifoLine* src = recvPtr(i) + offset;
 589 |      uint32_t flag = recvFlag(i);
 590 |      int spins = 0;
 591 | +
 592 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 593 | +    int npkitWaitRecvSpins = 0;
 594 | +    if (tid == 0) {
 595 | +      npKitWaitRecvEntryTime = clock64();
 596 | +    }
 597 | +#endif
 598 | +
 599 |      while (line[i].flag1 != flag || line[i].flag2 != flag) {
 600 |        asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
 601 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 602 | +      npkitWaitRecvSpins++;
 603 | +#endif
 604 |        if (checkAbort(spins, 0)) break;
 605 |      }
 606 |      uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
 607 | +
 608 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 609 | +    if (tid == 0) {
 610 | +      npKitWaitRecvExitTime = clock64();
 611 | +      npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
 612 | +    }
 613 | +#endif
 614 | +
 615 |      return val64;
 616 |    }
 617 |  
 618 | @@ -234,6 +304,22 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 619 |      nelem = nelem < 0 ? 0 : nelem;
 620 |      if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine));
 621 |  
 622 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
 623 | +    if (tid == 0) {
 624 | +      npKitWaitRecvTotalTime = 0;
 625 | +      npKitWaitRecvDataProcessSize = nelem*sizeof(T);
 626 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY,
 627 | +          npKitWaitRecvDataProcessSize, 0, clock64(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 628 | +    }
 629 | +#endif
 630 | +
 631 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
 632 | +    if (tid == 0) {
 633 | +      npKitWaitRecvTotalTime = 0;
 634 | +      npKitDataProcessEntryTime = clock64();
 635 | +    }
 636 | +#endif
 637 | +
 638 |      nelem -= tid*EltPerLine;
 639 |      srcElts += tid*EltPerLine;
 640 |      dstElts += tid*EltPerLine;
 641 | @@ -282,6 +368,21 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 642 |        offset += nthreads;
 643 |      }
 644 |  
 645 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
 646 | +    if (tid == 0) {
 647 | +      npKitDataProcessExitTime = clock64();
 648 | +      npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime - npKitWaitRecvTotalTime;
 649 | +    }
 650 | +#endif
 651 | +
 652 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
 653 | +    if (tid == 0) {
 654 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT,
 655 | +          npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, clock64(),
 656 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 657 | +    }
 658 | +#endif
 659 | +
 660 |      if (RECV) {
 661 |        for (int i=0; i < MaxRecv; i++) incRecv(i);
 662 |        postRecv();
 663 | @@ -367,27 +468,123 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 664 |    }
 665 |  
 666 |    __device__ void send(intptr_t inpIx, int eltN) {
 667 | -    return LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
 668 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY)
 669 | +    if (tid == 0) {
 670 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
 671 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 672 | +    }
 673 | +#endif
 674 | +    LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
 675 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT)
 676 | +    if (tid == 0) {
 677 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
 678 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 679 | +    }
 680 | +#endif
 681 |    }
 682 |    __device__ void sendFromOutput(intptr_t outIx, int eltN) {
 683 | -    return LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
 684 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
 685 | +    if (tid == 0) {
 686 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, clock64(),
 687 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 688 | +    }
 689 | +#endif
 690 | +    LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
 691 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
 692 | +    if (tid == 0) {
 693 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, clock64(),
 694 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 695 | +    }
 696 | +#endif
 697 |    }
 698 |    __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
 699 | -    return LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
 700 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY)
 701 | +    if (tid == 0) {
 702 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, clock64(),
 703 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 704 | +    }
 705 | +#endif
 706 | +    LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
 707 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT)
 708 | +    if (tid == 0) {
 709 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, clock64(),
 710 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 711 | +    }
 712 | +#endif
 713 |    }
 714 |    __device__ void recvReduceSend(intptr_t inpIx, int eltN) {
 715 | -    return LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
 716 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
 717 | +    if (tid == 0) {
 718 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
 719 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 720 | +    }
 721 | +#endif
 722 | +    LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
 723 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
 724 | +    if (tid == 0) {
 725 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
 726 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 727 | +    }
 728 | +#endif
 729 |    }
 730 |    __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
 731 | -    return LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
 732 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
 733 | +    if (tid == 0) {
 734 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, clock64(),
 735 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 736 | +    }
 737 | +#endif
 738 | +    LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
 739 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
 740 | +    if (tid == 0) {
 741 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, clock64(),
 742 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 743 | +    }
 744 | +#endif
 745 |    }
 746 |    __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
 747 | -    return LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
 748 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
 749 | +    if (tid == 0) {
 750 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
 751 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 752 | +    }
 753 | +#endif
 754 | +    LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
 755 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
 756 | +    if (tid == 0) {
 757 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
 758 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 759 | +    }
 760 | +#endif
 761 |    }
 762 |    __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
 763 | -    return LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
 764 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
 765 | +    if (tid == 0) {
 766 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
 767 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 768 | +    }
 769 | +#endif
 770 | +    LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
 771 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
 772 | +    if (tid == 0) {
 773 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
 774 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 775 | +    }
 776 | +#endif
 777 |    }
 778 |    __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
 779 | -    return LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
 780 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
 781 | +    if (tid == 0) {
 782 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
 783 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 784 | +    }
 785 | +#endif
 786 | +    LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
 787 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
 788 | +    if (tid == 0) {
 789 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
 790 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 791 | +    }
 792 | +#endif
 793 |    }
 794 |  };
 795 | diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
 796 | index 8a4570a..ac8fccd 100644
 797 | --- a/src/collectives/device/prims_ll128.h
 798 | +++ b/src/collectives/device/prims_ll128.h
 799 | @@ -5,6 +5,9 @@
 800 |   ************************************************************************/
 801 |  
 802 |  #include "op128.h"
 803 | +#if defined(ENABLE_NPKIT)
 804 | +#include "npkit/npkit.h"
 805 | +#endif
 806 |  
 807 |  #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
 808 |  
 809 | @@ -42,6 +45,22 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 810 |    uint64_t* recvBuff[MaxRecv];
 811 |    uint64_t* sendBuff[MaxSend];
 812 |  
 813 | +#if defined(ENABLE_NPKIT)
 814 | +public:
 815 | +  int npKitCtxIdx = 0;
 816 | +  uint64_t npKitDataProcessEntryTime = 0;
 817 | +  uint64_t npKitDataProcessExitTime = 0;
 818 | +  uint64_t npKitDataProcessTotalTime = 0;
 819 | +private:
 820 | +#endif
 821 | +
 822 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 823 | +  uint64_t npKitWaitRecvDataProcessSize = 0;
 824 | +  uint64_t npKitWaitRecvEntryTime = 0;
 825 | +  uint64_t npKitWaitRecvExitTime = 0;
 826 | +  uint64_t npKitWaitRecvTotalTime = 0;
 827 | +#endif
 828 | +
 829 |    inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
 830 |    inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
 831 |    inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
 832 | @@ -65,6 +84,12 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 833 |    }
 834 |  
 835 |    inline __device__ void waitSend(int nbytes) {
 836 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY)
 837 | +    if (tid == 0) {
 838 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY, nbytes, 0, clock64(),
 839 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 840 | +    }
 841 | +#endif
 842 |      if (sendConnHeadPtr) {
 843 |        int spins = 0;
 844 |        while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
 845 | @@ -76,6 +101,12 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 846 |        }
 847 |        sendConnHead += 1;
 848 |      }
 849 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT)
 850 | +    if (tid == 0) {
 851 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT, nbytes, 0, clock64(),
 852 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 853 | +    }
 854 | +#endif
 855 |    }
 856 |  
 857 |    inline __device__ void postRecv() {
 858 | @@ -194,6 +225,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 859 |        uint64_t flag = recvFlag(0);
 860 |        bool needReload;
 861 |        int spins = 0;
 862 | +
 863 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 864 | +      int npkitWaitRecvSpins = 0;
 865 | +      if (tid == 0) {
 866 | +        npKitWaitRecvEntryTime = clock64();
 867 | +      }
 868 | +#endif
 869 | +
 870 |        do {
 871 |          needReload = false;
 872 |          #pragma unroll
 873 | @@ -201,9 +240,21 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 874 |            load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
 875 |            needReload |= flagThread && (vr[u+1] != flag);
 876 |          }
 877 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 878 | +        npkitWaitRecvSpins++;
 879 | +#endif
 880 |          needReload &= (0 == checkAbort(spins, 0, 0));
 881 |        } while (__any_sync(WARP_MASK, needReload));
 882 |  
 883 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 884 | +      if (tid == 0) {
 885 | +        npKitWaitRecvExitTime = clock64();
 886 | +        npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
 887 | +        npkitWaitRecvSpins = 0;
 888 | +      }
 889 | +#endif
 890 | +
 891 | +
 892 |        #pragma unroll
 893 |        for (int u=0; u<ELEMS_PER_THREAD; u+=2)
 894 |          load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
 895 | @@ -239,6 +290,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 896 |          uint64_t* ptr = recvPtr(i)+ll128Offset;
 897 |          bool needReload;
 898 |          int spins = 0;
 899 | +
 900 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 901 | +        int npkitWaitRecvSpins = 0;
 902 | +        if (tid == 0) {
 903 | +          npKitWaitRecvEntryTime = clock64();
 904 | +        }
 905 | +#endif
 906 | +
 907 |          do {
 908 |            needReload = false;
 909 |            #pragma unroll
 910 | @@ -246,9 +305,20 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 911 |              load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
 912 |              needReload |= flagThread && (vr[u+1] != flag);
 913 |            }
 914 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 915 | +          npkitWaitRecvSpins++;
 916 | +#endif
 917 |            needReload &= (0 == checkAbort(spins, i, 0));
 918 |          } while (__any_sync(WARP_MASK, needReload));
 919 |  
 920 | +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
 921 | +        if (tid == 0) {
 922 | +          npKitWaitRecvExitTime = clock64();
 923 | +          npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
 924 | +          npkitWaitRecvSpins = 0;
 925 | +        }
 926 | +#endif
 927 | +
 928 |          #pragma unroll
 929 |          for (int u=0; u<ELEMS_PER_THREAD; u+=2)
 930 |            load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
 931 | @@ -305,6 +375,23 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 932 |  
 933 |      if (SEND) waitSend(divUp(nelem, DataEltPerSlice)*WireWordPerSlice*sizeof(uint64_t));
 934 |      barrier();
 935 | +
 936 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT)
 937 | +    if (tid == 0) {
 938 | +      npKitWaitRecvTotalTime = 0;
 939 | +      npKitWaitRecvDataProcessSize = nelem*sizeof(T);
 940 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY,
 941 | +          npKitWaitRecvDataProcessSize, 0, clock64(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 942 | +    }
 943 | +#endif
 944 | +
 945 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
 946 | +    if (tid == 0) {
 947 | +      npKitWaitRecvTotalTime = 0;
 948 | +      npKitDataProcessEntryTime = clock64();
 949 | +    }
 950 | +#endif
 951 | +
 952 |      nelem -= DataEltPerSlice*warp;
 953 |      srcPtr += DataEltPerSlice*warp;
 954 |      dstPtr += DataEltPerSlice*warp;
 955 | @@ -322,6 +409,22 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 956 |      }
 957 |  
 958 |      barrier();
 959 | +
 960 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
 961 | +    if (tid == 0) {
 962 | +      npKitDataProcessExitTime = clock64();
 963 | +      npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime - npKitWaitRecvTotalTime;
 964 | +    }
 965 | +#endif
 966 | +
 967 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT)
 968 | +    if (tid == 0) {
 969 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT,
 970 | +          npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, clock64(),
 971 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 972 | +    }
 973 | +#endif
 974 | +
 975 |      if (SEND) for (int i=0; i < MaxSend; i++) sendStep[i] += 1;
 976 |      if (SEND) postSend();
 977 |      if (RECV) for (int i=0; i < MaxRecv; i++) recvStep[i] += 1;
 978 | @@ -408,27 +511,123 @@ public:
 979 |    }
 980 |  
 981 |    __device__ void send(intptr_t inpIx, int eltN) {
 982 | -    return GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
 983 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY)
 984 | +    if (tid == 0) {
 985 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
 986 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 987 | +    }
 988 | +#endif
 989 | +    GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
 990 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT)
 991 | +    if (tid == 0) {
 992 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
 993 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
 994 | +    }
 995 | +#endif
 996 |    }
 997 |    __device__ void sendFromOutput(intptr_t outIx, int eltN) {
 998 | -    return GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
 999 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
1000 | +    if (tid == 0) {
1001 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, clock64(),
1002 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1003 | +    }
1004 | +#endif
1005 | +    GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
1006 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
1007 | +    if (tid == 0) {
1008 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, clock64(),
1009 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1010 | +    }
1011 | +#endif
1012 |    }
1013 |    __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
1014 | -    return GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
1015 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY)
1016 | +    if (tid == 0) {
1017 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, clock64(),
1018 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1019 | +    }
1020 | +#endif
1021 | +    GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
1022 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT)
1023 | +    if (tid == 0) {
1024 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, clock64(),
1025 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1026 | +    }
1027 | +#endif
1028 |    }
1029 |    __device__ void recvReduceSend(intptr_t inpIx, int eltN) {
1030 | -    return GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
1031 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
1032 | +    if (tid == 0) {
1033 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
1034 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1035 | +    }
1036 | +#endif
1037 | +    GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
1038 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
1039 | +    if (tid == 0) {
1040 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
1041 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1042 | +    }
1043 | +#endif
1044 |    }
1045 |    __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
1046 | -    return GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
1047 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
1048 | +    if (tid == 0) {
1049 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, clock64(),
1050 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1051 | +    }
1052 | +#endif
1053 | +    GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
1054 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
1055 | +    if (tid == 0) {
1056 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, clock64(),
1057 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1058 | +    }
1059 | +#endif
1060 |    }
1061 |    __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
1062 | -    return GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
1063 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
1064 | +    if (tid == 0) {
1065 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
1066 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1067 | +    }
1068 | +#endif
1069 | +    GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
1070 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
1071 | +    if (tid == 0) {
1072 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
1073 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1074 | +    }
1075 | +#endif
1076 |    }
1077 |    __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
1078 | -    return GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
1079 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
1080 | +    if (tid == 0) {
1081 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
1082 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1083 | +    }
1084 | +#endif
1085 | +    GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
1086 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
1087 | +    if (tid == 0) {
1088 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
1089 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1090 | +    }
1091 | +#endif
1092 |    }
1093 |    __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
1094 | -    return GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
1095 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
1096 | +    if (tid == 0) {
1097 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, clock64(),
1098 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1099 | +    }
1100 | +#endif
1101 | +    GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
1102 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
1103 | +    if (tid == 0) {
1104 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, clock64(),
1105 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1106 | +    }
1107 | +#endif
1108 |    }
1109 |  };
1110 | diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
1111 | index 2cd3797..3c7bd39 100644
1112 | --- a/src/collectives/device/prims_simple.h
1113 | +++ b/src/collectives/device/prims_simple.h
1114 | @@ -4,6 +4,10 @@
1115 |   * See LICENSE.txt for license information
1116 |   ************************************************************************/
1117 |  
1118 | +#if defined(ENABLE_NPKIT)
1119 | +#include "npkit/npkit.h"
1120 | +#endif
1121 | +
1122 |  template<typename T, typename RedOp, typename Fan, int Direct,
1123 |           int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, bool NVLS>
1124 |  class Primitives<
1125 | @@ -46,6 +50,15 @@ class Primitives<
1126 |    uint64_t *connStepPtr;
1127 |    uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
1128 |  
1129 | +#if defined(ENABLE_NPKIT)
1130 | +public:
1131 | +  int npKitCtxIdx = 0;
1132 | +  uint64_t npKitDataProcessEntryTime = 0;
1133 | +  uint64_t npKitDataProcessExitTime = 0;
1134 | +  uint64_t npKitDataProcessTotalTime = 0;
1135 | +private:
1136 | +#endif
1137 | +
1138 |    // Don't use barrier 0 as it's used by the final sync
1139 |    __device__ void barrier() {
1140 |      flags |= ThreadsSynced;
1141 | @@ -238,20 +251,92 @@ class Primitives<
1142 |          } else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
1143 |            // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
1144 |            if (Send) {
1145 | +
1146 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
1147 | +            if (tid == 0) {
1148 | +              NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, clock64(),
1149 | +                  ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1150 | +            }
1151 | +#endif
1152 | +
1153 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
1154 | +            if (tid == 0) {
1155 | +              npKitDataProcessEntryTime = clock64();
1156 | +            }
1157 | +#endif
1158 | +
1159 |              ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, MaxSend, /*PreOpSrcs*/0>
1160 |                (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
1161 |                 1, ncclShmem.groups[group].srcs,
1162 |                 fan.nsend(), ncclShmem.groups[group].dsts+1,
1163 |                 workSize);
1164 | +
1165 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
1166 | +            if (tid == 0) {
1167 | +              npKitDataProcessExitTime = clock64();
1168 | +              npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
1169 | +            }
1170 | +#endif
1171 | +
1172 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
1173 | +            if (tid == 0) {
1174 | +              NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, clock64(),
1175 | +                  ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1176 | +            }
1177 | +#endif
1178 | +
1179 |            }
1180 |          } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
1181 |            // For broadcast in CollNet to do empty send
1182 | +
1183 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
1184 | +          if (tid == 0) {
1185 | +            NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, clock64(),
1186 | +                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1187 | +          }
1188 | +#endif
1189 | +
1190 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
1191 | +          if (tid == 0) {
1192 | +            npKitDataProcessEntryTime = clock64();
1193 | +          }
1194 | +#endif
1195 | +
1196 |            ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs*/0>
1197 |              (tid, nworkers, ncclShmem.redOpArgs[0],  nullptr, postOp,
1198 |               Recv, ncclShmem.groups[group].srcs,
1199 |               Dst, ncclShmem.groups[group].dsts,
1200 |               workSize);
1201 | +
1202 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
1203 | +          if (tid == 0) {
1204 | +            npKitDataProcessExitTime = clock64();
1205 | +            npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
1206 | +          }
1207 | +#endif
1208 | +
1209 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
1210 | +          if (tid == 0) {
1211 | +            NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, clock64(),
1212 | +                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1213 | +          }
1214 | +#endif
1215 | +
1216 |          } else {
1217 | +
1218 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
1219 | +          if (tid == 0) {
1220 | +            NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, clock64(),
1221 | +                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1222 | +          }
1223 | +#endif
1224 | +
1225 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
1226 | +          if (tid == 0) {
1227 | +            npKitDataProcessEntryTime = clock64();
1228 | +          }
1229 | +#endif
1230 | +
1231 |            constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
1232 |                                      DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
1233 |            ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
1234 | @@ -259,6 +344,21 @@ class Primitives<
1235 |               Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
1236 |               Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
1237 |               workSize);
1238 | +
1239 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
1240 | +          if (tid == 0) {
1241 | +            npKitDataProcessExitTime = clock64();
1242 | +            npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
1243 | +          }
1244 | +#endif
1245 | +
1246 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
1247 | +          if (tid == 0) {
1248 | +            NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, clock64(),
1249 | +                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1250 | +          }
1251 | +#endif
1252 | +
1253 |          }
1254 |          barrier(); // This barrier has a counterpart in following loop
1255 |          postPeer<Recv, Send>(0 < sliceSize);
1256 | diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
1257 | index 41fe0c2..35ee734 100644
1258 | --- a/src/collectives/device/sendrecv.h
1259 | +++ b/src/collectives/device/sendrecv.h
1260 | @@ -7,19 +7,73 @@
1261 |  #include "devcomm.h"
1262 |  #include "collectives.h"
1263 |  #include "primitives.h"
1264 | +#if defined(ENABLE_NPKIT)
1265 | +#include "npkit/npkit.h"
1266 | +#endif
1267 |  
1268 |  template<typename T, typename RedOp>
1269 |  struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
1270 |    template<typename Proto>
1271 |    __device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
1272 | +
1273 | +#if defined(ENABLE_NPKIT)
1274 | +    bool isNpKitThread = (tid == 0);
1275 | +    int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P;
1276 | +#endif
1277 | +
1278 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
1279 | +    if (isNpKitThread) {
1280 | +      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
1281 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
1282 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1283 | +    }
1284 | +#endif
1285 | +
1286 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
1287 | +    if (isNpKitThread) {
1288 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(),
1289 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1290 | +    }
1291 | +#endif
1292 | +
1293 |      void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
1294 |      ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
1295 |      if (args->peer == ncclShmem.comm.rank) {
1296 |        struct ncclWorkElemP2p* recvArgs = args-1;
1297 |        void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
1298 |        if (buff != recvBuff) {
1299 | +
1300 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
1301 | +        if (isNpKitThread) {
1302 | +          NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, clock64(),
1303 | +              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1304 | +        }
1305 | +#endif
1306 | +
1307 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
1308 | +        if (isNpKitThread) {
1309 | +          NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, clock64(),
1310 | +              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1311 | +        }
1312 | +#endif
1313 | +
1314 |          ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
1315 |            (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
1316 | +
1317 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
1318 | +        if (isNpKitThread) {
1319 | +          NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, clock64(),
1320 | +              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1321 | +        }
1322 | +#endif
1323 | +
1324 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
1325 | +        if (isNpKitThread) {
1326 | +          NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, clock64(),
1327 | +              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1328 | +        }
1329 | +#endif
1330 | +
1331 |        }
1332 |      } else {
1333 |        int chunkSize = args->chunkSize/sizeof(T);
1334 | @@ -27,17 +81,60 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
1335 |        int const peer = args->peer;
1336 |        Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
1337 |          (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
1338 | +
1339 | +#if defined(ENABLE_NPKIT)
1340 | +      if (isNpKitThread) {
1341 | +        prims.npKitCtxIdx = npKitCtxIdx;
1342 | +      }
1343 | +#endif
1344 | +
1345 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
1346 | +      if (isNpKitThread) {
1347 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, clock64(),
1348 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1349 | +        prims.npKitDataProcessTotalTime = 0;
1350 | +      }
1351 | +#endif
1352 | +
1353 |        size_t offset = 0;
1354 |        do {
1355 |          int nelem = min(size_t(chunkSize), count-offset);
1356 |          prims.directSend(offset, offset, nelem);
1357 |          offset += nelem;
1358 |        } while(offset < count);
1359 | +
1360 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
1361 | +      if (isNpKitThread) {
1362 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
1363 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1364 | +      }
1365 | +#endif
1366 | +
1367 |      }
1368 |    }
1369 |  
1370 |    template<typename Proto>
1371 |    __device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
1372 | +#if defined(ENABLE_NPKIT)
1373 | +    bool isNpKitThread = (tid == 0);
1374 | +    int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1;
1375 | +#endif
1376 | +
1377 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
1378 | +    if (isNpKitThread) {
1379 | +      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
1380 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
1381 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1382 | +    }
1383 | +#endif
1384 | +
1385 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
1386 | +    if (isNpKitThread) {
1387 | +      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, clock64(),
1388 | +          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1389 | +    }
1390 | +#endif
1391 | +
1392 |      if (args->peer != ncclShmem.comm.rank) {
1393 |        void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
1394 |        ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
1395 | @@ -46,12 +143,35 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
1396 |        int const peer = args->peer;
1397 |        Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
1398 |          (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
1399 | +
1400 | +#if defined(ENABLE_NPKIT)
1401 | +      if (isNpKitThread) {
1402 | +        prims.npKitCtxIdx = npKitCtxIdx;
1403 | +      }
1404 | +#endif
1405 | +
1406 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
1407 | +      if (isNpKitThread) {
1408 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, clock64(),
1409 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1410 | +        prims.npKitDataProcessTotalTime = 0;
1411 | +      }
1412 | +#endif
1413 | +
1414 |        size_t offset = 0;
1415 |        do {
1416 |          int nelem = min(size_t(chunkSize), count-offset);
1417 |          prims.directRecv(offset, nelem);
1418 |          offset += nelem;
1419 |        } while(offset < count);
1420 | +
1421 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
1422 | +      if (isNpKitThread) {
1423 | +        NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, clock64(),
1424 | +            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
1425 | +      }
1426 | +#endif
1427 | +
1428 |      }
1429 |    }
1430 |  
1431 | diff --git a/src/include/devcomm.h b/src/include/devcomm.h
1432 | index 14ff92e..5d54049 100644
1433 | --- a/src/include/devcomm.h
1434 | +++ b/src/include/devcomm.h
1435 | @@ -9,6 +9,9 @@
1436 |  
1437 |  #include "nccl.h"
1438 |  #include "align.h"
1439 | +#if defined(ENABLE_NPKIT)
1440 | +#include "npkit/npkit_struct.h"
1441 | +#endif
1442 |  #include <stdint.h>
1443 |  
1444 |  #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
1445 | @@ -293,6 +296,12 @@ struct ncclDevComm {
1446 |  
1447 |    // Channels, device side
1448 |    struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
1449 | +
1450 | +#if defined(ENABLE_NPKIT)
1451 | +  NpKitEventCollectContext* npKitEventCollectContexts;
1452 | +  uint64_t* cpuTimestamp;
1453 | +#endif
1454 | +
1455 |  };
1456 |  
1457 |  struct alignas(16) ncclDevCommAndChannels {
1458 | diff --git a/src/include/npkit/npkit.h b/src/include/npkit/npkit.h
1459 | new file mode 100644
1460 | index 0000000..06b0deb
1461 | --- /dev/null
1462 | +++ b/src/include/npkit/npkit.h
1463 | @@ -0,0 +1,65 @@
1464 | +#ifndef NPKIT_H_
1465 | +#define NPKIT_H_
1466 | +
1467 | +#include <string>
1468 | +#include <thread>
1469 | +
1470 | +#include <cuda_runtime.h>
1471 | +
1472 | +#include "npkit/npkit_event.h"
1473 | +#include "npkit/npkit_struct.h"
1474 | +
1475 | +class NpKit {
1476 | + public:
1477 | +  static const uint64_t kNumGpuEventBuffers = 512;
1478 | +
1479 | +  static const uint64_t kNumCpuEventBuffers = 32;
1480 | +
1481 | +  static ncclResult_t Init(int rank);
1482 | +
1483 | +  static ncclResult_t Dump(const std::string& dump_dir);
1484 | +
1485 | +  static ncclResult_t Shutdown();
1486 | +
1487 | +  static NpKitEventCollectContext* GetGpuEventCollectContexts();
1488 | +
1489 | +  static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp,
1490 | +                                                NpKitEventCollectContext* ctx) {
1491 | +    uint64_t event_buffer_head = ctx->event_buffer_head;
1492 | +    if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
1493 | +      NpKitEvent& event = ctx->event_buffer[event_buffer_head];
1494 | +      event.fields.type = type;
1495 | +      event.fields.size = size;
1496 | +      event.fields.rsvd = rsvd;
1497 | +      event.fields.timestamp = timestamp;
1498 | +      ctx->event_buffer_head++;
1499 | +    }
1500 | +  }
1501 | +
1502 | +  static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
1503 | +
1504 | +  static uint64_t* GetCpuTimestamp();
1505 | +
1506 | + private:
1507 | +  static void CpuTimestampUpdateThread();
1508 | +
1509 | +  // 64K * 512 * 16B = 512MB per GPU
1510 | +  static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 16;
1511 | +
1512 | +  // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
1513 | +  static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
1514 | +
1515 | +  static NpKitEvent** gpu_event_buffers_;
1516 | +  static NpKitEvent** cpu_event_buffers_;
1517 | +
1518 | +  static NpKitEventCollectContext* gpu_collect_contexts_;
1519 | +  static NpKitEventCollectContext* cpu_collect_contexts_;
1520 | +  static uint64_t* cpu_timestamp_;
1521 | +
1522 | +  static uint64_t rank_;
1523 | +
1524 | +  static std::thread* cpu_timestamp_update_thread_;
1525 | +  static volatile bool cpu_timestamp_update_thread_should_stop_;
1526 | +};
1527 | +
1528 | +#endif
1529 | diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h
1530 | new file mode 100644
1531 | index 0000000..b328fc9
1532 | --- /dev/null
1533 | +++ b/src/include/npkit/npkit_event.h
1534 | @@ -0,0 +1,98 @@
1535 | +#ifndef NPKIT_EVENT_H_
1536 | +#define NPKIT_EVENT_H_
1537 | +
1538 | +#define NPKIT_EVENT_INVALID                                     0x0
1539 | +
1540 | +#define NPKIT_EVENT_ALL_REDUCE_RING_ENTRY                       0x1
1541 | +#define NPKIT_EVENT_ALL_REDUCE_RING_EXIT                        0x2
1542 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY                0x3
1543 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT                 0x4
1544 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY                 0x5
1545 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT                  0x6
1546 | +
1547 | +#define NPKIT_EVENT_COPY_SEND_ENTRY                             0x7
1548 | +#define NPKIT_EVENT_COPY_SEND_EXIT                              0x8
1549 | +#define NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY                      0x9
1550 | +#define NPKIT_EVENT_DIRECT_COPY_SEND_EXIT                       0xA
1551 | +#define NPKIT_EVENT_DIRECT_RECV_ENTRY                           0xB
1552 | +#define NPKIT_EVENT_DIRECT_RECV_EXIT                            0xC
1553 | +#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY                 0xD
1554 | +#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT                  0xE
1555 | +#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY          0xF
1556 | +#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT           0x10
1557 | +#define NPKIT_EVENT_DIRECT_SEND_ENTRY                           0x11
1558 | +#define NPKIT_EVENT_DIRECT_SEND_EXIT                            0x12
1559 | +#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY               0x13
1560 | +#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT                0x14
1561 | +#define NPKIT_EVENT_RECV_ENTRY                                  0x15
1562 | +#define NPKIT_EVENT_RECV_EXIT                                   0x16
1563 | +#define NPKIT_EVENT_RECV_COPY_SEND_ENTRY                        0x17
1564 | +#define NPKIT_EVENT_RECV_COPY_SEND_EXIT                         0x18
1565 | +#define NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY                      0x19
1566 | +#define NPKIT_EVENT_RECV_REDUCE_COPY_EXIT                       0x1A
1567 | +#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY                 0x1B
1568 | +#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT                  0x1C
1569 | +#define NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY                      0x1D
1570 | +#define NPKIT_EVENT_RECV_REDUCE_SEND_EXIT                       0x1E
1571 | +#define NPKIT_EVENT_SEND_ENTRY                                  0x1F
1572 | +#define NPKIT_EVENT_SEND_EXIT                                   0x20
1573 | +#define NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY                      0x21
1574 | +#define NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT                       0x22
1575 | +
1576 | +#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY                 0x23
1577 | +#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT                  0x24
1578 | +#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY      0x25
1579 | +#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT       0x26
1580 | +
1581 | +#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY                     0x27
1582 | +#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT                      0x28
1583 | +#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY                  0x29
1584 | +#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT                   0x2A
1585 | +
1586 | +#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY                  0x2B
1587 | +#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT                   0x2C
1588 | +#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY               0x2D
1589 | +#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT                0x2E
1590 | +
1591 | +#define NPKIT_EVENT_NET_SEND_ENTRY                              0x2F
1592 | +#define NPKIT_EVENT_NET_SEND_EXIT                               0x30
1593 | +
1594 | +#define NPKIT_EVENT_NET_RECV_ENTRY                              0x31
1595 | +#define NPKIT_EVENT_NET_RECV_EXIT                               0x32
1596 | +
1597 | +#define NPKIT_EVENT_TIME_SYNC_GPU                               0x33
1598 | +#define NPKIT_EVENT_TIME_SYNC_CPU                               0x34
1599 | +
1600 | +#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY                  0x35
1601 | +#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT                   0x36
1602 | +#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY      0x37
1603 | +#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT       0x38
1604 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY  0x39
1605 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT   0x3A
1606 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x3B
1607 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT  0x3C
1608 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY           0x3D
1609 | +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT            0x3E
1610 | +
1611 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY         0x3F
1612 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT          0x40
1613 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY      0x41
1614 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT       0x42
1615 | +
1616 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY    0x43
1617 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT     0x44
1618 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY          0x45
1619 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT           0x46
1620 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY       0x47
1621 | +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT        0x48
1622 | +
1623 | +#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY                  0x49
1624 | +#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT                   0x4A
1625 | +#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY                        0x4B
1626 | +#define NPKIT_EVENT_SEND_RECV_SEND_EXIT                         0x4C
1627 | +#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY                        0x4D
1628 | +#define NPKIT_EVENT_SEND_RECV_RECV_EXIT                         0x4E
1629 | +
1630 | +#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME                    0x4F
1631 | +
1632 | +#endif
1633 | diff --git a/src/include/npkit/npkit_struct.h b/src/include/npkit/npkit_struct.h
1634 | new file mode 100644
1635 | index 0000000..89dadcb
1636 | --- /dev/null
1637 | +++ b/src/include/npkit/npkit_struct.h
1638 | @@ -0,0 +1,25 @@
1639 | +#ifndef NPKIT_STRUCT_H_
1640 | +#define NPKIT_STRUCT_H_
1641 | +
1642 | +#include <cstdint>
1643 | +
1644 | +#pragma pack(push, 1)
1645 | +
1646 | +union NpKitEvent {
1647 | +  uint64_t bits[2];
1648 | +  struct {
1649 | +    uint64_t type : 8;
1650 | +    uint64_t size : 32;
1651 | +    uint64_t rsvd : 24;
1652 | +    uint64_t timestamp;
1653 | +  } fields;
1654 | +};
1655 | +
1656 | +struct NpKitEventCollectContext {
1657 | +  NpKitEvent* event_buffer;
1658 | +  uint64_t event_buffer_head;
1659 | +};
1660 | +
1661 | +#pragma pack(pop)
1662 | +
1663 | +#endif
1664 | diff --git a/src/include/proxy.h b/src/include/proxy.h
1665 | index 5e7f728..bdbe46d 100644
1666 | --- a/src/include/proxy.h
1667 | +++ b/src/include/proxy.h
1668 | @@ -64,6 +64,19 @@ struct ncclProxySubArgs {
1669 |    uint64_t end;
1670 |    void* requests[NCCL_STEPS];
1671 |    void* profilingEvents[NCCL_STEPS];
1672 | +
1673 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
1674 | +  int npKitSizesFifo[NCCL_STEPS];
1675 | +#endif
1676 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
1677 | +  int npKitSizesFifo[NCCL_STEPS];
1678 | +  uint64_t npKitStartTime[NCCL_STEPS];
1679 | +  uint64_t npKitLastPollTime[NCCL_STEPS];
1680 | +  uint64_t npKitLastPollInterval[NCCL_STEPS];
1681 | +  uint64_t npKitMaxPollInterval[NCCL_STEPS];
1682 | +  uint64_t npKitPollIntervalSum[NCCL_STEPS];
1683 | +  uint64_t npKitPollCnt[NCCL_STEPS];
1684 | +#endif
1685 |  };
1686 |  
1687 |  struct ncclProxyArgs {
1688 | diff --git a/src/init.cc b/src/init.cc
1689 | index 40f7872..4d7d5ce 100644
1690 | --- a/src/init.cc
1691 | +++ b/src/init.cc
1692 | @@ -16,6 +16,9 @@
1693 |  #include "enqueue.h"
1694 |  #include "graph.h"
1695 |  #include "argcheck.h"
1696 | +#if defined(ENABLE_NPKIT)
1697 | +#include "npkit/npkit.h"
1698 | +#endif
1699 |  #include <fcntl.h>
1700 |  #include <string.h>
1701 |  #include <errno.h>
1702 | @@ -399,7 +402,15 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
1703 |      }
1704 |    }
1705 |  
1706 | +#if defined(ENABLE_NPKIT)
1707 | +  // Init NPKit
1708 | +  NCCLCHECK(NpKit::Init(comm->rank));
1709 | +  tmpCommAndChans.comm.npKitEventCollectContexts = NpKit::GetGpuEventCollectContexts();
1710 | +  tmpCommAndChans.comm.cpuTimestamp = NpKit::GetCpuTimestamp();
1711 | +#endif
1712 | +
1713 |    NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.cudaStream), ret, fail);
1714 | +
1715 |  exit:
1716 |    CUDACHECK(cudaStreamSynchronize(comm->deviceStream.cudaStream));
1717 |    NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
1718 | @@ -1454,11 +1465,26 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
1719 |    int commDevice = comm->cudaDev;
1720 |    ncclResult_t ret = ncclSuccess;
1721 |  
1722 | +#if defined(ENABLE_NPKIT)
1723 | +  const char* npkitDumpDir = nullptr;
1724 | +#endif
1725 | +
1726 |    CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail);
1727 |    if (savedDevice != commDevice) {
1728 |      CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail);
1729 |    }
1730 |  
1731 | +#if defined(ENABLE_NPKIT)
1732 | +  // Dump NPKit events and shutdown
1733 | +  npkitDumpDir = getenv("NPKIT_DUMP_DIR");
1734 | +  if (npkitDumpDir == nullptr) {
1735 | +    WARN("NPKIT_DUMP_DIR is empty");
1736 | +  } else {
1737 | +    NCCLCHECKGOTO(NpKit::Dump(npkitDumpDir), ret, fail);
1738 | +  }
1739 | +  NCCLCHECKGOTO(NpKit::Shutdown(), ret, fail);
1740 | +#endif
1741 | +
1742 |    TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
1743 |  
1744 |    if (comm->initState == ncclSuccess) {
1745 | diff --git a/src/misc/npkit.cc b/src/misc/npkit.cc
1746 | new file mode 100644
1747 | index 0000000..af180e4
1748 | --- /dev/null
1749 | +++ b/src/misc/npkit.cc
1750 | @@ -0,0 +1,174 @@
1751 | +#include <chrono>
1752 | +#include <fstream>
1753 | +#include <unistd.h>
1754 | +
1755 | +#include "alloc.h"
1756 | +#include "npkit/npkit.h"
1757 | +
1758 | +uint64_t NpKit::rank_ = 0;
1759 | +
1760 | +NpKitEvent** NpKit::gpu_event_buffers_ = nullptr;
1761 | +NpKitEvent** NpKit::cpu_event_buffers_ = nullptr;
1762 | +
1763 | +NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr;
1764 | +NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr;
1765 | +uint64_t* NpKit::cpu_timestamp_ = nullptr;
1766 | +
1767 | +std::thread* NpKit::cpu_timestamp_update_thread_ = nullptr;
1768 | +volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
1769 | +
1770 | +void NpKit::CpuTimestampUpdateThread() {
1771 | +  uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count();
1772 | +  uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
1773 | +  uint64_t curr_steady_clock = 0;
1774 | +  volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_;
1775 | +  while (!cpu_timestamp_update_thread_should_stop_) {
1776 | +    curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
1777 | +    *volatile_cpu_timestamp_ = init_system_clock + (curr_steady_clock - init_steady_clock);
1778 | +  }
1779 | +}
1780 | +
1781 | +ncclResult_t NpKit::Init(int rank) {
1782 | +  uint64_t i = 0;
1783 | +  NpKitEventCollectContext ctx;
1784 | +  ctx.event_buffer_head = 0;
1785 | +  rank_ = rank;
1786 | +
1787 | +  // Init event data structures
1788 | +  NCCLCHECK(ncclCalloc(&gpu_event_buffers_, kNumGpuEventBuffers));
1789 | +  NCCLCHECK(ncclCudaCalloc(&gpu_collect_contexts_, kNumGpuEventBuffers));
1790 | +  for (i = 0; i < kNumGpuEventBuffers; i++) {
1791 | +    NCCLCHECK(ncclCudaCalloc(gpu_event_buffers_ + i, kMaxNumGpuEventsPerBuffer));
1792 | +    ctx.event_buffer = gpu_event_buffers_[i];
1793 | +    NCCLCHECK(ncclCudaMemcpy(gpu_collect_contexts_ + i, &ctx, 1));
1794 | +  }
1795 | +
1796 | +  NCCLCHECK(ncclCalloc(&cpu_event_buffers_, kNumCpuEventBuffers));
1797 | +  NCCLCHECK(ncclCalloc(&cpu_collect_contexts_, kNumCpuEventBuffers));
1798 | +  for (i = 0; i < kNumCpuEventBuffers; i++) {
1799 | +    NCCLCHECK(ncclCalloc(cpu_event_buffers_ + i, kMaxNumCpuEventsPerBuffer));
1800 | +    ctx.event_buffer = cpu_event_buffers_[i];
1801 | +    cpu_collect_contexts_[i] = ctx;
1802 | +  }
1803 | +
1804 | +  // Init timestamp
1805 | +  NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_, 1));
1806 | +  volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_;
1807 | +  *volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count();
1808 | +  cpu_timestamp_update_thread_should_stop_ = false;
1809 | +  cpu_timestamp_update_thread_ = new std::thread(CpuTimestampUpdateThread);
1810 | +
1811 | +  return ncclSuccess;
1812 | +}
1813 | +
1814 | +ncclResult_t NpKit::Dump(const std::string& dump_dir) {
1815 | +  uint64_t i = 0;
1816 | +  std::string dump_file_path;
1817 | +
1818 | +  // Dump CPU events
1819 | +  for (i = 0; i < kNumCpuEventBuffers; i++) {
1820 | +    dump_file_path = dump_dir;
1821 | +    dump_file_path += "/cpu_events_rank_";
1822 | +    dump_file_path += std::to_string(rank_);
1823 | +    dump_file_path += "_channel_";
1824 | +    dump_file_path += std::to_string(i);
1825 | +    auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
1826 | +    cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i]),
1827 | +        cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
1828 | +    cpu_trace_file.close();
1829 | +  }
1830 | +
1831 | +  // Dump CPU clock info
1832 | +  dump_file_path = dump_dir;
1833 | +  dump_file_path += "/cpu_clock_period_num_rank_";
1834 | +  dump_file_path += std::to_string(rank_);
1835 | +  std::string clock_period_num_str = std::to_string(std::chrono::steady_clock::duration::period::num);
1836 | +  auto clock_period_num_file = std::fstream(dump_file_path, std::ios::out);
1837 | +  clock_period_num_file.write(clock_period_num_str.c_str(), clock_period_num_str.length());
1838 | +  clock_period_num_file.close();
1839 | +
1840 | +  dump_file_path = dump_dir;
1841 | +  dump_file_path += "/cpu_clock_period_den_rank_";
1842 | +  dump_file_path += std::to_string(rank_);
1843 | +  std::string clock_period_den_str = std::to_string(std::chrono::steady_clock::duration::period::den);
1844 | +  auto clock_period_den_file = std::fstream(dump_file_path, std::ios::out);
1845 | +  clock_period_den_file.write(clock_period_den_str.c_str(), clock_period_den_str.length());
1846 | +  clock_period_den_file.close();
1847 | +
1848 | +  // Dump GPU events, reuse CPU struct
1849 | +  for (i = 0; i < kNumGpuEventBuffers; i++) {
1850 | +    dump_file_path = dump_dir;
1851 | +    dump_file_path += "/gpu_events_rank_";
1852 | +    dump_file_path += std::to_string(rank_);
1853 | +    dump_file_path += "_buf_";
1854 | +    dump_file_path += std::to_string(i);
1855 | +    NCCLCHECK(ncclCudaMemcpy(cpu_event_buffers_[0], gpu_event_buffers_[i], kMaxNumGpuEventsPerBuffer));
1856 | +    NCCLCHECK(ncclCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1));
1857 | +    auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
1858 | +    gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0]),
1859 | +        cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
1860 | +    gpu_trace_file.close();
1861 | +  }
1862 | +
1863 | +  // Dump GPU clockRate
1864 | +  dump_file_path = dump_dir;
1865 | +  dump_file_path += "/gpu_clock_rate_rank_";
1866 | +  dump_file_path += std::to_string(rank_);
1867 | +  cudaDeviceProp dev_prop;
1868 | +  int dev;
1869 | +  CUDACHECK(cudaGetDevice(&dev));
1870 | +  CUDACHECK(cudaGetDeviceProperties(&dev_prop, dev));
1871 | +  std::string clock_rate_str = std::to_string(dev_prop.clockRate);
1872 | +  auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
1873 | +  gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
1874 | +  gpu_clock_rate_file.close();
1875 | +
1876 | +  return ncclSuccess;
1877 | +}
1878 | +
1879 | +ncclResult_t NpKit::Shutdown() {
1880 | +  uint64_t i = 0;
1881 | +
1882 | +  // Stop CPU timestamp updating thread
1883 | +  cpu_timestamp_update_thread_should_stop_ = true;
1884 | +  cpu_timestamp_update_thread_->join();
1885 | +
1886 | +  // Free CPU event data structures
1887 | +  for (i = 0; i < kNumCpuEventBuffers; i++) {
1888 | +    free(cpu_event_buffers_[i]);
1889 | +  }
1890 | +  free(cpu_event_buffers_);
1891 | +  free(cpu_collect_contexts_);
1892 | +
1893 | +  // Free GPU event data structures
1894 | +  for (i = 0; i < kNumGpuEventBuffers; i++) {
1895 | +    CUDACHECK(cudaFree(gpu_event_buffers_[i]));
1896 | +  }
1897 | +  free(gpu_event_buffers_);
1898 | +  CUDACHECK(cudaFree(gpu_collect_contexts_));
1899 | +
1900 | +  // Free timestamp
1901 | +  NCCLCHECK(ncclCudaHostFree(cpu_timestamp_));
1902 | +
1903 | +  return ncclSuccess;
1904 | +}
1905 | +
1906 | +NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() {
1907 | +  return gpu_collect_contexts_;
1908 | +}
1909 | +
1910 | +void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) {
1911 | +  uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head;
1912 | +  if (event_buffer_head < kMaxNumCpuEventsPerBuffer) {
1913 | +    NpKitEvent& event = cpu_collect_contexts_[channel_id].event_buffer[event_buffer_head];
1914 | +    event.fields.type = type;
1915 | +    event.fields.size = size;
1916 | +    event.fields.rsvd = rsvd;
1917 | +    event.fields.timestamp = timestamp;
1918 | +    cpu_collect_contexts_[channel_id].event_buffer_head++;
1919 | +  }
1920 | +}
1921 | +
1922 | +uint64_t* NpKit::GetCpuTimestamp() {
1923 | +  return cpu_timestamp_;
1924 | +}
1925 | diff --git a/src/transport/net.cc b/src/transport/net.cc
1926 | index fe98a4c..6e0b801 100644
1927 | --- a/src/transport/net.cc
1928 | +++ b/src/transport/net.cc
1929 | @@ -12,6 +12,36 @@
1930 |  #include "gdrwrap.h"
1931 |  #include "shm.h"
1932 |  #include "profiler.h"
1933 | +#if defined(ENABLE_NPKIT)
1934 | +#include "npkit/npkit.h"
1935 | +#endif
1936 | +
1937 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
1938 | +#include <chrono>
1939 | +static uint64_t g_npkit_net_check_latency_threshold_us = 100;
1940 | +static uint64_t g_npkit_time_den = 1000000000;
1941 | +static uint64_t g_npkit_time_num = 1;
1942 | +static uint64_t g_npkit_num_warmup_ops = 10000;
1943 | +static inline uint64_t npKitGetTsInUs() {
1944 | +  return std::chrono::steady_clock::now().time_since_epoch().count() * 1000000 * g_npkit_time_num / g_npkit_time_den;
1945 | +}
1946 | +static void npKitInitCheckLatencyEnv() {
1947 | +  const char* param_threshold_str = "NPKIT_NET_CHECK_LATENCY_THRESHOLD";
1948 | +  const char* param_warmup_str = "NPKIT_NUM_WARMUP_OPS";
1949 | +  static bool initialized = false;
1950 | +  if (!initialized) {
1951 | +    g_npkit_time_den = std::chrono::steady_clock::duration::period::den;
1952 | +    g_npkit_time_num = std::chrono::steady_clock::duration::period::num;
1953 | +    if (getenv(param_threshold_str) != nullptr) {
1954 | +      g_npkit_net_check_latency_threshold_us = strtoull(getenv(param_threshold_str), nullptr, 10);
1955 | +    }
1956 | +    if (getenv(param_warmup_str) != nullptr) {
1957 | +      g_npkit_num_warmup_ops = strtoull(getenv(param_warmup_str), nullptr, 10);
1958 | +    }
1959 | +    initialized = true;
1960 | +  }
1961 | +}
1962 | +#endif
1963 |  
1964 |  static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
1965 |  
1966 | @@ -188,6 +218,11 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
1967 |          proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
1968 |    }
1969 |    *((int*)connectInfo) = proxyRank;
1970 | +
1971 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
1972 | +  npKitInitCheckLatencyEnv();
1973 | +#endif
1974 | +
1975 |    return ncclSuccess;
1976 |  }
1977 |  
1978 | @@ -221,6 +256,11 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
1979 |    NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
1980 |    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
1981 |        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
1982 | +
1983 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
1984 | +  npKitInitCheckLatencyEnv();
1985 | +#endif
1986 | +
1987 |    return ncclSuccess;
1988 |  }
1989 |  
1990 | @@ -863,7 +903,16 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
1991 |  
1992 |  static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
1993 |  
1994 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
1995 | +static int g_npkit_net_poll_cnt = 0;
1996 | +#endif
1997 | +
1998 |  static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
1999 | +
2000 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2001 | +  g_npkit_net_poll_cnt++;
2002 | +#endif
2003 | +
2004 |    if (args->state == ncclProxyOpReady) {
2005 |      for (int s=0; s<args->nsubs; s++) {
2006 |        struct ncclProxySubArgs* sub = args->subs+s;
2007 | @@ -916,6 +965,14 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
2008 |          if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
2009 |            // We have something to receive, let's check if it's completely ready.
2010 |            int size = sizesFifo[buffSlot];
2011 | +
2012 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
2013 | +          sub->npKitSizesFifo[buffSlot] = size;
2014 | +#endif
2015 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
2016 | +          sub->npKitSizesFifo[buffSlot] = size;
2017 | +#endif
2018 | +
2019 |            bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
2020 |            char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
2021 |            int ready = 1;
2022 | @@ -946,6 +1003,27 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
2023 |              // Data is ready, try to send.
2024 |              NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
2025 |              if (sub->requests[buffSlot] != NULL) {
2026 | +
2027 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
2028 | +              NpKit::CollectCpuEvent(
2029 | +                  NPKIT_EVENT_NET_SEND_ENTRY,
2030 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2031 | +                  g_npkit_net_poll_cnt,
2032 | +#else
2033 | +                  size,
2034 | +#endif
2035 | +                  uint64_t(sub->requests+buffSlot)/sizeof(void*),
2036 | +                  *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
2037 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2038 | +              g_npkit_net_poll_cnt = 0;
2039 | +#endif
2040 | +#endif
2041 | +
2042 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
2043 | +              sub->npKitStartTime[buffSlot] = sub->npKitLastPollTime[buffSlot] = npKitGetTsInUs();
2044 | +              sub->npKitMaxPollInterval[buffSlot] = sub->npKitPollIntervalSum[buffSlot] = sub->npKitPollCnt[buffSlot] = 0;
2045 | +#endif
2046 | +
2047 |                TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
2048 |                sizesFifo[buffSlot] = -1;
2049 |                // Make sure size is reset to zero before we update the head.
2050 | @@ -963,7 +1041,48 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
2051 |          int done;
2052 |          int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
2053 |          NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
2054 | +
2055 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
2056 | +        uint64_t npKitPollTime = npKitGetTsInUs();
2057 | +        sub->npKitLastPollInterval[buffSlot] = npKitPollTime - sub->npKitLastPollTime[buffSlot];
2058 | +        sub->npKitPollIntervalSum[buffSlot] += sub->npKitLastPollInterval[buffSlot];
2059 | +        if (sub->npKitLastPollInterval[buffSlot] > sub->npKitMaxPollInterval[buffSlot]) {
2060 | +            sub->npKitMaxPollInterval[buffSlot] = sub->npKitLastPollInterval[buffSlot];
2061 | +        }
2062 | +        sub->npKitLastPollTime[buffSlot] = npKitPollTime;
2063 | +        sub->npKitPollCnt[buffSlot]++;
2064 | +#endif
2065 | +
2066 |          if (done) {
2067 | +
2068 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
2069 | +          NpKit::CollectCpuEvent(
2070 | +              NPKIT_EVENT_NET_SEND_EXIT,
2071 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2072 | +              g_npkit_net_poll_cnt,
2073 | +#else
2074 | +              sub->npKitSizesFifo[buffSlot],
2075 | +#endif
2076 | +              uint64_t(sub->requests+buffSlot)/sizeof(void*),
2077 | +              *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
2078 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2079 | +          g_npkit_net_poll_cnt = 0;
2080 | +#endif
2081 | +#endif
2082 | +
2083 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
2084 | +          uint64_t npKitSendDuration = sub->npKitLastPollTime[buffSlot] - sub->npKitStartTime[buffSlot];
2085 | +          if (g_npkit_num_warmup_ops > 0) {
2086 | +            g_npkit_num_warmup_ops--;
2087 | +          }
2088 | +          if (g_npkit_num_warmup_ops == 0 && npKitSendDuration > g_npkit_net_check_latency_threshold_us) {
2089 | +            fprintf(stdout, "NPKIT LONG SEND (R:%d,P:%d,C:%d,S:%d): %d took %lu us, last/max/sum poll interval %lu/%lu/%lu us, cnt: %lu, ts: %lu/%lu\n",
2090 | +                    comm->rank, sub->peer, sub->channelId, buffSlot, sub->npKitSizesFifo[buffSlot], npKitSendDuration, sub->npKitLastPollInterval[buffSlot], sub->npKitMaxPollInterval[buffSlot], sub->npKitPollIntervalSum[buffSlot], sub->npKitPollCnt[buffSlot], sub->npKitStartTime[buffSlot], sub->npKitLastPollTime[buffSlot]);
2091 | +            sub->npKitStartTime[buffSlot] = sub->npKitLastPollTime[buffSlot] = npKitGetTsInUs();
2092 | +            sub->npKitMaxPollInterval[buffSlot] = sub->npKitPollIntervalSum[buffSlot] = sub->npKitPollCnt[buffSlot] = 0;
2093 | +          }
2094 | +#endif
2095 | +
2096 |            TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
2097 |            sub->done += args->sliceSteps;
2098 |            for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
2099 | @@ -989,6 +1108,11 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
2100 |  }
2101 |  
2102 |  static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
2103 | +
2104 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2105 | +  g_npkit_net_poll_cnt++;
2106 | +#endif
2107 | +
2108 |    if (args->state == ncclProxyOpReady) {
2109 |      // Initialize subs and group them by same recvComm.
2110 |      void* recvComm;
2111 | @@ -1070,6 +1194,27 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
2112 |          if (*requestPtr) {
2113 |            for (int i=0; i<subGroup->groupSize; i++) {
2114 |              struct ncclProxySubArgs* sub = subGroup+i;
2115 | +
2116 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
2117 | +            NpKit::CollectCpuEvent(
2118 | +                NPKIT_EVENT_NET_RECV_ENTRY,
2119 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2120 | +                g_npkit_net_poll_cnt,
2121 | +#else
2122 | +                sizes[i],
2123 | +#endif
2124 | +                uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
2125 | +                *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
2126 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2127 | +            g_npkit_net_poll_cnt = 0;
2128 | +#endif
2129 | +#endif
2130 | +
2131 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
2132 | +            sub->npKitStartTime[step%NCCL_STEPS] = sub->npKitLastPollTime[step%NCCL_STEPS] = npKitGetTsInUs();
2133 | +            sub->npKitMaxPollInterval[step%NCCL_STEPS] = sub->npKitPollIntervalSum[step%NCCL_STEPS] = sub->npKitPollCnt[step%NCCL_STEPS] = 0;
2134 | +#endif
2135 | +
2136 |              sub->posted += args->sliceSteps;
2137 |              for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
2138 |            }
2139 | @@ -1089,12 +1234,56 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
2140 |          void* mhandles[NCCL_PROXY_MAX_SUBS];
2141 |          for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
2142 |          NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
2143 | +
2144 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
2145 | +        uint64_t npKitPollTime = npKitGetTsInUs();
2146 | +        for (int i=0; i<subGroup->groupSize; i++) {
2147 | +          struct ncclProxySubArgs* sub = subGroup + i;
2148 | +          sub->npKitLastPollInterval[step%NCCL_STEPS] = npKitPollTime - sub->npKitLastPollTime[step%NCCL_STEPS];
2149 | +          sub->npKitPollIntervalSum[step%NCCL_STEPS] += sub->npKitLastPollInterval[step%NCCL_STEPS];
2150 | +          if (sub->npKitLastPollInterval[step%NCCL_STEPS] > sub->npKitMaxPollInterval[step%NCCL_STEPS]) {
2151 | +              sub->npKitMaxPollInterval[step%NCCL_STEPS] = sub->npKitLastPollInterval[step%NCCL_STEPS];
2152 | +          }
2153 | +          sub->npKitLastPollTime[step%NCCL_STEPS] = npKitPollTime;
2154 | +          sub->npKitPollCnt[step%NCCL_STEPS]++;
2155 | +        }
2156 | +#endif
2157 | +
2158 |          if (done) {
2159 |            int needFlush = 0;
2160 |            int totalSize = 0;
2161 |            for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
2162 |            for (int i=0; i<subGroup->groupSize; i++) {
2163 |              struct ncclProxySubArgs* sub = subGroup + i;
2164 | +
2165 | +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
2166 | +            NpKit::CollectCpuEvent(
2167 | +                NPKIT_EVENT_NET_RECV_EXIT,
2168 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2169 | +                g_npkit_net_poll_cnt,
2170 | +#else
2171 | +                sizes[i],
2172 | +#endif
2173 | +                uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
2174 | +                *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
2175 | +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
2176 | +            g_npkit_net_poll_cnt = 0;
2177 | +#endif
2178 | +#endif
2179 | +
2180 | +#if defined(ENABLE_NPKIT_NET_CHECK_LATENCY)
2181 | +            if (g_npkit_num_warmup_ops > 0) {
2182 | +              g_npkit_num_warmup_ops--;
2183 | +            }
2184 | +            uint64_t npKitRecvDuration = sub->npKitLastPollTime[step%NCCL_STEPS] - sub->npKitStartTime[step%NCCL_STEPS];
2185 | +            if (g_npkit_num_warmup_ops == 0 && npKitRecvDuration > g_npkit_net_check_latency_threshold_us) {
2186 | +              fprintf(stdout, "NPKIT LONG RECV (R:%d,P:%d,C:%d,S:%lu): %d took %lu us, last/max/sum poll interval %lu/%lu/%lu us, cnt: %lu, ts: %lu/%lu\n",
2187 | +                      comm->rank, sub->peer, sub->channelId, step%NCCL_STEPS, sizes[i], npKitRecvDuration, sub->npKitLastPollInterval[step%NCCL_STEPS], sub->npKitMaxPollInterval[step%NCCL_STEPS], sub->npKitPollIntervalSum[step%NCCL_STEPS], sub->npKitPollCnt[step%NCCL_STEPS], sub->npKitStartTime[step%NCCL_STEPS], sub->npKitLastPollTime[step%NCCL_STEPS]);
2188 | +              sub->npKitStartTime[step%NCCL_STEPS] = sub->npKitLastPollTime[step%NCCL_STEPS] = npKitGetTsInUs();
2189 | +              sub->npKitMaxPollInterval[step%NCCL_STEPS] = sub->npKitPollIntervalSum[step%NCCL_STEPS] = sub->npKitPollCnt[step%NCCL_STEPS] = 0;
2190 | +            }
2191 | +#endif
2192 | +
2193 |              sub->received += args->sliceSteps;
2194 |              for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
2195 |              if (step < sub->nsteps) {
2196 | 


--------------------------------------------------------------------------------
/nccl_samples/npkit_launcher.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | set -x
 5 | 
 6 | # NCCL source directory.
 7 | export NCCL_SRC_DIR="/mnt/nccl"
 8 | 
 9 | # NPKit source directory.
10 | export NPKIT_SRC_DIR="/mnt/npkit"
11 | 
12 | # Path to nccl-tests binary being profiled.
13 | export NCCL_TEST_BIN="/mnt/nccl-tests/build/all_reduce_perf"
14 | # export NCCL_TEST_BIN="/mnt/nccl-tests/build/alltoall_perf"
15 | 
16 | # NPKit runtime directory, used to store logs and results.
17 | export NPKIT_RUN_DIR="/mnt/npkit_run"
18 | 
19 | # Message size of NCCL operation.
20 | export NCCL_MSG_SIZE="16M"
21 | 
22 | # NCCL communication algorithm.
23 | export NCCL_ALGO="Ring"
24 | # export NCCL_ALGO="Tree"
25 | 
26 | # NCCL communication protocol. Simple and LL are supported.
27 | export NCCL_PROTO="Simple"
28 | # export NCCL_PROTO="LL"
29 | # export NCCL_PROTO="LL128"
30 | 
31 | # Number of nccl-tests warmups.
32 | export NCCL_NUM_WARMUPS="0"
33 | 
34 | # Number of nccl-tests iterations.
35 | export NCCL_NUM_ITERS="10"
36 | 
37 | NPKIT_FLAGS_CPU_PREFIX="-DENABLE_NPKIT"
38 | NPKIT_FLAGS_GPU_PREFIX="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU"
39 | 
40 | export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT"
41 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT"
42 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT"
43 | 
44 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT"
45 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT"
46 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT"
47 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT"
48 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT"
49 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT"
50 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT"
51 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_ENTRY -DENABLE_NPKIT_EVENT_RECV_EXIT"
52 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT"
53 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT"
54 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT"
55 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT"
56 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_ENTRY -DENABLE_NPKIT_EVENT_SEND_EXIT"
57 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT"
58 | 
59 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT"
60 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT"
61 | 
62 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT"
63 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT"
64 | 
65 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT"
66 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT"
67 | 
68 | # export NPKIT_FLAGS=${NPKIT_FLAGS_CPU_PREFIX}" -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT"
69 | 
70 | bash npkit_runner.sh
71 | 


--------------------------------------------------------------------------------
/nccl_samples/npkit_runner.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | set -x
 5 | 
 6 | # Function that runs nccl-tests and collect NPKit traces.
 7 | # nccl_test
 8 | #   <nccl_test_bin> <msg_size> <algorithm> <protocol> <num_warmups> <num_iters>
 9 | #   <npkit_dump_dir> <npkit_result_dir>
10 | function nccl_test() {
11 |   mpirun --allow-run-as-root \
12 |     -map-by ppr:8:node --bind-to numa \
13 |     -x LD_PRELOAD=$2/build/lib/libnccl.so:$LD_PRELOAD \
14 |     -x NCCL_DEBUG=WARN \
15 |     -x NCCL_ALGO=$4 \
16 |     -x NCCL_PROTO=$5 \
17 |     -x NPKIT_DUMP_DIR=$8 \
18 |     $1 -b $3 -e $3 -f 2 -g 1 -c 1 -w $6 -n $7 | tee $9/log.txt
19 | }
20 | 
21 | # Tag of this NPKit run.
22 | npkit_run_tag=`basename ${NCCL_TEST_BIN}`"/${msg_size}/${NCCL_ALGO}/${NCCL_PROTO}"
23 | 
24 | # Path to NPKit dump directory.
25 | npkit_dump_dir="${NPKIT_RUN_DIR}/npkit_dump/${npkit_run_tag}"
26 | 
27 | # Path to NPKit post-process directory.
28 | npkit_trace_dir="${NPKIT_RUN_DIR}/npkit_trace/${npkit_run_tag}"
29 | 
30 | # Path to NPKit result directory.
31 | npkit_result_dir="${NPKIT_RUN_DIR}/npkit_result/${npkit_run_tag}"
32 | 
33 | # Build NCCL with NPKit
34 | cd ${NCCL_SRC_DIR}
35 | make clean
36 | make -j src.build NPKIT_FLAGS="${NPKIT_FLAGS}"
37 | 
38 | # Clean existing results
39 | rm -rf ${NPKIT_RUN_DIR}
40 | mkdir -p ${npkit_dump_dir}
41 | mkdir -p ${npkit_trace_dir}
42 | mkdir -p ${npkit_result_dir}
43 | 
44 | # Run NPKit on all nodes.
45 | nccl_test ${NCCL_TEST_BIN} ${NCCL_SRC_DIR} ${NCCL_MSG_SIZE} ${NCCL_ALGO} ${NCCL_PROTO} ${NCCL_NUM_WARMUPS} ${NCCL_NUM_ITERS} ${npkit_dump_dir} ${npkit_result_dir}
46 | 
47 | # Generate trace file
48 | cd ${NPKIT_SRC_DIR}/nccl_samples
49 | python3 npkit_trace_generator.py --npkit_dump_dir=${npkit_dump_dir} --npkit_event_header_path=${NCCL_SRC_DIR}/src/include/npkit/npkit_event.h --output_dir=${npkit_trace_dir}
50 | cd ${npkit_trace_dir}
51 | tar cvzf npkit_result.tar.gz npkit_event_trace.json
52 | mv npkit_result.tar.gz ${npkit_result_dir}
53 | 


--------------------------------------------------------------------------------
/nccl_samples/npkit_trace_generator.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import argparse
  5 | import os
  6 | import json
  7 | 
  8 | from queue import Queue
  9 | 
 10 | def parse_npkit_event_header(npkit_event_header_path):
 11 |     npkit_event_def = {'id_to_type': {}, 'type_to_id': {}}
 12 |     with open(npkit_event_header_path, 'r') as f:
 13 |         lines = [x.strip() for x in f.readlines() if len(x.strip()) != 0]
 14 |         line_idx = 0
 15 |         while line_idx < len(lines):
 16 |             if lines[line_idx].startswith('#define NPKIT_EVENT_'):
 17 |                 fields = lines[line_idx].split()
 18 |                 if len(fields) == 3:
 19 |                     event_type = fields[1]
 20 |                     event_id = int(fields[2], 0)
 21 |                     npkit_event_def['type_to_id'][event_type] = event_id
 22 |                     npkit_event_def['id_to_type'][event_id] = event_type
 23 |             line_idx += 1
 24 |     return npkit_event_def
 25 | 
 26 | def parse_gpu_clock_scale(gpu_clock_file_path):
 27 |     with open(gpu_clock_file_path, 'r') as f:
 28 |         freq_in_khz = f.read()
 29 |         return float(freq_in_khz) * 1e3 / 1e6
 30 | 
 31 | def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path):
 32 |     with open(cpu_clock_num_file_path, 'r') as f:
 33 |         num = float(f.read())
 34 |     with open(cpu_clock_den_file_path, 'r') as f:
 35 |         den = float(f.read())
 36 |     return den / num / 1e6
 37 | 
 38 | def parse_gpu_event(event_bytes):
 39 |     return {
 40 |         'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
 41 |         'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False),
 42 |         'rsvd': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False),
 43 |         'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
 44 |     }
 45 | 
 46 | def parse_cpu_event(event_bytes):
 47 |     return {
 48 |         'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
 49 |         'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False),
 50 |         'slot': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False),
 51 |         'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
 52 |     }
 53 | 
 54 | def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale):
 55 |     gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx))
 56 |     raw_event_size = 16
 57 |     curr_cpu_base_time = None
 58 |     curr_gpu_base_time = None
 59 |     gpu_events = []
 60 |     event_type_to_seq = {}
 61 |     with open(gpu_event_file_path, 'rb') as f:
 62 |         raw_content = f.read()
 63 |         raw_content_size = len(raw_content)
 64 |         raw_content_idx = 0
 65 |         while raw_content_idx < raw_content_size:
 66 |             parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
 67 |             if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU':
 68 |                 curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale
 69 |                 curr_gpu_base_time = None
 70 |             elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU':
 71 |                 if curr_gpu_base_time is None:
 72 |                     curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
 73 |             else:
 74 |                 if curr_gpu_base_time is None:
 75 |                     curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
 76 |                 event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
 77 |                 phase = 'B' if event_type.endswith('_ENTRY') else 'E'
 78 |                 gpu_events.append({
 79 |                     'ph': phase,
 80 |                     'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time,
 81 |                     'pid': rank,
 82 |                     'tid': buf_idx + 1
 83 |                 })
 84 |                 if phase == 'B':
 85 |                     if event_type not in event_type_to_seq:
 86 |                         event_type_to_seq[event_type] = 0
 87 |                     gpu_events[-1].update({
 88 |                         'name': event_type,
 89 |                         'cat': 'GPU',
 90 |                         'args': {
 91 |                             'rank': rank,
 92 |                             'buf_idx': buf_idx,
 93 |                             'seq': event_type_to_seq[event_type],
 94 |                             'rsvd_0': parsed_gpu_event['rsvd'],
 95 |                             'size_0': parsed_gpu_event['size']
 96 |                         }
 97 |                     })
 98 |                     event_type_to_seq[event_type] += 1
 99 |                 else:
100 |                     gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
101 |                     delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts']
102 |                     gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3
103 |             raw_content_idx += raw_event_size
104 |     return gpu_events
105 | 
106 | def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale):
107 |     cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel))
108 |     raw_event_size = 16
109 |     cpu_events = []
110 |     event_type_to_seq = {}
111 | 
112 |     fiber_is_usable = []
113 |     fiber_open_ts = []
114 |     slot_to_fiber_id = {}
115 |     channel_shift = 1000
116 | 
117 |     with open(cpu_event_file_path, 'rb') as f:
118 |         raw_content = f.read()
119 |         raw_content_size = len(raw_content)
120 |         raw_content_idx = 0
121 |         while raw_content_idx < raw_content_size:
122 |             parsed_cpu_event = parse_cpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
123 |             event_type = npkit_event_def['id_to_type'][parsed_cpu_event['id']]
124 |             phase = 'B' if event_type.endswith('_ENTRY') else 'E'
125 |             cpu_events.append({
126 |                 'ph': phase,
127 |                 'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale,
128 |                 'pid': rank
129 |             })
130 |             slot = parsed_cpu_event['slot']
131 |             if phase == 'B':
132 |                 # Open fiber event
133 |                 fiber_id = 0
134 |                 while fiber_id < len(fiber_is_usable):
135 |                     if fiber_is_usable[fiber_id]:
136 |                         break
137 |                     fiber_id += 1
138 |                 if fiber_id == len(fiber_is_usable):
139 |                     fiber_is_usable.append(True)
140 |                     fiber_open_ts.append(0.0)
141 |                 slot_to_fiber_id[slot] = fiber_id
142 |                 fiber_open_ts[fiber_id] = cpu_events[-1]['ts']
143 |                 fiber_is_usable[fiber_id] = False
144 | 
145 |                 if event_type not in event_type_to_seq:
146 |                     event_type_to_seq[event_type] = 0
147 |                 cpu_events[-1].update({
148 |                     'name': event_type,
149 |                     'cat': 'CPU',
150 |                     'args': {
151 |                         'rank': rank,
152 |                         'channel': channel,
153 |                         'slot': parsed_cpu_event['slot'],
154 |                         'seq': event_type_to_seq[event_type],
155 |                         'size_0': parsed_cpu_event['size']
156 |                     }
157 |                 })
158 |                 event_type_to_seq[event_type] += 1
159 |             else:
160 |                 # Close fiber event
161 |                 fiber_id = slot_to_fiber_id[slot]
162 |                 slot_to_fiber_id.pop(slot)
163 |                 last_ts = fiber_open_ts[fiber_id]
164 |                 fiber_is_usable[fiber_id] = True
165 | 
166 |                 delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts)
167 |                 cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']}
168 |                 cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3
169 | 
170 |             cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift
171 | 
172 |             raw_content_idx += raw_event_size
173 |     return cpu_events
174 | 
175 | def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def):
176 |     files_in_dump_dir = next(os.walk(npkit_dump_dir))[2]
177 |     gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')]
178 |     cpu_event_files = [x for x in files_in_dump_dir if x.startswith('cpu_events_rank_')]
179 | 
180 |     ranks = list(set([int(x.split('_rank_')[1].split('_')[0]) for x in gpu_event_files]))
181 |     buf_indices = list(set([int(x.split('_buf_')[1].split('_')[0]) for x in gpu_event_files]))
182 |     channels = list(set([int(x.split('_channel_')[1].split('_')[0]) for x in cpu_event_files]))
183 | 
184 |     trace = {'traceEvents': []}
185 | 
186 |     for rank in ranks:
187 |         cpu_clock_den_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_den_rank_%d' % rank)
188 |         cpu_clock_num_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_num_rank_%d' % rank)
189 |         cpu_clock_scale = parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path)
190 | 
191 |         gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank)
192 |         gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path)
193 | 
194 |         for buf_idx in buf_indices:
195 |             gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale)
196 |             trace['traceEvents'].extend(gpu_events)
197 | 
198 |         for channel in channels:
199 |             cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale)
200 |             trace['traceEvents'].extend(cpu_events)
201 | 
202 |     trace['traceEvents'].sort(key=lambda x : x['ts'])
203 |     trace['displayTimeUnit'] = 'ns'
204 | 
205 |     os.makedirs(output_dir, exist_ok=True)
206 |     with open(os.path.join(output_dir, 'npkit_event_trace.json'), 'w') as f:
207 |         json.dump(trace, f)
208 | 
209 | if __name__ == '__main__':
210 |     parser = argparse.ArgumentParser()
211 |     parser.add_argument('--npkit_dump_dir', type=str, required=True, help='NPKit dump directory.')
212 |     parser.add_argument('--npkit_event_header_path', type=str, required=True, help='Path to npkit_event.h.')
213 |     parser.add_argument('--output_dir', type=str, required=True, help='Path to output directory.')
214 |     args = parser.parse_args()
215 | 
216 |     npkit_event_def = parse_npkit_event_header(args.npkit_event_header_path)
217 |     convert_npkit_dump_to_trace(args.npkit_dump_dir, args.output_dir, npkit_event_def)
218 | 


--------------------------------------------------------------------------------
/npkit_result_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/NPKit/4cbb26e3c145d2f9b19892ee250a17e8a4e4e680/npkit_result_example.png


--------------------------------------------------------------------------------
/rccl_samples/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | This folder contains scripts for NPKit sample workflow for RCCL. The sample workflow first builds RCCL with NPKit enabled, then runs rccl-test to collect NPKit event dump files, and finally generates NPKit trace file.
 4 | 
 5 | ## Dependencies
 6 | 
 7 | [RCCL](https://github.com/ROCmSoftwarePlatform/rccl) (with NPKit integrated) and [rccl-tests](https://github.com/ROCmSoftwarePlatform/rccl-tests).
 8 | 
 9 | ## Usage
10 | 
11 | 1) Make sure parameters in `npkit_launcher.sh` are valid. Also note that currently NPKit only supports collecting non-overlapped events in GPU, and `NPKIT_FLAGS` should follow this rule.
12 | 
13 | 2) Make sure `rccl_test` function in `npkit_runner.sh` is a valid command to run `rccl-tests` binary. Also note that currently NPKit only supports 1 GPU per process, so `-g 1` mode is required in `rccl-tests` commands.
14 | 
15 | 3) Run command `bash npkit_launcher.sh`.
16 | 
17 | 4) The generated trace file `npkit_event_trace.json` (zipped in `npkit_result.tar.gz`) is in [Google Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) and can be viewed by trace viewers.
18 | 


--------------------------------------------------------------------------------
/rccl_samples/npkit_launcher.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | set -x
 5 | 
 6 | # RCCL source directory.
 7 | export RCCL_SRC_DIR="/mnt/rccl"
 8 | 
 9 | # NPKit source directory.
10 | export NPKIT_SRC_DIR="/mnt/npkit"
11 | 
12 | # Path to rccl-tests binary being profiled.
13 | export RCCL_TEST_BIN="/mnt/rccl-tests/build/all_reduce_perf"
14 | # export RCCL_TEST_BIN="/mnt/rccl-tests/build/alltoall_perf"
15 | 
16 | # NPKit runtime directory, used to store logs and results.
17 | export NPKIT_RUN_DIR="/mnt/npkit_run"
18 | 
19 | # Message size of RCCL operation.
20 | export RCCL_MSG_SIZE="16M"
21 | 
22 | # RCCL communication algorithm.
23 | export RCCL_ALGO="Ring"
24 | # export RCCL_ALGO="Tree"
25 | 
26 | # RCCL communication protocol. Simple and LL are supported.
27 | export RCCL_PROTO="Simple"
28 | # export RCCL_PROTO="LL"
29 | 
30 | # Number of rccl-tests warmups.
31 | export RCCL_NUM_WARMUPS="0"
32 | 
33 | # Number of rccl-tests iterations.
34 | export RCCL_NUM_ITERS="10"
35 | 
36 | NPKIT_FLAGS_CPU_PREFIX="-DENABLE_NPKIT"
37 | NPKIT_FLAGS_GPU_PREFIX="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU"
38 | 
39 | export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT"
40 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT"
41 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT"
42 | 
43 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT"
44 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT"
45 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT"
46 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT"
47 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT"
48 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT"
49 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT"
50 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_ENTRY -DENABLE_NPKIT_EVENT_RECV_EXIT"
51 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT"
52 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT"
53 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT"
54 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT"
55 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_ENTRY -DENABLE_NPKIT_EVENT_SEND_EXIT"
56 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT"
57 | 
58 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT"
59 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT"
60 | 
61 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT"
62 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT"
63 | 
64 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT"
65 | # export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT"
66 | 
67 | # export NPKIT_FLAGS=${NPKIT_FLAGS_CPU_PREFIX}" -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT"
68 | 
69 | bash npkit_runner.sh
70 | 


--------------------------------------------------------------------------------
/rccl_samples/npkit_runner.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | set -x
 5 | 
 6 | # Function that runs rccl-tests and collect NPKit traces.
 7 | # rccl_test
 8 | #   <rccl_test_bin> <msg_size> <algorithm> <protocol> <num_warmups> <num_iters>
 9 | #   <npkit_dump_dir> <npkit_result_dir>
10 | function rccl_test() {
11 |   mpirun --allow-run-as-root \
12 |     -map-by ppr:16:node --bind-to numa \
13 |     -x LD_PRELOAD=$2/build/librccl.so:$LD_LIBRARY_PATH \
14 |     -x NCCL_DEBUG=WARN \
15 |     -x NCCL_ALGO=$4 \
16 |     -x NCCL_PROTO=$5 \
17 |     -x NPKIT_DUMP_DIR=$8 \
18 |     $1 -b $3 -e $3 -f 2 -g 1 -c 1 -w $6 -n $7 | tee $9/log.txt
19 | }
20 | 
21 | # Tag of this NPKit run.
22 | npkit_run_tag=`basename ${RCCL_TEST_BIN}`"/${msg_size}/${RCCL_ALGO}/${RCCL_PROTO}"
23 | 
24 | # Path to NPKit dump directory.
25 | npkit_dump_dir="${NPKIT_RUN_DIR}/npkit_dump/${npkit_run_tag}"
26 | 
27 | # Path to NPKit post-process directory.
28 | npkit_trace_dir="${NPKIT_RUN_DIR}/npkit_trace/${npkit_run_tag}"
29 | 
30 | # Path to NPKit result directory.
31 | npkit_result_dir="${NPKIT_RUN_DIR}/npkit_result/${npkit_run_tag}"
32 | 
33 | # Build RCCL with NPKit
34 | cd ${RCCL_SRC_DIR}
35 | rm -rf build
36 | mkdir -p build
37 | cd build
38 | CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ -DNPKIT_FLAGS="${NPKIT_FLAGS}" ..
39 | make -j
40 | 
41 | # Clean existing results
42 | rm -rf ${NPKIT_RUN_DIR}
43 | mkdir -p ${npkit_dump_dir}
44 | mkdir -p ${npkit_trace_dir}
45 | mkdir -p ${npkit_result_dir}
46 | 
47 | # Run NPKit on all nodes.
48 | rccl_test ${RCCL_TEST_BIN} ${RCCL_SRC_DIR} ${RCCL_MSG_SIZE} ${RCCL_ALGO} ${RCCL_PROTO} ${RCCL_NUM_WARMUPS} ${RCCL_NUM_ITERS} ${npkit_dump_dir} ${npkit_result_dir}
49 | 
50 | # Generate trace file
51 | cd ${NPKIT_SRC_DIR}/rccl_samples
52 | python3 npkit_trace_generator.py --npkit_dump_dir=${npkit_dump_dir} --npkit_event_header_path=${RCCL_SRC_DIR}/src/include/npkit/npkit_event.h --output_dir=${npkit_trace_dir}
53 | cd ${npkit_trace_dir}
54 | tar cvzf npkit_result.tar.gz npkit_event_trace.json
55 | mv npkit_result.tar.gz ${npkit_result_dir}
56 | 


--------------------------------------------------------------------------------
/rccl_samples/npkit_trace_generator.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import argparse
  5 | import os
  6 | import json
  7 | 
  8 | from queue import Queue
  9 | 
 10 | def parse_npkit_event_header(npkit_event_header_path):
 11 |     npkit_event_def = {'id_to_type': {}, 'type_to_id': {}}
 12 |     with open(npkit_event_header_path, 'r') as f:
 13 |         lines = [x.strip() for x in f.readlines() if len(x.strip()) != 0]
 14 |         line_idx = 0
 15 |         while line_idx < len(lines):
 16 |             if lines[line_idx].startswith('#define NPKIT_EVENT_'):
 17 |                 fields = lines[line_idx].split()
 18 |                 if len(fields) == 3:
 19 |                     event_type = fields[1]
 20 |                     event_id = int(fields[2], 0)
 21 |                     npkit_event_def['type_to_id'][event_type] = event_id
 22 |                     npkit_event_def['id_to_type'][event_id] = event_type
 23 |             line_idx += 1
 24 |     return npkit_event_def
 25 | 
 26 | def parse_gpu_clock_scale(gpu_clock_file_path):
 27 |     with open(gpu_clock_file_path, 'r') as f:
 28 |         freq_in_khz = f.read()
 29 |         return float(freq_in_khz) * 1e3 / 1e6
 30 | 
 31 | def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path):
 32 |     with open(cpu_clock_num_file_path, 'r') as f:
 33 |         num = float(f.read())
 34 |     with open(cpu_clock_den_file_path, 'r') as f:
 35 |         den = float(f.read())
 36 |     return den / num / 1e6
 37 | 
 38 | def parse_gpu_event(event_bytes):
 39 |     return {
 40 |         'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
 41 |         'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False),
 42 |         'rsvd': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False),
 43 |         'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
 44 |     }
 45 | 
 46 | def parse_cpu_event(event_bytes):
 47 |     return {
 48 |         'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
 49 |         'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False),
 50 |         'slot': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False),
 51 |         'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
 52 |     }
 53 | 
 54 | def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale):
 55 |     gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx))
 56 |     raw_event_size = 16
 57 |     curr_cpu_base_time = None
 58 |     curr_gpu_base_time = None
 59 |     gpu_events = []
 60 |     event_type_to_seq = {}
 61 |     with open(gpu_event_file_path, 'rb') as f:
 62 |         raw_content = f.read()
 63 |         raw_content_size = len(raw_content)
 64 |         raw_content_idx = 0
 65 |         while raw_content_idx < raw_content_size:
 66 |             parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
 67 |             if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU':
 68 |                 curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale
 69 |                 curr_gpu_base_time = None
 70 |             elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU':
 71 |                 if curr_gpu_base_time is None:
 72 |                     curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
 73 |             else:
 74 |                 if curr_gpu_base_time is None:
 75 |                     curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
 76 |                 event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
 77 |                 phase = 'B' if event_type.endswith('_ENTRY') else 'E'
 78 |                 gpu_events.append({
 79 |                     'ph': phase,
 80 |                     'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time,
 81 |                     'pid': rank,
 82 |                     'tid': buf_idx + 1
 83 |                 })
 84 |                 if phase == 'B':
 85 |                     if event_type not in event_type_to_seq:
 86 |                         event_type_to_seq[event_type] = 0
 87 |                     gpu_events[-1].update({
 88 |                         'name': event_type,
 89 |                         'cat': 'GPU',
 90 |                         'args': {
 91 |                             'rank': rank,
 92 |                             'buf_idx': buf_idx,
 93 |                             'seq': event_type_to_seq[event_type],
 94 |                             'rsvd_0': parsed_gpu_event['rsvd'],
 95 |                             'size_0': parsed_gpu_event['size']
 96 |                         }
 97 |                     })
 98 |                     event_type_to_seq[event_type] += 1
 99 |                 else:
100 |                     gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
101 |                     delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts']
102 |                     gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3
103 |             raw_content_idx += raw_event_size
104 |     return gpu_events
105 | 
106 | def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale):
107 |     cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel))
108 |     raw_event_size = 16
109 |     cpu_events = []
110 |     event_type_to_seq = {}
111 | 
112 |     fiber_is_usable = []
113 |     fiber_open_ts = []
114 |     slot_to_fiber_id = {}
115 |     channel_shift = 1000
116 | 
117 |     with open(cpu_event_file_path, 'rb') as f:
118 |         raw_content = f.read()
119 |         raw_content_size = len(raw_content)
120 |         raw_content_idx = 0
121 |         while raw_content_idx < raw_content_size:
122 |             parsed_cpu_event = parse_cpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
123 |             event_type = npkit_event_def['id_to_type'][parsed_cpu_event['id']]
124 |             phase = 'B' if event_type.endswith('_ENTRY') else 'E'
125 |             cpu_events.append({
126 |                 'ph': phase,
127 |                 'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale,
128 |                 'pid': rank
129 |             })
130 |             slot = parsed_cpu_event['slot']
131 |             if phase == 'B':
132 |                 # Open fiber event
133 |                 fiber_id = 0
134 |                 while fiber_id < len(fiber_is_usable):
135 |                     if fiber_is_usable[fiber_id]:
136 |                         break
137 |                     fiber_id += 1
138 |                 if fiber_id == len(fiber_is_usable):
139 |                     fiber_is_usable.append(True)
140 |                     fiber_open_ts.append(0.0)
141 |                 slot_to_fiber_id[slot] = fiber_id
142 |                 fiber_open_ts[fiber_id] = cpu_events[-1]['ts']
143 |                 fiber_is_usable[fiber_id] = False
144 | 
145 |                 if event_type not in event_type_to_seq:
146 |                     event_type_to_seq[event_type] = 0
147 |                 cpu_events[-1].update({
148 |                     'name': event_type,
149 |                     'cat': 'CPU',
150 |                     'args': {
151 |                         'rank': rank,
152 |                         'channel': channel,
153 |                         'slot': parsed_cpu_event['slot'],
154 |                         'seq': event_type_to_seq[event_type],
155 |                         'size_0': parsed_cpu_event['size']
156 |                     }
157 |                 })
158 |                 event_type_to_seq[event_type] += 1
159 |             else:
160 |                 # Close fiber event
161 |                 fiber_id = slot_to_fiber_id[slot]
162 |                 slot_to_fiber_id.pop(slot)
163 |                 last_ts = fiber_open_ts[fiber_id]
164 |                 fiber_is_usable[fiber_id] = True
165 | 
166 |                 delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts)
167 |                 cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']}
168 |                 cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3
169 | 
170 |             cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift
171 | 
172 |             raw_content_idx += raw_event_size
173 |     return cpu_events
174 | 
175 | def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def):
176 |     files_in_dump_dir = next(os.walk(npkit_dump_dir))[2]
177 |     gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')]
178 |     cpu_event_files = [x for x in files_in_dump_dir if x.startswith('cpu_events_rank_')]
179 | 
180 |     ranks = list(set([int(x.split('_rank_')[1].split('_')[0]) for x in gpu_event_files]))
181 |     buf_indices = list(set([int(x.split('_buf_')[1].split('_')[0]) for x in gpu_event_files]))
182 |     channels = list(set([int(x.split('_channel_')[1].split('_')[0]) for x in cpu_event_files]))
183 | 
184 |     trace = {'traceEvents': []}
185 | 
186 |     for rank in ranks:
187 |         cpu_clock_den_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_den_rank_%d' % rank)
188 |         cpu_clock_num_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_num_rank_%d' % rank)
189 |         cpu_clock_scale = parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path)
190 | 
191 |         gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank)
192 |         gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path)
193 | 
194 |         for buf_idx in buf_indices:
195 |             gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale)
196 |             trace['traceEvents'].extend(gpu_events)
197 | 
198 |         for channel in channels:
199 |             cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale)
200 |             trace['traceEvents'].extend(cpu_events)
201 | 
202 |     trace['traceEvents'].sort(key=lambda x : x['ts'])
203 |     trace['displayTimeUnit'] = 'ns'
204 | 
205 |     os.makedirs(output_dir, exist_ok=True)
206 |     with open(os.path.join(output_dir, 'npkit_event_trace.json'), 'w') as f:
207 |         json.dump(trace, f)
208 | 
209 | if __name__ == '__main__':
210 |     parser = argparse.ArgumentParser()
211 |     parser.add_argument('--npkit_dump_dir', type=str, required=True, help='NPKit dump directory.')
212 |     parser.add_argument('--npkit_event_header_path', type=str, required=True, help='Path to npkit_event.h.')
213 |     parser.add_argument('--output_dir', type=str, required=True, help='Path to output directory.')
214 |     args = parser.parse_args()
215 | 
216 |     npkit_event_def = parse_npkit_event_header(args.npkit_event_header_path)
217 |     convert_npkit_dump_to_trace(args.npkit_dump_dir, args.output_dir, npkit_event_def)
218 | 


--------------------------------------------------------------------------------