├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── pip_package
    ├── CHANGELOG.md
    ├── cloud_accelerator_diagnostics
    │   ├── __init__.py
    │   ├── src
    │   │   └── tensorboard_uploader
    │   │   │   ├── tensorboard.py
    │   │   │   └── uploader.py
    │   └── tests
    │   │   └── tensorboard_uploader
    │   │       ├── tensorboard_test.py
    │   │       └── uploader_test.py
    └── pyproject.toml
└── tpu_info
    ├── README.md
    ├── pyproject.toml
    └── tpu_info
        ├── __init__.py
        ├── args.py
        ├── cli.py
        ├── device.py
        ├── metrics.py
        └── proto
            ├── __init__.py
            └── tpu_metric_service.proto


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright 2023 Google LLC
 3 |  
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 |  
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 |  
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  -->
16 | # How to contribute
17 | 
18 | We'd love to accept your patches and contributions to this project.
19 | 
20 | ## Before you begin
21 | 
22 | ### Sign our Contributor License Agreement
23 | 
24 | Contributions to this project must be accompanied by a
25 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
26 | You (or your employer) retain the copyright to your contribution; this simply
27 | gives us permission to use and redistribute your contributions as part of the
28 | project.
29 | 
30 | If you or your current employer have already signed the Google CLA (even if it
31 | was for a different project), you probably don't need to do it again.
32 | 
33 | Visit <https://cla.developers.google.com/> to see your current agreements or to
34 | sign a new one.
35 | 
36 | ### Review our community guidelines
37 | 
38 | This project follows
39 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
40 | 
41 | ## Contribution process
42 | 
43 | ### Code reviews
44 | 
45 | All submissions, including submissions by project members, require review. We
46 | use GitHub pull requests for this purpose. Consult
47 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
48 | information on using pull requests.
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright 2023 Google LLC
 3 |  
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 |  
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 |  
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  -->
16 | # Cloud Accelerator Diagnostics
17 | 
18 | ## Overview
19 | Cloud Accelerator Diagnostics is a library to monitor, debug and profile the workloads running on Cloud accelerators like TPUs and GPUs. Additionally, this library provides a streamlined approach to automatically upload data to Tensorboard Experiments in Vertex AI. The package allows users to create a Tensorboard instance and Experiments in Vertex AI, and upload logs to them.
20 | 
21 | ## Installation
22 | To install the Cloud Accelerator Diagnostics package, run the following command:
23 | 
24 |  ```bash
25 |  pip install cloud-accelerator-diagnostics
26 |  ```
27 | 
28 | ## Automating Uploads to Vertex AI Tensorboard
29 | Before creating and uploading logs to Vertex AI Tensorboard, you must enable [Vertex AI API](https://cloud.google.com/vertex-ai/docs/start/cloud-environment#enable_vertexai_apis) in your Google Cloud console. Also, make sure to assign the [Vertex AI User IAM role](https://cloud.google.com/vertex-ai/docs/general/access-control#aiplatform.user) to the service account that will call the APIs in `cloud-accelerator-diagnostics` package. This is required to create and access the Vertex AI Tensorboard in the Google Cloud console.
30 | 
31 | ### Create Vertex AI Tensorboard
32 | To learn about Vertex AI Tensorboard, visit this [page](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-introduction).
33 | 
34 | Here is an example script to create a Vertex AI Tensorboard instance with the name `test-instance` in Google Cloud Project `test-project`.
35 | 
36 | Note: Vertex AI is available in only [these](https://cloud.google.com/vertex-ai/docs/general/locations#available-regions) regions.
37 | 
38 | ```
39 | from cloud_accelerator_diagnostics import tensorboard
40 | 
41 | instance_id = tensorboard.create_instance(project="test-project",
42 |                                           location="us-central1",
43 |                                           tensorboard_name="test-instance")
44 | print("Vertex AI Tensorboard created: ", instance_id)
45 | ```
46 | 
47 | ### Create Vertex AI Experiment
48 | To learn about Vertex AI Experiments, visit this [page](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments).
49 | 
50 | The following script will create a Vertex AI Experiment named `test-experiment` in your Google Cloud Project `test-project`. Here's how it handles attaching a Tensorboard instance:
51 | 
52 | **Scenario 1: Tensorboard Instance Exist**
53 | 
54 | If a Tensorboard instance named `test-instance` already exists in your project, the script will attach it to the new Experiment.
55 | 
56 | **Scenario 2: No Tensorboard Instance Present**
57 | 
58 | If `test-instance` does not exist, the script will create a new Tensorboard instance with that name and attach it to the Experiment.
59 | 
60 | ```
61 | from cloud_accelerator_diagnostics import tensorboard
62 | 
63 | instance_id, tensorboard_url = tensorboard.create_experiment(project="test-project",
64 |                                                              location="us-central1",
65 |                                                              experiment_name="test-experiment",
66 |                                                              tensorboard_name="test-instance")
67 | 
68 | print("View your Vertex AI Tensorboard here: ", tensorboard_url)
69 | ```
70 | 
71 | If a Vertex AI Experiment with the specified name exists, a new one will not be created, and the existing Experiment's URL will be returned.
72 | 
73 | Note: You can attach multiple Vertex AI Experiments to a single Vertex AI Tensorboard.
74 | 
75 | ### Upload Logs to Vertex AI Tensorboard
76 | The following script will continuously monitor for new data in the directory (`logdir`), and uploads it to your Vertex AI Tensorboard Experiment. Note that after calling `start_upload_to_tensorboard()`, the thread will be kept alive even if an exception is thrown. To ensure the thread gets shut down, put any code after `start_upload_to_tensorboard()` and before `stop_upload_to_tensorboard()` in a `try` block, and call `stop_upload_to_tensorboard()` in `finally` block. This example shows how you can upload the [profile logs](https://jax.readthedocs.io/en/latest/profiling.html#programmatic-capture) collected for your JAX workload on Vertex AI Tensorboard.
77 | 
78 | ```
79 | from cloud_accelerator_diagnostics import uploader
80 | 
81 | uploader.start_upload_to_tensorboard(project="test-project",
82 |                                      location="us-central1",
83 |                                      experiment_name="test-experiment",
84 |                                      tensorboard_name="test-instance",
85 |                                      logdir="gs://test-directory/testing")
86 | try:
87 |   jax.profiler.start_trace("gs://test-directory/testing")
88 |   <your code goes here>
89 |   jax.profiler.stop_trace()
90 | finally:
91 |   uploader.stop_upload_to_tensorboard()
92 | ```
93 | 


--------------------------------------------------------------------------------
/pip_package/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright 2023 Google LLC
 3 |  
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 |  
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 |  
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  -->
16 | # Changelog
17 | 
18 | <!--
19 | 
20 | Changelog follow the https://keepachangelog.com/ standard (at least the headers)
21 | 
22 | This allow to:
23 | 
24 | * auto-parsing release notes during the automated releases from github-action:
25 |   https://github.com/marketplace/actions/pypi-github-auto-release
26 | * Have clickable headers in the rendered markdown
27 | 
28 | To release a new version (e.g. from `1.0.0` -> `2.0.0`):
29 | 
30 | * Create a new `# [2.0.0] - YYYY-MM-DD` header and add the changes to be released.
31 | 
32 | -->
33 | 
34 | # [0.1.1] - 2024-10-14
35 | * Version 0.1.1 of `cloud-accelerator-diagnostics` PyPI package
36 | * Features:
37 |   * Use Vertex AI's continuous uploader directly
38 | 
39 | # [0.1.0] - 2024-03-20
40 | * Initial release of `cloud-accelerator-diagnostics` PyPI package
41 | * Features:
42 |   * Create a Vertex AI Tensorboard instance in Google Cloud Project
43 |   * Create a Vertex AI Experiment in Google Cloud Project
44 |   * Automatically upload logs to Vertex AI Tensorboard Experiment
45 | 


--------------------------------------------------------------------------------
/pip_package/cloud_accelerator_diagnostics/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from cloud_accelerator_diagnostics.src.tensorboard_uploader import tensorboard
16 | from cloud_accelerator_diagnostics.src.tensorboard_uploader import uploader
17 | 


--------------------------------------------------------------------------------
/pip_package/cloud_accelerator_diagnostics/src/tensorboard_uploader/tensorboard.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Tensorboard module.
 16 | 
 17 | This module provides the functionality to create Tensorboard instance and
 18 | Experiment in Vertex AI.
 19 | """
 20 | 
 21 | import logging
 22 | 
 23 | from google.cloud.aiplatform import aiplatform
 24 | 
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | # The API URI for accessing the Tensorboard UI
 29 | WEB_SERVER_URI = "tensorboard.googleusercontent.com"
 30 | 
 31 | 
 32 | def create_instance(project, location, tensorboard_name):
 33 |   """Creates a new Tensorboard instance in Vertex AI.
 34 | 
 35 |   Args:
 36 |       project (str): Google Cloud Project to create the Tensorboard instance to.
 37 |       location (str): Location to create the Tensorboard instance to. See
 38 |         https://cloud.google.com/vertex-ai/docs/general/locations#available-regions
 39 |           for the list of available VertexAI locations.
 40 |       tensorboard_name (str): The user-defined name of the Tensorboard. The name
 41 |         can be up to 128 characters long and can be consist of any UTF-8
 42 |         characters.
 43 | 
 44 |   Returns:
 45 |     str: The Tensorboard instance identifier.
 46 |   """
 47 |   try:
 48 |     aiplatform.init(project=project, location=location)
 49 |     tensorboard_identifiers = get_instance_identifiers(tensorboard_name)
 50 |     if not tensorboard_identifiers:
 51 |       # create a new Tensorboard instance if an instance doesn't exist
 52 |       logger.info(
 53 |           "Creating a Tensorboard instance with the name: %s", tensorboard_name
 54 |       )
 55 |       tensorboard = aiplatform.Tensorboard.create(
 56 |           display_name=tensorboard_name,
 57 |           project=project,
 58 |           location=location,
 59 |       )
 60 |       return tensorboard.name
 61 |     else:
 62 |       logger.info(
 63 |           "Tensorboard instance with the name: %s already exist in project: %s"
 64 |           " and location: %s. Not creating a new Tensorboard instance.",
 65 |           tensorboard_name,
 66 |           project,
 67 |           location,
 68 |       )
 69 |       # return the first Tensorboard instance even if multiple instances exist
 70 |       return tensorboard_identifiers[0]
 71 |   except (ValueError, Exception):
 72 |     logger.exception("Error while creating Tensorboard instance.")
 73 |     return None
 74 | 
 75 | 
 76 | def create_experiment(project, location, experiment_name, tensorboard_name):
 77 |   """Creates a new Tensorboard Experiment in VertexAI.
 78 | 
 79 |   Args:
 80 |     project (str): Google Cloud Project to create the Tensorboard experiment to.
 81 |     location (str): Location to create the Tensorboard experiment to. See
 82 |       https://cloud.google.com/vertex-ai/docs/general/locations#available-regions
 83 |         for the list of available VertexAI locations.
 84 |     experiment_name (str): The name of the Tensorboard experiment to create.
 85 |       This value should be 1-128 characters, and valid characters are
 86 |       /[a-z][0-9]-/.
 87 |     tensorboard_name (str): The name of the Tensorboard to create the
 88 |       Tensorboard Experiment in.
 89 | 
 90 |   Returns:
 91 |     str: The Tensorboard instance identifier.
 92 |     str: The URL to access the Tensorboard UI.
 93 |   """
 94 |   try:
 95 |     aiplatform.init(project=project, location=location)
 96 | 
 97 |     # Get the identifier for the Tensorboard instance. If no Tensorboard
 98 |     # instance is present, then create a new instance.
 99 |     tensorboard_identifiers = get_instance_identifiers(tensorboard_name)
100 |     if not tensorboard_identifiers:
101 |       logger.info(
102 |           "No Tensorboard instance present in the project: %s. Creating"
103 |           " a new Tensorboard instance with the name: %s",
104 |           project,
105 |           tensorboard_name,
106 |       )
107 |       tensorboard_id = create_instance(project, location, tensorboard_name)
108 |       # create_instance() failed to create a Tensorboard instance
109 |       if tensorboard_id is None:
110 |         return None, None
111 |     else:
112 |       # get the first Tensorboard instance even if multiple instances exist
113 |       tensorboard_id = tensorboard_identifiers[0]
114 | 
115 |     # check if an experiment already exist for the tensorboard_id
116 |     experiment = get_experiment(tensorboard_id, experiment_name)
117 |     if experiment is not None:
118 |       logger.info(
119 |           "Experiment with the name: %s already exist in the project: %s."
120 |           " Not creating a new Experiment.",
121 |           experiment_name,
122 |           project,
123 |       )
124 |     else:
125 |       logger.info(
126 |           "Creating Experiment for Tensorboard instance id: %s", tensorboard_id
127 |       )
128 |       experiment = aiplatform.TensorboardExperiment.create(
129 |           tensorboard_experiment_id=experiment_name,
130 |           display_name=experiment_name,
131 |           tensorboard_name=tensorboard_id,
132 |       )
133 |     experiment_resource_name = experiment.resource_name
134 |     tensorboard_url = "https://{}.{}/experiment/{}".format(
135 |         location,
136 |         WEB_SERVER_URI,
137 |         experiment_resource_name.replace("/", "+"),
138 |     )
139 |     return tensorboard_id, tensorboard_url
140 |   except (ValueError, Exception):
141 |     logger.exception("Error while creating Tensorboard Experiment.")
142 |     return None, None
143 | 
144 | 
145 | def get_instance_identifiers(tensorboard_name):
146 |   """Retrieves a list of Tensorboard instance identifiers that match the given `tensorboard_name`.
147 | 
148 |   Args:
149 |     tensorboard_name (str): The name of the Tensorboard instance to search for.
150 | 
151 |   Returns:
152 |     list: A list of Tensorboard instance identifiers that match
153 |     `tensorboard_name`.
154 |   """
155 |   tensorboard_instances = aiplatform.tensorboard.Tensorboard.list()
156 |   tensorboard_identifiers = []
157 |   for tensorboard in tensorboard_instances:
158 |     if tensorboard.display_name == tensorboard_name:
159 |       tensorboard_identifiers.append(tensorboard.name)
160 |   return tensorboard_identifiers
161 | 
162 | 
163 | def get_experiment(tensorboard_id, experiment_name):
164 |   """Retrieves the experiment object if an experiment with the given `experiment_name` exists for the given `tensorboard_id`.
165 | 
166 |   Args:
167 |     tensorboard_id (str): The id of Tensorboard instance.
168 |     experiment_name (str): The name of Tensorboard experiment.
169 | 
170 |   Returns:
171 |     TensorboardExperiment object if an experiment with the given name exist
172 |     in the project, None otherwise.
173 |   """
174 |   experiment_list = aiplatform.tensorboard.TensorboardExperiment.list(
175 |       tensorboard_id
176 |   )
177 |   for experiment in experiment_list:
178 |     if experiment.display_name == experiment_name:
179 |       return experiment
180 |   return None
181 | 


--------------------------------------------------------------------------------
/pip_package/cloud_accelerator_diagnostics/src/tensorboard_uploader/uploader.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Uploader module.
 16 | 
 17 | This module provides the functionality to upload data to Tensorboard in Vertex
 18 | AI.
 19 | """
 20 | 
 21 | import logging
 22 | 
 23 | from cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader import tensorboard
 24 | from google.cloud.aiplatform import aiplatform
 25 | 
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def start_upload_to_tensorboard(
 31 |     project,
 32 |     location,
 33 |     experiment_name,
 34 |     tensorboard_name,
 35 |     logdir,
 36 | ):
 37 |   """Continues to listen for new data in the logdir and uploads when it appears.
 38 | 
 39 |   Note that after calling `start_upload_to_tensorboard()`, thread will be kept
 40 |   alive even if an exception is thrown. To ensure the thread gets shut down, put
 41 |   any code after `start_upload_to_tensorboard()` and before
 42 |   `stop_upload_to_tensorboard()` in a `try` statement, and call
 43 |   `stop_upload_to_tensorboard()` in finally.
 44 | 
 45 |   Sample usage:
 46 |   ```
 47 |     start_upload_to_tensorboard(project='test-project'
 48 |                                 location='us-central1',
 49 |                                 experiment_name='test-experiment',
 50 |                                 tensorboard_name='test-instance',
 51 |                                 logdir='test-logdir')
 52 |     try:
 53 |       # your code here
 54 |     finally:
 55 |       stop_upload_to_tensorboard()
 56 |   ```
 57 | 
 58 |   Args:
 59 |       project (str): Google Cloud Project that has the Tensorboard instance.
 60 |       location (str): Location where Tensorboard instance is present.
 61 |       experiment_name (str): The name of the Tensorboard experiment.
 62 |       tensorboard_name (str): The name of the Tensorboard instance.
 63 |       logdir (str): path of the log directory to upload to Tensorboard.
 64 |   """
 65 |   try:
 66 |     aiplatform.init(project=project, location=location)
 67 | 
 68 |     # Skip uploading logs to VertexAI if a Tensorboard instance doesn't exist
 69 |     tensorboard_identifiers = tensorboard.get_instance_identifiers(
 70 |         tensorboard_name
 71 |     )
 72 |     if not tensorboard_identifiers:
 73 |       logger.error(
 74 |           "No Tensorboard instance with the name %s present in the project %s."
 75 |           " Skipping uploading logs to VertexAI.",
 76 |           tensorboard_name,
 77 |           project,
 78 |       )
 79 |       return
 80 |     else:
 81 |       # get the first Tensorboard instance even if multiple instances exist
 82 |       tensorboard_id = tensorboard_identifiers[0]
 83 | 
 84 |     # Skip uploading logs to VertexAI if a Tensorboard experiment doesn't exist
 85 |     experiment = tensorboard.get_experiment(tensorboard_id, experiment_name)
 86 |     if experiment is None:
 87 |       logger.error(
 88 |           "No Tensorboard experiment with the name %s present in the project"
 89 |           " %s. Skipping uploading logs to VertexAI.",
 90 |           experiment_name,
 91 |           project,
 92 |       )
 93 |       return
 94 | 
 95 |     start_upload(tensorboard_id, experiment_name, logdir)
 96 |   except (ValueError, Exception) as e:
 97 |     logger.exception(
 98 |         "Error while uploading logs to Tensorboard. This will not impact the"
 99 |         " workload. Error: %s",
100 |         e,
101 |     )
102 | 
103 | 
104 | def stop_upload_to_tensorboard():
105 |   """Stops the thread created by `start_upload_to_tensorboard()`."""
106 |   logger.info("Logs will no longer be uploaded to Tensorboard.")
107 |   aiplatform.end_upload_tb_log()
108 | 
109 | 
110 | def start_upload(tensorboard_id, experiment_name, logdir):
111 |   """Starts uploading logs to Tensorboard instance in VertexAI.
112 | 
113 |   Args:
114 |     tensorboard_id (str): The id of Tensorboard instance.
115 |     experiment_name (str): The name of the Tensorboard experiment.
116 |     logdir (str): path of the log directory to upload to Tensorboard.
117 |   """
118 |   logger.info("Starting uploading of logs to Tensorboard.")
119 |   try:
120 |     aiplatform.start_upload_tb_log(
121 |         tensorboard_id=tensorboard_id,
122 |         tensorboard_experiment_name=experiment_name,
123 |         logdir=logdir,
124 |     )
125 |   except Exception as e:
126 |     logger.exception(
127 |         "Error while uploading logs to Tensorboard. This will not impact the"
128 |         " workload. Error: %s",
129 |         e,
130 |     )
131 | 


--------------------------------------------------------------------------------
/pip_package/cloud_accelerator_diagnostics/tests/tensorboard_uploader/tensorboard_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from absl.testing import absltest
 16 | from cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader import tensorboard
 17 | 
 18 | 
 19 | class TensorboardTest(absltest.TestCase):
 20 | 
 21 |   @absltest.mock.patch("google.cloud.aiplatform.aiplatform.Tensorboard.create")
 22 |   @absltest.mock.patch(
 23 |       "google.cloud.aiplatform.aiplatform.tensorboard.Tensorboard.list"
 24 |   )
 25 |   def testCreateInstanceWhenNoInstanceExist(
 26 |       self, mock_tensorboard_list, mock_tensorboard_create
 27 |   ):
 28 |     mock_tensorboard_list.return_value = []
 29 |     mock_tensorboard_create.return_value.name = "123"
 30 | 
 31 |     instance_id = tensorboard.create_instance(
 32 |         "test-project", "us-central1", "test-instance"
 33 |     )
 34 | 
 35 |     mock_tensorboard_list.assert_called_once()
 36 |     mock_tensorboard_create.assert_called_once_with(
 37 |         project="test-project",
 38 |         location="us-central1",
 39 |         display_name="test-instance",
 40 |     )
 41 |     self.assertEqual(instance_id, "123")
 42 | 
 43 |   @absltest.mock.patch("google.cloud.aiplatform.aiplatform.Tensorboard")
 44 |   @absltest.mock.patch("google.cloud.aiplatform.aiplatform.Tensorboard.create")
 45 |   @absltest.mock.patch(
 46 |       "google.cloud.aiplatform.aiplatform.tensorboard.Tensorboard.list"
 47 |   )
 48 |   def testCreateInstanceWhenSameNameInstanceExist(
 49 |       self, mock_tensorboard_list, mock_tensorboard_create, mock_tensorboard
 50 |   ):
 51 |     mock_tensorboard_instance = mock_tensorboard.return_value
 52 |     mock_tensorboard_instance.display_name = "test-instance"
 53 |     mock_tensorboard_list.return_value = [mock_tensorboard_instance]
 54 | 
 55 |     instance_id = tensorboard.create_instance(
 56 |         "test-project", "us-central1", "test-instance"
 57 |     )
 58 | 
 59 |     mock_tensorboard_list.assert_called_once()
 60 |     mock_tensorboard_create.assert_not_called()
 61 |     self.assertEqual(instance_id, mock_tensorboard_instance.name)
 62 | 
 63 |   def testCreateInstanceForUnsupportedRegion(self):
 64 |     with self.assertLogs(level="ERROR") as log:
 65 |       instance_id = tensorboard.create_instance(
 66 |           "test-project", "us-central2", "test-instance"
 67 |       )
 68 | 
 69 |     self.assertRegex(
 70 |         log.output[0], "ValueError: Unsupported region for Vertex AI"
 71 |     )
 72 |     self.assertIsNone(instance_id)
 73 | 
 74 |   @absltest.mock.patch("google.cloud.aiplatform.aiplatform.Tensorboard.create")
 75 |   @absltest.mock.patch(
 76 |       "google.cloud.aiplatform.aiplatform.tensorboard.Tensorboard.list"
 77 |   )
 78 |   def testCreateInstanceWhenExceptionIsThrown(
 79 |       self, mock_tensorboard_list, mock_tensorboard_create
 80 |   ):
 81 |     mock_tensorboard_list.return_value = []
 82 |     mock_tensorboard_create.return_value = Exception("Exception is thrown...")
 83 | 
 84 |     with self.assertLogs(level="ERROR"):
 85 |       instance_id = tensorboard.create_instance(
 86 |           "test-project", "us-central1", "test-instance"
 87 |       )
 88 | 
 89 |     mock_tensorboard_list.assert_called_once()
 90 |     mock_tensorboard_create.assert_called_once_with(
 91 |         project="test-project",
 92 |         location="us-central1",
 93 |         display_name="test-instance",
 94 |     )
 95 |     self.assertIsNone(instance_id)
 96 | 
 97 |   @absltest.mock.patch(
 98 |       "google.cloud.aiplatform.aiplatform.tensorboard.TensorboardExperiment.list"
 99 |   )
100 |   @absltest.mock.patch("google.cloud.aiplatform.aiplatform.Tensorboard")
101 |   @absltest.mock.patch(
102 |       "google.cloud.aiplatform.aiplatform.tensorboard.Tensorboard.list"
103 |   )
104 |   @absltest.mock.patch(
105 |       "google.cloud.aiplatform.aiplatform.TensorboardExperiment.create"
106 |   )
107 |   def testCreateExperimentWhenTensorboardInstanceExist(
108 |       self,
109 |       mock_experiment_create,
110 |       mock_tensorboard_list,
111 |       mock_tensorboard,
112 |       mock_experiment_list,
113 |   ):
114 |     mock_tensorboard_instance = mock_tensorboard.return_value
115 |     mock_tensorboard_instance.display_name = "test-instance"
116 |     mock_tensorboard_instance.name = "123"
117 |     mock_tensorboard_list.return_value = [mock_tensorboard_instance]
118 |     mock_experiment_list.return_value = []
119 |     expected_resource_name = "projects/770040921623/locations/us-central1/tensorboards/123/experiments/test-experiment"
120 |     mock_experiment_create.return_value.resource_name = expected_resource_name
121 |     expected_tensorboard_url = (
122 |         "https://us-central1.tensorboard.googleusercontent.com/experiment/"
123 |         + expected_resource_name.replace("/", "+")
124 |     )
125 | 
126 |     instance_id, tensorboard_url = tensorboard.create_experiment(
127 |         "test-project", "us-central1", "test-experiment", "test-instance"
128 |     )
129 | 
130 |     mock_tensorboard_list.assert_called_once()
131 |     mock_experiment_create.assert_called_once_with(
132 |         tensorboard_experiment_id="test-experiment",
133 |         tensorboard_name="123",
134 |         display_name="test-experiment",
135 |     )
136 |     self.assertEqual(instance_id, "123")
137 |     self.assertEqual(tensorboard_url, expected_tensorboard_url)
138 | 
139 |   @absltest.mock.patch(
140 |       "google.cloud.aiplatform.aiplatform.tensorboard.TensorboardExperiment.list"
141 |   )
142 |   @absltest.mock.patch("google.cloud.aiplatform.aiplatform.Tensorboard.create")
143 |   @absltest.mock.patch(
144 |       "google.cloud.aiplatform.aiplatform.tensorboard.Tensorboard.list"
145 |   )
146 |   @absltest.mock.patch(
147 |       "google.cloud.aiplatform.aiplatform.TensorboardExperiment.create"
148 |   )
149 |   def testCreateExperimentWhenNoTensorboardInstanceExist(
150 |       self,
151 |       mock_experiment_create,
152 |       mock_tensorboard_list,
153 |       mock_tensorboard_create,
154 |       mock_experiment_list,
155 |   ):
156 |     mock_tensorboard_list.return_value = []
157 |     mock_tensorboard_create.return_value.name = "123"
158 |     mock_experiment_list.return_value = []
159 |     expected_resource_name = "projects/770040921623/locations/us-central1/tensorboards/123/experiments/test-experiment"
160 |     mock_experiment_create.return_value.resource_name = expected_resource_name
161 |     expected_tensorboard_url = (
162 |         "https://us-central1.tensorboard.googleusercontent.com/experiment/"
163 |         + expected_resource_name.replace("/", "+")
164 |     )
165 | 
166 |     instance_id, tensorboard_url = tensorboard.create_experiment(
167 |         "test-project", "us-central1", "test-experiment", "test-instance"
168 |     )
169 | 
170 |     mock_tensorboard_list.assert_called()
171 |     mock_tensorboard_create.assert_called_once_with(
172 |         project="test-project",
173 |         location="us-central1",
174 |         display_name="test-instance",
175 |     )
176 |     mock_experiment_create.assert_called_once_with(
177 |         tensorboard_experiment_id="test-experiment",
178 |         tensorboard_name="123",
179 |         display_name="test-experiment",
180 |     )
181 |     self.assertEqual(instance_id, "123")
182 |     self.assertEqual(tensorboard_url, expected_tensorboard_url)
183 | 
184 |   @absltest.mock.patch(
185 |       "google.cloud.aiplatform.aiplatform.TensorboardExperiment"
186 |   )
187 |   @absltest.mock.patch(
188 |       "google.cloud.aiplatform.aiplatform.tensorboard.TensorboardExperiment.list"
189 |   )
190 |   @absltest.mock.patch("google.cloud.aiplatform.aiplatform.Tensorboard")
191 |   @absltest.mock.patch(
192 |       "google.cloud.aiplatform.aiplatform.tensorboard.Tensorboard.list"
193 |   )
194 |   @absltest.mock.patch(
195 |       "google.cloud.aiplatform.aiplatform.TensorboardExperiment.create"
196 |   )
197 |   def testCreateExperimentWhenTensorboardInstanceAndExperimentExist(
198 |       self,
199 |       mock_experiment_create,
200 |       mock_tensorboard_list,
201 |       mock_tensorboard,
202 |       mock_experiment_list,
203 |       mock_experiment,
204 |   ):
205 |     mock_tensorboard_instance = mock_tensorboard.return_value
206 |     mock_tensorboard_instance.display_name = "test-instance"
207 |     mock_tensorboard_instance.name = "123"
208 |     mock_tensorboard_list.return_value = [mock_tensorboard_instance]
209 |     expected_resource_name = "projects/770040921623/locations/us-central1/tensorboards/123/experiments/test-experiment"
210 |     expected_tensorboard_url = (
211 |         "https://us-central1.tensorboard.googleusercontent.com/experiment/"
212 |         + expected_resource_name.replace("/", "+")
213 |     )
214 |     mock_experiment_instance = mock_experiment.return_value
215 |     mock_experiment_instance.display_name = "test-experiment"
216 |     mock_experiment_instance.resource_name = expected_resource_name
217 |     mock_experiment_list.return_value = [mock_experiment_instance]
218 | 
219 |     instance_id, tensorboard_url = tensorboard.create_experiment(
220 |         "test-project", "us-central1", "test-experiment", "test-instance"
221 |     )
222 | 
223 |     mock_tensorboard_list.assert_called_once()
224 |     mock_experiment_create.assert_not_called()
225 |     self.assertEqual(instance_id, "123")
226 |     self.assertEqual(tensorboard_url, expected_tensorboard_url)
227 | 
228 |   def testCreateExperimentForUnsupportedRegion(self):
229 |     with self.assertLogs(level="ERROR") as log:
230 |       instance_id, tensorboard_url = tensorboard.create_experiment(
231 |           "test-project", "us-central2", "test-experiment", "test-instance"
232 |       )
233 | 
234 |     self.assertRegex(
235 |         log.output[0], "ValueError: Unsupported region for Vertex AI"
236 |     )
237 |     self.assertIsNone(instance_id)
238 |     self.assertIsNone(tensorboard_url)
239 | 
240 |   @absltest.mock.patch("google.cloud.aiplatform.aiplatform.Tensorboard.create")
241 |   @absltest.mock.patch(
242 |       "google.cloud.aiplatform.aiplatform.tensorboard.Tensorboard.list"
243 |   )
244 |   @absltest.mock.patch(
245 |       "google.cloud.aiplatform.aiplatform.TensorboardExperiment.create"
246 |   )
247 |   def testCreateExperimentWhenCreateInstanceFails(
248 |       self,
249 |       mock_experiment_create,
250 |       mock_tensorboard_list,
251 |       mock_tensorboard_create,
252 |   ):
253 |     mock_tensorboard_list.return_value = []
254 |     mock_tensorboard_create.return_value = Exception("Exception is thrown...")
255 | 
256 |     with self.assertLogs(level="ERROR") as log:
257 |       instance_id, tensorboard_url = tensorboard.create_experiment(
258 |           "test-project", "us-central1", "test-experiment", "test-instance"
259 |       )
260 | 
261 |     mock_tensorboard_list.assert_called()
262 |     mock_tensorboard_create.assert_called_once_with(
263 |         project="test-project",
264 |         location="us-central1",
265 |         display_name="test-instance",
266 |     )
267 |     mock_experiment_create.assert_not_called()
268 |     self.assertRegex(
269 |         log.output[0], "Error while creating Tensorboard instance."
270 |     )
271 |     self.assertIsNone(instance_id)
272 |     self.assertIsNone(tensorboard_url)
273 | 
274 | 
275 | if __name__ == "__main__":
276 |   absltest.main()
277 | 


--------------------------------------------------------------------------------
/pip_package/cloud_accelerator_diagnostics/tests/tensorboard_uploader/uploader_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import threading
 16 | 
 17 | from absl.testing import absltest
 18 | from cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader import uploader
 19 | 
 20 | 
 21 | class UploaderTest(absltest.TestCase):
 22 | 
 23 |   @absltest.mock.patch(
 24 |       "cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader.uploader.tensorboard"
 25 |   )
 26 |   @absltest.mock.patch(
 27 |       "cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader.uploader.aiplatform"
 28 |   )
 29 |   def testWhenUploadToTensorboardThenVertexUploaderIsCalled(
 30 |       self,
 31 |       mock_aiplatform,
 32 |       mock_tensorboard,
 33 |   ):
 34 |     # given
 35 |     mock_tensorboard.get_instance_identifiers.return_value = ["test_experiment"]
 36 |     mock_tensorboard.get_experiment.return_value = "test-experiment"
 37 | 
 38 |     # when
 39 |     uploader.start_upload_to_tensorboard(
 40 |         "test-project",
 41 |         "us-central1",
 42 |         "test-experiment",
 43 |         "test-instance",
 44 |         "logdir",
 45 |     )
 46 | 
 47 |     # then
 48 |     mock_aiplatform.init.assert_called_once_with(
 49 |         project="test-project", location="us-central1"
 50 |     )
 51 |     mock_aiplatform.start_upload_tb_log.assert_called_once_with(
 52 |         tensorboard_id="test_experiment",
 53 |         tensorboard_experiment_name="test-experiment",
 54 |         logdir="logdir",
 55 |     )
 56 | 
 57 |   @absltest.mock.patch(
 58 |       "cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader.uploader.tensorboard"
 59 |   )
 60 |   @absltest.mock.patch(
 61 |       "cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader.uploader.aiplatform"
 62 |   )
 63 |   def testWhenNoTensorboardExistsThenVertexUploaderNotCalled(
 64 |       self,
 65 |       mock_aiplatform,
 66 |       mock_tensorboard,
 67 |   ):
 68 |     # given
 69 |     mock_tensorboard.get_instance_identifiers.return_value = []
 70 | 
 71 |     # when
 72 |     with self.assertLogs(level="ERROR") as log:
 73 |       uploader.start_upload_to_tensorboard(
 74 |           "test-project",
 75 |           "us-central1",
 76 |           "test-experiment",
 77 |           "test-instance",
 78 |           "logdir",
 79 |       )
 80 | 
 81 |     # then
 82 |     self.assertEqual(threading.active_count(), 1)
 83 |     self.assertRegex(
 84 |         log.output[0],
 85 |         "No Tensorboard instance with the name test-instance present in the"
 86 |         " project test-project.",
 87 |     )
 88 |     mock_aiplatform.init.assert_called_once_with(
 89 |         project="test-project", location="us-central1"
 90 |     )
 91 |     mock_aiplatform.start_upload_tb_log.assert_not_called()
 92 | 
 93 |   @absltest.mock.patch(
 94 |       "cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader.uploader.tensorboard"
 95 |   )
 96 |   @absltest.mock.patch(
 97 |       "cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader.uploader.aiplatform"
 98 |   )
 99 |   def testWhenNoExperimentExistsThenVertexUploaderNotCalled(
100 |       self,
101 |       mock_aiplatform,
102 |       mock_tensorboard,
103 |   ):
104 |     # given
105 |     mock_tensorboard.get_instance_identifiers.return_value = ["test_experiment"]
106 |     mock_tensorboard.get_experiment.return_value = None
107 | 
108 |     # when
109 |     with self.assertLogs(level="ERROR") as log:
110 |       uploader.start_upload_to_tensorboard(
111 |           "test-project",
112 |           "us-central1",
113 |           "test-experiment",
114 |           "test-instance",
115 |           "logdir",
116 |       )
117 | 
118 |     # then
119 |     self.assertRegex(
120 |         log.output[0],
121 |         "No Tensorboard experiment with the name test-experiment present in"
122 |         " the project test-project.",
123 |     )
124 |     mock_aiplatform.init.assert_called_once_with(
125 |         project="test-project", location="us-central1"
126 |     )
127 |     mock_aiplatform.start_upload_tb_log.assert_not_called()
128 | 
129 |   @absltest.mock.patch(
130 |       "cloud_accelerator_diagnostics.pip_package.cloud_accelerator_diagnostics.src.tensorboard_uploader.uploader.aiplatform"
131 |   )
132 |   def testWhenStopUploadToTensorboardIsCalledThenVertexUploadIsStopped(
133 |       self,
134 |       mock_aiplatform,
135 |   ):
136 |     # when
137 |     uploader.stop_upload_to_tensorboard()
138 | 
139 |     # then
140 |     mock_aiplatform.end_upload_tb_log.assert_called_once()
141 | 
142 | 
143 | if __name__ == "__main__":
144 |   absltest.main()
145 | 


--------------------------------------------------------------------------------
/pip_package/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | [project]
16 | name = "cloud-accelerator-diagnostics"
17 | version = "0.1.1"
18 | authors = [
19 |   { name="Cloud TPU Team", email="cloud-tpu-eng@google.com" },
20 | ]
21 | description = "Monitor, debug and profile the jobs running on Cloud accelerators like TPUs and GPUs."
22 | readme = "README.md"
23 | requires-python = ">=3.8"
24 | license = {text = "Apache-2.0"}
25 | classifiers = [
26 |     "Programming Language :: Python :: 3.8",
27 |     "Programming Language :: Python :: 3.9",
28 |     "Programming Language :: Python :: 3.10",
29 |     "Programming Language :: Python :: 3.11",
30 | ]
31 | keywords = []
32 | 
33 | # pip dependencies installed with `pip install -e .`
34 | dependencies = [
35 |   "google-cloud-aiplatform[tensorboard]"
36 | ]
37 | 
38 | [project.urls]
39 | "Homepage" = "https://github.com/google/cloud-accelerator-diagnostics"
40 | "Bug Tracker" = "https://github.com/google/cloud-accelerator-diagnostics/issues"
41 | 
42 | [build-system]
43 | # Build system specify which backend is used to build/install the project
44 | requires = ["flit_core >=3.8,<4"]
45 | build-backend = "flit_core.buildapi"
46 | 
47 | [tool.flit.sdist]
48 | # Flit specific options (files to exclude from the PyPI package)
49 | exclude = [
50 |   # Do not release tests files on PyPI
51 |   "tests/*_test.py",
52 | ]
53 | 


--------------------------------------------------------------------------------
/tpu_info/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright 2023 Google LLC
 3 |  
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 |  
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 |  
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  -->
16 | # `tpu-info` CLI
17 | 
18 | `tpu-info` is a simple CLI tool for detecting Cloud TPU devices and reading
19 | runtime metrics from `libtpu`, including memory usage.
20 | 
21 | Note: to access `libtpu` utilization metrics, you must have a workload running
22 | with a supported ML framework, such as JAX or PyTorch/XLA. See the
23 | [Usage](#usage) section for more information.
24 | 
25 | ## Installing
26 | 
27 | Install the latest release using `pip`:
28 | 
29 | ```
30 | pip install tpu-info
31 | ```
32 | 
33 | Alternatively, install `tpu-info` from source:
34 | 
35 | ```bash
36 | pip install git+https://github.com/google/cloud-accelerator-diagnostics/#subdirectory=tpu_info
37 | ```
38 | 
39 | ## Usage
40 | 
41 | To view current TPU utilization data, `tpu-info` requires a running TPU workload
42 | with supported ML framework[^1] such as JAX or PyTorch/XLA. For example:
43 | 
44 | ```
45 | # JAX
46 | >>> import jax
47 | >>> jax.device_count()
48 | 4
49 | # Create a tensor on the TPU
50 | >>> t = jax.numpy.ones((300, 300))
51 | 
52 | # PyTorch/XLA
53 | >>> import torch
54 | >>> import torch_xla
55 | >>> t = torch.randn((300, 300), device=torch_xla.device())
56 | ```
57 | 
58 | Then, on the same machine, run the `tpu-info` command line tool:
59 | 
60 | ```bash
61 | $ tpu-info
62 | TPU Chips
63 | ┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓
64 | ┃ Chip        ┃ Type        ┃ Devices ┃ PID    ┃
65 | ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩
66 | │ /dev/accel0 │ TPU v4 chip │ 1       │ 130007 │
67 | │ /dev/accel1 │ TPU v4 chip │ 1       │ 130007 │
68 | │ /dev/accel2 │ TPU v4 chip │ 1       │ 130007 │
69 | │ /dev/accel3 │ TPU v4 chip │ 1       │ 130007 │
70 | └─────────────┴─────────────┴─────────┴────────┘
71 | Connected to libtpu at grpc://localhost:8431...
72 | TPU Utilization
73 | ┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
74 | ┃ Device ┃ Memory usage         ┃ Duty cycle ┃
75 | ┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
76 | │ 0      │ 0.00 GiB / 31.75 GiB │      0.00% │
77 | │ 1      │ 0.00 GiB / 31.75 GiB │      0.00% │
78 | │ 2      │ 0.00 GiB / 31.75 GiB │      0.00% │
79 | │ 3      │ 0.00 GiB / 31.75 GiB │      0.00% │
80 | └────────┴──────────────────────┴────────────┘
81 | ```
82 | 
83 | [^1]: Releases from before 2024 may not be compatible.
84 | 


--------------------------------------------------------------------------------
/tpu_info/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | [build-system]
16 | requires = ["hatchling", "hatch-build-scripts", "grpcio-tools~=1.65.5"]
17 | build-backend = "hatchling.build"
18 | 
19 | [project]
20 | version = "0.3.1"
21 | name = "tpu-info"
22 | dependencies = [
23 |   # `grpcio` should match `grpcio-tools` build dependency
24 |   "grpcio>=1.65.5",
25 |   "protobuf",
26 |   "rich",
27 | ]
28 | authors = [
29 |   { name="Cloud TPU Team", email="cloud-tpu-eng@google.com" },
30 | ]
31 | description = "CLI tool to view TPU metrics"
32 | readme = "README.md"
33 | license = {text = "Apache-2.0"}
34 | requires-python = ">=3.8"
35 | 
36 | [project.urls]
37 | homepage = "https://github.com/google/cloud-accelerator-diagnostics/tree/main/tpu_info"
38 | repository = "https://github.com/google/cloud-accelerator-diagnostics"
39 | 
40 | [project.optional-dependencies]
41 | test = [
42 |   "absl-py",
43 | ]
44 | 
45 | [project.scripts]
46 | tpu-info = "tpu_info.cli:print_chip_info"
47 | 
48 | [tool.hatch.build]
49 | exclude = [
50 |   "*_test.py",
51 |   "*.proto",
52 | ]
53 | 
54 | [tool.hatch.build.targets.wheel]
55 | # HACK: Avoid copying files generated below
56 | # See https://github.com/rmorshea/hatch-build-scripts/discussions/4
57 | artifacts = [
58 |   "tpu_metric_service_pb2.py",
59 |   "tpu_metric_service_pb2.pyi",
60 |   "tpu_metric_service_pb2_grpc.py",
61 | ]
62 | 
63 | [[tool.hatch.build.hooks.build-scripts.scripts]]
64 | commands = [
65 |   # Look up proto from current directory to ensure imports use `tpu_info`
66 |   # package (e.g. `from tpu_info.proto import ...`)
67 |   # See protoc bug: https://github.com/protocolbuffers/protobuf/issues/1491
68 |   "python -m grpc_tools.protoc -I. tpu_info/proto/tpu_metric_service.proto --python_out=. --pyi_out=. --grpc_python_out=.",
69 | ]
70 | artifacts = []
71 | clean_artifacts = false
72 | 


--------------------------------------------------------------------------------
/tpu_info/tpu_info/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from . import cli
16 | 


--------------------------------------------------------------------------------
/tpu_info/tpu_info/args.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Argument parsing for the tpu-info tool."""
16 | 
17 | import argparse
18 | 
19 | 
20 | def parse_arguments():
21 |   """Parses command line arguments for the tpu-info tool."""
22 |   parser = argparse.ArgumentParser(
23 |       description="Display TPU info and metrics.",
24 |       formatter_class=argparse.RawTextHelpFormatter,
25 |   )
26 |   parser.add_argument(
27 |       "--streaming",
28 |       action="store_true",
29 |       help="Enable streaming mode to refresh metrics continuously",
30 |   )
31 |   parser.add_argument(
32 |       "--rate",
33 |       type=float,
34 |       default=1.0,
35 |       help=(
36 |           "Refresh rate in seconds for streaming mode (default: 1.0; effective"
37 |           " when streaming is implemented)."
38 |       ),
39 |   )
40 |   return parser.parse_args()
41 | 


--------------------------------------------------------------------------------
/tpu_info/tpu_info/cli.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Defines command line interface for `tpu-info` tool.
 16 | 
 17 | Top-level functions should be added to `project.scripts` in `pyproject.toml`.
 18 | """
 19 | 
 20 | import sys
 21 | import time
 22 | from typing import Any, List
 23 | 
 24 | from tpu_info import args
 25 | from tpu_info import device
 26 | from tpu_info import metrics
 27 | import grpc
 28 | from rich.console import Console, Group
 29 | from rich.live import Live
 30 | import rich.table
 31 | 
 32 | 
 33 | def _bytes_to_gib(size: int) -> float:
 34 |   return size / (1 << 30)
 35 | 
 36 | 
 37 | # TODO(vidishasethi): b/418938764 - Modularize by extracting
 38 | #  each table's rendering logic into its own dedicated helper function.
 39 | def _fetch_and_render_tables(chip_type: Any, count: int):
 40 |   """Fetches all TPU data and prepares a list of Rich Table objects for display."""
 41 |   renderables: List[rich.table.Table] = []
 42 | 
 43 |   table = rich.table.Table(title="TPU Chips", title_justify="left")
 44 |   table.add_column("Chip")
 45 |   table.add_column("Type")
 46 |   table.add_column("Devices")
 47 |   # TODO(wcromar): this may not match the libtpu runtime metrics
 48 |   # table.add_column("HBM (per core)")
 49 |   table.add_column("PID")
 50 | 
 51 |   chip_paths = [device.chip_path(chip_type, index) for index in range(count)]
 52 |   chip_owners = device.get_chip_owners()
 53 | 
 54 |   for chip in chip_paths:
 55 |     owner = chip_owners.get(chip)
 56 | 
 57 |     table.add_row(
 58 |         chip,
 59 |         str(chip_type),
 60 |         str(chip_type.value.devices_per_chip),
 61 |         str(owner),
 62 |     )
 63 | 
 64 |   renderables.append(table)
 65 | 
 66 |   table = rich.table.Table(
 67 |       title="TPU Runtime Utilization", title_justify="left"
 68 |   )
 69 |   table.add_column("Device")
 70 |   table.add_column("HBM usage")
 71 |   table.add_column("Duty cycle", justify="right")
 72 | 
 73 |   try:
 74 |     device_usage = metrics.get_chip_usage(chip_type)
 75 |   except grpc.RpcError as e:
 76 |     if e.code() == grpc.StatusCode.UNAVAILABLE:  # pytype: disable=attribute-error
 77 |       print(
 78 |           "WARNING: Libtpu metrics unavailable. Is there a framework using the"
 79 |           " TPU? See"
 80 |           " https://github.com/google/cloud-accelerator-diagnostics/tree/main/tpu_info"
 81 |           " for more information"
 82 |       )
 83 |     else:
 84 |       print(f"ERROR: {e}")
 85 | 
 86 |     device_usage = [metrics.Usage(i, -1, -1, -1) for i in range(count)]
 87 | 
 88 |   # TODO(wcromar): take alternative ports as a flag
 89 |   print("Connected to libtpu at grpc://localhost:8431...")
 90 |   for chip in device_usage:
 91 |     if chip.memory_usage < 0:
 92 |       memory_usage = "N/A"
 93 |     else:
 94 |       memory_usage = (
 95 |           f"{_bytes_to_gib(chip.memory_usage):.2f} GiB /"
 96 |           f" {_bytes_to_gib(chip.total_memory):.2f} GiB"
 97 |       )
 98 |     if chip.duty_cycle_pct < 0:
 99 |       duty_cycle_pct = "N/A"
100 |     else:
101 |       duty_cycle_pct = f"{chip.duty_cycle_pct:.2f}%"
102 |     table.add_row(
103 |         str(chip.device_id),
104 |         memory_usage,
105 |         duty_cycle_pct
106 |         if chip_type.value.devices_per_chip == 1 or chip.device_id % 2 == 0
107 |         else "",
108 |     )
109 | 
110 |   renderables.append(table)
111 | 
112 |   table = rich.table.Table(title="TensorCore Utilization", title_justify="left")
113 |   table.add_column("Chip ID")
114 |   table.add_column("TensorCore Utilization", justify="right")
115 | 
116 |   try:
117 |     # pylint: disable=g-import-not-at-top
118 |     from libtpu import sdk  # pytype: disable=import-error
119 | 
120 |     tensorcore_util_data = sdk.monitoring.get_metric("tensorcore_util").data()
121 |   except ImportError as e:
122 |     print(f"WARNING: ImportError: {e}.")
123 |   except AttributeError as e:
124 |     print(
125 |         f"WARNING: {e}. Please check if the latest libtpu is used"
126 |     )
127 |   except RuntimeError as e:
128 |     print(
129 |         f"WARNING: {e}. Please check if the latest vbar control agent is used."
130 |     )
131 |   else:
132 |     for i in range(len(tensorcore_util_data)):
133 |       tc_data = f"{tensorcore_util_data[i]}%"
134 |       table.add_row(
135 |           str(i),
136 |           tc_data,
137 |       )
138 |     renderables.append(table)
139 | 
140 |   table = rich.table.Table(
141 |       title="TPU Buffer Transfer Latency", title_justify="left"
142 |   )
143 |   table.add_column("Buffer Size")
144 |   table.add_column("P50", justify="right")
145 |   table.add_column("P90", justify="right")
146 |   table.add_column("P95", justify="right")
147 |   table.add_column("P999", justify="right")
148 | 
149 |   try:
150 |     buffer_transfer_latency_distributions = (
151 |         metrics.get_buffer_transfer_latency()
152 |     )
153 |   except grpc.RpcError as e:
154 |     if e.code() == grpc.StatusCode.UNAVAILABLE:  # pytype: disable=attribute-error
155 |       print(
156 |           "WARNING: Buffer Transfer Latency metrics unavailable. Did you start"
157 |           " a MULTI_SLICE workload with"
158 |           " `TPU_RUNTIME_METRICS_PORTS=8431,8432,8433,8434`?"
159 |       )
160 |     else:
161 |       print(f"ERROR: {e}")
162 | 
163 |     buffer_transfer_latency_distributions = []
164 | 
165 |   for distribution in buffer_transfer_latency_distributions:
166 |     table.add_row(
167 |         distribution.buffer_size,
168 |         f"{distribution.p50:.2f} us",
169 |         f"{distribution.p90:.2f} us",
170 |         f"{distribution.p95:.2f} us",
171 |         f"{distribution.p999:.2f} us",
172 |     )
173 |   renderables.append(table)
174 | 
175 |   return renderables
176 | 
177 | 
178 | def print_chip_info():
179 |   """Print local TPU devices and libtpu runtime metrics."""
180 |   cli_args = args.parse_arguments()
181 |   # TODO(wcromar): Merge all of this info into one table
182 |   chip_type, count = device.get_local_chips()
183 |   if not chip_type:
184 |     print("No TPU chips found.")
185 |     return
186 | 
187 |   if cli_args.streaming:
188 |     if cli_args.rate <= 0:
189 |       print("Error: Refresh rate must be positive.", file=sys.stderr)
190 |       return
191 | 
192 |     print(
193 |         f"Starting streaming mode (refresh rate: {cli_args.rate}s). Press"
194 |         " Ctrl+C to exit."
195 |     )
196 | 
197 |     try:
198 |       renderables = _fetch_and_render_tables(chip_type, count)
199 | 
200 |       if not renderables and chip_type:
201 |         print(
202 |             "No data tables could be generated. Exiting streaming.",
203 |             file=sys.stderr,
204 |         )
205 |         return
206 | 
207 |       render_group = Group(*renderables)
208 | 
209 |       with Live(
210 |           render_group,
211 |           refresh_per_second=4,
212 |           screen=True,
213 |           vertical_overflow="visible",
214 |       ) as live:
215 |         while True:
216 |           time.sleep(cli_args.rate)
217 |           new_renderables = _fetch_and_render_tables(chip_type, count)
218 |           live.update(Group(*new_renderables))
219 |     except KeyboardInterrupt:
220 |       print("\nExiting streaming mode.")
221 |     except Exception as e:
222 |       import traceback
223 | 
224 |       print(
225 |           f"\nAn unexpected error occurred in streaming mode: {e}",
226 |           file=sys.stderr,
227 |       )
228 |       traceback.print_exc(file=sys.stderr)
229 |       sys.exit(1)
230 | 
231 |   else:
232 |     renderables = _fetch_and_render_tables(chip_type, count)
233 | 
234 |     if renderables:
235 |       console_obj = Console()
236 |       for item in renderables:
237 |         console_obj.print(item)
238 | 


--------------------------------------------------------------------------------
/tpu_info/tpu_info/device.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Utilities for detecting locally-attached TPU devices."""
 16 | 
 17 | import collections
 18 | import enum
 19 | import glob
 20 | import os
 21 | import pathlib
 22 | import re
 23 | import typing
 24 | from typing import Dict, Literal, Optional, Tuple
 25 | 
 26 | GOOGLE_PCI_VENDOR_ID = "0x1ae0"
 27 | 
 28 | 
 29 | class TpuChip(enum.Enum):
 30 |   """TPU chip versions and basic specs."""
 31 | 
 32 |   class Info(typing.NamedTuple):
 33 |     """Specs for a specific TPU chip version."""
 34 | 
 35 |     name: str
 36 |     hbm_gib: int
 37 |     devices_per_chip: Literal[1, 2]
 38 | 
 39 |   V2 = Info("v2", hbm_gib=8, devices_per_chip=2)
 40 |   V3 = Info("v3", hbm_gib=16, devices_per_chip=2)
 41 |   V4 = Info("v4", hbm_gib=32, devices_per_chip=1)
 42 |   V5E = Info("v5e", hbm_gib=16, devices_per_chip=1)
 43 |   V5P = Info("v5p", hbm_gib=95, devices_per_chip=1)
 44 |   V6E = Info("v6e", hbm_gib=32, devices_per_chip=1)
 45 | 
 46 |   @classmethod
 47 |   def from_pci_device_id(
 48 |       cls, device_id: str, subsystem_id: str
 49 |   ) -> Optional["TpuChip"]:
 50 |     """Returns TPU chip type for given PCI IDs, or None if not a TPU device."""
 51 |     # TPU v2 and v3 share a device ID
 52 |     if device_id == "0x0027":
 53 |       if subsystem_id == "0x004e":
 54 |         return cls.V2
 55 |       elif subsystem_id == "0x004f":
 56 |         return cls.V3
 57 | 
 58 |     device_id_to_device = {
 59 |         "0x005e": cls.V4,
 60 |         "0x0063": cls.V5E,
 61 |         "0x0062": cls.V5P,
 62 |         "0x006f": cls.V6E,
 63 |     }
 64 | 
 65 |     return device_id_to_device.get(device_id)
 66 | 
 67 |   def __str__(self):
 68 |     """Human-readable name of TPU chip type."""
 69 |     return f"TPU {self.value.name} chip"
 70 | 
 71 | 
 72 | def get_local_chips() -> Tuple[Optional[TpuChip], int]:
 73 |   """Returns the type and number of TPU chips available."""
 74 |   count = collections.Counter()
 75 |   for pci_path in glob.glob("/sys/bus/pci/devices/*"):
 76 |     vendor_path = os.path.join(pci_path, "vendor")
 77 |     vendor_id = pathlib.Path(vendor_path).read_text().strip()
 78 |     if vendor_id != GOOGLE_PCI_VENDOR_ID:
 79 |       continue
 80 | 
 81 |     device_id_path = os.path.join(pci_path, "device")
 82 |     device_id = pathlib.Path(device_id_path).read_text().strip()
 83 |     subsystem_path = os.path.join(pci_path, "subsystem_device")
 84 |     subsystem_id = pathlib.Path(subsystem_path).read_text().strip()
 85 | 
 86 |     chip_type = TpuChip.from_pci_device_id(device_id, subsystem_id)
 87 |     if chip_type:
 88 |       count[chip_type] += 1
 89 | 
 90 |   assert len(count) <= 1, f"Expected one chip type, got {count}"
 91 |   return count.most_common()[0] if count else (None, 0)
 92 | 
 93 | 
 94 | def chip_path(chip_type: TpuChip, index: int):
 95 |   """Returns the expected `/dev` path for a given TPU device type."""
 96 |   if chip_type in [TpuChip.V5E, TpuChip.V5P, TpuChip.V6E]:
 97 |     return f"/dev/vfio/{index}"
 98 |   else:
 99 |     return f"/dev/accel{index}"
100 | 
101 | 
102 | def get_chip_owners() -> Dict[str, int]:
103 |   """Returns a mapping of device paths to PIDs of processes using that device."""
104 |   device_owners = {}
105 | 
106 |   for link in glob.glob("/proc/*/fd/*"):
107 |     try:
108 |       file = os.readlink(link)
109 |     except FileNotFoundError:
110 |       continue
111 | 
112 |     # /dev/accel_ or /dev/vfio/_
113 |     if re.fullmatch(r"/dev/(?:accel|vfio/)\d", file):
114 |       match = re.fullmatch(r"/proc/(\d+)/fd/\d+", link)
115 |       if not match:
116 |         raise RuntimeError("Unknown link pattern", link)
117 | 
118 |       device_owners[file] = int(match.group(1))
119 | 
120 |   return device_owners
121 | 


--------------------------------------------------------------------------------
/tpu_info/tpu_info/metrics.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Client library for libtpu runtime metrics."""
 16 | 
 17 | import enum
 18 | import itertools
 19 | import typing
 20 | from typing import List
 21 | 
 22 | from tpu_info import device
 23 | import grpc
 24 | 
 25 | from tpu_info.proto import tpu_metric_service_pb2 as tpu_metrics
 26 | from tpu_info.proto import tpu_metric_service_pb2_grpc as tpu_metrics_grpc
 27 | 
 28 | 
 29 | class MetricName(enum.Enum):
 30 |   """Metric names defined in libtpu."""
 31 | 
 32 |   TOTAL_MEMORY = "tpu.runtime.hbm.memory.total.bytes"
 33 |   MEMORY_USAGE = "tpu.runtime.hbm.memory.usage.bytes"
 34 |   DUTY_CYCLE_PCT = "tpu.runtime.tensorcore.dutycycle.percent"
 35 |   BUFFER_TRANSFER_LATENCY_US = (
 36 |       "megascale.dcn_transfer_latencies.microsecond.cumulative.distribution"
 37 |   )
 38 | 
 39 | 
 40 | class Usage(typing.NamedTuple):
 41 |   """Usage measurements for a TPU device."""
 42 | 
 43 |   device_id: int
 44 |   memory_usage: int
 45 |   total_memory: int
 46 |   duty_cycle_pct: float
 47 | 
 48 | 
 49 | class BufferTransferLatencyDistribution(typing.NamedTuple):
 50 |   """Distribution measurements."""
 51 | 
 52 |   buffer_size: str
 53 |   p50: float
 54 |   p90: float
 55 |   p95: float
 56 |   p999: float
 57 | 
 58 | 
 59 | def get_chip_usage(
 60 |     chip_type: device.TpuChip, addr: str = "localhost:8431"
 61 | ) -> List[Usage]:
 62 |   """Gets usage statistics for all attached TPU devices.
 63 | 
 64 |   Args:
 65 |     chip_type: TPU chip version. Determines how metrics are interpreted.
 66 |     addr: GRPC address of libtpu metrics server.
 67 | 
 68 |   Returns:
 69 |     List of usage statistics for each TPU device.
 70 |   """
 71 |   channel = grpc.secure_channel(addr, grpc.local_channel_credentials())
 72 |   client = tpu_metrics_grpc.RuntimeMetricServiceStub(channel)
 73 | 
 74 |   def sorted_metric_response(
 75 |       metric_name: MetricName,
 76 |   ) -> List[tpu_metrics.Metric]:
 77 |     # Manually annotate type until GRPC supports annotations
 78 |     # See https://github.com/grpc/grpc/issues/29041
 79 |     resp: tpu_metrics.MetricResponse = client.GetRuntimeMetric(
 80 |         tpu_metrics.MetricRequest(metric_name=metric_name.value)
 81 |     )
 82 |     return sorted(resp.metric.metrics, key=lambda m: m.attribute.value.int_attr)
 83 | 
 84 |   totals = sorted_metric_response(MetricName.TOTAL_MEMORY)
 85 |   usages = sorted_metric_response(MetricName.MEMORY_USAGE)
 86 |   duty_cycle_pct = sorted_metric_response(MetricName.DUTY_CYCLE_PCT)
 87 | 
 88 |   # Duty cycle is always measured per-chip, while memory is measured per-core.
 89 |   # Repeat if necessary so these responses are the same length.
 90 |   duty_cycle_pct_per_core = list(
 91 |       itertools.chain.from_iterable(
 92 |           itertools.repeat(d, chip_type.value.devices_per_chip)
 93 |           for d in duty_cycle_pct
 94 |       )
 95 |   )
 96 | 
 97 |   assert (
 98 |       len(totals) == len(usages) == len(duty_cycle_pct_per_core)
 99 |   ), "Metrics not found for all chips"
100 | 
101 |   return [
102 |       Usage(
103 |           u.attribute.value.int_attr,
104 |           u.gauge.as_int,
105 |           t.gauge.as_int,
106 |           d.gauge.as_double,
107 |       )
108 |       for u, t, d in zip(usages, totals, duty_cycle_pct_per_core)
109 |   ]
110 | 
111 | 
112 | def _get_percentile(
113 |     percentile_count: int,
114 |     total_count: int,
115 |     buckets: List[int],
116 |     scale: float,
117 |     growth_factor: float,
118 | ) -> float:
119 |   """Gets a percentile value from a distribution."""
120 |   for i in range(len(buckets) - 1, 0, -1):
121 |     total_count -= buckets[i]
122 |     if total_count <= percentile_count:
123 |       delta = percentile_count - total_count
124 |       lower_bound = scale * (growth_factor ** (i - 1))
125 |       return lower_bound * (1 + (delta / buckets[i]) * (growth_factor - 1))
126 |   return 1
127 | 
128 | 
129 | def get_buffer_transfer_latency(
130 |     addr: str = "localhost:8431",
131 | ) -> List[BufferTransferLatencyDistribution]:
132 |   """Gets buffer transfer latency statistics for all attached TPU devices.
133 | 
134 |   Args:
135 |     addr: GRPC address of libtpu metrics server.
136 | 
137 |   Returns:
138 |     List of buffer transfer latency statistics for each TPU device.
139 |   """
140 |   channel = grpc.secure_channel(addr, grpc.local_channel_credentials())
141 |   client = tpu_metrics_grpc.RuntimeMetricServiceStub(channel)
142 | 
143 |   resp: tpu_metrics.MetricResponse = client.GetRuntimeMetric(
144 |       tpu_metrics.MetricRequest(
145 |           metric_name=MetricName.BUFFER_TRANSFER_LATENCY_US.value
146 |       )
147 |   )
148 | 
149 |   buffer_transfer_latency_distributions = []
150 | 
151 |   for metric in resp.metric.metrics:
152 |     attribute = metric.attribute
153 |     distribution = metric.distribution
154 |     bucket = list(distribution.bucket_counts)
155 |     count = distribution.count
156 |     scale = distribution.bucket_options.exponential_buckets.scale
157 |     growth_factor = (
158 |         distribution.bucket_options.exponential_buckets.growth_factor
159 |     )
160 | 
161 |     p50_count = int(count * 0.5)
162 |     p90_count = int(count * 0.9)
163 |     p95_count = int(count * 0.95)
164 |     p999_count = int(count * 0.999)
165 | 
166 |     p50 = _get_percentile(p50_count, count, bucket, scale, growth_factor)
167 |     p90 = _get_percentile(p90_count, count, bucket, scale, growth_factor)
168 |     p95 = _get_percentile(p95_count, count, bucket, scale, growth_factor)
169 |     p999 = _get_percentile(p999_count, count, bucket, scale, growth_factor)
170 | 
171 |     buffer_transfer_latency_distributions.append(
172 |         BufferTransferLatencyDistribution(
173 |             attribute.value.kvlist_attr.attributes[0].value.string_attr,
174 |             p50,
175 |             p90,
176 |             p95,
177 |             p999,
178 |         )
179 |     )
180 | 
181 |   return buffer_transfer_latency_distributions
182 | 


--------------------------------------------------------------------------------
/tpu_info/tpu_info/proto/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Contains generated proto definitions for libtpu metrics service."""
16 | 


--------------------------------------------------------------------------------
/tpu_info/tpu_info/proto/tpu_metric_service.proto:
--------------------------------------------------------------------------------
  1 | syntax = "proto3";
  2 | 
  3 | package tpu.monitoring.runtime;
  4 | 
  5 | import "google/protobuf/timestamp.proto";
  6 | 
  7 | 
  8 | 
  9 | option java_multiple_files = true;
 10 | option objc_class_prefix = "GRPC";
 11 | option java_package = "com.google.tpu.monitoring.runtime.service.proto";
 12 | 
 13 | 
 14 | message Exemplar {
 15 |   double value = 1;
 16 |   .google.protobuf.Timestamp timestamp = 2;
 17 |   repeated Attribute attributes = 3;
 18 | }
 19 | 
 20 | message Distribution {
 21 |   int64 count = 1;
 22 |   double mean = 2;
 23 |   double min = 3;
 24 |   double max = 4;
 25 |   double sum_of_squared_deviation = 5;
 26 | 
 27 |   message BucketOptions {
 28 |     oneof options {
 29 |       Regular regular_buckets = 1 [deprecated = true];
 30 |       Exponential exponential_buckets = 2;
 31 |       Explicit explicit_buckets = 3;
 32 |       Linear linear_buckets = 4;
 33 |     }
 34 |     message Regular {
 35 |       option deprecated = true;
 36 | 
 37 |       int32 num_finite_buckets = 1;
 38 |       // A linear distribution has only one bound with overall width and offset
 39 |       // of the lowest bucket.
 40 |       // An explicit distribution will have monotonically increasing buckets
 41 |       // with width and the offset from the previous bucket.
 42 |       repeated Bound bounds = 2;
 43 |     }
 44 |     message Exponential {
 45 |       // Must be greater than 0.
 46 |       int32 num_finite_buckets = 1;
 47 |       // Must be greater than 1.
 48 |       double growth_factor = 2;
 49 |       // Must be greater than 0.
 50 |       double scale = 3;
 51 |     }
 52 |     message Bound {
 53 |       option deprecated = true;
 54 | 
 55 |       double width = 1;
 56 |       double offset = 2;
 57 |     }
 58 | 
 59 |     // Specifies a linear sequence of buckets that all have the same width
 60 |     // (except overflow and underflow). Each bucket represents a constant
 61 |     // absolute uncertainty on the specific value in the bucket.
 62 |     //
 63 |     // There are `num_finite_buckets + 2` (= N) buckets. Bucket `i` has the
 64 |     // following boundaries:
 65 |     //
 66 |     //    Upper bound (0 <= i < N-1):     offset + (width * i).
 67 |     //
 68 |     //    Lower bound (1 <= i < N):       offset + (width * (i - 1)).
 69 |     message Linear {
 70 |       // Must be greater than 0.
 71 |       int32 num_finite_buckets = 1;
 72 | 
 73 |       // Must be greater than 0.
 74 |       double width = 2;
 75 | 
 76 |       // Lower bound of the first bucket.
 77 |       double offset = 3;
 78 |     }
 79 | 
 80 |     // Specifies a set of buckets with arbitrary widths.
 81 |     //
 82 |     // There are `size(bounds) + 1` (= N) buckets. Bucket `i` has the following
 83 |     // boundaries:
 84 |     //
 85 |     //    Upper bound (0 <= i < N-1):     bounds[i]
 86 |     //    Lower bound (1 <= i < N);       bounds[i - 1]
 87 |     //
 88 |     // The `bounds` field must contain at least one element. If `bounds` has
 89 |     // only one element, then there are no finite buckets, and that single
 90 |     // element is the common boundary of the overflow and underflow buckets.
 91 |     message Explicit {
 92 |       // The values must be monotonically increasing.
 93 |       repeated double bounds = 1;
 94 |     }
 95 |   }
 96 | 
 97 |   // Defines the histogram bucket boundaries.
 98 |   BucketOptions bucket_options = 6;
 99 |   repeated int64 bucket_counts = 7;
100 |   repeated Exemplar exemplars = 8;
101 | }
102 | 
103 | // Gauge represents a single-point measure.
104 | message Gauge {
105 |   oneof value {
106 |     double as_double = 1;
107 |     int64 as_int = 2;
108 |     string as_string = 3;
109 |     bool as_bool = 4;
110 |   }
111 | }
112 | 
113 | // Counter is a monotonically increasing measure (until reset to zero).
114 | message Counter {
115 |   // The value MUST not be negative.
116 |   oneof value {
117 |     double as_double = 1;
118 |     uint64 as_int = 2;
119 |   }
120 |   Exemplar exemplar = 3;
121 | }
122 | 
123 | // Quantile represents the value at a given quantile of a distribution.
124 | message Quantile {
125 |   // The quantile of a distribution. Must be in the interval [0.0, 1.0].
126 |   double quantile = 1;
127 |   // The value at the given quantile of a distribution.
128 |   // Quantile values must NOT be negative.
129 |   double value = 2;
130 | }
131 | 
132 | // Summary represents observed sampling for different quantiles including
133 | // sum of all the observations and total count of observations.
134 | message Summary {
135 |   uint64 sample_count = 1;
136 |   double sample_sum = 2;
137 |   repeated Quantile quantile = 3;
138 | }
139 | 
140 | // AttrValue represents an attribute value.
141 | // AttrValue is considered to be "empty" if all values are unspecified.
142 | message AttrValue {
143 |   oneof attr {
144 |     string string_attr = 1;
145 |     bool bool_attr = 2;
146 |     int64 int_attr = 3;
147 |     double double_attr = 4;
148 |     ArrayAttrValue array_attr = 5;
149 |     KeyValueList kvlist_attr = 6;
150 |     bytes bytes_attr = 7;
151 |   }
152 | }
153 | 
154 | // ArrayAttrValue is a list of AttrValue messages.
155 | message ArrayAttrValue {
156 |   // Array of attribute. The array may be empty (contain 0 elements).
157 |   repeated AttrValue attrs = 1;
158 | }
159 | 
160 | // KeyValueList is a list of Key-AttrValue messages.
161 | message KeyValueList {
162 |   // A collection of key/value attributes. The list may be empty.
163 |   // The keys in attributes MUST be unique.
164 |   repeated Attribute attributes = 1;
165 | }
166 | 
167 | // Attribute is a key-value pair to store the attributes of a metric.
168 | // For example, device-id of the metric, host-id of the metric.
169 | message Attribute {
170 |   string key = 1;
171 |   AttrValue value = 2;
172 | }
173 | 
174 | // Metric represents a metric datapoint.
175 | // A metric has a reporting time, attribute and a measure value.
176 | message Metric {
177 |   Attribute attribute = 1;
178 |   .google.protobuf.Timestamp timestamp = 2;
179 |   oneof measure {
180 |     Gauge gauge = 3;
181 |     Counter counter = 4;
182 |     Distribution distribution = 5;
183 |     Summary summary = 6;
184 |   }
185 | }
186 | 
187 | // TPUMetric is a standalone metric object, exposed externally to a consumer.
188 | message TPUMetric {
189 |   string name = 1;
190 |   string description = 2;
191 |   repeated Metric metrics = 3;
192 | }
193 | 
194 | // MetricRequest is the request object to fetch metrics from LibTPU.
195 | // MetricRequest contains the metric name with which metrics can be fetched
196 | // from the RuntimeMetricsService.GetRuntimeMetric.
197 | message MetricRequest {
198 |   string metric_name = 1;
199 |   // skip_node_aggregation provides options to the client to skip aggregated
200 |   // lookup of metrics for a worker node. If the field is unset or set as false,
201 |   // an aggregated view of metrics for a TPU worker node would be provided.
202 |   // The aggregation feature is enabled by libTPU during initialization.
203 |   // By default, the worker node aggregation would be turned on in libTPU if the
204 |   // metrics server is supported. If the libTPU initialization turns off the
205 |   // feature explicitly, then the aggregated view would not be provided.
206 |   bool skip_node_aggregation = 2;
207 | }
208 | 
209 | // MetricResponse is the response object for RuntimeService.GetRuntimeMetric.
210 | // The response contains the TPUMetric as response which holds the metric data
211 | // for the requested metric.
212 | message MetricResponse {
213 |   TPUMetric metric = 1;
214 | }
215 | 
216 | // ListSupportedMetricsRequest is the request object for
217 | // RuntimeService.ListSupportedMetrics.
218 | // Empty request means no filters. All the metrics supported from the LibTPU
219 | // would be returned as the response.
220 | message ListSupportedMetricsRequest {
221 |   // A regex filter to apply to the supported metrics.
222 |   // If the field is empty or not set, no filter is applied. All the supported
223 |   // metrics are returned.
224 |   //
225 |   // Example: `.*memory.*`, `.*memory.*|.*duty_cycle.*`
226 |   string filter = 1;
227 | }
228 | 
229 | message SupportedMetric {
230 |   string metric_name = 1;
231 | }
232 | 
233 | // ListSupportedMetricsResponse is the response object for
234 | // RuntimeService.ListSupportedMetrics.
235 | // It contains all the metrics supported in the LibTPU for the
236 | // ListSupportedMetricsRequest.
237 | message ListSupportedMetricsResponse {
238 |   // List of supported metric.
239 |   repeated SupportedMetric supported_metric = 1;
240 | }
241 | 
242 | service RuntimeMetricService {
243 |   // GetRuntimeMetric returns the TPU metrics data for the MetricRequest.
244 |   rpc GetRuntimeMetric(MetricRequest) returns (MetricResponse);
245 | 
246 |   // ListSupportedMetrics lists the supported metrics for
247 |   // ListSupportedMetricsRequest.
248 |   rpc ListSupportedMetrics(ListSupportedMetricsRequest)
249 |       returns (ListSupportedMetricsResponse);
250 | }
251 | 


--------------------------------------------------------------------------------