├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── gcp_resources
    ├── gce
    │   ├── input.tf
    │   ├── main.tf
    │   └── resources
    │   │   ├── dashboard
    │   │       ├── logging_dashboard
    │   │       │   ├── dashboard.tf
    │   │       │   ├── dashboard_json
    │   │       │   │   ├── main.json
    │   │       │   │   ├── stack-trace-counter-metric.json
    │   │       │   │   └── stack-trace-log-panel.json
    │   │       │   ├── input.tf
    │   │       │   ├── log_metrics
    │   │       │   │   ├── input.tf
    │   │       │   │   └── stack_trace_counter.tf
    │   │       │   └── main.tf
    │   │       └── monitoring_dashboard
    │   │       │   ├── dashboard.tf
    │   │       │   ├── dashboard_json
    │   │       │       ├── cpu-utilization.json
    │   │       │       ├── dcn-transfer-latency.json
    │   │       │       ├── device-to-host-transfer-latency.json
    │   │       │       ├── host-to-device-transfer-latency.json
    │   │       │       ├── main.json
    │   │       │       ├── memory-usage.json
    │   │       │       ├── network-bytes.json
    │   │       │       └── tensorcore-idle-duration.json
    │   │       │   ├── input.tf
    │   │       │   └── main.tf
    │   │   └── log_storage
    │   │       ├── input.tf
    │   │       ├── main.tf
    │   │       └── stack-trace-bucket.tf
    └── gke
    │   ├── input.tf
    │   ├── main.tf
    │   └── resources
    │       ├── dashboard
    │           ├── logging_dashboard
    │           │   ├── dashboard.tf
    │           │   ├── dashboard_json
    │           │   │   ├── main.json
    │           │   │   ├── stack-trace-counter-metric.json
    │           │   │   └── stack-trace-log-panel.json
    │           │   ├── input.tf
    │           │   ├── log_metrics
    │           │   │   ├── input.tf
    │           │   │   └── stack_trace_counter.tf
    │           │   └── main.tf
    │           └── monitoring_dashboard
    │           │   ├── dashboard.tf
    │           │   ├── dashboard_json
    │           │       ├── accelerator-memory-used.json
    │           │       ├── collectives-latency.json
    │           │       ├── cpu-utilization.json
    │           │       ├── dcn-transfer-latency.json
    │           │       ├── device-to-host-transfer-latency.json
    │           │       ├── duty-cycle.json
    │           │       ├── host-to-device-transfer-latency.json
    │           │       ├── main.json
    │           │       ├── memory-usage.json
    │           │       └── network-bytes.json
    │           │   ├── input.tf
    │           │   └── main.tf
    │       └── log_storage
    │           ├── input.tf
    │           ├── main.tf
    │           └── stack-trace-bucket.tf
└── pip_package
    ├── CHANGELOG.md
    ├── README.md
    ├── cloud_tpu_diagnostics
        ├── __init__.py
        ├── configuration.py
        ├── diagnostic.py
        ├── src
        │   ├── config
        │   │   ├── debug_configuration.py
        │   │   ├── diagnostic_configuration.py
        │   │   └── stack_trace_configuration.py
        │   ├── debug.py
        │   ├── diagnose.py
        │   ├── stack_trace.py
        │   └── util
        │   │   ├── default.py
        │   │   └── stack_trace_test_util.py
        └── tests
        │   ├── debug_test.py
        │   ├── diagnose_test.py
        │   └── stack_trace_test.py
    └── pyproject.toml


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright 2023 Google LLC
 3 |  
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 |  
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 |  
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  -->
16 | # How to contribute
17 | 
18 | We'd love to accept your patches and contributions to this project.
19 | 
20 | ## Before you begin
21 | 
22 | ### Sign our Contributor License Agreement
23 | 
24 | Contributions to this project must be accompanied by a
25 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
26 | You (or your employer) retain the copyright to your contribution; this simply
27 | gives us permission to use and redistribute your contributions as part of the
28 | project.
29 | 
30 | If you or your current employer have already signed the Google CLA (even if it
31 | was for a different project), you probably don't need to do it again.
32 | 
33 | Visit <https://cla.developers.google.com/> to see your current agreements or to
34 | sign a new one.
35 | 
36 | ### Review our community guidelines
37 | 
38 | This project follows
39 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
40 | 
41 | ## Contribution process
42 | 
43 | ### Code reviews
44 | 
45 | All submissions, including submissions by project members, require review. We
46 | use GitHub pull requests for this purpose. Consult
47 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
48 | information on using pull requests.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 |  Copyright 2023 Google LLC
  3 |  
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 |  
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 |  
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  -->
 16 | # Cloud TPU Monitoring Debugging
 17 | 
 18 | ## Overview
 19 | 
 20 | Cloud TPU Monitoring Debugging repository contains all the infrastructure and logic required to monitor and debug jobs running on Cloud TPU.
 21 | 
 22 | Terraform is used to deploy resources in google cloud project.
 23 | Terraform is an open-source tool to set up and manage google cloud
 24 | infrastructure based on configuration files. This repository will help the
 25 | customers to deploy various google cloud resources via script, without any
 26 | manual effort.
 27 | 
 28 | [cloud-tpu-diagnostics PyPI package](https://pypi.org/project/cloud-tpu-diagnostics) contains all the logic to monitor, debug and profile the jobs running on Cloud TPU.
 29 | 
 30 | ## Getting Started with Terraform
 31 | 
 32 | -   Follow [this link](https://developer.hashicorp.com/terraform/tutorials/gcp-get-started/install-cli) to install Terraform on desktop.
 33 | -   Run `terraform init` to
 34 |     initialize google cloud Terraform provider version. This command will add
 35 |     the necessary plugins and build the `.terraform` directory.
 36 | -   If there is an update to terraform google cloud provider version, run
 37 |     `terraform init --upgrade` for the update to take place.
 38 | -   You can also run `terraform plan` to validate resource declarations,
 39 |     identify any syntax errors, version mismatch before deploying the resources.
 40 | 
 41 | ### Configure Terraform to store state in Cloud Storage
 42 | 
 43 | By default, Terraform stores [state](https://www.terraform.io/docs/state/) locally in a file named `terraform.tfstate`. This default configuration can make Terraform usage difficult for teams, especially when many users run Terraform at the same time and each machine has its own understanding of the current infrastructure. To help avoid such issues, this section configures a remote state that points to Google Cloud Storage (GCS) bucket.
 44 | 
 45 | 1. In Cloud Shell, create the GCS bucket:
 46 | 
 47 |         gsutil mb gs://${GCS_BUCKET_NAME}
 48 | 
 49 | 2. Enable [Object Versioning](https://cloud.google.com/storage/docs/object-versioning) to keep the history of your deployments. Enabling Object Versioning increases [storage costs](https://cloud.google.com/storage/pricing), which you can mitigate by configuring
 50 | [Object Lifecycle Management](https://cloud.google.com/storage/docs/lifecycle) to delete old state versions.
 51 | 
 52 |         gsutil versioning set on gs://${GCS_BUCKET_NAME}
 53 | 
 54 | 3. Enter the name of GCS bucket created above when you run `terraform init` to initialize Terraform.
 55 | 
 56 |         Initializing the backend...
 57 |         bucket
 58 |           The name of the Google Cloud Storage bucket
 59 | 
 60 |           Enter a value: <GCS_BUCKET_NAME>
 61 | 
 62 | ## Deploy GCP Resources
 63 | There are following resources managed in this directory:
 64 | 
 65 | 1. **Monitoring Dashboard**: This is an outlier dashboard that displays statistics and outlier mode for TPU metrics.
 66 | 2. **Debugging Dashboard**: This dashboard displays the stack traces collected in Cloud Logging for the process running on TPU VMs.
 67 | 3. **Logging Storage**: This is an user-defined log bucket to store stack traces. Creating a new log storage is completely optional. If you choose not to create a separate log bucket, the stack traces will be collected in [_Default log bucket](https://cloud.google.com/logging/docs/routing/overview#default-bucket).
 68 | 
 69 | ### Deploy Resources for Workloads on GCE
 70 | 
 71 | Run `terraform init && terraform apply` inside `gcp_resources/gce` directory to deploy all the resources mentioned above for TPU workloads running on GCE. You will be prompted to provide values for some input variables. After confirming the action, all the resources will get automatically deployed in your gcp project.
 72 | 
 73 | ### Deploy Resources for Workloads on GKE
 74 | 
 75 | Run `terraform init && terraform apply` inside `gcp_resources/gke` directory to deploy all the resources mentioned above for TPU workloads running on GKE. You will be prompted to provide values for some input variables. After confirming the action, all the resources will get automatically deployed in your gcp project.
 76 | 
 77 | > **_NOTE:_** Please check the below guide for more details about GCE/GKE specific resources and prerequisites.
 78 | 
 79 | Follow the below guide to deploy the resources individually:
 80 | ### Monitoring Dashboard
 81 | #### GCE
 82 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/dashboard/monitoring_dashboard/` to deploy only monitoring dashboard for GCE in your gcp project.
 83 | 
 84 | If the `node_prefix` parameter is not specified in the input variable `var.monitoring_dashboard_config` or is set to an empty string, the metrics on the dashboard will plot the data points for all TPU VMs in your GCP project.
 85 | 
 86 | For instance, if you provide `{"node_prefix": "test"}` as the input value for the input variable `var.monitoring_dashboard_config`, then the metrics on the monitoring dashboard will only show the data points for the TPU VMs with node names that start with `test`. Refer to this [doc](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/tpus/queued-resources/create#--node-prefix) for more information on node prefix for TPUs in multislice.
 87 | 
 88 | #### GKE
 89 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/dashboard/monitoring_dashboard/` to deploy only monitoring dashboard for GKE in your gcp project.
 90 | 
 91 | ### Debugging Dashboard
 92 | #### GCE
 93 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/dashboard/logging_dashboard/` to deploy only debugging dashboard for GCE in your gcp project.
 94 | 
 95 | #### GKE
 96 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/dashboard/logging_dashboard/` to deploy only debugging dashboard for GKE in your gcp project.
 97 | 
 98 | Users need to add a sidecar container to their TPU workload running on GKE to view traces in the debugging dashboard. The sidecar container must be named in a specific way, matching the regex `[a-z-0-9]*stacktrace[a-z-0-9]*`. Here is an example of the sidecar container that should be added:
 99 | 
100 | ```
101 | containers:
102 | - name: stacktrace-log-collector
103 |   image: busybox:1.28
104 |   resources:
105 |     limits:
106 |       cpu: 100m
107 |       memory: 200Mi
108 |   args: [/bin/sh, -c, "while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*"]
109 |   volumeMounts:
110 |   - name: tpu-debug-logs
111 |     readOnly: true
112 |     mountPath: /tmp/debugging
113 | - name: <main_container>
114 | .....
115 | .....
116 | volumes:
117 | - name: tpu-debug-logs
118 | ```
119 | 
120 | ### Log Storage
121 | #### GCE
122 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/log_storage/` to deploy a separate log bucket to store stack traces for GCE. You will be prompted to provide name of your gcp project and also the bucket configuration. You can also set the retention period for the bucket.
123 | 
124 | #### GKE
125 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/log_storage/` to deploy a separate log bucket to store stack traces for GKE. You will be prompted to provide name of your gcp project and also the bucket configuration. You can also set the retention period for the bucket. Make sure that you have the sidecar container running in your GKE cluster as mentioned in [Debugging Dashboard section for GKE](#debugging-dashboard).


--------------------------------------------------------------------------------
/gcp_resources/gce/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 
20 | variable "monitoring_dashboard_config" {
21 |   type = object({
22 |     node_prefix : optional(string),
23 |     outlier_count : optional(number)
24 |   })
25 |   description = <<EOF
26 |   Configuration for monitoring dashboard:
27 |   {
28 |     "node_prefix": "prefix used to generate the node name in multislice node provision, default to empty string",
29 |     "outlier_count": "number of outliers to show on dashboard, default to 10 if not set"
30 |   }
31 |   Enter {} to set default configuration for monitoring dashboard.
32 |   EOF
33 | }
34 | 
35 | // Valid inputs:
36 | // 1. To create stack trace bucket for 30 retention days: {"bucket_name":"<log_bucket_name>"}
37 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"<log_bucket_name>", "retention_days":x}
38 | // 3. To not create stack trace bucket: {}
39 | variable "stack_trace_bucket_config" {
40 |   type = object({
41 |     bucket_name : optional(string)
42 |     retention_days : optional(number)
43 |   })
44 |   validation {
45 |     condition = (
46 |       (var.stack_trace_bucket_config.bucket_name == null &&
47 |       var.stack_trace_bucket_config.retention_days == null) ||
48 |       (var.stack_trace_bucket_config.bucket_name != null)
49 |     )
50 |     error_message = "bucket_name is not defined for stack_trace_bucket_config."
51 |   }
52 |   description = <<EOF
53 |   Configuration to create a log bucket to store stack traces:
54 |   {
55 |     "bucket_name": "name of log bucket to create",
56 |     "retention_days": number of days to retain stack traces, default to 30 days if not set
57 |   }
58 |   Enter {} to not create separate log bucket for stack traces.
59 |   EOF
60 | }
61 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source  = "hashicorp/google"
19 |       version = ">= 4.57.0"
20 |     }
21 |   }
22 |   /*
23 |     Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 |     The bucket must exist prior to configuring the backend.
25 |     For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 |   */
27 |   backend "gcs" {
28 |     # GCS prefix inside the bucket. terraform states are stored in an object called <prefix>/default.tfstate
29 |     prefix = "gce"
30 |   }
31 | }
32 | 
33 | module "monitoring_dashboard" {
34 |   source                      = "./resources/dashboard/monitoring_dashboard"
35 |   project_name                = var.project_name
36 |   monitoring_dashboard_config = var.monitoring_dashboard_config
37 | }
38 | 
39 | module "logging_dashboard" {
40 |   source       = "./resources/dashboard/logging_dashboard"
41 |   project_name = var.project_name
42 | }
43 | 
44 | module "log_storage" {
45 |   source                    = "./resources/log_storage"
46 |   project_name              = var.project_name
47 |   stack_trace_bucket_config = var.stack_trace_bucket_config
48 | }
49 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | data "google_project" "project" {
16 |   project_id = var.project_name
17 | }
18 | 
19 | // Add a dependency on log_metrics module to deploy log-based metrics before deploying logging dashboard
20 | module "log_metrics" {
21 |   source       = "./log_metrics"
22 |   project_name = var.project_name
23 | }
24 | 
25 | locals {
26 |   dashboard_json = templatefile("${path.module}/dashboard_json/main.json",
27 |     {
28 |       TILE_1 = templatefile("${path.module}/dashboard_json/stack-trace-counter-metric.json",
29 |         {
30 |           METRIC_NAME = module.log_metrics.stack_trace_counter_metric_id
31 |       }),
32 |       TILE_2 = templatefile("${path.module}/dashboard_json/stack-trace-log-panel.json",
33 |         {
34 |           PROJECT_NUMBER = data.google_project.project.number
35 |       })
36 |   })
37 | }
38 | 
39 | resource "google_monitoring_dashboard" "logging_dashboard" {
40 |   project        = var.project_name
41 |   dashboard_json = local.dashboard_json
42 |   depends_on     = [module.log_metrics.stack_trace_counter_metric]
43 | }
44 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/main.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "category": "CUSTOM",
 3 |   "displayName": "GCE - TPU Logging Dashboard",
 4 |   "dashboardFilters": [
 5 |     {
 6 |       "filterType": "RESOURCE_LABEL",
 7 |       "labelKey": "node_id"
 8 |     },
 9 |     {
10 |       "filterType": "RESOURCE_LABEL",
11 |       "labelKey": "worker_id"
12 |     }
13 |   ],
14 |   "mosaicLayout": {
15 |     "columns": 12,
16 |     "tiles": [
17 |       ${TILE_1},
18 |       ${TILE_2},
19 |       {
20 |         "height": 10,
21 |         "widget": {
22 |           "collapsibleGroup": {
23 |             "collapsed": false
24 |           },
25 |           "title": "TPU VM Process Debugging"
26 |         },
27 |         "width": 12
28 |       }
29 |     ]
30 |   }
31 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-counter-metric.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "height": 4,
 3 |   "widget": {
 4 |     "timeSeriesTable": {
 5 |       "columnSettings": [
 6 |         {
 7 |           "column": "node_id",
 8 |           "visible": true
 9 |         },
10 |         {
11 |           "column": "worker_id",
12 |           "visible": true
13 |         },
14 |         {
15 |           "column": "zone",
16 |           "visible": true
17 |         },
18 |         {
19 |           "column": "value",
20 |           "visible": true
21 |         }
22 |       ],
23 |       "dataSets": [
24 |         {
25 |           "minAlignmentPeriod": "600s",
26 |           "timeSeriesQuery": {
27 |             "outputFullDuration": true,
28 |             "timeSeriesFilter": {
29 |               "aggregation": {
30 |                 "alignmentPeriod": "600s",
31 |                 "perSeriesAligner": "ALIGN_RATE"
32 |               },
33 |               "filter": "metric.type=\"logging.googleapis.com/user/${METRIC_NAME}\" resource.type=\"tpu_worker\"",
34 |               "pickTimeSeriesFilter": {
35 |                 "direction": "TOP",
36 |                 "numTimeSeries": 300,
37 |                 "rankingMethod": "METHOD_MEAN"
38 |               },
39 |               "secondaryAggregation": {
40 |                 "alignmentPeriod": "600s",
41 |                 "crossSeriesReducer": "REDUCE_MEAN",
42 |                 "groupByFields": [
43 |                   "metric.label.\"node_id\"",
44 |                   "metric.label.\"worker_id\"",
45 |                   "metric.label.\"zone\""
46 |                 ],
47 |                 "perSeriesAligner": "ALIGN_MEAN"
48 |               }
49 |             }
50 |           }
51 |         }
52 |       ],
53 |       "metricVisualization": "BAR"
54 |     },
55 |     "title": "Stack Trace Log Entry Count per Period [Sorted by MEAN]"
56 |   },
57 |   "width": 12
58 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-log-panel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "height": 6,
 3 |   "widget": {
 4 |     "logsPanel": {
 5 |       "filter": "resource.type=\"tpu_worker\" log_id(\"tpu.googleapis.com/runtime_monitor\") jsonPayload.verb=\"stacktraceanalyzer\"",
 6 |       "resourceNames": [
 7 |         "projects/${PROJECT_NUMBER}"
 8 |       ]
 9 |     },
10 |     "title": "Stack Trace Logs"
11 |   },
12 |   "width": 12,
13 |   "yPos": 4
14 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/log_metrics/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/log_metrics/stack_trace_counter.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | // Metric that counts the number of stack trace entries that match a specified filter within a specific period
16 | resource "google_logging_metric" "stack_trace_counter_metric" {
17 |   name        = "stack_trace_counter_gce"
18 |   project     = var.project_name
19 |   description = "Counts the number of stack trace log entries within a specific period."
20 |   filter      = "resource.type=\"tpu_worker\" AND log_id(\"tpu.googleapis.com/runtime_monitor\") AND jsonPayload.verb=\"stacktraceanalyzer\""
21 |   metric_descriptor {
22 |     metric_kind = "DELTA"
23 |     value_type  = "INT64"
24 |     labels {
25 |       key        = "zone"
26 |       value_type = "STRING"
27 |     }
28 |     labels {
29 |       key        = "node_id"
30 |       value_type = "STRING"
31 |     }
32 |     labels {
33 |       key        = "worker_id"
34 |       value_type = "STRING"
35 |     }
36 |   }
37 |   label_extractors = {
38 |     "zone"      = "EXTRACT(resource.labels.zone)",
39 |     "node_id"   = "EXTRACT(resource.labels.node_id)",
40 |     "worker_id" = "EXTRACT(resource.labels.worker_id)",
41 |   }
42 | }
43 | 
44 | output "stack_trace_counter_metric_id" {
45 |   value = google_logging_metric.stack_trace_counter_metric.id
46 | }
47 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source  = "hashicorp/google"
19 |       version = ">= 4.57.0"
20 |     }
21 |   }
22 |   /*
23 |     Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 |     The bucket must exist prior to configuring the backend.
25 |     For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 |   */
27 |   backend "gcs" {
28 |     # GCS prefix inside the bucket. terraform states are stored in an object called <prefix>/default.tfstate
29 |     prefix = "gce/dashboard/logging_dashboard"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | locals {
16 |   outlier_count     = var.monitoring_dashboard_config.outlier_count == null ? 10 : var.monitoring_dashboard_config.outlier_count
17 |   node_prefix_regex = var.monitoring_dashboard_config.node_prefix == null ? "[a-z0-9-_]*" : "${var.monitoring_dashboard_config.node_prefix}[a-z0-9-_]*"
18 |   dashboard_json = templatefile("${path.module}/dashboard_json/main.json",
19 |     {
20 |       TILE_1 = templatefile("${path.module}/dashboard_json/cpu-utilization.json",
21 |         {
22 |           OUTLIER_COUNT     = local.outlier_count,
23 |           NODE_PREFIX_REGEX = local.node_prefix_regex
24 |       }),
25 |       TILE_2 = templatefile("${path.module}/dashboard_json/tensorcore-idle-duration.json",
26 |         {
27 |           OUTLIER_COUNT     = local.outlier_count,
28 |           NODE_PREFIX_REGEX = local.node_prefix_regex
29 |       }),
30 |       TILE_3 = templatefile("${path.module}/dashboard_json/memory-usage.json",
31 |         {
32 |           OUTLIER_COUNT     = local.outlier_count,
33 |           NODE_PREFIX_REGEX = local.node_prefix_regex
34 |       }),
35 |       TILE_4 = templatefile("${path.module}/dashboard_json/network-bytes.json",
36 |         {
37 |           OUTLIER_COUNT     = local.outlier_count,
38 |           NODE_PREFIX_REGEX = local.node_prefix_regex
39 |       }),
40 |       TILE_5 = templatefile("${path.module}/dashboard_json/dcn-transfer-latency.json",
41 |         {
42 |           OUTLIER_COUNT     = local.outlier_count,
43 |           NODE_PREFIX_REGEX = local.node_prefix_regex
44 |       }),
45 |       TILE_6 = templatefile("${path.module}/dashboard_json/host-to-device-transfer-latency.json",
46 |         {
47 |           OUTLIER_COUNT     = local.outlier_count,
48 |           NODE_PREFIX_REGEX = local.node_prefix_regex
49 |       }),
50 |       TILE_7 = templatefile("${path.module}/dashboard_json/device-to-host-transfer-latency.json",
51 |         {
52 |           OUTLIER_COUNT     = local.outlier_count,
53 |           NODE_PREFIX_REGEX = local.node_prefix_regex
54 |       })
55 |   })
56 | }
57 | 
58 | resource "google_monitoring_dashboard" "monitoring_dashboard" {
59 |   project        = var.project_name
60 |   dashboard_json = local.dashboard_json
61 | }
62 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/cpu-utilization.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "TPU Worker - CPU Utilization Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_NONE"
 19 |               },
 20 |               "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12,
 34 |   "yPos": 1
 35 | },
 36 | {
 37 |   "height": 4,
 38 |   "widget": {
 39 |     "title": "TPU Worker - CPU Utilization Outliers [MEAN]",
 40 |     "xyChart": {
 41 |       "chartOptions": {
 42 |         "mode": "COLOR"
 43 |       },
 44 |       "dataSets": [
 45 |         {
 46 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
 47 |           "minAlignmentPeriod": "60s",
 48 |           "plotType": "LINE",
 49 |           "targetAxis": "Y1",
 50 |           "timeSeriesQuery": {
 51 |             "timeSeriesFilter": {
 52 |               "aggregation": {
 53 |                 "alignmentPeriod": "60s",
 54 |                 "crossSeriesReducer": "REDUCE_MEAN",
 55 |                 "groupByFields": [
 56 |                   "resource.label.\"node_id\"",
 57 |                   "resource.label.\"worker_id\""
 58 |                 ],
 59 |                 "perSeriesAligner": "ALIGN_MEAN"
 60 |               },
 61 |               "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
 62 |               "pickTimeSeriesFilter": {
 63 |                 "direction": "TOP",
 64 |                 "numTimeSeries": ${OUTLIER_COUNT},
 65 |                 "rankingMethod": "METHOD_MEAN"
 66 |               }
 67 |             }
 68 |           }
 69 |         }
 70 |       ],
 71 |       "thresholds": [],
 72 |       "timeshiftDuration": "0s",
 73 |       "yAxis": {
 74 |         "label": "",
 75 |         "scale": "LINEAR"
 76 |       }
 77 |     }
 78 |   },
 79 |   "width": 6,
 80 |   "yPos": 5
 81 | },
 82 | {
 83 |   "height": 4,
 84 |   "widget": {
 85 |     "title": "TPU Worker - CPU Utilization Outliers [MAX]",
 86 |     "xyChart": {
 87 |       "chartOptions": {
 88 |         "mode": "COLOR"
 89 |       },
 90 |       "dataSets": [
 91 |         {
 92 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
 93 |           "minAlignmentPeriod": "60s",
 94 |           "plotType": "LINE",
 95 |           "targetAxis": "Y1",
 96 |           "timeSeriesQuery": {
 97 |             "timeSeriesFilter": {
 98 |               "aggregation": {
 99 |                 "alignmentPeriod": "60s",
100 |                 "crossSeriesReducer": "REDUCE_MAX",
101 |                 "groupByFields": [
102 |                   "resource.label.\"node_id\"",
103 |                   "resource.label.\"worker_id\""
104 |                 ],
105 |                 "perSeriesAligner": "ALIGN_MAX"
106 |               },
107 |               "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
108 |               "pickTimeSeriesFilter": {
109 |                 "direction": "TOP",
110 |                 "numTimeSeries": ${OUTLIER_COUNT},
111 |                 "rankingMethod": "METHOD_MAX"
112 |               }
113 |             }
114 |           }
115 |         }
116 |       ],
117 |       "thresholds": [],
118 |       "timeshiftDuration": "0s",
119 |       "yAxis": {
120 |         "label": "",
121 |         "scale": "LINEAR"
122 |       }
123 |     }
124 |   },
125 |   "width": 6,
126 |   "xPos": 6,
127 |   "yPos": 5
128 | },
129 | {
130 |   "height": 8,
131 |   "widget": {
132 |     "collapsibleGroup": {
133 |       "collapsed": false
134 |     },
135 |     "title": "CPU Utilization on TPU Worker"
136 |   },
137 |   "width": 12,
138 |   "yPos": 1
139 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/dcn-transfer-latency.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "GCE Instance - DCN Transfer Latency Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 19 |                 "perSeriesAligner": "ALIGN_SUM"
 20 |               },
 21 |               "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
 22 |             }
 23 |           }
 24 |         }
 25 |       ],
 26 |       "thresholds": [],
 27 |       "timeshiftDuration": "0s",
 28 |       "yAxis": {
 29 |         "label": "",
 30 |         "scale": "LINEAR"
 31 |       }
 32 |     }
 33 |   },
 34 |   "width": 12,
 35 |   "yPos": 42
 36 | },
 37 | {
 38 |   "height": 4,
 39 |   "widget": {
 40 |     "title": "GCE Instance - DCN Transfer Latency Outliers [p50]",
 41 |     "xyChart": {
 42 |       "chartOptions": {
 43 |         "mode": "COLOR"
 44 |       },
 45 |       "dataSets": [
 46 |         {
 47 |           "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
 48 |           "minAlignmentPeriod": "60s",
 49 |           "plotType": "LINE",
 50 |           "targetAxis": "Y1",
 51 |           "timeSeriesQuery": {
 52 |             "timeSeriesFilter": {
 53 |               "aggregation": {
 54 |                 "alignmentPeriod": "60s",
 55 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 56 |                 "groupByFields": [
 57 |                   "metric.label.\"node_id\"",
 58 |                   "metric.label.\"worker_id\""
 59 |                 ],
 60 |                 "perSeriesAligner": "ALIGN_SUM"
 61 |               },
 62 |               "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
 63 |               "pickTimeSeriesFilter": {
 64 |                 "direction": "TOP",
 65 |                 "numTimeSeries": ${OUTLIER_COUNT},
 66 |                 "rankingMethod": "METHOD_MAX"
 67 |               }
 68 |             }
 69 |           }
 70 |         }
 71 |       ],
 72 |       "thresholds": [],
 73 |       "timeshiftDuration": "0s",
 74 |       "yAxis": {
 75 |         "label": "",
 76 |         "scale": "LINEAR"
 77 |       }
 78 |     }
 79 |   },
 80 |   "width": 6,
 81 |   "yPos": 46
 82 | },
 83 | {
 84 |   "height": 4,
 85 |   "widget": {
 86 |     "title": "GCE Instance - DCN Transfer Latency Outliers [p99]",
 87 |     "xyChart": {
 88 |       "chartOptions": {
 89 |         "mode": "COLOR"
 90 |       },
 91 |       "dataSets": [
 92 |         {
 93 |           "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
 94 |           "minAlignmentPeriod": "60s",
 95 |           "plotType": "LINE",
 96 |           "targetAxis": "Y1",
 97 |           "timeSeriesQuery": {
 98 |             "timeSeriesFilter": {
 99 |               "aggregation": {
100 |                 "alignmentPeriod": "60s",
101 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_99",
102 |                 "groupByFields": [
103 |                   "metric.label.\"node_id\"",
104 |                   "metric.label.\"worker_id\""
105 |                 ],
106 |                 "perSeriesAligner": "ALIGN_SUM"
107 |               },
108 |               "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
109 |               "pickTimeSeriesFilter": {
110 |                 "direction": "TOP",
111 |                 "numTimeSeries": ${OUTLIER_COUNT},
112 |                 "rankingMethod": "METHOD_MAX"
113 |               }
114 |             }
115 |           }
116 |         }
117 |       ],
118 |       "thresholds": [],
119 |       "timeshiftDuration": "0s",
120 |       "yAxis": {
121 |         "label": "",
122 |         "scale": "LINEAR"
123 |       }
124 |     }
125 |   },
126 |   "width": 6,
127 |   "xPos": 6,
128 |   "yPos": 46
129 | },
130 | {
131 |   "height": 8,
132 |   "widget": {
133 |     "collapsibleGroup": {
134 |       "collapsed": false
135 |     },
136 |     "title": "DCN Transfer Latency"
137 |   },
138 |   "width": 12,
139 |   "yPos": 42
140 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/device-to-host-transfer-latency.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "GCE Instance - Device to Host Transfer Latency Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 19 |                 "perSeriesAligner": "ALIGN_SUM"
 20 |               },
 21 |               "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
 22 |             }
 23 |           }
 24 |         }
 25 |       ],
 26 |       "thresholds": [],
 27 |       "timeshiftDuration": "0s",
 28 |       "yAxis": {
 29 |         "label": "",
 30 |         "scale": "LINEAR"
 31 |       }
 32 |     }
 33 |   },
 34 |   "width": 12,
 35 |   "yPos": 58
 36 | },
 37 | {
 38 |   "height": 4,
 39 |   "widget": {
 40 |     "title": "GCE Instance - Device to Host Transfer Latency Outliers [p50]",
 41 |     "xyChart": {
 42 |       "chartOptions": {
 43 |         "mode": "COLOR"
 44 |       },
 45 |       "dataSets": [
 46 |         {
 47 |           "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
 48 |           "minAlignmentPeriod": "60s",
 49 |           "plotType": "LINE",
 50 |           "targetAxis": "Y1",
 51 |           "timeSeriesQuery": {
 52 |             "timeSeriesFilter": {
 53 |               "aggregation": {
 54 |                 "alignmentPeriod": "60s",
 55 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 56 |                 "groupByFields": [
 57 |                   "metric.label.\"node_id\"",
 58 |                   "metric.label.\"worker_id\""
 59 |                 ],
 60 |                 "perSeriesAligner": "ALIGN_SUM"
 61 |               },
 62 |               "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
 63 |               "pickTimeSeriesFilter": {
 64 |                 "direction": "TOP",
 65 |                 "numTimeSeries": ${OUTLIER_COUNT},
 66 |                 "rankingMethod": "METHOD_MAX"
 67 |               }
 68 |             }
 69 |           }
 70 |         }
 71 |       ],
 72 |       "thresholds": [],
 73 |       "timeshiftDuration": "0s",
 74 |       "yAxis": {
 75 |         "label": "",
 76 |         "scale": "LINEAR"
 77 |       }
 78 |     }
 79 |   },
 80 |   "width": 6,
 81 |   "yPos": 62
 82 | },
 83 | {
 84 |   "height": 4,
 85 |   "widget": {
 86 |     "title": "GCE Instance - Device to Host Transfer Latency Outliers [p99]",
 87 |     "xyChart": {
 88 |       "chartOptions": {
 89 |         "mode": "COLOR"
 90 |       },
 91 |       "dataSets": [
 92 |         {
 93 |           "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
 94 |           "minAlignmentPeriod": "60s",
 95 |           "plotType": "LINE",
 96 |           "targetAxis": "Y1",
 97 |           "timeSeriesQuery": {
 98 |             "timeSeriesFilter": {
 99 |               "aggregation": {
100 |                 "alignmentPeriod": "60s",
101 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_99",
102 |                 "groupByFields": [
103 |                   "metric.label.\"node_id\"",
104 |                   "metric.label.\"worker_id\""
105 |                 ],
106 |                 "perSeriesAligner": "ALIGN_SUM"
107 |               },
108 |               "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
109 |               "pickTimeSeriesFilter": {
110 |                 "direction": "TOP",
111 |                 "numTimeSeries": ${OUTLIER_COUNT},
112 |                 "rankingMethod": "METHOD_MAX"
113 |               }
114 |             }
115 |           }
116 |         }
117 |       ],
118 |       "thresholds": [],
119 |       "timeshiftDuration": "0s",
120 |       "yAxis": {
121 |         "label": "",
122 |         "scale": "LINEAR"
123 |       }
124 |     }
125 |   },
126 |   "width": 6,
127 |   "xPos": 6,
128 |   "yPos": 62
129 | },
130 | {
131 |   "height": 8,
132 |   "widget": {
133 |     "collapsibleGroup": {
134 |       "collapsed": false
135 |     },
136 |     "title": "Device to Host Transfer Latency"
137 |   },
138 |   "width": 12,
139 |   "yPos": 58
140 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/host-to-device-transfer-latency.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "GCE Instance - Host to Device Transfer Latency Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 19 |                 "perSeriesAligner": "ALIGN_SUM"
 20 |               },
 21 |               "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
 22 |             }
 23 |           }
 24 |         }
 25 |       ],
 26 |       "thresholds": [],
 27 |       "timeshiftDuration": "0s",
 28 |       "yAxis": {
 29 |         "label": "",
 30 |         "scale": "LINEAR"
 31 |       }
 32 |     }
 33 |   },
 34 |   "width": 12,
 35 |   "yPos": 50
 36 | },
 37 | {
 38 |   "height": 4,
 39 |   "widget": {
 40 |     "title": "GCE Instance - Host to Device Transfer Latency Outliers [p50]",
 41 |     "xyChart": {
 42 |       "chartOptions": {
 43 |         "mode": "COLOR"
 44 |       },
 45 |       "dataSets": [
 46 |         {
 47 |           "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
 48 |           "minAlignmentPeriod": "60s",
 49 |           "plotType": "LINE",
 50 |           "targetAxis": "Y1",
 51 |           "timeSeriesQuery": {
 52 |             "timeSeriesFilter": {
 53 |               "aggregation": {
 54 |                 "alignmentPeriod": "60s",
 55 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 56 |                 "groupByFields": [
 57 |                   "metric.label.\"node_id\"",
 58 |                   "metric.label.\"worker_id\""
 59 |                 ],
 60 |                 "perSeriesAligner": "ALIGN_SUM"
 61 |               },
 62 |               "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
 63 |               "pickTimeSeriesFilter": {
 64 |                 "direction": "TOP",
 65 |                 "numTimeSeries": ${OUTLIER_COUNT},
 66 |                 "rankingMethod": "METHOD_MAX"
 67 |               }
 68 |             }
 69 |           }
 70 |         }
 71 |       ],
 72 |       "thresholds": [],
 73 |       "timeshiftDuration": "0s",
 74 |       "yAxis": {
 75 |         "label": "",
 76 |         "scale": "LINEAR"
 77 |       }
 78 |     }
 79 |   },
 80 |   "width": 6,
 81 |   "yPos": 54
 82 | },
 83 | {
 84 |   "height": 4,
 85 |   "widget": {
 86 |     "title": "GCE Instance - Host to Device Transfer Latency Outliers [p99]",
 87 |     "xyChart": {
 88 |       "chartOptions": {
 89 |         "mode": "COLOR"
 90 |       },
 91 |       "dataSets": [
 92 |         {
 93 |           "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
 94 |           "minAlignmentPeriod": "60s",
 95 |           "plotType": "LINE",
 96 |           "targetAxis": "Y1",
 97 |           "timeSeriesQuery": {
 98 |             "timeSeriesFilter": {
 99 |               "aggregation": {
100 |                 "alignmentPeriod": "60s",
101 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_99",
102 |                 "groupByFields": [
103 |                   "metric.label.\"node_id\"",
104 |                   "metric.label.\"worker_id\""
105 |                 ],
106 |                 "perSeriesAligner": "ALIGN_SUM"
107 |               },
108 |               "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
109 |               "pickTimeSeriesFilter": {
110 |                 "direction": "TOP",
111 |                 "numTimeSeries": ${OUTLIER_COUNT},
112 |                 "rankingMethod": "METHOD_MAX"
113 |               }
114 |             }
115 |           }
116 |         }
117 |       ],
118 |       "thresholds": [],
119 |       "timeshiftDuration": "0s",
120 |       "yAxis": {
121 |         "label": "",
122 |         "scale": "LINEAR"
123 |       }
124 |     }
125 |   },
126 |   "width": 6,
127 |   "xPos": 6,
128 |   "yPos": 54
129 | },
130 | {
131 |   "height": 8,
132 |   "widget": {
133 |     "collapsibleGroup": {
134 |       "collapsed": false
135 |     },
136 |     "title": "Host to Device Transfer Latency"
137 |   },
138 |   "width": 12,
139 |   "yPos": 50
140 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/main.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "category": "CUSTOM",
 3 |   "displayName": "GCE - TPU Monitoring Dashboard",
 4 |   "dashboardFilters": [
 5 |    {
 6 |      "filterType": "RESOURCE_LABEL",
 7 |      "labelKey": "worker_id"
 8 |    }
 9 |   ],
10 |   "mosaicLayout": {
11 |     "columns": 12,
12 |     "tiles": [
13 |       {
14 |         "height": 1,
15 |         "widget": {
16 |           "title": "TPU Worker Metrics",
17 |           "text": {
18 |             "content": ""
19 |           }
20 |         },
21 |         "width": 12,
22 |         "yPos": 0
23 |       },
24 |       ${TILE_1},
25 |       ${TILE_2},
26 |       ${TILE_3},
27 |       ${TILE_4},
28 |       {
29 |         "height": 1,
30 |         "widget": {
31 |           "title": "Megascale Metrics",
32 |           "text": {
33 |             "content": ""
34 |           }
35 |         },
36 |         "width": 12,
37 |         "yPos": 41
38 |       },
39 |       ${TILE_5},
40 |       ${TILE_6},
41 |       ${TILE_7}
42 |     ]
43 |   }
44 |  }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/memory-usage.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "TPU VM - Memory Usage Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_NONE"
 19 |               },
 20 |               "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12,
 34 |   "yPos": 9
 35 | },
 36 | {
 37 |   "height": 4,
 38 |   "widget": {
 39 |     "title": "TPU VM - Memory Usage Outliers [MEAN]",
 40 |     "xyChart": {
 41 |       "chartOptions": {
 42 |         "mode": "COLOR"
 43 |       },
 44 |       "dataSets": [
 45 |         {
 46 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
 47 |           "minAlignmentPeriod": "60s",
 48 |           "plotType": "LINE",
 49 |           "targetAxis": "Y1",
 50 |           "timeSeriesQuery": {
 51 |             "timeSeriesFilter": {
 52 |               "aggregation": {
 53 |                 "alignmentPeriod": "60s",
 54 |                 "crossSeriesReducer": "REDUCE_MEAN",
 55 |                 "groupByFields": [
 56 |                   "resource.label.\"node_id\"",
 57 |                   "resource.label.\"worker_id\""
 58 |                 ],
 59 |                 "perSeriesAligner": "ALIGN_MEAN"
 60 |               },
 61 |               "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
 62 |               "pickTimeSeriesFilter": {
 63 |                 "direction": "TOP",
 64 |                 "numTimeSeries": ${OUTLIER_COUNT},
 65 |                 "rankingMethod": "METHOD_MEAN"
 66 |               }
 67 |             }
 68 |           }
 69 |         }
 70 |       ],
 71 |       "thresholds": [],
 72 |       "timeshiftDuration": "0s",
 73 |       "yAxis": {
 74 |         "label": "",
 75 |         "scale": "LINEAR"
 76 |       }
 77 |     }
 78 |   },
 79 |   "width": 6,
 80 |   "yPos": 13
 81 | },
 82 | {
 83 |   "height": 4,
 84 |   "widget": {
 85 |     "title": "TPU VM - Memory Usage Outliers [MAX]",
 86 |     "xyChart": {
 87 |       "chartOptions": {
 88 |         "mode": "COLOR"
 89 |       },
 90 |       "dataSets": [
 91 |         {
 92 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
 93 |           "minAlignmentPeriod": "60s",
 94 |           "plotType": "LINE",
 95 |           "targetAxis": "Y1",
 96 |           "timeSeriesQuery": {
 97 |             "timeSeriesFilter": {
 98 |               "aggregation": {
 99 |                 "alignmentPeriod": "60s",
100 |                 "crossSeriesReducer": "REDUCE_MAX",
101 |                 "groupByFields": [
102 |                   "resource.label.\"node_id\"",
103 |                   "resource.label.\"worker_id\""
104 |                 ],
105 |                 "perSeriesAligner": "ALIGN_MAX"
106 |               },
107 |               "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
108 |               "pickTimeSeriesFilter": {
109 |                 "direction": "TOP",
110 |                 "numTimeSeries": ${OUTLIER_COUNT},
111 |                 "rankingMethod": "METHOD_MAX"
112 |               }
113 |             }
114 |           }
115 |         }
116 |       ],
117 |       "thresholds": [],
118 |       "timeshiftDuration": "0s",
119 |       "yAxis": {
120 |         "label": "",
121 |         "scale": "LINEAR"
122 |       }
123 |     }
124 |   },
125 |   "width": 6,
126 |   "xPos": 6,
127 |   "yPos": 13
128 | },
129 | {
130 |   "height": 8,
131 |   "widget": {
132 |     "collapsibleGroup": {
133 |       "collapsed": false
134 |     },
135 |     "title": "Memory Usage by TPU VM"
136 |   },
137 |   "width": 12,
138 |   "yPos": 9
139 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/network-bytes.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "TPU VM - Network Bytes Sent Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_NONE"
 19 |               },
 20 |               "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12,
 34 |   "yPos": 25
 35 | },
 36 | {
 37 |   "height": 4,
 38 |   "widget": {
 39 |     "title": "TPU VM - Network Bytes Sent Outliers [MEAN]",
 40 |     "xyChart": {
 41 |       "chartOptions": {
 42 |         "mode": "COLOR"
 43 |       },
 44 |       "dataSets": [
 45 |         {
 46 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
 47 |           "minAlignmentPeriod": "60s",
 48 |           "plotType": "LINE",
 49 |           "targetAxis": "Y1",
 50 |           "timeSeriesQuery": {
 51 |             "timeSeriesFilter": {
 52 |               "aggregation": {
 53 |                 "alignmentPeriod": "60s",
 54 |                 "perSeriesAligner": "ALIGN_RATE"
 55 |               },
 56 |               "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
 57 |               "pickTimeSeriesFilter": {
 58 |                 "direction": "TOP",
 59 |                 "numTimeSeries": ${OUTLIER_COUNT},
 60 |                 "rankingMethod": "METHOD_MEAN"
 61 |               }
 62 |             }
 63 |           }
 64 |         }
 65 |       ],
 66 |       "thresholds": [],
 67 |       "timeshiftDuration": "0s",
 68 |       "yAxis": {
 69 |         "label": "",
 70 |         "scale": "LINEAR"
 71 |       }
 72 |     }
 73 |   },
 74 |   "width": 6,
 75 |   "yPos": 29
 76 | },
 77 | {
 78 |   "height": 4,
 79 |   "widget": {
 80 |     "title": "TPU VM - Network Bytes Sent Outliers [MAX]",
 81 |     "xyChart": {
 82 |       "chartOptions": {
 83 |         "mode": "COLOR"
 84 |       },
 85 |       "dataSets": [
 86 |         {
 87 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
 88 |           "minAlignmentPeriod": "60s",
 89 |           "plotType": "LINE",
 90 |           "targetAxis": "Y1",
 91 |           "timeSeriesQuery": {
 92 |             "timeSeriesFilter": {
 93 |               "aggregation": {
 94 |                 "alignmentPeriod": "60s",
 95 |                 "perSeriesAligner": "ALIGN_RATE"
 96 |               },
 97 |               "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
 98 |               "pickTimeSeriesFilter": {
 99 |                 "direction": "TOP",
100 |                 "numTimeSeries": ${OUTLIER_COUNT},
101 |                 "rankingMethod": "METHOD_MAX"
102 |               }
103 |             }
104 |           }
105 |         }
106 |       ],
107 |       "thresholds": [],
108 |       "timeshiftDuration": "0s",
109 |       "yAxis": {
110 |         "label": "",
111 |         "scale": "LINEAR"
112 |       }
113 |     }
114 |   },
115 |   "width": 6,
116 |   "xPos": 6,
117 |   "yPos": 29
118 | },
119 | {
120 |   "height": 4,
121 |   "widget": {
122 |     "title": "TPU VM - Network Bytes Received Stats",
123 |     "xyChart": {
124 |       "chartOptions": {
125 |         "mode": "STATS"
126 |       },
127 |       "dataSets": [
128 |         {
129 |           "minAlignmentPeriod": "60s",
130 |           "plotType": "LINE",
131 |           "targetAxis": "Y1",
132 |           "timeSeriesQuery": {
133 |             "timeSeriesFilter": {
134 |               "aggregation": {
135 |                 "alignmentPeriod": "60s",
136 |                 "perSeriesAligner": "ALIGN_NONE"
137 |               },
138 |               "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
139 |             }
140 |           }
141 |         }
142 |       ],
143 |       "thresholds": [],
144 |       "timeshiftDuration": "0s",
145 |       "yAxis": {
146 |         "label": "",
147 |         "scale": "LINEAR"
148 |       }
149 |     }
150 |   },
151 |   "width": 12,
152 |   "yPos": 33
153 | },
154 | {
155 |   "height": 4,
156 |   "widget": {
157 |     "title": "TPU VM - Network Bytes Received Outliers [MEAN]",
158 |     "xyChart": {
159 |       "chartOptions": {
160 |         "mode": "COLOR"
161 |       },
162 |       "dataSets": [
163 |         {
164 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
165 |           "minAlignmentPeriod": "60s",
166 |           "plotType": "LINE",
167 |           "targetAxis": "Y1",
168 |           "timeSeriesQuery": {
169 |             "timeSeriesFilter": {
170 |               "aggregation": {
171 |                 "alignmentPeriod": "60s",
172 |                 "perSeriesAligner": "ALIGN_RATE"
173 |               },
174 |               "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
175 |               "pickTimeSeriesFilter": {
176 |                 "direction": "TOP",
177 |                 "numTimeSeries": ${OUTLIER_COUNT},
178 |                 "rankingMethod": "METHOD_MEAN"
179 |               }
180 |             }
181 |           }
182 |         }
183 |       ],
184 |       "thresholds": [],
185 |       "timeshiftDuration": "0s",
186 |       "yAxis": {
187 |         "label": "",
188 |         "scale": "LINEAR"
189 |       }
190 |     }
191 |   },
192 |   "width": 6,
193 |   "yPos": 37
194 | },
195 | {
196 |   "height": 4,
197 |   "widget": {
198 |     "title": "TPU VM - Network Bytes Received Outliers [MAX]",
199 |     "xyChart": {
200 |       "chartOptions": {
201 |         "mode": "COLOR"
202 |       },
203 |       "dataSets": [
204 |         {
205 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
206 |           "minAlignmentPeriod": "60s",
207 |           "plotType": "LINE",
208 |           "targetAxis": "Y1",
209 |           "timeSeriesQuery": {
210 |             "timeSeriesFilter": {
211 |               "aggregation": {
212 |                 "alignmentPeriod": "60s",
213 |                 "perSeriesAligner": "ALIGN_RATE"
214 |               },
215 |               "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
216 |               "pickTimeSeriesFilter": {
217 |                 "direction": "TOP",
218 |                 "numTimeSeries": ${OUTLIER_COUNT},
219 |                 "rankingMethod": "METHOD_MAX"
220 |               }
221 |             }
222 |           }
223 |         }
224 |       ],
225 |       "thresholds": [],
226 |       "timeshiftDuration": "0s",
227 |       "yAxis": {
228 |         "label": "",
229 |         "scale": "LINEAR"
230 |       }
231 |     }
232 |   },
233 |   "width": 6,
234 |   "xPos": 6,
235 |   "yPos": 37
236 | },
237 | {
238 |   "height": 16,
239 |   "widget": {
240 |     "collapsibleGroup": {
241 |       "collapsed": false
242 |     },
243 |     "title": "Network Bytes Received and Sent by TPU VM"
244 |   },
245 |   "width": 12,
246 |   "yPos": 25
247 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/tensorcore-idle-duration.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "Tensorcore Idle Duration Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_NONE"
 19 |               },
 20 |               "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12,
 34 |   "yPos": 17
 35 | },
 36 | {
 37 |   "height": 4,
 38 |   "widget": {
 39 |     "title": "Tensorcore Idle Duration Outliers [MEAN]",
 40 |     "xyChart": {
 41 |       "chartOptions": {
 42 |         "mode": "COLOR"
 43 |       },
 44 |       "dataSets": [
 45 |         {
 46 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
 47 |           "minAlignmentPeriod": "60s",
 48 |           "plotType": "LINE",
 49 |           "targetAxis": "Y1",
 50 |           "timeSeriesQuery": {
 51 |             "timeSeriesFilter": {
 52 |               "aggregation": {
 53 |                 "alignmentPeriod": "60s",
 54 |                 "crossSeriesReducer": "REDUCE_MEAN",
 55 |                 "groupByFields": [
 56 |                   "resource.label.\"node_id\"",
 57 |                   "resource.label.\"worker_id\""
 58 |                 ],
 59 |                 "perSeriesAligner": "ALIGN_MEAN"
 60 |               },
 61 |               "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
 62 |               "pickTimeSeriesFilter": {
 63 |                 "direction": "TOP",
 64 |                 "numTimeSeries": ${OUTLIER_COUNT},
 65 |                 "rankingMethod": "METHOD_MEAN"
 66 |               }
 67 |             }
 68 |           }
 69 |         }
 70 |       ],
 71 |       "thresholds": [],
 72 |       "timeshiftDuration": "0s",
 73 |       "yAxis": {
 74 |         "label": "",
 75 |         "scale": "LINEAR"
 76 |       }
 77 |     }
 78 |   },
 79 |   "width": 6,
 80 |   "yPos": 21
 81 | },
 82 | {
 83 |   "height": 4,
 84 |   "widget": {
 85 |     "title": "Tensorcore Idle Duration Outliers [MAX]",
 86 |     "xyChart": {
 87 |       "chartOptions": {
 88 |         "mode": "COLOR"
 89 |       },
 90 |       "dataSets": [
 91 |         {
 92 |           "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
 93 |           "minAlignmentPeriod": "60s",
 94 |           "plotType": "LINE",
 95 |           "targetAxis": "Y1",
 96 |           "timeSeriesQuery": {
 97 |             "timeSeriesFilter": {
 98 |               "aggregation": {
 99 |                 "alignmentPeriod": "60s",
100 |                 "crossSeriesReducer": "REDUCE_MAX",
101 |                 "groupByFields": [
102 |                   "resource.label.\"node_id\"",
103 |                   "resource.label.\"worker_id\""
104 |                 ],
105 |                 "perSeriesAligner": "ALIGN_MAX"
106 |               },
107 |               "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
108 |               "pickTimeSeriesFilter": {
109 |                 "direction": "TOP",
110 |                 "numTimeSeries": ${OUTLIER_COUNT},
111 |                 "rankingMethod": "METHOD_MAX"
112 |               }
113 |             }
114 |           }
115 |         }
116 |       ],
117 |       "thresholds": [],
118 |       "timeshiftDuration": "0s",
119 |       "yAxis": {
120 |         "label": "",
121 |         "scale": "LINEAR"
122 |       }
123 |     }
124 |   },
125 |   "width": 6,
126 |   "xPos": 6,
127 |   "yPos": 21
128 | },
129 | {
130 |   "height": 8,
131 |   "widget": {
132 |     "collapsibleGroup": {
133 |       "collapsed": false
134 |     },
135 |     "title": "Tensorcore Idle Duration of TPU Chip"
136 |   },
137 |   "width": 12,
138 |   "yPos": 17
139 | }


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 
20 | variable "monitoring_dashboard_config" {
21 |   type = object({
22 |     node_prefix : optional(string),
23 |     outlier_count : optional(number)
24 |   })
25 |   description = <<EOF
26 |   Configuration for monitoring dashboard:
27 |   {
28 |     "node_prefix": "prefix used to generate the node name in multislice node provision, default to empty string",
29 |     "outlier_count": "number of outliers to show on dashboard, default to 10 if not set"
30 |   }
31 |   Enter {} to set default configuration for monitoring dashboard.
32 |   EOF
33 | }
34 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source  = "hashicorp/google"
19 |       version = ">= 4.57.0"
20 |     }
21 |   }
22 |   /*
23 |     Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 |     The bucket must exist prior to configuring the backend.
25 |     For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 |   */
27 |   backend "gcs" {
28 |     # GCS prefix inside the bucket. terraform states are stored in an object called <prefix>/default.tfstate
29 |     prefix = "gce/dashboard/monitoring_dashboard"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/log_storage/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 
20 | // Valid inputs:
21 | // 1. To create stack trace bucket for 30 retention days: {"bucket_name":"<log_bucket_name>"}
22 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"<log_bucket_name>", "retention_days":x}
23 | // 3. To not create stack trace bucket: {}
24 | variable "stack_trace_bucket_config" {
25 |   type = object({
26 |     bucket_name : optional(string)
27 |     retention_days : optional(number)
28 |   })
29 |   validation {
30 |     condition = (
31 |       (var.stack_trace_bucket_config.bucket_name == null &&
32 |       var.stack_trace_bucket_config.retention_days == null) ||
33 |       (var.stack_trace_bucket_config.bucket_name != null)
34 |     )
35 |     error_message = "bucket_name is not defined for stack_trace_bucket_config."
36 |   }
37 |   description = <<EOF
38 |   Configuration to create a log bucket to store stack traces:
39 |   {
40 |     "bucket_name": "name of log bucket to create",
41 |     "retention_days": number of days to retain stack traces, default to 30 days if not set
42 |   }
43 |   Enter {} to not create separate log bucket for stack traces.
44 |   EOF
45 | }
46 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/log_storage/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source  = "hashicorp/google"
19 |       version = ">= 4.57.0"
20 |     }
21 |   }
22 |   /*
23 |     Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 |     The bucket must exist prior to configuring the backend.
25 |     For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 |   */
27 |   backend "gcs" {
28 |     # GCS prefix inside the bucket. terraform states are stored in an object called <prefix>/default.tfstate
29 |     prefix = "gce/log_storage"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/gcp_resources/gce/resources/log_storage/stack-trace-bucket.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | locals {
16 |   stack_trace_filter         = "projects/${var.project_name}/logs/tpu.googleapis.com%2Fruntime_monitor AND jsonPayload.verb=stacktraceanalyzer"
17 |   stack_trace_bucket_counter = var.stack_trace_bucket_config.bucket_name == null ? 0 : 1
18 | }
19 | 
20 | resource "google_logging_project_bucket_config" "log_bucket" {
21 |   count    = local.stack_trace_bucket_counter
22 |   project  = var.project_name
23 |   location = "global"
24 |   // default retention period is 30 days
25 |   retention_days = var.stack_trace_bucket_config.retention_days == null ? 30 : var.stack_trace_bucket_config.retention_days
26 |   bucket_id      = var.stack_trace_bucket_config.bucket_name
27 | }
28 | 
29 | resource "google_logging_project_sink" "log_sink" {
30 |   count       = local.stack_trace_bucket_counter
31 |   project     = var.project_name
32 |   name        = "${var.stack_trace_bucket_config.bucket_name}_sink"
33 |   destination = "logging.googleapis.com/projects/${var.project_name}/locations/global/buckets/${google_logging_project_bucket_config.log_bucket[count.index].bucket_id}"
34 |   filter      = local.stack_trace_filter
35 | }
36 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 
20 | variable "monitoring_dashboard_config" {
21 |   type = object({
22 |     outlier_count : optional(number)
23 |   })
24 |   description = <<EOF
25 |   Configuration for monitoring dashboard:
26 |   {
27 |     "outlier_count": "number of outliers to show on dashboard, default to 10 if not set"
28 |   }
29 |   Enter {} to set default configuration for monitoring dashboard.
30 |   EOF
31 | }
32 | 
33 | // Valid inputs:
34 | // 1. To create stack trace bucket for 30 retention days: {"bucket_name":"<log_bucket_name>"}
35 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"<log_bucket_name>", "retention_days":x}
36 | // 3. To not create stack trace bucket: {}
37 | variable "stack_trace_bucket_config" {
38 |   type = object({
39 |     bucket_name : optional(string)
40 |     retention_days : optional(number)
41 |   })
42 |   validation {
43 |     condition = (
44 |       (var.stack_trace_bucket_config.bucket_name == null &&
45 |       var.stack_trace_bucket_config.retention_days == null) ||
46 |       (var.stack_trace_bucket_config.bucket_name != null)
47 |     )
48 |     error_message = "bucket_name is not defined for stack_trace_bucket_config."
49 |   }
50 |   description = <<EOF
51 |   Configuration to create a log bucket to store stack traces:
52 |   {
53 |     "bucket_name": "name of log bucket to create",
54 |     "retention_days": number of days to retain stack traces, default to 30 days if not set
55 |   }
56 |   Enter {} to not create separate log bucket for stack traces.
57 |   EOF
58 | }
59 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source  = "hashicorp/google"
19 |       version = ">= 4.57.0"
20 |     }
21 |   }
22 |   /*
23 |     Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 |     The bucket must exist prior to configuring the backend.
25 |     For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 |   */
27 |   backend "gcs" {
28 |     # GCS prefix inside the bucket. terraform states are stored in an object called <prefix>/default.tfstate
29 |     prefix = "gke"
30 |   }
31 | }
32 | 
33 | module "monitoring_dashboard" {
34 |   source                      = "./resources/dashboard/monitoring_dashboard"
35 |   project_name                = var.project_name
36 |   monitoring_dashboard_config = var.monitoring_dashboard_config
37 | }
38 | 
39 | module "logging_dashboard" {
40 |   source       = "./resources/dashboard/logging_dashboard"
41 |   project_name = var.project_name
42 | }
43 | 
44 | module "log_storage" {
45 |   source                    = "./resources/log_storage"
46 |   project_name              = var.project_name
47 |   stack_trace_bucket_config = var.stack_trace_bucket_config
48 | }
49 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | data "google_project" "project" {
16 |   project_id = var.project_name
17 | }
18 | 
19 | // Add a dependency on log_metrics module to deploy log-based metrics before deploying logging dashboard
20 | module "log_metrics" {
21 |   source       = "./log_metrics"
22 |   project_name = var.project_name
23 | }
24 | 
25 | locals {
26 |   dashboard_json = templatefile("${path.module}/dashboard_json/main.json",
27 |     {
28 |       TILE_1 = templatefile("${path.module}/dashboard_json/stack-trace-counter-metric.json",
29 |         {
30 |           METRIC_NAME = module.log_metrics.stack_trace_counter_metric_id
31 |       }),
32 |       TILE_2 = templatefile("${path.module}/dashboard_json/stack-trace-log-panel.json",
33 |         {
34 |           PROJECT_NUMBER = data.google_project.project.number
35 |       })
36 |   })
37 | }
38 | 
39 | resource "google_monitoring_dashboard" "logging_dashboard" {
40 |   project        = var.project_name
41 |   dashboard_json = local.dashboard_json
42 |   depends_on     = [module.log_metrics.stack_trace_counter_metric]
43 | }
44 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/main.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "category": "CUSTOM",
 3 |   "displayName": "GKE - TPU Logging Dashboard",
 4 |   "dashboardFilters": [
 5 |     {
 6 |       "filterType": "RESOURCE_LABEL",
 7 |       "labelKey": "cluster_name",
 8 |       "templateVariable": "ClusterName"
 9 |     },
10 |     {
11 |       "filterType": "USER_METADATA_LABEL",
12 |       "labelKey": "jobset.sigs.k8s.io/jobset-name",
13 |       "templateVariable": "JobName"
14 |     }
15 |   ],
16 |   "mosaicLayout": {
17 |     "columns": 12,
18 |     "tiles": [
19 |       ${TILE_1},
20 |       ${TILE_2},
21 |       {
22 |         "height": 10,
23 |         "widget": {
24 |           "collapsibleGroup": {
25 |             "collapsed": false
26 |           },
27 |           "title": "TPU VM Process Debugging"
28 |         },
29 |         "width": 12
30 |       }
31 |     ]
32 |   }
33 | }


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-counter-metric.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "height": 4,
 3 |   "widget": {
 4 |     "timeSeriesTable": {
 5 |       "columnSettings": [
 6 |         {
 7 |           "column": "location",
 8 |           "visible": true
 9 |         },
10 |         {
11 |           "column": "pod",
12 |           "visible": true
13 |         },
14 |         {
15 |           "column": "cluster",
16 |           "visible": true
17 |         },
18 |         {
19 |           "column": "job_name",
20 |           "visible": true
21 |         },
22 |         {
23 |           "column": "value",
24 |           "visible": true
25 |         }
26 |       ],
27 |       "dataSets": [
28 |         {
29 |           "minAlignmentPeriod": "600s",
30 |           "timeSeriesQuery": {
31 |             "outputFullDuration": true,
32 |             "timeSeriesFilter": {
33 |               "aggregation": {
34 |                 "alignmentPeriod": "600s",
35 |                 "perSeriesAligner": "ALIGN_RATE"
36 |               },
37 |               "filter": "metric.type=\"logging.googleapis.com/user/${METRIC_NAME}\" resource.type=\"k8s_container\" $${ClusterName} $${JobName}",
38 |               "secondaryAggregation": {
39 |                 "alignmentPeriod": "600s",
40 |                 "crossSeriesReducer": "REDUCE_MEAN",
41 |                 "groupByFields": [
42 |                   "metric.label.\"location\"",
43 |                   "metric.label.\"pod\"",
44 |                   "metric.label.\"cluster\"",
45 |                   "metric.label.\"job_name\""
46 |                 ],
47 |                 "perSeriesAligner": "ALIGN_MEAN"
48 |               }
49 |             }
50 |           }
51 |         }
52 |       ],
53 |       "metricVisualization": "BAR"
54 |     },
55 |     "title": "Stack Trace Log Entry Count per Period [Sorted by MEAN]"
56 |   },
57 |   "width": 12
58 | }


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-log-panel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "height": 6,
 3 |   "widget": {
 4 |     "logsPanel": {
 5 |       "filter": "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\" AND $${ClusterName}",
 6 |       "resourceNames": [
 7 |         "projects/${PROJECT_NUMBER}"
 8 |       ]
 9 |     },
10 |     "title": "Stack Trace Logs"
11 |   },
12 |   "width": 12,
13 |   "yPos": 4
14 | }


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/log_metrics/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/log_metrics/stack_trace_counter.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | // Metric that counts the number of stack trace entries that match a specified filter within a specific period
16 | resource "google_logging_metric" "stack_trace_counter_metric" {
17 |   name        = "stack_trace_counter_gke"
18 |   project     = var.project_name
19 |   description = "Counts the number of stack trace log entries within a specific period."
20 |   filter      = "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\""
21 |   metric_descriptor {
22 |     metric_kind = "DELTA"
23 |     value_type  = "INT64"
24 |     labels {
25 |       key        = "location"
26 |       value_type = "STRING"
27 |     }
28 |     labels {
29 |       key        = "cluster"
30 |       value_type = "STRING"
31 |     }
32 |     labels {
33 |       key        = "pod"
34 |       value_type = "STRING"
35 |     }
36 |     labels {
37 |       key        = "job_name"
38 |       value_type = "STRING"
39 |     }
40 |   }
41 |   label_extractors = {
42 |     "location" = "EXTRACT(resource.labels.location)",
43 |     "cluster"  = "EXTRACT(resource.labels.cluster_name)",
44 |     "pod"      = "EXTRACT(resource.labels.pod_name)",
45 |     "job_name" = "EXTRACT(labels.k8s-pod/job-name)",
46 |   }
47 | }
48 | 
49 | output "stack_trace_counter_metric_id" {
50 |   value = google_logging_metric.stack_trace_counter_metric.id
51 | }
52 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source  = "hashicorp/google"
19 |       version = ">= 4.57.0"
20 |     }
21 |   }
22 |   /*
23 |     Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 |     The bucket must exist prior to configuring the backend.
25 |     For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 |   */
27 |   backend "gcs" {
28 |     # GCS prefix inside the bucket. terraform states are stored in an object called <prefix>/default.tfstate
29 |     prefix = "gke/dashboard/logging_dashboard"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | locals {
16 |   outlier_count = var.monitoring_dashboard_config.outlier_count == null ? 10 : var.monitoring_dashboard_config.outlier_count
17 |   dashboard_json = templatefile("${path.module}/dashboard_json/main.json",
18 |     {
19 |       TILE_1 = templatefile("${path.module}/dashboard_json/cpu-utilization.json",
20 |         {
21 |           OUTLIER_COUNT = local.outlier_count
22 |       }),
23 |       TILE_2 = templatefile("${path.module}/dashboard_json/memory-usage.json",
24 |         {
25 |           OUTLIER_COUNT = local.outlier_count
26 |       }),
27 |       TILE_3 = templatefile("${path.module}/dashboard_json/accelerator-memory-used.json",
28 |         {
29 |           OUTLIER_COUNT = local.outlier_count
30 |       }),
31 |       TILE_4 = templatefile("${path.module}/dashboard_json/duty-cycle.json",
32 |         {
33 |           OUTLIER_COUNT = local.outlier_count
34 |       }),
35 |       TILE_5 = templatefile("${path.module}/dashboard_json/network-bytes.json",
36 |         {
37 |           OUTLIER_COUNT = local.outlier_count
38 |       }),
39 |       TILE_6 = templatefile("${path.module}/dashboard_json/dcn-transfer-latency.json",
40 |         {
41 |           OUTLIER_COUNT = local.outlier_count
42 |       }),
43 |       TILE_7 = templatefile("${path.module}/dashboard_json/collectives-latency.json",
44 |         {
45 |           OUTLIER_COUNT = local.outlier_count
46 |       }),
47 |       TILE_8 = templatefile("${path.module}/dashboard_json/host-to-device-transfer-latency.json",
48 |         {
49 |           OUTLIER_COUNT = local.outlier_count
50 |       }),
51 |       TILE_9 = templatefile("${path.module}/dashboard_json/device-to-host-transfer-latency.json",
52 |         {
53 |           OUTLIER_COUNT = local.outlier_count
54 |       })
55 |   })
56 | }
57 | 
58 | resource "google_monitoring_dashboard" "monitoring_dashboard" {
59 |   project        = var.project_name
60 |   dashboard_json = local.dashboard_json
61 | }
62 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/accelerator-memory-used.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "Accelerator Memory Used Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_MEAN"
 19 |               },
 20 |               "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12,
 34 |   "yPos": 16
 35 | },
 36 | {
 37 |   "height": 4,
 38 |   "widget": {
 39 |     "title": "Accelerator Memory Used Outliers [MEAN]",
 40 |     "xyChart": {
 41 |       "chartOptions": {
 42 |         "mode": "COLOR"
 43 |       },
 44 |       "dataSets": [
 45 |         {
 46 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 47 |           "minAlignmentPeriod": "60s",
 48 |           "plotType": "LINE",
 49 |           "targetAxis": "Y1",
 50 |           "timeSeriesQuery": {
 51 |             "timeSeriesFilter": {
 52 |               "aggregation": {
 53 |                 "alignmentPeriod": "60s",
 54 |                 "perSeriesAligner": "ALIGN_MEAN"
 55 |               },
 56 |               "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
 57 |               "pickTimeSeriesFilter": {
 58 |                 "direction": "TOP",
 59 |                 "numTimeSeries": ${OUTLIER_COUNT},
 60 |                 "rankingMethod": "METHOD_MEAN"
 61 |               },
 62 |               "secondaryAggregation": {
 63 |                 "alignmentPeriod": "60s",
 64 |                 "crossSeriesReducer": "REDUCE_MEAN",
 65 |                 "groupByFields": [
 66 |                   "resource.label.\"cluster_name\"",
 67 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 68 |                   "resource.label.\"pod_name\""
 69 |                 ],
 70 |                 "perSeriesAligner": "ALIGN_NONE"
 71 |               }
 72 |             }
 73 |           }
 74 |         }
 75 |       ],
 76 |       "thresholds": [],
 77 |       "timeshiftDuration": "0s",
 78 |       "yAxis": {
 79 |         "label": "",
 80 |         "scale": "LINEAR"
 81 |       }
 82 |     }
 83 |   },
 84 |   "width": 6,
 85 |   "yPos": 20
 86 | },
 87 | {
 88 |   "height": 4,
 89 |   "widget": {
 90 |     "title": "Accelerator Memory Used Outliers [MAX]",
 91 |     "xyChart": {
 92 |       "chartOptions": {
 93 |         "mode": "COLOR"
 94 |       },
 95 |       "dataSets": [
 96 |         {
 97 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 98 |           "minAlignmentPeriod": "60s",
 99 |           "plotType": "LINE",
100 |           "targetAxis": "Y1",
101 |           "timeSeriesQuery": {
102 |             "timeSeriesFilter": {
103 |               "aggregation": {
104 |                 "alignmentPeriod": "60s",
105 |                 "perSeriesAligner": "ALIGN_MEAN"
106 |               },
107 |               "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
108 |               "pickTimeSeriesFilter": {
109 |                 "direction": "TOP",
110 |                 "numTimeSeries": ${OUTLIER_COUNT},
111 |                 "rankingMethod": "METHOD_MAX"
112 |               },
113 |               "secondaryAggregation": {
114 |                 "alignmentPeriod": "60s",
115 |                 "crossSeriesReducer": "REDUCE_MAX",
116 |                 "groupByFields": [
117 |                   "resource.label.\"cluster_name\"",
118 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
119 |                   "resource.label.\"pod_name\""
120 |                 ],
121 |                 "perSeriesAligner": "ALIGN_NONE"
122 |               }
123 |             }
124 |           }
125 |         }
126 |       ],
127 |       "thresholds": [],
128 |       "timeshiftDuration": "0s",
129 |       "yAxis": {
130 |         "label": "",
131 |         "scale": "LINEAR"
132 |       }
133 |     }
134 |   },
135 |   "width": 6,
136 |   "xPos": 6,
137 |   "yPos": 20
138 | },
139 | {
140 |   "height": 8,
141 |   "widget": {
142 |     "collapsibleGroup": {
143 |       "collapsed": false
144 |     },
145 |     "title": "Accelerator Memory Used by TPU Slice"
146 |   },
147 |   "width": 12,
148 |   "yPos": 16
149 | }
150 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/collectives-latency.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "Collectives Latency Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "COLOR"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "HEATMAP",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "crossSeriesReducer": "REDUCE_SUM",
 19 |                 "perSeriesAligner": "ALIGN_DELTA"
 20 |               },
 21 |               "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
 22 |             }
 23 |           }
 24 |         }
 25 |       ],
 26 |       "thresholds": [],
 27 |       "timeshiftDuration": "0s",
 28 |       "yAxis": {
 29 |         "label": "",
 30 |         "scale": "LINEAR"
 31 |       }
 32 |     }
 33 |   },
 34 |   "width": 12,
 35 |   "yPos": 57
 36 | },
 37 | {
 38 |   "height": 4,
 39 |   "widget": {
 40 |     "title": "Collectives Latency Outliers [p50]",
 41 |     "xyChart": {
 42 |       "chartOptions": {
 43 |         "mode": "COLOR"
 44 |       },
 45 |       "dataSets": [
 46 |         {
 47 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 48 |           "minAlignmentPeriod": "60s",
 49 |           "plotType": "LINE",
 50 |           "targetAxis": "Y1",
 51 |           "timeSeriesQuery": {
 52 |             "timeSeriesFilter": {
 53 |               "aggregation": {
 54 |                 "alignmentPeriod": "60s",
 55 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 56 |                 "groupByFields": [
 57 |                   "resource.label.\"cluster_name\"",
 58 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 59 |                   "resource.label.\"pod_name\""
 60 |                 ],
 61 |                 "perSeriesAligner": "ALIGN_PERCENTILE_50"
 62 |               },
 63 |               "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
 64 |               "pickTimeSeriesFilter": {
 65 |                 "direction": "TOP",
 66 |                 "numTimeSeries": ${OUTLIER_COUNT},
 67 |                 "rankingMethod": "METHOD_MAX"
 68 |               }
 69 |             }
 70 |           }
 71 |         }
 72 |       ],
 73 |       "thresholds": [],
 74 |       "timeshiftDuration": "0s",
 75 |       "yAxis": {
 76 |         "label": "",
 77 |         "scale": "LINEAR"
 78 |       }
 79 |     }
 80 |   },
 81 |   "width": 6,
 82 |   "yPos": 61
 83 | },
 84 | {
 85 |   "height": 4,
 86 |   "widget": {
 87 |     "title": "Collectives Latency Outliers [p99]",
 88 |     "xyChart": {
 89 |       "chartOptions": {
 90 |         "mode": "COLOR"
 91 |       },
 92 |       "dataSets": [
 93 |         {
 94 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 95 |           "minAlignmentPeriod": "60s",
 96 |           "plotType": "LINE",
 97 |           "targetAxis": "Y1",
 98 |           "timeSeriesQuery": {
 99 |             "timeSeriesFilter": {
100 |               "aggregation": {
101 |                 "alignmentPeriod": "60s",
102 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_99",
103 |                 "groupByFields": [
104 |                   "resource.label.\"cluster_name\"",
105 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
106 |                   "resource.label.\"pod_name\""
107 |                 ],
108 |                 "perSeriesAligner": "ALIGN_PERCENTILE_99"
109 |               },
110 |               "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
111 |               "pickTimeSeriesFilter": {
112 |                 "direction": "TOP",
113 |                 "numTimeSeries": ${OUTLIER_COUNT},
114 |                 "rankingMethod": "METHOD_MAX"
115 |               }
116 |             }
117 |           }
118 |         }
119 |       ],
120 |       "thresholds": [],
121 |       "timeshiftDuration": "0s",
122 |       "yAxis": {
123 |         "label": "",
124 |         "scale": "LINEAR"
125 |       }
126 |     }
127 |   },
128 |   "width": 6,
129 |   "xPos": 6,
130 |   "yPos": 61
131 | },
132 | {
133 |   "height": 8,
134 |   "widget": {
135 |     "collapsibleGroup": {
136 |       "collapsed": false
137 |     },
138 |     "title": "Collectives Latency"
139 |   },
140 |   "width": 12,
141 |   "yPos": 57
142 | }
143 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/cpu-utilization.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "CPU Utilization Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_RATE"
 19 |               },
 20 |               "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12
 34 | },
 35 | {
 36 |   "height": 4,
 37 |   "widget": {
 38 |     "title": "CPU Utilization Outliers [MEAN]",
 39 |     "xyChart": {
 40 |       "chartOptions": {
 41 |         "mode": "COLOR"
 42 |       },
 43 |       "dataSets": [
 44 |         {
 45 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 46 |           "minAlignmentPeriod": "60s",
 47 |           "plotType": "LINE",
 48 |           "targetAxis": "Y1",
 49 |           "timeSeriesQuery": {
 50 |             "timeSeriesFilter": {
 51 |               "aggregation": {
 52 |                 "alignmentPeriod": "60s",
 53 |                 "perSeriesAligner": "ALIGN_RATE"
 54 |               },
 55 |               "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
 56 |               "pickTimeSeriesFilter": {
 57 |                 "direction": "TOP",
 58 |                 "numTimeSeries": ${OUTLIER_COUNT},
 59 |                 "rankingMethod": "METHOD_MEAN"
 60 |               },
 61 |               "secondaryAggregation": {
 62 |                 "alignmentPeriod": "60s",
 63 |                 "crossSeriesReducer": "REDUCE_MEAN",
 64 |                 "groupByFields": [
 65 |                   "resource.label.\"cluster_name\"",
 66 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 67 |                   "resource.label.\"pod_name\""
 68 |                 ],
 69 |                 "perSeriesAligner": "ALIGN_NONE"
 70 |               }
 71 |             }
 72 |           }
 73 |         }
 74 |       ],
 75 |       "thresholds": [],
 76 |       "timeshiftDuration": "0s",
 77 |       "yAxis": {
 78 |         "label": "",
 79 |         "scale": "LINEAR"
 80 |       }
 81 |     }
 82 |   },
 83 |   "width": 6,
 84 |   "yPos": 4
 85 | },
 86 | {
 87 |   "height": 4,
 88 |   "widget": {
 89 |     "title": "CPU Utilization Outliers [MAX]",
 90 |     "xyChart": {
 91 |       "chartOptions": {
 92 |         "mode": "COLOR"
 93 |       },
 94 |       "dataSets": [
 95 |         {
 96 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 97 |           "minAlignmentPeriod": "60s",
 98 |           "plotType": "LINE",
 99 |           "targetAxis": "Y1",
100 |           "timeSeriesQuery": {
101 |             "timeSeriesFilter": {
102 |               "aggregation": {
103 |                 "alignmentPeriod": "60s",
104 |                 "perSeriesAligner": "ALIGN_RATE"
105 |               },
106 |               "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
107 |               "pickTimeSeriesFilter": {
108 |                 "direction": "TOP",
109 |                 "numTimeSeries": ${OUTLIER_COUNT},
110 |                 "rankingMethod": "METHOD_MAX"
111 |               },
112 |               "secondaryAggregation": {
113 |                 "alignmentPeriod": "60s",
114 |                 "crossSeriesReducer": "REDUCE_MAX",
115 |                 "groupByFields": [
116 |                   "resource.label.\"cluster_name\"",
117 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
118 |                   "resource.label.\"pod_name\""
119 |                 ],
120 |                 "perSeriesAligner": "ALIGN_NONE"
121 |               }
122 |             }
123 |           }
124 |         }
125 |       ],
126 |       "thresholds": [],
127 |       "timeshiftDuration": "0s",
128 |       "yAxis": {
129 |         "label": "",
130 |         "scale": "LINEAR"
131 |       }
132 |     }
133 |   },
134 |   "width": 6,
135 |   "xPos": 6,
136 |   "yPos": 4
137 | },
138 | {
139 |   "height": 8,
140 |   "widget": {
141 |     "collapsibleGroup": {
142 |       "collapsed": false
143 |     },
144 |     "title": "CPU Utilization by TPU Slice"
145 |   },
146 |   "width": 12
147 | }
148 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/dcn-transfer-latency.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "DCN Transfer Latency Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "COLOR"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "HEATMAP",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "crossSeriesReducer": "REDUCE_SUM",
 19 |                 "perSeriesAligner": "ALIGN_DELTA"
 20 |               },
 21 |               "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
 22 |             }
 23 |           }
 24 |         }
 25 |       ],
 26 |       "thresholds": [],
 27 |       "timeshiftDuration": "0s",
 28 |       "yAxis": {
 29 |         "label": "",
 30 |         "scale": "LINEAR"
 31 |       }
 32 |     }
 33 |   },
 34 |   "width": 12,
 35 |   "yPos": 49
 36 | },
 37 | {
 38 |   "height": 4,
 39 |   "widget": {
 40 |     "title": "DCN Transfer Latency Outliers [p50]",
 41 |     "xyChart": {
 42 |       "chartOptions": {
 43 |         "mode": "COLOR"
 44 |       },
 45 |       "dataSets": [
 46 |         {
 47 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 48 |           "minAlignmentPeriod": "60s",
 49 |           "plotType": "LINE",
 50 |           "targetAxis": "Y1",
 51 |           "timeSeriesQuery": {
 52 |             "timeSeriesFilter": {
 53 |               "aggregation": {
 54 |                 "alignmentPeriod": "60s",
 55 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 56 |                 "groupByFields": [
 57 |                   "resource.label.\"cluster_name\"",
 58 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 59 |                   "resource.label.\"pod_name\""
 60 |                 ],
 61 |                 "perSeriesAligner": "ALIGN_PERCENTILE_50"
 62 |               },
 63 |               "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
 64 |               "pickTimeSeriesFilter": {
 65 |                 "direction": "TOP",
 66 |                 "numTimeSeries": ${OUTLIER_COUNT},
 67 |                 "rankingMethod": "METHOD_MAX"
 68 |               }
 69 |             }
 70 |           }
 71 |         }
 72 |       ],
 73 |       "thresholds": [],
 74 |       "timeshiftDuration": "0s",
 75 |       "yAxis": {
 76 |         "label": "",
 77 |         "scale": "LINEAR"
 78 |       }
 79 |     }
 80 |   },
 81 |   "width": 6,
 82 |   "yPos": 53
 83 | },
 84 | {
 85 |   "height": 4,
 86 |   "widget": {
 87 |     "title": "DCN Transfer Latency Outliers [p99]",
 88 |     "xyChart": {
 89 |       "chartOptions": {
 90 |         "mode": "COLOR"
 91 |       },
 92 |       "dataSets": [
 93 |         {
 94 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 95 |           "minAlignmentPeriod": "60s",
 96 |           "plotType": "LINE",
 97 |           "targetAxis": "Y1",
 98 |           "timeSeriesQuery": {
 99 |             "timeSeriesFilter": {
100 |               "aggregation": {
101 |                 "alignmentPeriod": "60s",
102 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_99",
103 |                 "groupByFields": [
104 |                   "resource.label.\"cluster_name\"",
105 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
106 |                   "resource.label.\"pod_name\""
107 |                 ],
108 |                 "perSeriesAligner": "ALIGN_PERCENTILE_99"
109 |               },
110 |               "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
111 |               "pickTimeSeriesFilter": {
112 |                 "direction": "TOP",
113 |                 "numTimeSeries": ${OUTLIER_COUNT},
114 |                 "rankingMethod": "METHOD_MAX"
115 |               }
116 |             }
117 |           }
118 |         }
119 |       ],
120 |       "thresholds": [],
121 |       "timeshiftDuration": "0s",
122 |       "yAxis": {
123 |         "label": "",
124 |         "scale": "LINEAR"
125 |       }
126 |     }
127 |   },
128 |   "width": 6,
129 |   "xPos": 6,
130 |   "yPos": 53
131 | },
132 | {
133 |   "height": 8,
134 |   "widget": {
135 |     "collapsibleGroup": {
136 |       "collapsed": false
137 |     },
138 |     "title": "DCN Transfer Latency"
139 |   },
140 |   "width": 12,
141 |   "yPos": 49
142 | }
143 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/device-to-host-transfer-latency.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "Device To Host Transfer Latency Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "COLOR"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "HEATMAP",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "crossSeriesReducer": "REDUCE_SUM",
 19 |                 "perSeriesAligner": "ALIGN_DELTA"
 20 |               },
 21 |               "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
 22 |             }
 23 |           }
 24 |         }
 25 |       ],
 26 |       "thresholds": [],
 27 |       "timeshiftDuration": "0s",
 28 |       "yAxis": {
 29 |         "label": "",
 30 |         "scale": "LINEAR"
 31 |       }
 32 |     }
 33 |   },
 34 |   "width": 12,
 35 |   "yPos": 73
 36 | },
 37 | {
 38 |   "height": 4,
 39 |   "widget": {
 40 |     "title": "Device To Host Transfer Latency Outliers [p50]",
 41 |     "xyChart": {
 42 |       "chartOptions": {
 43 |         "mode": "COLOR"
 44 |       },
 45 |       "dataSets": [
 46 |         {
 47 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 48 |           "minAlignmentPeriod": "60s",
 49 |           "plotType": "LINE",
 50 |           "targetAxis": "Y1",
 51 |           "timeSeriesQuery": {
 52 |             "timeSeriesFilter": {
 53 |               "aggregation": {
 54 |                 "alignmentPeriod": "60s",
 55 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 56 |                 "groupByFields": [
 57 |                   "resource.label.\"cluster_name\"",
 58 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 59 |                   "resource.label.\"pod_name\""
 60 |                 ],
 61 |                 "perSeriesAligner": "ALIGN_PERCENTILE_50"
 62 |               },
 63 |               "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
 64 |               "pickTimeSeriesFilter": {
 65 |                 "direction": "TOP",
 66 |                 "numTimeSeries": ${OUTLIER_COUNT},
 67 |                 "rankingMethod": "METHOD_MAX"
 68 |               }
 69 |             }
 70 |           }
 71 |         }
 72 |       ],
 73 |       "thresholds": [],
 74 |       "timeshiftDuration": "0s",
 75 |       "yAxis": {
 76 |         "label": "",
 77 |         "scale": "LINEAR"
 78 |       }
 79 |     }
 80 |   },
 81 |   "width": 6,
 82 |   "yPos": 77
 83 | },
 84 | {
 85 |   "height": 4,
 86 |   "widget": {
 87 |     "title": "Device To Host Transfer Latency Outliers [p99]",
 88 |     "xyChart": {
 89 |       "chartOptions": {
 90 |         "mode": "COLOR"
 91 |       },
 92 |       "dataSets": [
 93 |         {
 94 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 95 |           "minAlignmentPeriod": "60s",
 96 |           "plotType": "LINE",
 97 |           "targetAxis": "Y1",
 98 |           "timeSeriesQuery": {
 99 |             "timeSeriesFilter": {
100 |               "aggregation": {
101 |                 "alignmentPeriod": "60s",
102 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_99",
103 |                 "groupByFields": [
104 |                   "resource.label.\"cluster_name\"",
105 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
106 |                   "resource.label.\"pod_name\""
107 |                 ],
108 |                 "perSeriesAligner": "ALIGN_PERCENTILE_99"
109 |               },
110 |               "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
111 |               "pickTimeSeriesFilter": {
112 |                 "direction": "TOP",
113 |                 "numTimeSeries": ${OUTLIER_COUNT},
114 |                 "rankingMethod": "METHOD_MAX"
115 |               }
116 |             }
117 |           }
118 |         }
119 |       ],
120 |       "thresholds": [],
121 |       "timeshiftDuration": "0s",
122 |       "yAxis": {
123 |         "label": "",
124 |         "scale": "LINEAR"
125 |       }
126 |     }
127 |   },
128 |   "width": 6,
129 |   "xPos": 6,
130 |   "yPos": 77
131 | },
132 | {
133 |   "height": 8,
134 |   "widget": {
135 |     "collapsibleGroup": {
136 |       "collapsed": false
137 |     },
138 |     "title": "Device To Host Transfer Latency"
139 |   },
140 |   "width": 12,
141 |   "yPos": 73
142 | }
143 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/duty-cycle.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "Duty Cycle Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_MEAN"
 19 |               },
 20 |               "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12,
 34 |   "yPos": 24
 35 | },
 36 | {
 37 |   "height": 4,
 38 |   "widget": {
 39 |     "title": "Duty Cycle Outliers [MEAN]",
 40 |     "xyChart": {
 41 |       "chartOptions": {
 42 |         "mode": "COLOR"
 43 |       },
 44 |       "dataSets": [
 45 |         {
 46 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 47 |           "minAlignmentPeriod": "60s",
 48 |           "plotType": "LINE",
 49 |           "targetAxis": "Y1",
 50 |           "timeSeriesQuery": {
 51 |             "timeSeriesFilter": {
 52 |               "aggregation": {
 53 |                 "alignmentPeriod": "60s",
 54 |                 "perSeriesAligner": "ALIGN_MEAN"
 55 |               },
 56 |               "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
 57 |               "pickTimeSeriesFilter": {
 58 |                 "direction": "TOP",
 59 |                 "numTimeSeries": ${OUTLIER_COUNT},
 60 |                 "rankingMethod": "METHOD_MEAN"
 61 |               },
 62 |               "secondaryAggregation": {
 63 |                 "alignmentPeriod": "60s",
 64 |                 "crossSeriesReducer": "REDUCE_MEAN",
 65 |                 "groupByFields": [
 66 |                   "resource.label.\"cluster_name\"",
 67 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 68 |                   "resource.label.\"pod_name\""
 69 |                 ],
 70 |                 "perSeriesAligner": "ALIGN_NONE"
 71 |               }
 72 |             }
 73 |           }
 74 |         }
 75 |       ],
 76 |       "thresholds": [],
 77 |       "timeshiftDuration": "0s",
 78 |       "yAxis": {
 79 |         "label": "",
 80 |         "scale": "LINEAR"
 81 |       }
 82 |     }
 83 |   },
 84 |   "width": 6,
 85 |   "yPos": 28
 86 | },
 87 | {
 88 |   "height": 4,
 89 |   "widget": {
 90 |     "title": "Duty Cycle Outliers [MAX]",
 91 |     "xyChart": {
 92 |       "chartOptions": {
 93 |         "mode": "COLOR"
 94 |       },
 95 |       "dataSets": [
 96 |         {
 97 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 98 |           "minAlignmentPeriod": "60s",
 99 |           "plotType": "LINE",
100 |           "targetAxis": "Y1",
101 |           "timeSeriesQuery": {
102 |             "timeSeriesFilter": {
103 |               "aggregation": {
104 |                 "alignmentPeriod": "60s",
105 |                 "perSeriesAligner": "ALIGN_MEAN"
106 |               },
107 |               "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
108 |               "pickTimeSeriesFilter": {
109 |                 "direction": "TOP",
110 |                 "numTimeSeries": ${OUTLIER_COUNT},
111 |                 "rankingMethod": "METHOD_MAX"
112 |               },
113 |               "secondaryAggregation": {
114 |                 "alignmentPeriod": "60s",
115 |                 "crossSeriesReducer": "REDUCE_MAX",
116 |                 "groupByFields": [
117 |                   "resource.label.\"cluster_name\"",
118 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
119 |                   "resource.label.\"pod_name\""
120 |                 ],
121 |                 "perSeriesAligner": "ALIGN_NONE"
122 |               }
123 |             }
124 |           }
125 |         }
126 |       ],
127 |       "thresholds": [],
128 |       "timeshiftDuration": "0s",
129 |       "yAxis": {
130 |         "label": "",
131 |         "scale": "LINEAR"
132 |       }
133 |     }
134 |   },
135 |   "width": 6,
136 |   "xPos": 6,
137 |   "yPos": 28
138 | },
139 | {
140 |   "height": 8,
141 |   "widget": {
142 |     "collapsibleGroup": {
143 |       "collapsed": false
144 |     },
145 |     "title": "Duty Cycle by TPU Slice"
146 |   },
147 |   "width": 12,
148 |   "yPos": 24
149 | }
150 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/host-to-device-transfer-latency.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "Host To Device Transfer Latency Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "COLOR"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "HEATMAP",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "crossSeriesReducer": "REDUCE_SUM",
 19 |                 "perSeriesAligner": "ALIGN_DELTA"
 20 |               },
 21 |               "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
 22 |             }
 23 |           }
 24 |         }
 25 |       ],
 26 |       "thresholds": [],
 27 |       "timeshiftDuration": "0s",
 28 |       "yAxis": {
 29 |         "label": "",
 30 |         "scale": "LINEAR"
 31 |       }
 32 |     }
 33 |   },
 34 |   "width": 12,
 35 |   "yPos": 65
 36 | },
 37 | {
 38 |   "height": 4,
 39 |   "widget": {
 40 |     "title": "Host To Device Transfer Latency Outliers [p50]",
 41 |     "xyChart": {
 42 |       "chartOptions": {
 43 |         "mode": "COLOR"
 44 |       },
 45 |       "dataSets": [
 46 |         {
 47 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 48 |           "minAlignmentPeriod": "60s",
 49 |           "plotType": "LINE",
 50 |           "targetAxis": "Y1",
 51 |           "timeSeriesQuery": {
 52 |             "timeSeriesFilter": {
 53 |               "aggregation": {
 54 |                 "alignmentPeriod": "60s",
 55 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_50",
 56 |                 "groupByFields": [
 57 |                   "resource.label.\"cluster_name\"",
 58 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 59 |                   "resource.label.\"pod_name\""
 60 |                 ],
 61 |                 "perSeriesAligner": "ALIGN_PERCENTILE_50"
 62 |               },
 63 |               "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
 64 |               "pickTimeSeriesFilter": {
 65 |                 "direction": "TOP",
 66 |                 "numTimeSeries": ${OUTLIER_COUNT},
 67 |                 "rankingMethod": "METHOD_MAX"
 68 |               }
 69 |             }
 70 |           }
 71 |         }
 72 |       ],
 73 |       "thresholds": [],
 74 |       "timeshiftDuration": "0s",
 75 |       "yAxis": {
 76 |         "label": "",
 77 |         "scale": "LINEAR"
 78 |       }
 79 |     }
 80 |   },
 81 |   "width": 6,
 82 |   "yPos": 69
 83 | },
 84 | {
 85 |   "height": 4,
 86 |   "widget": {
 87 |     "title": "Host To Device Transfer Latency Outliers [p99]",
 88 |     "xyChart": {
 89 |       "chartOptions": {
 90 |         "mode": "COLOR"
 91 |       },
 92 |       "dataSets": [
 93 |         {
 94 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 95 |           "minAlignmentPeriod": "60s",
 96 |           "plotType": "LINE",
 97 |           "targetAxis": "Y1",
 98 |           "timeSeriesQuery": {
 99 |             "timeSeriesFilter": {
100 |               "aggregation": {
101 |                 "alignmentPeriod": "60s",
102 |                 "crossSeriesReducer": "REDUCE_PERCENTILE_99",
103 |                 "groupByFields": [
104 |                   "resource.label.\"cluster_name\"",
105 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
106 |                   "resource.label.\"pod_name\""
107 |                 ],
108 |                 "perSeriesAligner": "ALIGN_PERCENTILE_99"
109 |               },
110 |               "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
111 |               "pickTimeSeriesFilter": {
112 |                 "direction": "TOP",
113 |                 "numTimeSeries": ${OUTLIER_COUNT},
114 |                 "rankingMethod": "METHOD_MAX"
115 |               }
116 |             }
117 |           }
118 |         }
119 |       ],
120 |       "thresholds": [],
121 |       "timeshiftDuration": "0s",
122 |       "yAxis": {
123 |         "label": "",
124 |         "scale": "LINEAR"
125 |       }
126 |     }
127 |   },
128 |   "width": 6,
129 |   "xPos": 6,
130 |   "yPos": 69
131 | },
132 | {
133 |   "height": 8,
134 |   "widget": {
135 |     "collapsibleGroup": {
136 |       "collapsed": false
137 |     },
138 |     "title": "Host To Device Transfer Latency"
139 |   },
140 |   "width": 12,
141 |   "yPos": 65
142 | }
143 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/main.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "category": "CUSTOM",
 3 |   "displayName": "GKE - TPU Monitoring Dashboard",
 4 |   "dashboardFilters": [
 5 |     {
 6 |       "filterType": "RESOURCE_LABEL",
 7 |       "labelKey": "cluster_name",
 8 |       "templateVariable": "ClusterName"
 9 |     },
10 |     {
11 |       "filterType": "USER_METADATA_LABEL",
12 |       "labelKey": "jobset.sigs.k8s.io/jobset-name",
13 |       "templateVariable": "JobName"
14 |     },
15 |     {
16 |       "filterType": "RESOURCE_LABEL",
17 |       "labelKey": "pod_name",
18 |       "templateVariable": "PodName"
19 |     }
20 |   ],
21 |   "mosaicLayout": {
22 |     "columns": 12,
23 |     "tiles": [
24 |       ${TILE_1},
25 |       ${TILE_2},
26 |       ${TILE_3},
27 |       ${TILE_4},
28 |       ${TILE_5},
29 |       {
30 |         "height": 1,
31 |         "widget": {
32 |           "title": "Megascale Metrics",
33 |           "sectionHeader": {
34 |             "subtitle": "These metrics are available in GKE version 1.29.1-gke.1016000 or later.  TPU workload must use JAX version 0.4.24.",
35 |             "dividerBelow": false
36 |           }
37 |         },
38 |         "width": 12,
39 |         "yPos": 48
40 |       },
41 |       ${TILE_6},
42 |       ${TILE_7},
43 |       ${TILE_8},
44 |       ${TILE_9}
45 |     ]
46 |   }
47 |  }
48 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/memory-usage.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "Memory Usage Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_SUM"
 19 |               },
 20 |               "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12,
 34 |   "yPos": 8
 35 | },
 36 | {
 37 |   "height": 4,
 38 |   "widget": {
 39 |     "title": "Memory Usage Outliers [MEAN]",
 40 |     "xyChart": {
 41 |       "chartOptions": {
 42 |         "mode": "COLOR"
 43 |       },
 44 |       "dataSets": [
 45 |         {
 46 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 47 |           "minAlignmentPeriod": "60s",
 48 |           "plotType": "LINE",
 49 |           "targetAxis": "Y1",
 50 |           "timeSeriesQuery": {
 51 |             "timeSeriesFilter": {
 52 |               "aggregation": {
 53 |                 "alignmentPeriod": "60s",
 54 |                 "crossSeriesReducer": "REDUCE_MEAN",
 55 |                 "groupByFields": [
 56 |                   "resource.label.\"cluster_name\"",
 57 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 58 |                   "resource.label.\"pod_name\""
 59 |                 ],
 60 |                 "perSeriesAligner": "ALIGN_MEAN"
 61 |               },
 62 |               "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
 63 |               "pickTimeSeriesFilter": {
 64 |                 "direction": "TOP",
 65 |                 "numTimeSeries": ${OUTLIER_COUNT},
 66 |                 "rankingMethod": "METHOD_MEAN"
 67 |               }
 68 |             }
 69 |           }
 70 |         }
 71 |       ],
 72 |       "thresholds": [],
 73 |       "timeshiftDuration": "0s",
 74 |       "yAxis": {
 75 |         "label": "",
 76 |         "scale": "LINEAR"
 77 |       }
 78 |     }
 79 |   },
 80 |   "width": 6,
 81 |   "yPos": 12
 82 | },
 83 | {
 84 |   "height": 4,
 85 |   "widget": {
 86 |     "title": "Memory Usage Outliers [MAX]",
 87 |     "xyChart": {
 88 |       "chartOptions": {
 89 |         "mode": "COLOR"
 90 |       },
 91 |       "dataSets": [
 92 |         {
 93 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 94 |           "minAlignmentPeriod": "60s",
 95 |           "plotType": "LINE",
 96 |           "targetAxis": "Y1",
 97 |           "timeSeriesQuery": {
 98 |             "timeSeriesFilter": {
 99 |               "aggregation": {
100 |                 "alignmentPeriod": "60s",
101 |                 "crossSeriesReducer": "REDUCE_MAX",
102 |                 "groupByFields": [
103 |                   "resource.label.\"cluster_name\"",
104 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
105 |                   "resource.label.\"pod_name\""
106 |                 ],
107 |                 "perSeriesAligner": "ALIGN_MAX"
108 |               },
109 |               "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
110 |               "pickTimeSeriesFilter": {
111 |                 "direction": "TOP",
112 |                 "numTimeSeries": ${OUTLIER_COUNT},
113 |                 "rankingMethod": "METHOD_MAX"
114 |               }
115 |             }
116 |           }
117 |         }
118 |       ],
119 |       "thresholds": [],
120 |       "timeshiftDuration": "0s",
121 |       "yAxis": {
122 |         "label": "",
123 |         "scale": "LINEAR"
124 |       }
125 |     }
126 |   },
127 |   "width": 6,
128 |   "xPos": 6,
129 |   "yPos": 12
130 | },
131 | {
132 |   "height": 8,
133 |   "widget": {
134 |     "collapsibleGroup": {
135 |       "collapsed": false
136 |     },
137 |     "title": "Memory Usage by TPU Slice"
138 |   },
139 |   "width": 12,
140 |   "yPos": 8
141 | }
142 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/network-bytes.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "height": 4,
  3 |   "widget": {
  4 |     "title": "Network Bytes Sent Stats",
  5 |     "xyChart": {
  6 |       "chartOptions": {
  7 |         "mode": "STATS"
  8 |       },
  9 |       "dataSets": [
 10 |         {
 11 |           "minAlignmentPeriod": "60s",
 12 |           "plotType": "LINE",
 13 |           "targetAxis": "Y1",
 14 |           "timeSeriesQuery": {
 15 |             "timeSeriesFilter": {
 16 |               "aggregation": {
 17 |                 "alignmentPeriod": "60s",
 18 |                 "perSeriesAligner": "ALIGN_RATE"
 19 |               },
 20 |               "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}"
 21 |             }
 22 |           }
 23 |         }
 24 |       ],
 25 |       "thresholds": [],
 26 |       "timeshiftDuration": "0s",
 27 |       "yAxis": {
 28 |         "label": "",
 29 |         "scale": "LINEAR"
 30 |       }
 31 |     }
 32 |   },
 33 |   "width": 12,
 34 |   "yPos": 32
 35 | },
 36 | {
 37 |   "height": 4,
 38 |   "widget": {
 39 |     "title": "Network Bytes Sent Outliers [MEAN]",
 40 |     "xyChart": {
 41 |       "chartOptions": {
 42 |         "mode": "COLOR"
 43 |       },
 44 |       "dataSets": [
 45 |         {
 46 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 47 |           "minAlignmentPeriod": "60s",
 48 |           "plotType": "LINE",
 49 |           "targetAxis": "Y1",
 50 |           "timeSeriesQuery": {
 51 |             "timeSeriesFilter": {
 52 |               "aggregation": {
 53 |                 "alignmentPeriod": "60s",
 54 |                 "perSeriesAligner": "ALIGN_RATE"
 55 |               },
 56 |               "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}",
 57 |               "pickTimeSeriesFilter": {
 58 |                 "direction": "TOP",
 59 |                 "numTimeSeries": ${OUTLIER_COUNT},
 60 |                 "rankingMethod": "METHOD_MEAN"
 61 |               },
 62 |               "secondaryAggregation": {
 63 |                 "alignmentPeriod": "60s",
 64 |                 "crossSeriesReducer": "REDUCE_MEAN",
 65 |                 "groupByFields": [
 66 |                   "resource.label.\"cluster_name\"",
 67 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
 68 |                   "resource.label.\"pod_name\""
 69 |                 ],
 70 |                 "perSeriesAligner": "ALIGN_MEAN"
 71 |               }
 72 |             }
 73 |           }
 74 |         }
 75 |       ],
 76 |       "thresholds": [],
 77 |       "timeshiftDuration": "0s",
 78 |       "yAxis": {
 79 |         "label": "",
 80 |         "scale": "LINEAR"
 81 |       }
 82 |     }
 83 |   },
 84 |   "width": 6,
 85 |   "yPos": 36
 86 | },
 87 | {
 88 |   "height": 4,
 89 |   "widget": {
 90 |     "title": "Network Bytes Sent Outliers [MAX]",
 91 |     "xyChart": {
 92 |       "chartOptions": {
 93 |         "mode": "COLOR"
 94 |       },
 95 |       "dataSets": [
 96 |         {
 97 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
 98 |           "minAlignmentPeriod": "60s",
 99 |           "plotType": "LINE",
100 |           "targetAxis": "Y1",
101 |           "timeSeriesQuery": {
102 |             "timeSeriesFilter": {
103 |               "aggregation": {
104 |                 "alignmentPeriod": "60s",
105 |                 "perSeriesAligner": "ALIGN_RATE"
106 |               },
107 |               "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}",
108 |               "pickTimeSeriesFilter": {
109 |                 "direction": "TOP",
110 |                 "numTimeSeries": ${OUTLIER_COUNT},
111 |                 "rankingMethod": "METHOD_MAX"
112 |               },
113 |               "secondaryAggregation": {
114 |                 "alignmentPeriod": "60s",
115 |                 "crossSeriesReducer": "REDUCE_MAX",
116 |                 "groupByFields": [
117 |                   "resource.label.\"cluster_name\"",
118 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
119 |                   "resource.label.\"pod_name\""
120 |                 ],
121 |                 "perSeriesAligner": "ALIGN_MAX"
122 |               }
123 |             }
124 |           }
125 |         }
126 |       ],
127 |       "thresholds": [],
128 |       "timeshiftDuration": "0s",
129 |       "yAxis": {
130 |         "label": "",
131 |         "scale": "LINEAR"
132 |       }
133 |     }
134 |   },
135 |   "width": 6,
136 |   "xPos": 6,
137 |   "yPos": 36
138 | },
139 | {
140 |   "height": 4,
141 |   "widget": {
142 |     "title": "Network Bytes Received Stats",
143 |     "xyChart": {
144 |       "chartOptions": {
145 |         "mode": "STATS"
146 |       },
147 |       "dataSets": [
148 |         {
149 |           "minAlignmentPeriod": "60s",
150 |           "plotType": "LINE",
151 |           "targetAxis": "Y1",
152 |           "timeSeriesQuery": {
153 |             "timeSeriesFilter": {
154 |               "aggregation": {
155 |                 "alignmentPeriod": "60s",
156 |                 "perSeriesAligner": "ALIGN_RATE"
157 |               },
158 |               "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}"
159 |             }
160 |           }
161 |         }
162 |       ],
163 |       "thresholds": [],
164 |       "timeshiftDuration": "0s",
165 |       "yAxis": {
166 |         "label": "",
167 |         "scale": "LINEAR"
168 |       }
169 |     }
170 |   },
171 |   "width": 12,
172 |   "yPos": 40
173 | },
174 | {
175 |   "height": 4,
176 |   "widget": {
177 |     "title": "Network Bytes Received Outliers [MEAN]",
178 |     "xyChart": {
179 |       "chartOptions": {
180 |         "mode": "COLOR"
181 |       },
182 |       "dataSets": [
183 |         {
184 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
185 |           "minAlignmentPeriod": "60s",
186 |           "plotType": "LINE",
187 |           "targetAxis": "Y1",
188 |           "timeSeriesQuery": {
189 |             "timeSeriesFilter": {
190 |               "aggregation": {
191 |                 "alignmentPeriod": "60s",
192 |                 "perSeriesAligner": "ALIGN_RATE"
193 |               },
194 |               "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}",
195 |               "pickTimeSeriesFilter": {
196 |                 "direction": "TOP",
197 |                 "numTimeSeries": ${OUTLIER_COUNT},
198 |                 "rankingMethod": "METHOD_MEAN"
199 |               },
200 |               "secondaryAggregation": {
201 |                 "alignmentPeriod": "60s",
202 |                 "crossSeriesReducer": "REDUCE_MEAN",
203 |                 "groupByFields": [
204 |                   "resource.label.\"cluster_name\"",
205 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
206 |                   "resource.label.\"pod_name\""
207 |                 ],
208 |                 "perSeriesAligner": "ALIGN_MEAN"
209 |               }
210 |             }
211 |           }
212 |         }
213 |       ],
214 |       "thresholds": [],
215 |       "timeshiftDuration": "0s",
216 |       "yAxis": {
217 |         "label": "",
218 |         "scale": "LINEAR"
219 |       }
220 |     }
221 |   },
222 |   "width": 6,
223 |   "yPos": 44
224 | },
225 | {
226 |   "height": 4,
227 |   "widget": {
228 |     "title": "Network Bytes Received Outliers [MAX]",
229 |     "xyChart": {
230 |       "chartOptions": {
231 |         "mode": "COLOR"
232 |       },
233 |       "dataSets": [
234 |         {
235 |           "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
236 |           "minAlignmentPeriod": "60s",
237 |           "plotType": "LINE",
238 |           "targetAxis": "Y1",
239 |           "timeSeriesQuery": {
240 |             "timeSeriesFilter": {
241 |               "aggregation": {
242 |                 "alignmentPeriod": "60s",
243 |                 "perSeriesAligner": "ALIGN_RATE"
244 |               },
245 |               "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}",
246 |               "pickTimeSeriesFilter": {
247 |                 "direction": "TOP",
248 |                 "numTimeSeries": ${OUTLIER_COUNT},
249 |                 "rankingMethod": "METHOD_MAX"
250 |               },
251 |               "secondaryAggregation": {
252 |                 "alignmentPeriod": "60s",
253 |                 "crossSeriesReducer": "REDUCE_MAX",
254 |                 "groupByFields": [
255 |                   "resource.label.\"cluster_name\"",
256 |                   "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
257 |                   "resource.label.\"pod_name\""
258 |                 ],
259 |                 "perSeriesAligner": "ALIGN_MAX"
260 |               }
261 |             }
262 |           }
263 |         }
264 |       ],
265 |       "thresholds": [],
266 |       "timeshiftDuration": "0s",
267 |       "yAxis": {
268 |         "label": "",
269 |         "scale": "LINEAR"
270 |       }
271 |     }
272 |   },
273 |   "width": 6,
274 |   "xPos": 6,
275 |   "yPos": 44
276 | },
277 | {
278 |   "height": 16,
279 |   "widget": {
280 |     "collapsibleGroup": {
281 |       "collapsed": false
282 |     },
283 |     "title": "Network Bytes Sent and Received by TPU Slice"
284 |   },
285 |   "width": 12,
286 |   "yPos": 32
287 | }
288 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 
20 | variable "monitoring_dashboard_config" {
21 |   type = object({
22 |     outlier_count : optional(number)
23 |   })
24 |   description = <<EOF
25 |   Configuration for monitoring dashboard:
26 |   {
27 |     "outlier_count": "number of outliers to show on dashboard, default to 10 if not set"
28 |   }
29 |   Enter {} to set default configuration for monitoring dashboard.
30 |   EOF
31 | }
32 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source  = "hashicorp/google"
19 |       version = ">= 4.57.0"
20 |     }
21 |   }
22 |   /*
23 |     Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 |     The bucket must exist prior to configuring the backend.
25 |     For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 |   */
27 |   backend "gcs" {
28 |     # GCS prefix inside the bucket. terraform states are stored in an object called <prefix>/default.tfstate
29 |     prefix = "gke/dashboard/monitoring_dashboard"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/log_storage/input.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_name" {
16 |   type        = string
17 |   description = "Name of gcp project"
18 | }
19 | 
20 | // Valid inputs:
21 | // 1. To create stack trace bucket for 30 retention days: {"bucket_name":"<log_bucket_name>"}
22 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"<log_bucket_name>", "retention_days":x}
23 | // 3. To not create stack trace bucket: {}
24 | variable "stack_trace_bucket_config" {
25 |   type = object({
26 |     bucket_name : optional(string)
27 |     retention_days : optional(number)
28 |   })
29 |   validation {
30 |     condition = (
31 |       (var.stack_trace_bucket_config.bucket_name == null &&
32 |       var.stack_trace_bucket_config.retention_days == null) ||
33 |       (var.stack_trace_bucket_config.bucket_name != null)
34 |     )
35 |     error_message = "bucket_name is not defined for stack_trace_bucket_config."
36 |   }
37 |   description = <<EOF
38 |   Configuration to create a log bucket to store stack traces:
39 |   {
40 |     "bucket_name": "name of log bucket to create",
41 |     "retention_days": number of days to retain stack traces, default to 30 days if not set
42 |   }
43 |   Enter {} to not create separate log bucket for stack traces.
44 |   EOF
45 | }
46 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/log_storage/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source  = "hashicorp/google"
19 |       version = ">= 4.57.0"
20 |     }
21 |   }
22 |   /*
23 |     Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 |     The bucket must exist prior to configuring the backend.
25 |     For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 |   */
27 |   backend "gcs" {
28 |     # GCS prefix inside the bucket. terraform states are stored in an object called <prefix>/default.tfstate
29 |     prefix = "gke/log_storage"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/gcp_resources/gke/resources/log_storage/stack-trace-bucket.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | locals {
16 |   stack_trace_filter         = "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\""
17 |   stack_trace_bucket_counter = var.stack_trace_bucket_config.bucket_name == null ? 0 : 1
18 | }
19 | 
20 | resource "google_logging_project_bucket_config" "log_bucket" {
21 |   count    = local.stack_trace_bucket_counter
22 |   project  = var.project_name
23 |   location = "global"
24 |   // default retention period is 30 days
25 |   retention_days = var.stack_trace_bucket_config.retention_days == null ? 30 : var.stack_trace_bucket_config.retention_days
26 |   bucket_id      = var.stack_trace_bucket_config.bucket_name
27 | }
28 | 
29 | resource "google_logging_project_sink" "log_sink" {
30 |   count       = local.stack_trace_bucket_counter
31 |   project     = var.project_name
32 |   name        = "${var.stack_trace_bucket_config.bucket_name}_sink"
33 |   destination = "logging.googleapis.com/projects/${var.project_name}/locations/global/buckets/${google_logging_project_bucket_config.log_bucket[count.index].bucket_id}"
34 |   filter      = local.stack_trace_filter
35 | }
36 | 


--------------------------------------------------------------------------------
/pip_package/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright 2023 Google LLC
 3 |  
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 |  
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 |  
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  -->
16 | # Changelog
17 | 
18 | <!--
19 | 
20 | Changelog follow the https://keepachangelog.com/ standard (at least the headers)
21 | 
22 | This allow to:
23 | 
24 | * auto-parsing release notes during the automated releases from github-action:
25 |   https://github.com/marketplace/actions/pypi-github-auto-release
26 | * Have clickable headers in the rendered markdown
27 | 
28 | To release a new version (e.g. from `1.0.0` -> `2.0.0`):
29 | 
30 | * Create a new `# [2.0.0] - YYYY-MM-DD` header and add the changes to be released.
31 | * At the end of the file:
32 |   * Define the new link url:
33 |   `[2.0.0]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v1.0.0...v2.0.0`
34 | 
35 | -->
36 | 
37 | ## [0.1.5] - 2023-12-08
38 | * Raise exception without waiting for the daemon thread to terminate
39 | * Remove sending user signal in `stop_debugging()` to avoid unnecessary stack traces related to `cloud-tpu-diagnostics` package
40 | 
41 | ## [0.1.4] - 2023-11-07
42 | * Gracefully exiting daemon threads
43 | * Fixed the URL for PyPI package in README
44 | 
45 | ## [0.1.3] - 2023-11-01
46 | * Fixing issue with using signals and threads together in a program
47 | 
48 | ## [0.1.2] - 2023-09-20
49 | * Improved stack trace readability and clarity by adding a message for more information
50 | 
51 | ## [0.1.1] - 2023-06-21
52 | * Bug Fixes
53 |   * Fixes dumping of stack traces on the console when exceptions like `AssertionError`, `tensorflow.python.framework.errors_impl.NotFoundError` are thrown when `collect_stack_trace=True` and `stack_trace_to_cloud=False`.
54 | * Updated README
55 | 
56 | ## [0.1.0] - 2023-06-08
57 | * Initial release of cloud-tpu-diagnostics PyPI package
58 | * FEATURE: Contains debug module to collect stack traces on faults
59 | 
60 | [0.1.5]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.4...v0.1.5
61 | [0.1.4]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.3...v0.1.4
62 | [0.1.3]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.2...v0.1.3
63 | [0.1.2]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.1...v0.1.2
64 | [0.1.1]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.0...v0.1.1
65 | [0.1.0]: https://github.com/google/cloud-tpu-monitoring-debugging/releases/tag/v0.1.0
66 | 


--------------------------------------------------------------------------------
/pip_package/README.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 |  Copyright 2023 Google LLC
  3 |  
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 |  
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 |  
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  -->
 16 | # Cloud TPU Diagnostics
 17 | 
 18 | This is a comprehensive library to monitor, debug and profile the jobs running on Cloud TPU.
 19 | To learn about Cloud TPU, refer to the [full documentation](https://cloud.google.com/tpu/docs/intro-to-tpu).
 20 | 
 21 | ## Features
 22 | ### 1. Debugging
 23 | #### 1.1 Collect Stack Traces
 24 | This module will dump the python traces when a fault such as Segmentation fault, Floating-point exception, Illegal operation exception occurs in the program. Additionally, it will also periodically collect stack traces to help debug when a program running on Cloud TPU is stuck or hung somewhere.
 25 | 
 26 | ## Installation
 27 | To install the package, run the following command on TPU VM:
 28 | 
 29 | ```
 30 | pip install cloud-tpu-diagnostics
 31 | ```
 32 | 
 33 | ## Usage
 34 | To use this package, first import the module:
 35 | 
 36 | ```
 37 | from cloud_tpu_diagnostics import diagnostic
 38 | from cloud_tpu_diagnostics.configuration import debug_configuration
 39 | from cloud_tpu_diagnostics.configuration import diagnostic_configuration
 40 | from cloud_tpu_diagnostics.configuration import stack_trace_configuration
 41 | ```
 42 | 
 43 | Then, create configuration object for stack traces. The module will only collect stack traces when `collect_stack_trace` parameter is set to `True`. There are following scenarios supported currently:
 44 | 
 45 | ##### Scenario 1: Do not collect stack traces on faults
 46 | 
 47 | ```
 48 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
 49 |                       collect_stack_trace=False)
 50 | ```
 51 | This configuration will prevent you from collecting stack traces in the event of a fault or process hang.
 52 | 
 53 | ##### Scenario 2: Collect stack traces on faults and display on console
 54 | 
 55 | ```
 56 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
 57 |                       collect_stack_trace=True,
 58 |                       stack_trace_to_cloud=False)
 59 | ```
 60 | If there is a fault or process hang, this configuration will show the stack traces on the console (stderr).
 61 | 
 62 | ##### Scenario 3: Collect stack traces on faults and upload on cloud
 63 | 
 64 | ```
 65 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
 66 |                       collect_stack_trace=True,
 67 |                       stack_trace_to_cloud=True)
 68 | ```
 69 | This configuration will temporary collect stack traces inside `/tmp/debugging` directory on TPU host if there is a fault or process hang. Additionally, the traces collected in TPU host memory will be uploaded to Google Cloud Logging, which will make it easier to troubleshoot and fix the problems. You can view the traces in [Logs Explorer](https://cloud.google.com/logging/docs/view/logs-explorer-interface) using the following query:
 70 | 
 71 | ```
 72 | logName="projects/<project_name>/logs/tpu.googleapis.com%2Fruntime_monitor"
 73 | jsonPayload.verb="stacktraceanalyzer"
 74 | ```
 75 | 
 76 | By default, stack traces will be collected every 10 minutes. In order to change the duration between two stack trace collection events, add the following configuration:
 77 | 
 78 | ```
 79 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
 80 |                       collect_stack_trace=True,
 81 |                       stack_trace_to_cloud=True,
 82 |                       stack_trace_interval_seconds=300)
 83 | ```
 84 | This configuration will collect the stack traces on cloud after every 5 minutes.
 85 | 
 86 | Then, create configuration object for debug.
 87 | 
 88 | ```
 89 | debug_config = debug_configuration.DebugConfig(
 90 |                 stack_trace_config=stack_trace_config)
 91 | ```
 92 | 
 93 | Then, create configuration object for diagnostic.
 94 | 
 95 | ```
 96 | diagnostic_config = diagnostic_configuration.DiagnosticConfig(
 97 |                       debug_config=debug_config)
 98 | ```
 99 | 
100 | Finally, call the `diagnose()` method using `with` and wrap the statements inside the context manager for which you want to collect the stack traces.
101 | 
102 | ```
103 | with diagnostic.diagnose(diagnostic_config):
104 |     run_job(...)
105 | ```


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from cloud_tpu_diagnostics import configuration
16 | from cloud_tpu_diagnostics import diagnostic
17 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/configuration.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from cloud_tpu_diagnostics.src.config import debug_configuration
16 | from cloud_tpu_diagnostics.src.config import diagnostic_configuration
17 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
18 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/diagnostic.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from cloud_tpu_diagnostics.src.diagnose import diagnose
16 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/config/debug_configuration.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import dataclasses
16 | from typing import Optional
17 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
18 | 
19 | 
20 | @dataclasses.dataclass
21 | class DebugConfig:
22 |   """Configuration for debugging.
23 | 
24 |   Attributes:
25 |     stack_trace_config: config object for stack trace collection, default is
26 |       None
27 |   """
28 | 
29 |   stack_trace_config: Optional[stack_trace_configuration.StackTraceConfig] = (
30 |       None
31 |   )
32 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/config/diagnostic_configuration.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import dataclasses
16 | from typing import Optional
17 | from cloud_tpu_diagnostics.src.config import debug_configuration
18 | 
19 | 
20 | @dataclasses.dataclass
21 | class DiagnosticConfig:
22 |   """Configuration for diagnostic.
23 | 
24 |   Attributes:
25 |     debug_config: config object for debugging, default is None
26 |   """
27 | 
28 |   debug_config: Optional[debug_configuration.DebugConfig] = None
29 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/config/stack_trace_configuration.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import dataclasses
16 | from cloud_tpu_diagnostics.src.util import default
17 | 
18 | 
19 | @dataclasses.dataclass
20 | class StackTraceConfig:
21 |   """Configuration for stack trace collection.
22 | 
23 |   Attributes:
24 |     collect_stack_trace: enable/disable collection of stack trace in case fault
25 |       occurs in the program. Default is False, which means stack trace will not
26 |       be collected unless collect_stack_trace is set to True.
27 |     stack_trace_to_cloud: enable/disable upload of stack trace to cloud. Default
28 |       is False, which means stack trace will be displayed on the termial unless
29 |       stack_trace_to_cloud is set to True.
30 |     stack_trace_interval_seconds: time interval in seconds between collection of
31 |       stack trace event. Default is 600, that is 10 minutes.
32 |   """
33 | 
34 |   collect_stack_trace: bool = default.COLLECT_STACK_TRACE_DEFAULT
35 |   stack_trace_to_cloud: bool = default.STACK_TRACE_TO_CLOUD_DEFAULT
36 |   stack_trace_interval_seconds: int = default.STACK_TRACE_INTERVAL_SECONDS_DEFAULT
37 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/debug.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import logging
16 | import signal
17 | import threading
18 | import time
19 | 
20 | from cloud_tpu_diagnostics.src.stack_trace import disable_stack_trace_dumping
21 | from cloud_tpu_diagnostics.src.stack_trace import enable_stack_trace_dumping
22 | 
23 | # flag to signal daemon thread to exit gracefully
24 | _exit_flag = threading.Event()
25 | _exit_flag.clear()
26 | _daemon_thread = None
27 | logger = logging.getLogger(__name__)
28 | 
29 | 
30 | def start_debugging(debug_config):
31 |   """Context manager to debug and identify errors."""
32 |   global _daemon_thread
33 |   _exit_flag.clear()
34 |   if (
35 |       debug_config.stack_trace_config is not None
36 |       and debug_config.stack_trace_config.collect_stack_trace
37 |   ):
38 |     _daemon_thread = threading.Thread(
39 |         target=send_user_signal,
40 |         daemon=True,
41 |         args=(debug_config.stack_trace_config.stack_trace_interval_seconds,),
42 |     )
43 |     _daemon_thread.start()  # start a daemon thread
44 |     enable_stack_trace_dumping(debug_config.stack_trace_config)
45 | 
46 | 
47 | def stop_debugging(debug_config):
48 |   """Context manager to debug and identify errors."""
49 |   if (
50 |       debug_config.stack_trace_config is not None
51 |       and debug_config.stack_trace_config.collect_stack_trace
52 |   ):
53 |     _exit_flag.set()
54 |     # wait for daemon thread to complete
55 |     if _daemon_thread is not None:
56 |       logger.info(
57 |           "Waiting for completion of stack trace collection daemon thread."
58 |       )
59 |       _daemon_thread.join()
60 |       logger.info("Stack trace collection daemon thread completed.")
61 |     disable_stack_trace_dumping(debug_config.stack_trace_config)
62 |   _exit_flag.clear()
63 | 
64 | 
65 | def send_user_signal(stack_trace_interval_seconds):
66 |   """Send SIGUSR1 signal to main thread after every stack_trace_interval_seconds seconds."""
67 |   while not _exit_flag.is_set():
68 |     time.sleep(stack_trace_interval_seconds)
69 |     if not _exit_flag.is_set():
70 |       signal.pthread_kill(threading.main_thread().ident, signal.SIGUSR1)
71 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/diagnose.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import contextlib
16 | 
17 | from cloud_tpu_diagnostics.src.debug import start_debugging
18 | from cloud_tpu_diagnostics.src.debug import stop_debugging
19 | 
20 | 
21 | @contextlib.contextmanager
22 | def diagnose(config):
23 |   """Context manager to debug and identify errors."""
24 |   if config is not None and config.debug_config is not None:
25 |     start_debugging(config.debug_config)
26 |   try:
27 |     yield
28 |     if config is not None and config.debug_config is not None:
29 |       stop_debugging(config.debug_config)
30 |   except Exception as e:
31 |     raise e
32 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/stack_trace.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import faulthandler
 16 | import logging
 17 | import os
 18 | import signal
 19 | import sys
 20 | import time
 21 | 
 22 | from cloud_tpu_diagnostics.src.util import default
 23 | 
 24 | _stack_trace_file_obj = None
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | def user_signal_handler_wrapper(file_descriptor, interval):
 29 |   def user_signal_handler(unused_signum, unused_frame):
 30 |     message = (
 31 |         "INFO: Not a crash. cloud-tpu-diagnostics emits a"
 32 |         f" stack trace snapshot every {interval} seconds.\n"
 33 |     )
 34 |     if file_descriptor is not sys.stderr:
 35 |       message = message.encode()
 36 |     file_descriptor.write(message)
 37 |     faulthandler.dump_traceback(file_descriptor, all_threads=False)
 38 | 
 39 |   return user_signal_handler
 40 | 
 41 | 
 42 | def enable_stack_trace_dumping(stack_trace_config):
 43 |   """Enables stack trace dumping.
 44 | 
 45 |   Enables faulthandler and register SIGSEGV, SIGFPE, SIGABRT,
 46 |   SIGBUS, SIGILL and SIGUSR1 to collect stack trace.
 47 | 
 48 |   Args:
 49 |     stack_trace_config: configuration object for stack trace collection
 50 |   """
 51 |   try:
 52 |     global _stack_trace_file_obj
 53 |     if stack_trace_config.stack_trace_to_cloud:
 54 |       stack_trace_file = _get_stack_trace_file()
 55 |       _stack_trace_file_obj = open(stack_trace_file, "wb")
 56 |       logger.info("Stack trace will be written in: %s", stack_trace_file)
 57 |     else:
 58 |       _stack_trace_file_obj = sys.stderr
 59 |       logger.info("Stack trace will be written to the console.")
 60 | 
 61 |     # Enables faulthandler for SIGSEGV, SIGFPE, SIGABRT, SIGBUS and SIGILL
 62 |     faulthandler.enable(file=_stack_trace_file_obj, all_threads=False)
 63 | 
 64 |     # Register SIGUSR1 signal to faulthandler
 65 |     faulthandler.register(
 66 |         signal.SIGUSR1, all_threads=False, file=_stack_trace_file_obj
 67 |     )
 68 | 
 69 |     # Register handler for SIGUSR1 to dump traces
 70 |     signal.signal(
 71 |         signal.SIGUSR1,
 72 |         user_signal_handler_wrapper(
 73 |             _stack_trace_file_obj,
 74 |             stack_trace_config.stack_trace_interval_seconds,
 75 |         ),
 76 |     )
 77 |   except Exception as e:  # pylint: disable=broad-exception-caught
 78 |     logger.error("Error in enabling dumping of stack trace.", e)
 79 | 
 80 | 
 81 | def disable_stack_trace_dumping(stack_trace_config):
 82 |   """Disable faulthandler and unregister user signals.
 83 | 
 84 |   Args:
 85 |     stack_trace_config: configuration object for stack trace collection
 86 |   """
 87 |   try:
 88 |     global _stack_trace_file_obj
 89 |     if (
 90 |         stack_trace_config.stack_trace_to_cloud
 91 |         and _stack_trace_file_obj is not None
 92 |     ):
 93 |       _stack_trace_file_obj.close()
 94 |       _stack_trace_file_obj = None
 95 | 
 96 |     faulthandler.unregister(signal.SIGUSR1)
 97 |     faulthandler.disable()
 98 |   except Exception as e:  # pylint: disable=broad-exception-caught
 99 |     logger.error("Error in disabling dumping of stack trace.", e)
100 | 
101 | 
102 | def _get_stack_trace_file():
103 |   """Prefix stack trace file.
104 | 
105 |   Create a file with prefix as stack_trace_ and current local time in
106 |   '%Y_%m_%d_%H_%M_%S' format inside default.STACK_TRACE_DIR_DEFAULT.
107 | 
108 |   Returns:
109 |     path of stack trace file
110 |   """
111 |   root_trace_folder = os.path.abspath(default.STACK_TRACE_DIR_DEFAULT)
112 |   if not os.path.exists(root_trace_folder):
113 |     os.makedirs(root_trace_folder)
114 | 
115 |   current_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
116 |   trace_file_name = "stack_trace_" + current_time + ".txt"
117 |   stack_trace_file = os.path.join(root_trace_folder, trace_file_name)
118 |   return stack_trace_file
119 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/util/default.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Stack trace default values
16 | COLLECT_STACK_TRACE_DEFAULT = False
17 | STACK_TRACE_TO_CLOUD_DEFAULT = False
18 | STACK_TRACE_DIR_DEFAULT = '/tmp/debugging/'
19 | STACK_TRACE_INTERVAL_SECONDS_DEFAULT = 600  # 10 minutes
20 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/util/stack_trace_test_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Script to raise different signals to test dumping of stack trace."""
16 | 
17 | import argparse
18 | import signal
19 | 
20 | from cloud_tpu_diagnostics import diagnostic
21 | from cloud_tpu_diagnostics.configuration import debug_configuration
22 | from cloud_tpu_diagnostics.configuration import diagnostic_configuration
23 | from cloud_tpu_diagnostics.configuration import stack_trace_configuration
24 | 
25 | 
26 | if __name__ == '__main__':
27 |   parser = argparse.ArgumentParser()
28 |   parser.add_argument('--signal', help='name of signal to raise')
29 |   parser.add_argument(
30 |       '--collect_stack_trace',
31 |       type=lambda x: (x.lower() == 'true'),
32 |       help='whether to collect stack trace or not',
33 |   )
34 |   parser.add_argument(
35 |       '--log_to_cloud',
36 |       type=lambda x: (x.lower() == 'true'),
37 |       help='whether to log to cloud or console',
38 |   )
39 |   args = parser.parse_args()
40 |   debug_config = debug_configuration.DebugConfig(
41 |       stack_trace_config=stack_trace_configuration.StackTraceConfig(
42 |           collect_stack_trace=args.collect_stack_trace,
43 |           stack_trace_to_cloud=args.log_to_cloud,
44 |           stack_trace_interval_seconds=1,
45 |       ),
46 |   )
47 |   diagnostic_config = diagnostic_configuration.DiagnosticConfig(
48 |       debug_config=debug_config
49 |   )
50 |   with diagnostic.diagnose(diagnostic_config):
51 |     if args.signal == 'SIGSEGV':
52 |       signal.raise_signal(signal.SIGSEGV)
53 | 
54 |     if args.signal == 'SIGABRT':
55 |       signal.raise_signal(signal.SIGABRT)
56 | 
57 |     if args.signal == 'SIGFPE':
58 |       signal.raise_signal(signal.SIGFPE)
59 | 
60 |     if args.signal == 'SIGILL':
61 |       signal.raise_signal(signal.SIGILL)
62 | 
63 |     if args.signal == 'SIGBUS':
64 |       signal.raise_signal(signal.SIGBUS)
65 | 
66 |     if args.signal == 'SIGUSR1':
67 |       signal.raise_signal(signal.SIGUSR1)
68 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/tests/debug_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import signal
 16 | import threading
 17 | from unittest import mock
 18 | from absl.testing import absltest
 19 | from cloud_tpu_diagnostics.src.config import debug_configuration
 20 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
 21 | from cloud_tpu_diagnostics.src.debug import send_user_signal
 22 | from cloud_tpu_diagnostics.src.debug import start_debugging
 23 | from cloud_tpu_diagnostics.src.debug import stop_debugging
 24 | 
 25 | 
 26 | class DebugTest(absltest.TestCase):
 27 | 
 28 |   def testDaemonThreadRunningWhenCollectStackTraceTrue(self):
 29 |     debug_config = debug_configuration.DebugConfig(
 30 |         stack_trace_config=stack_trace_configuration.StackTraceConfig(
 31 |             collect_stack_trace=True,
 32 |             stack_trace_to_cloud=True,
 33 |             stack_trace_interval_seconds=1,
 34 |         ),
 35 |     )
 36 |     start_debugging(debug_config)
 37 |     self.assertEqual(threading.active_count(), 2)
 38 |     daemon_thread_list = list(
 39 |         filter(lambda thread: thread.daemon is True, threading.enumerate())
 40 |     )
 41 |     self.assertLen(daemon_thread_list, 1)
 42 |     stop_debugging(debug_config)
 43 |     self.assertEqual(threading.active_count(), 1)
 44 |     daemon_thread_list = list(
 45 |         filter(lambda thread: thread.daemon is True, threading.enumerate())
 46 |     )
 47 |     self.assertLen(daemon_thread_list, 0)
 48 | 
 49 |   def testDaemonThreadNotRunningWhenCollectStackTraceFalse(self):
 50 |     debug_config = debug_configuration.DebugConfig(
 51 |         stack_trace_config=stack_trace_configuration.StackTraceConfig(
 52 |             collect_stack_trace=False,
 53 |             stack_trace_to_cloud=True,
 54 |             stack_trace_interval_seconds=1,
 55 |         ),
 56 |     )
 57 |     start_debugging(debug_config)
 58 |     self.assertEqual(threading.active_count(), 1)
 59 |     daemon_thread_list = list(
 60 |         filter(lambda thread: thread.daemon is True, threading.enumerate())
 61 |     )
 62 |     self.assertLen(daemon_thread_list, 0)
 63 |     stop_debugging(debug_config)
 64 |     self.assertEqual(threading.active_count(), 1)
 65 |     daemon_thread_list = list(
 66 |         filter(lambda thread: thread.daemon is True, threading.enumerate())
 67 |     )
 68 |     self.assertLen(daemon_thread_list, 0)
 69 | 
 70 |   @mock.patch(
 71 |       'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.debug.disable_stack_trace_dumping'
 72 |   )
 73 |   def testStopDebuggingDisableStackTraceDumpingCalled(
 74 |       self, disable_stack_trace_dumping_mock
 75 |   ):
 76 |     debug_config = debug_configuration.DebugConfig(
 77 |         stack_trace_config=stack_trace_configuration.StackTraceConfig(
 78 |             collect_stack_trace=True,
 79 |             stack_trace_to_cloud=True,
 80 |             stack_trace_interval_seconds=1,
 81 |         ),
 82 |     )
 83 |     stop_debugging(debug_config)
 84 |     disable_stack_trace_dumping_mock.assert_called_once()
 85 |     self.assertEqual(threading.active_count(), 1)
 86 |     daemon_thread_list = list(
 87 |         filter(lambda thread: thread.daemon is True, threading.enumerate())
 88 |     )
 89 |     self.assertLen(daemon_thread_list, 0)
 90 | 
 91 |   def testSendUserSignalSIGUSR1SignalReceived(self):
 92 |     signal.signal(signal.SIGUSR1, user_signal_handler)
 93 |     stack_trace_interval_seconds = 1
 94 |     with self.assertRaises(Exception) as e:
 95 |       send_user_signal(stack_trace_interval_seconds)
 96 |     self.assertEqual(str(e.exception), 'SIGSUR1 signal received.')
 97 | 
 98 | 
 99 | def user_signal_handler(signum, _):
100 |   raise Exception('SIGSUR1 signal received.')  # pylint: disable=broad-exception-caught
101 | 
102 | 
103 | if __name__ == '__main__':
104 |   absltest.main()
105 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/tests/diagnose_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from unittest import mock
16 | from absl.testing import absltest
17 | from cloud_tpu_diagnostics.src.config import debug_configuration
18 | from cloud_tpu_diagnostics.src.config import diagnostic_configuration
19 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
20 | from cloud_tpu_diagnostics.src.diagnose import diagnose
21 | 
22 | 
23 | class DiagnoseTest(absltest.TestCase):
24 | 
25 |   @mock.patch(
26 |       'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.diagnose.start_debugging'
27 |   )
28 |   @mock.patch(
29 |       'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.diagnose.stop_debugging'
30 |   )
31 |   def testDiagnoseContextManager(
32 |       self, stop_debugging_mock, start_debugging_mock
33 |   ):
34 |     debug_config = debug_configuration.DebugConfig(
35 |         stack_trace_config=stack_trace_configuration.StackTraceConfig(
36 |             collect_stack_trace=True,
37 |             stack_trace_to_cloud=True,
38 |         ),
39 |     )
40 |     diagnostic_config = diagnostic_configuration.DiagnosticConfig(
41 |         debug_config=debug_config,
42 |     )
43 |     with diagnose(diagnostic_config):
44 |       pass
45 |     start_debugging_mock.assert_called_once_with(debug_config)
46 |     stop_debugging_mock.assert_called_once_with(debug_config)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |   absltest.main()
51 | 


--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/tests/stack_trace_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import faulthandler
 16 | import os
 17 | import shutil
 18 | import signal
 19 | import subprocess
 20 | import sys
 21 | import tempfile
 22 | import textwrap
 23 | import unittest
 24 | from absl.testing import absltest
 25 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
 26 | from cloud_tpu_diagnostics.src.stack_trace import disable_stack_trace_dumping
 27 | from cloud_tpu_diagnostics.src.stack_trace import enable_stack_trace_dumping
 28 | from cloud_tpu_diagnostics.src.stack_trace import user_signal_handler_wrapper
 29 | from cloud_tpu_diagnostics.src.util import default
 30 | 
 31 | class StackTraceTest(absltest.TestCase):
 32 | 
 33 |   def setUp(self):
 34 |     super().setUp()
 35 |     package_dir = '/'.join(os.path.dirname(__file__).split('/')[:-1])
 36 |     # Used to run test with blaze/bazel
 37 |     self.test_binary = os.path.join(package_dir, 'stack_trace_test_util')
 38 |     # Used to run test with unittest `python3 -m unittest stack_trace_test.py`
 39 |     self.test_file = os.path.join(
 40 |         package_dir, 'src/util/stack_trace_test_util.py'
 41 |     )
 42 |     self.stack_trace_module = os.path.join(package_dir, 'src/stack_trace.py')
 43 | 
 44 |   def tearDown(self):
 45 |     super().tearDown()
 46 |     if os.path.exists(default.STACK_TRACE_DIR_DEFAULT):
 47 |       shutil.rmtree(default.STACK_TRACE_DIR_DEFAULT)
 48 | 
 49 |   @unittest.skipIf(not hasattr(signal, 'SIGSEGV'), 'Missing signal.SIGSEGV')
 50 |   def testSigsegvCollectStackTraceTrueTraceCollectedOnCloud(self):
 51 |     error = 'Fatal Python error: Segmentation fault'
 52 |     self.check_fatal_error(52, error, 'SIGSEGV', True)
 53 | 
 54 |   @unittest.skipIf(not hasattr(signal, 'SIGABRT'), 'Missing signal.SIGABRT')
 55 |   def testSigabrtCollectStackTraceTrueTraceCollectedOnCloud(self):
 56 |     error = 'Fatal Python error: Aborted'
 57 |     self.check_fatal_error(55, error, 'SIGABRT', True)
 58 | 
 59 |   @unittest.skipIf(not hasattr(signal, 'SIGFPE'), 'Missing signal.SIGFPE')
 60 |   def testSigfpeCollectStackTraceTrueTraceCollectedOnCloud(self):
 61 |     error = 'Fatal Python error: Floating point exception'
 62 |     try:
 63 |       self.check_fatal_error(58, error, 'SIGFPE', True)
 64 |     except AssertionError:
 65 |       # error message is different for Python 3.12
 66 |       error = 'Fatal Python error: Floating-point exception'
 67 |       self.check_fatal_error(58, error, 'SIGFPE', True)
 68 | 
 69 |   @unittest.skipIf(not hasattr(signal, 'SIGILL'), 'Missing signal.SIGILL')
 70 |   def testSigillCollectStackTraceTrueTraceCollectedOnCloud(self):
 71 |     error = 'Fatal Python error: Illegal instruction'
 72 |     self.check_fatal_error(61, error, 'SIGILL', True)
 73 | 
 74 |   @unittest.skipIf(not hasattr(signal, 'SIGBUS'), 'Missing signal.SIGBUS')
 75 |   def testSigbusCollectStackTraceTrueTraceCollectedOnCloud(self):
 76 |     error = 'Fatal Python error: Bus error'
 77 |     self.check_fatal_error(64, error, 'SIGBUS', True)
 78 | 
 79 |   @unittest.skipIf(not hasattr(signal, 'SIGUSR1'), 'Missing signal.SIGUSR1')
 80 |   def testSigusrCollectStackTraceTrueTraceCollectedOnCloud(self):
 81 |     self.check_fatal_error(67, '', 'SIGUSR1', True)
 82 | 
 83 |   def testCollectStackTraceFalseNoTraceDirCreated(self):
 84 |     process = self.run_python_code('', False, True)
 85 |     _, stderr = process.communicate()
 86 |     self.assertFalse(os.path.exists(default.STACK_TRACE_DIR_DEFAULT))
 87 |     self.assertEmpty(stderr)
 88 | 
 89 |   @unittest.skipIf(not hasattr(signal, 'SIGUSR1'), 'Missing signal.SIGUSR1')
 90 |   def testCollectStackTraceToConsole(self):
 91 |     self.check_fatal_error(67, '', 'SIGUSR1', False)
 92 | 
 93 |   def testCollectStackTraceFalseNoTraceCollectedOnConsole(self):
 94 |     process = self.run_python_code('', False, False)
 95 |     _, stderr = process.communicate()
 96 |     self.assertEmpty(stderr)
 97 | 
 98 |   def testEnableStackTraceDumpingFaulthandlerEnabled(self):
 99 |     stack_trace_config = stack_trace_configuration.StackTraceConfig(
100 |         collect_stack_trace=True, stack_trace_to_cloud=True
101 |     )
102 |     with self.assertLogs(level='INFO') as log:
103 |       enable_stack_trace_dumping(stack_trace_config)
104 |     self.assertEqual(faulthandler.is_enabled(), True)
105 |     self.assertRegex(
106 |         log.output[0], 'Stack trace will be written in: /tmp/debugging/'
107 |     )
108 | 
109 |   def testDisableStackTraceDumpingFaulthandlerDisabled(self):
110 |     stack_trace_config = stack_trace_configuration.StackTraceConfig(
111 |         collect_stack_trace=True, stack_trace_to_cloud=True
112 |     )
113 |     enable_stack_trace_dumping(stack_trace_config)
114 |     disable_stack_trace_dumping(stack_trace_config)
115 |     self.assertEqual(faulthandler.is_enabled(), False)
116 | 
117 |   def testUserSignalHandlerForStderr(self):
118 |     file_obj = tempfile.NamedTemporaryFile('r+')
119 |     sys.stderr = file_obj
120 |     user_signal_handler = user_signal_handler_wrapper(sys.stderr, 30)
121 |     user_signal_handler(signal.SIGUSR1, None)
122 |     with open(file_obj.name, 'rb') as f:
123 |       data = f.readlines()
124 |       self.assertEqual(
125 |           data[0],
126 |           b'INFO: Not a crash. cloud-tpu-diagnostics emits a stack trace'
127 |           b' snapshot every 30 seconds.\n',
128 |       )
129 | 
130 |   def testUserSignalHandlerForFile(self):
131 |     file_obj = tempfile.NamedTemporaryFile('rb+')
132 |     user_signal_handler = user_signal_handler_wrapper(file_obj, 30)
133 |     user_signal_handler(signal.SIGUSR1, None)
134 |     with open(file_obj.name, 'rb') as f:
135 |       data = f.readlines()
136 |       self.assertEqual(
137 |           data[0],
138 |           b'INFO: Not a crash. cloud-tpu-diagnostics emits a stack trace'
139 |           b' snapshot every 30 seconds.\n',
140 |       )
141 | 
142 |   def check_fatal_error(self, line_number, error, signal_name, log_to_cloud):
143 |     if error:
144 |       header = r'Stack \(most recent call first\)'
145 |       regex = """
146 |           {error}
147 |           
148 |           {header}:
149 |             File "{filename}", line {line_number} in <module>
150 |           """
151 |     else:
152 |       header = (
153 |           r'INFO: Not a crash. cloud\-tpu\-diagnostics emits a stack trace'
154 |           r' snapshot every 1 seconds.\n'
155 |           r'Stack \(most recent call first\)'
156 |       )
157 |       regex = """
158 |           {header}:
159 |             File "{stack_trace_module}", line 23 in user_signal_handler
160 |             File "{filename}", line {line_number} in <module>
161 |           """
162 |     regex = (
163 |         textwrap.dedent(regex)
164 |         .format(
165 |             error=error,
166 |             header=header,
167 |             filename=self.test_file,
168 |             stack_trace_module=self.stack_trace_module,
169 |             line_number=line_number,
170 |         )
171 |         .strip()
172 |     )
173 | 
174 |     output, stderr = self.get_output(signal_name, True, log_to_cloud)
175 |     if log_to_cloud:
176 |       self.assertRegex(output, regex)
177 |       self.assertEmpty(stderr)
178 |     else:
179 |       self.assertRegex(stderr, regex)
180 |       self.assertEmpty(output)
181 | 
182 |   def get_output(self, signal_name, collect_stack_trace, log_to_cloud):
183 |     process = self.run_python_code(
184 |         signal_name, collect_stack_trace, log_to_cloud
185 |     )
186 |     _, stderr = process.communicate()
187 |     stderr = stderr.decode('ascii', 'backslashreplace')
188 |     output = ''
189 |     if log_to_cloud:
190 |       trace_file = os.listdir(default.STACK_TRACE_DIR_DEFAULT)
191 |       if trace_file:
192 |         stack_trace_file = default.STACK_TRACE_DIR_DEFAULT + trace_file[0]
193 |         with open(stack_trace_file, 'rb') as fp:
194 |           output = fp.read().decode('ascii', 'backslashreplace')
195 |     return output, stderr
196 | 
197 |   def run_python_code(self, signal_name, collect_stack_trace, log_to_cloud):
198 |     args = [
199 |         '--signal=' + signal_name,
200 |         '--collect_stack_trace=' + str(collect_stack_trace),
201 |         '--log_to_cloud=' + str(log_to_cloud),
202 |     ]
203 |     if sys.executable is not None:
204 |       code = [sys.executable, self.test_file]
205 |     else:
206 |       code = [self.test_binary]
207 |     return subprocess.Popen(
208 |         code + args,
209 |         stdout=subprocess.PIPE,
210 |         stderr=subprocess.PIPE,
211 |         env=os.environ.copy(),
212 |     )
213 | 
214 | 
215 | if __name__ == '__main__':
216 |   absltest.main()
217 | 


--------------------------------------------------------------------------------
/pip_package/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | # 
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | [project]
16 | name = "cloud-tpu-diagnostics"
17 | version = "0.1.5"
18 | authors = [
19 |   { name="Cloud TPU Team", email="cloud-tpu-eng@google.com" },
20 | ]
21 | description = "Monitor, debug and profile the jobs running on Cloud TPU."
22 | readme = "README.md"
23 | requires-python = ">=3.8"
24 | license = {text = "Apache-2.0"}
25 | classifiers = [
26 |     "Programming Language :: Python :: 3.8",
27 |     "Programming Language :: Python :: 3.9",
28 |     "Programming Language :: Python :: 3.10",
29 |     "Programming Language :: Python :: 3.11",
30 | ]
31 | keywords = []
32 | 
33 | # pip dependencies installed with `pip install -e .`
34 | dependencies = []
35 | 
36 | [project.urls]
37 | "Homepage" = "https://github.com/google/cloud-tpu-monitoring-debugging"
38 | "Bug Tracker" = "https://github.com/google/cloud-tpu-monitoring-debugging/issues"
39 | 
40 | [build-system]
41 | # Build system specify which backend is used to build/install the project
42 | requires = ["flit_core >=3.8,<4"]
43 | build-backend = "flit_core.buildapi"
44 | 
45 | [tool.flit.sdist]
46 | # Flit specific options (files to exclude from the PyPI package)
47 | exclude = [
48 |   # Do not release tests files on PyPI
49 |   "tests/*_test.py",
50 | ]
51 | 


--------------------------------------------------------------------------------