├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── gcp_resources
├── gce
│ ├── input.tf
│ ├── main.tf
│ └── resources
│ │ ├── dashboard
│ │ ├── logging_dashboard
│ │ │ ├── dashboard.tf
│ │ │ ├── dashboard_json
│ │ │ │ ├── main.json
│ │ │ │ ├── stack-trace-counter-metric.json
│ │ │ │ └── stack-trace-log-panel.json
│ │ │ ├── input.tf
│ │ │ ├── log_metrics
│ │ │ │ ├── input.tf
│ │ │ │ └── stack_trace_counter.tf
│ │ │ └── main.tf
│ │ └── monitoring_dashboard
│ │ │ ├── dashboard.tf
│ │ │ ├── dashboard_json
│ │ │ ├── cpu-utilization.json
│ │ │ ├── dcn-transfer-latency.json
│ │ │ ├── device-to-host-transfer-latency.json
│ │ │ ├── host-to-device-transfer-latency.json
│ │ │ ├── main.json
│ │ │ ├── memory-usage.json
│ │ │ ├── network-bytes.json
│ │ │ └── tensorcore-idle-duration.json
│ │ │ ├── input.tf
│ │ │ └── main.tf
│ │ └── log_storage
│ │ ├── input.tf
│ │ ├── main.tf
│ │ └── stack-trace-bucket.tf
└── gke
│ ├── input.tf
│ ├── main.tf
│ └── resources
│ ├── dashboard
│ ├── logging_dashboard
│ │ ├── dashboard.tf
│ │ ├── dashboard_json
│ │ │ ├── main.json
│ │ │ ├── stack-trace-counter-metric.json
│ │ │ └── stack-trace-log-panel.json
│ │ ├── input.tf
│ │ ├── log_metrics
│ │ │ ├── input.tf
│ │ │ └── stack_trace_counter.tf
│ │ └── main.tf
│ └── monitoring_dashboard
│ │ ├── dashboard.tf
│ │ ├── dashboard_json
│ │ ├── accelerator-memory-used.json
│ │ ├── collectives-latency.json
│ │ ├── cpu-utilization.json
│ │ ├── dcn-transfer-latency.json
│ │ ├── device-to-host-transfer-latency.json
│ │ ├── duty-cycle.json
│ │ ├── host-to-device-transfer-latency.json
│ │ ├── main.json
│ │ ├── memory-usage.json
│ │ └── network-bytes.json
│ │ ├── input.tf
│ │ └── main.tf
│ └── log_storage
│ ├── input.tf
│ ├── main.tf
│ └── stack-trace-bucket.tf
└── pip_package
├── CHANGELOG.md
├── README.md
├── cloud_tpu_diagnostics
├── __init__.py
├── configuration.py
├── diagnostic.py
├── src
│ ├── config
│ │ ├── debug_configuration.py
│ │ ├── diagnostic_configuration.py
│ │ └── stack_trace_configuration.py
│ ├── debug.py
│ ├── diagnose.py
│ ├── stack_trace.py
│ └── util
│ │ ├── default.py
│ │ └── stack_trace_test_util.py
└── tests
│ ├── debug_test.py
│ ├── diagnose_test.py
│ └── stack_trace_test.py
└── pyproject.toml
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 |
16 | # How to contribute
17 |
18 | We'd love to accept your patches and contributions to this project.
19 |
20 | ## Before you begin
21 |
22 | ### Sign our Contributor License Agreement
23 |
24 | Contributions to this project must be accompanied by a
25 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
26 | You (or your employer) retain the copyright to your contribution; this simply
27 | gives us permission to use and redistribute your contributions as part of the
28 | project.
29 |
30 | If you or your current employer have already signed the Google CLA (even if it
31 | was for a different project), you probably don't need to do it again.
32 |
33 | Visit to see your current agreements or to
34 | sign a new one.
35 |
36 | ### Review our community guidelines
37 |
38 | This project follows
39 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
40 |
41 | ## Contribution process
42 |
43 | ### Code reviews
44 |
45 | All submissions, including submissions by project members, require review. We
46 | use GitHub pull requests for this purpose. Consult
47 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
48 | information on using pull requests.
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
16 | # Cloud TPU Monitoring Debugging
17 |
18 | ## Overview
19 |
20 | Cloud TPU Monitoring Debugging repository contains all the infrastructure and logic required to monitor and debug jobs running on Cloud TPU.
21 |
22 | Terraform is used to deploy resources in google cloud project.
23 | Terraform is an open-source tool to set up and manage google cloud
24 | infrastructure based on configuration files. This repository will help the
25 | customers to deploy various google cloud resources via script, without any
26 | manual effort.
27 |
28 | [cloud-tpu-diagnostics PyPI package](https://pypi.org/project/cloud-tpu-diagnostics) contains all the logic to monitor, debug and profile the jobs running on Cloud TPU.
29 |
30 | ## Getting Started with Terraform
31 |
32 | - Follow [this link](https://developer.hashicorp.com/terraform/tutorials/gcp-get-started/install-cli) to install Terraform on desktop.
33 | - Run `terraform init` to
34 | initialize google cloud Terraform provider version. This command will add
35 | the necessary plugins and build the `.terraform` directory.
36 | - If there is an update to terraform google cloud provider version, run
37 | `terraform init --upgrade` for the update to take place.
38 | - You can also run `terraform plan` to validate resource declarations,
39 | identify any syntax errors, version mismatch before deploying the resources.
40 |
41 | ### Configure Terraform to store state in Cloud Storage
42 |
43 | By default, Terraform stores [state](https://www.terraform.io/docs/state/) locally in a file named `terraform.tfstate`. This default configuration can make Terraform usage difficult for teams, especially when many users run Terraform at the same time and each machine has its own understanding of the current infrastructure. To help avoid such issues, this section configures a remote state that points to Google Cloud Storage (GCS) bucket.
44 |
45 | 1. In Cloud Shell, create the GCS bucket:
46 |
47 | gsutil mb gs://${GCS_BUCKET_NAME}
48 |
49 | 2. Enable [Object Versioning](https://cloud.google.com/storage/docs/object-versioning) to keep the history of your deployments. Enabling Object Versioning increases [storage costs](https://cloud.google.com/storage/pricing), which you can mitigate by configuring
50 | [Object Lifecycle Management](https://cloud.google.com/storage/docs/lifecycle) to delete old state versions.
51 |
52 | gsutil versioning set on gs://${GCS_BUCKET_NAME}
53 |
54 | 3. Enter the name of GCS bucket created above when you run `terraform init` to initialize Terraform.
55 |
56 | Initializing the backend...
57 | bucket
58 | The name of the Google Cloud Storage bucket
59 |
60 | Enter a value:
61 |
62 | ## Deploy GCP Resources
63 | There are following resources managed in this directory:
64 |
65 | 1. **Monitoring Dashboard**: This is an outlier dashboard that displays statistics and outlier mode for TPU metrics.
66 | 2. **Debugging Dashboard**: This dashboard displays the stack traces collected in Cloud Logging for the process running on TPU VMs.
67 | 3. **Logging Storage**: This is an user-defined log bucket to store stack traces. Creating a new log storage is completely optional. If you choose not to create a separate log bucket, the stack traces will be collected in [_Default log bucket](https://cloud.google.com/logging/docs/routing/overview#default-bucket).
68 |
69 | ### Deploy Resources for Workloads on GCE
70 |
71 | Run `terraform init && terraform apply` inside `gcp_resources/gce` directory to deploy all the resources mentioned above for TPU workloads running on GCE. You will be prompted to provide values for some input variables. After confirming the action, all the resources will get automatically deployed in your gcp project.
72 |
73 | ### Deploy Resources for Workloads on GKE
74 |
75 | Run `terraform init && terraform apply` inside `gcp_resources/gke` directory to deploy all the resources mentioned above for TPU workloads running on GKE. You will be prompted to provide values for some input variables. After confirming the action, all the resources will get automatically deployed in your gcp project.
76 |
77 | > **_NOTE:_** Please check the below guide for more details about GCE/GKE specific resources and prerequisites.
78 |
79 | Follow the below guide to deploy the resources individually:
80 | ### Monitoring Dashboard
81 | #### GCE
82 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/dashboard/monitoring_dashboard/` to deploy only monitoring dashboard for GCE in your gcp project.
83 |
84 | If the `node_prefix` parameter is not specified in the input variable `var.monitoring_dashboard_config` or is set to an empty string, the metrics on the dashboard will plot the data points for all TPU VMs in your GCP project.
85 |
86 | For instance, if you provide `{"node_prefix": "test"}` as the input value for the input variable `var.monitoring_dashboard_config`, then the metrics on the monitoring dashboard will only show the data points for the TPU VMs with node names that start with `test`. Refer to this [doc](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/tpus/queued-resources/create#--node-prefix) for more information on node prefix for TPUs in multislice.
87 |
88 | #### GKE
89 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/dashboard/monitoring_dashboard/` to deploy only monitoring dashboard for GKE in your gcp project.
90 |
91 | ### Debugging Dashboard
92 | #### GCE
93 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/dashboard/logging_dashboard/` to deploy only debugging dashboard for GCE in your gcp project.
94 |
95 | #### GKE
96 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/dashboard/logging_dashboard/` to deploy only debugging dashboard for GKE in your gcp project.
97 |
98 | Users need to add a sidecar container to their TPU workload running on GKE to view traces in the debugging dashboard. The sidecar container must be named in a specific way, matching the regex `[a-z-0-9]*stacktrace[a-z-0-9]*`. Here is an example of the sidecar container that should be added:
99 |
100 | ```
101 | containers:
102 | - name: stacktrace-log-collector
103 | image: busybox:1.28
104 | resources:
105 | limits:
106 | cpu: 100m
107 | memory: 200Mi
108 | args: [/bin/sh, -c, "while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*"]
109 | volumeMounts:
110 | - name: tpu-debug-logs
111 | readOnly: true
112 | mountPath: /tmp/debugging
113 | - name:
114 | .....
115 | .....
116 | volumes:
117 | - name: tpu-debug-logs
118 | ```
119 |
120 | ### Log Storage
121 | #### GCE
122 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/log_storage/` to deploy a separate log bucket to store stack traces for GCE. You will be prompted to provide name of your gcp project and also the bucket configuration. You can also set the retention period for the bucket.
123 |
124 | #### GKE
125 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/log_storage/` to deploy a separate log bucket to store stack traces for GKE. You will be prompted to provide name of your gcp project and also the bucket configuration. You can also set the retention period for the bucket. Make sure that you have the sidecar container running in your GKE cluster as mentioned in [Debugging Dashboard section for GKE](#debugging-dashboard).
--------------------------------------------------------------------------------
/gcp_resources/gce/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
20 | variable "monitoring_dashboard_config" {
21 | type = object({
22 | node_prefix : optional(string),
23 | outlier_count : optional(number)
24 | })
25 | description = <"}
37 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"", "retention_days":x}
38 | // 3. To not create stack trace bucket: {}
39 | variable "stack_trace_bucket_config" {
40 | type = object({
41 | bucket_name : optional(string)
42 | retention_days : optional(number)
43 | })
44 | validation {
45 | condition = (
46 | (var.stack_trace_bucket_config.bucket_name == null &&
47 | var.stack_trace_bucket_config.retention_days == null) ||
48 | (var.stack_trace_bucket_config.bucket_name != null)
49 | )
50 | error_message = "bucket_name is not defined for stack_trace_bucket_config."
51 | }
52 | description = </default.tfstate
29 | prefix = "gce"
30 | }
31 | }
32 |
33 | module "monitoring_dashboard" {
34 | source = "./resources/dashboard/monitoring_dashboard"
35 | project_name = var.project_name
36 | monitoring_dashboard_config = var.monitoring_dashboard_config
37 | }
38 |
39 | module "logging_dashboard" {
40 | source = "./resources/dashboard/logging_dashboard"
41 | project_name = var.project_name
42 | }
43 |
44 | module "log_storage" {
45 | source = "./resources/log_storage"
46 | project_name = var.project_name
47 | stack_trace_bucket_config = var.stack_trace_bucket_config
48 | }
49 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | data "google_project" "project" {
16 | project_id = var.project_name
17 | }
18 |
19 | // Add a dependency on log_metrics module to deploy log-based metrics before deploying logging dashboard
20 | module "log_metrics" {
21 | source = "./log_metrics"
22 | project_name = var.project_name
23 | }
24 |
25 | locals {
26 | dashboard_json = templatefile("${path.module}/dashboard_json/main.json",
27 | {
28 | TILE_1 = templatefile("${path.module}/dashboard_json/stack-trace-counter-metric.json",
29 | {
30 | METRIC_NAME = module.log_metrics.stack_trace_counter_metric_id
31 | }),
32 | TILE_2 = templatefile("${path.module}/dashboard_json/stack-trace-log-panel.json",
33 | {
34 | PROJECT_NUMBER = data.google_project.project.number
35 | })
36 | })
37 | }
38 |
39 | resource "google_monitoring_dashboard" "logging_dashboard" {
40 | project = var.project_name
41 | dashboard_json = local.dashboard_json
42 | depends_on = [module.log_metrics.stack_trace_counter_metric]
43 | }
44 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/main.json:
--------------------------------------------------------------------------------
1 | {
2 | "category": "CUSTOM",
3 | "displayName": "GCE - TPU Logging Dashboard",
4 | "dashboardFilters": [
5 | {
6 | "filterType": "RESOURCE_LABEL",
7 | "labelKey": "node_id"
8 | },
9 | {
10 | "filterType": "RESOURCE_LABEL",
11 | "labelKey": "worker_id"
12 | }
13 | ],
14 | "mosaicLayout": {
15 | "columns": 12,
16 | "tiles": [
17 | ${TILE_1},
18 | ${TILE_2},
19 | {
20 | "height": 10,
21 | "widget": {
22 | "collapsibleGroup": {
23 | "collapsed": false
24 | },
25 | "title": "TPU VM Process Debugging"
26 | },
27 | "width": 12
28 | }
29 | ]
30 | }
31 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-counter-metric.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "timeSeriesTable": {
5 | "columnSettings": [
6 | {
7 | "column": "node_id",
8 | "visible": true
9 | },
10 | {
11 | "column": "worker_id",
12 | "visible": true
13 | },
14 | {
15 | "column": "zone",
16 | "visible": true
17 | },
18 | {
19 | "column": "value",
20 | "visible": true
21 | }
22 | ],
23 | "dataSets": [
24 | {
25 | "minAlignmentPeriod": "600s",
26 | "timeSeriesQuery": {
27 | "outputFullDuration": true,
28 | "timeSeriesFilter": {
29 | "aggregation": {
30 | "alignmentPeriod": "600s",
31 | "perSeriesAligner": "ALIGN_RATE"
32 | },
33 | "filter": "metric.type=\"logging.googleapis.com/user/${METRIC_NAME}\" resource.type=\"tpu_worker\"",
34 | "pickTimeSeriesFilter": {
35 | "direction": "TOP",
36 | "numTimeSeries": 300,
37 | "rankingMethod": "METHOD_MEAN"
38 | },
39 | "secondaryAggregation": {
40 | "alignmentPeriod": "600s",
41 | "crossSeriesReducer": "REDUCE_MEAN",
42 | "groupByFields": [
43 | "metric.label.\"node_id\"",
44 | "metric.label.\"worker_id\"",
45 | "metric.label.\"zone\""
46 | ],
47 | "perSeriesAligner": "ALIGN_MEAN"
48 | }
49 | }
50 | }
51 | }
52 | ],
53 | "metricVisualization": "BAR"
54 | },
55 | "title": "Stack Trace Log Entry Count per Period [Sorted by MEAN]"
56 | },
57 | "width": 12
58 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-log-panel.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 6,
3 | "widget": {
4 | "logsPanel": {
5 | "filter": "resource.type=\"tpu_worker\" log_id(\"tpu.googleapis.com/runtime_monitor\") jsonPayload.verb=\"stacktraceanalyzer\"",
6 | "resourceNames": [
7 | "projects/${PROJECT_NUMBER}"
8 | ]
9 | },
10 | "title": "Stack Trace Logs"
11 | },
12 | "width": 12,
13 | "yPos": 4
14 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/log_metrics/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/log_metrics/stack_trace_counter.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | // Metric that counts the number of stack trace entries that match a specified filter within a specific period
16 | resource "google_logging_metric" "stack_trace_counter_metric" {
17 | name = "stack_trace_counter_gce"
18 | project = var.project_name
19 | description = "Counts the number of stack trace log entries within a specific period."
20 | filter = "resource.type=\"tpu_worker\" AND log_id(\"tpu.googleapis.com/runtime_monitor\") AND jsonPayload.verb=\"stacktraceanalyzer\""
21 | metric_descriptor {
22 | metric_kind = "DELTA"
23 | value_type = "INT64"
24 | labels {
25 | key = "zone"
26 | value_type = "STRING"
27 | }
28 | labels {
29 | key = "node_id"
30 | value_type = "STRING"
31 | }
32 | labels {
33 | key = "worker_id"
34 | value_type = "STRING"
35 | }
36 | }
37 | label_extractors = {
38 | "zone" = "EXTRACT(resource.labels.zone)",
39 | "node_id" = "EXTRACT(resource.labels.node_id)",
40 | "worker_id" = "EXTRACT(resource.labels.worker_id)",
41 | }
42 | }
43 |
44 | output "stack_trace_counter_metric_id" {
45 | value = google_logging_metric.stack_trace_counter_metric.id
46 | }
47 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/logging_dashboard/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | terraform {
16 | required_providers {
17 | google = {
18 | source = "hashicorp/google"
19 | version = ">= 4.57.0"
20 | }
21 | }
22 | /*
23 | Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 | The bucket must exist prior to configuring the backend.
25 | For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 | */
27 | backend "gcs" {
28 | # GCS prefix inside the bucket. terraform states are stored in an object called /default.tfstate
29 | prefix = "gce/dashboard/logging_dashboard"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | locals {
16 | outlier_count = var.monitoring_dashboard_config.outlier_count == null ? 10 : var.monitoring_dashboard_config.outlier_count
17 | node_prefix_regex = var.monitoring_dashboard_config.node_prefix == null ? "[a-z0-9-_]*" : "${var.monitoring_dashboard_config.node_prefix}[a-z0-9-_]*"
18 | dashboard_json = templatefile("${path.module}/dashboard_json/main.json",
19 | {
20 | TILE_1 = templatefile("${path.module}/dashboard_json/cpu-utilization.json",
21 | {
22 | OUTLIER_COUNT = local.outlier_count,
23 | NODE_PREFIX_REGEX = local.node_prefix_regex
24 | }),
25 | TILE_2 = templatefile("${path.module}/dashboard_json/tensorcore-idle-duration.json",
26 | {
27 | OUTLIER_COUNT = local.outlier_count,
28 | NODE_PREFIX_REGEX = local.node_prefix_regex
29 | }),
30 | TILE_3 = templatefile("${path.module}/dashboard_json/memory-usage.json",
31 | {
32 | OUTLIER_COUNT = local.outlier_count,
33 | NODE_PREFIX_REGEX = local.node_prefix_regex
34 | }),
35 | TILE_4 = templatefile("${path.module}/dashboard_json/network-bytes.json",
36 | {
37 | OUTLIER_COUNT = local.outlier_count,
38 | NODE_PREFIX_REGEX = local.node_prefix_regex
39 | }),
40 | TILE_5 = templatefile("${path.module}/dashboard_json/dcn-transfer-latency.json",
41 | {
42 | OUTLIER_COUNT = local.outlier_count,
43 | NODE_PREFIX_REGEX = local.node_prefix_regex
44 | }),
45 | TILE_6 = templatefile("${path.module}/dashboard_json/host-to-device-transfer-latency.json",
46 | {
47 | OUTLIER_COUNT = local.outlier_count,
48 | NODE_PREFIX_REGEX = local.node_prefix_regex
49 | }),
50 | TILE_7 = templatefile("${path.module}/dashboard_json/device-to-host-transfer-latency.json",
51 | {
52 | OUTLIER_COUNT = local.outlier_count,
53 | NODE_PREFIX_REGEX = local.node_prefix_regex
54 | })
55 | })
56 | }
57 |
58 | resource "google_monitoring_dashboard" "monitoring_dashboard" {
59 | project = var.project_name
60 | dashboard_json = local.dashboard_json
61 | }
62 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/cpu-utilization.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "TPU Worker - CPU Utilization Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_NONE"
19 | },
20 | "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12,
34 | "yPos": 1
35 | },
36 | {
37 | "height": 4,
38 | "widget": {
39 | "title": "TPU Worker - CPU Utilization Outliers [MEAN]",
40 | "xyChart": {
41 | "chartOptions": {
42 | "mode": "COLOR"
43 | },
44 | "dataSets": [
45 | {
46 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
47 | "minAlignmentPeriod": "60s",
48 | "plotType": "LINE",
49 | "targetAxis": "Y1",
50 | "timeSeriesQuery": {
51 | "timeSeriesFilter": {
52 | "aggregation": {
53 | "alignmentPeriod": "60s",
54 | "crossSeriesReducer": "REDUCE_MEAN",
55 | "groupByFields": [
56 | "resource.label.\"node_id\"",
57 | "resource.label.\"worker_id\""
58 | ],
59 | "perSeriesAligner": "ALIGN_MEAN"
60 | },
61 | "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
62 | "pickTimeSeriesFilter": {
63 | "direction": "TOP",
64 | "numTimeSeries": ${OUTLIER_COUNT},
65 | "rankingMethod": "METHOD_MEAN"
66 | }
67 | }
68 | }
69 | }
70 | ],
71 | "thresholds": [],
72 | "timeshiftDuration": "0s",
73 | "yAxis": {
74 | "label": "",
75 | "scale": "LINEAR"
76 | }
77 | }
78 | },
79 | "width": 6,
80 | "yPos": 5
81 | },
82 | {
83 | "height": 4,
84 | "widget": {
85 | "title": "TPU Worker - CPU Utilization Outliers [MAX]",
86 | "xyChart": {
87 | "chartOptions": {
88 | "mode": "COLOR"
89 | },
90 | "dataSets": [
91 | {
92 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
93 | "minAlignmentPeriod": "60s",
94 | "plotType": "LINE",
95 | "targetAxis": "Y1",
96 | "timeSeriesQuery": {
97 | "timeSeriesFilter": {
98 | "aggregation": {
99 | "alignmentPeriod": "60s",
100 | "crossSeriesReducer": "REDUCE_MAX",
101 | "groupByFields": [
102 | "resource.label.\"node_id\"",
103 | "resource.label.\"worker_id\""
104 | ],
105 | "perSeriesAligner": "ALIGN_MAX"
106 | },
107 | "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
108 | "pickTimeSeriesFilter": {
109 | "direction": "TOP",
110 | "numTimeSeries": ${OUTLIER_COUNT},
111 | "rankingMethod": "METHOD_MAX"
112 | }
113 | }
114 | }
115 | }
116 | ],
117 | "thresholds": [],
118 | "timeshiftDuration": "0s",
119 | "yAxis": {
120 | "label": "",
121 | "scale": "LINEAR"
122 | }
123 | }
124 | },
125 | "width": 6,
126 | "xPos": 6,
127 | "yPos": 5
128 | },
129 | {
130 | "height": 8,
131 | "widget": {
132 | "collapsibleGroup": {
133 | "collapsed": false
134 | },
135 | "title": "CPU Utilization on TPU Worker"
136 | },
137 | "width": 12,
138 | "yPos": 1
139 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/dcn-transfer-latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "GCE Instance - DCN Transfer Latency Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
19 | "perSeriesAligner": "ALIGN_SUM"
20 | },
21 | "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
22 | }
23 | }
24 | }
25 | ],
26 | "thresholds": [],
27 | "timeshiftDuration": "0s",
28 | "yAxis": {
29 | "label": "",
30 | "scale": "LINEAR"
31 | }
32 | }
33 | },
34 | "width": 12,
35 | "yPos": 42
36 | },
37 | {
38 | "height": 4,
39 | "widget": {
40 | "title": "GCE Instance - DCN Transfer Latency Outliers [p50]",
41 | "xyChart": {
42 | "chartOptions": {
43 | "mode": "COLOR"
44 | },
45 | "dataSets": [
46 | {
47 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
48 | "minAlignmentPeriod": "60s",
49 | "plotType": "LINE",
50 | "targetAxis": "Y1",
51 | "timeSeriesQuery": {
52 | "timeSeriesFilter": {
53 | "aggregation": {
54 | "alignmentPeriod": "60s",
55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
56 | "groupByFields": [
57 | "metric.label.\"node_id\"",
58 | "metric.label.\"worker_id\""
59 | ],
60 | "perSeriesAligner": "ALIGN_SUM"
61 | },
62 | "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
63 | "pickTimeSeriesFilter": {
64 | "direction": "TOP",
65 | "numTimeSeries": ${OUTLIER_COUNT},
66 | "rankingMethod": "METHOD_MAX"
67 | }
68 | }
69 | }
70 | }
71 | ],
72 | "thresholds": [],
73 | "timeshiftDuration": "0s",
74 | "yAxis": {
75 | "label": "",
76 | "scale": "LINEAR"
77 | }
78 | }
79 | },
80 | "width": 6,
81 | "yPos": 46
82 | },
83 | {
84 | "height": 4,
85 | "widget": {
86 | "title": "GCE Instance - DCN Transfer Latency Outliers [p99]",
87 | "xyChart": {
88 | "chartOptions": {
89 | "mode": "COLOR"
90 | },
91 | "dataSets": [
92 | {
93 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
94 | "minAlignmentPeriod": "60s",
95 | "plotType": "LINE",
96 | "targetAxis": "Y1",
97 | "timeSeriesQuery": {
98 | "timeSeriesFilter": {
99 | "aggregation": {
100 | "alignmentPeriod": "60s",
101 | "crossSeriesReducer": "REDUCE_PERCENTILE_99",
102 | "groupByFields": [
103 | "metric.label.\"node_id\"",
104 | "metric.label.\"worker_id\""
105 | ],
106 | "perSeriesAligner": "ALIGN_SUM"
107 | },
108 | "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
109 | "pickTimeSeriesFilter": {
110 | "direction": "TOP",
111 | "numTimeSeries": ${OUTLIER_COUNT},
112 | "rankingMethod": "METHOD_MAX"
113 | }
114 | }
115 | }
116 | }
117 | ],
118 | "thresholds": [],
119 | "timeshiftDuration": "0s",
120 | "yAxis": {
121 | "label": "",
122 | "scale": "LINEAR"
123 | }
124 | }
125 | },
126 | "width": 6,
127 | "xPos": 6,
128 | "yPos": 46
129 | },
130 | {
131 | "height": 8,
132 | "widget": {
133 | "collapsibleGroup": {
134 | "collapsed": false
135 | },
136 | "title": "DCN Transfer Latency"
137 | },
138 | "width": 12,
139 | "yPos": 42
140 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/device-to-host-transfer-latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "GCE Instance - Device to Host Transfer Latency Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
19 | "perSeriesAligner": "ALIGN_SUM"
20 | },
21 | "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
22 | }
23 | }
24 | }
25 | ],
26 | "thresholds": [],
27 | "timeshiftDuration": "0s",
28 | "yAxis": {
29 | "label": "",
30 | "scale": "LINEAR"
31 | }
32 | }
33 | },
34 | "width": 12,
35 | "yPos": 58
36 | },
37 | {
38 | "height": 4,
39 | "widget": {
40 | "title": "GCE Instance - Device to Host Transfer Latency Outliers [p50]",
41 | "xyChart": {
42 | "chartOptions": {
43 | "mode": "COLOR"
44 | },
45 | "dataSets": [
46 | {
47 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
48 | "minAlignmentPeriod": "60s",
49 | "plotType": "LINE",
50 | "targetAxis": "Y1",
51 | "timeSeriesQuery": {
52 | "timeSeriesFilter": {
53 | "aggregation": {
54 | "alignmentPeriod": "60s",
55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
56 | "groupByFields": [
57 | "metric.label.\"node_id\"",
58 | "metric.label.\"worker_id\""
59 | ],
60 | "perSeriesAligner": "ALIGN_SUM"
61 | },
62 | "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
63 | "pickTimeSeriesFilter": {
64 | "direction": "TOP",
65 | "numTimeSeries": ${OUTLIER_COUNT},
66 | "rankingMethod": "METHOD_MAX"
67 | }
68 | }
69 | }
70 | }
71 | ],
72 | "thresholds": [],
73 | "timeshiftDuration": "0s",
74 | "yAxis": {
75 | "label": "",
76 | "scale": "LINEAR"
77 | }
78 | }
79 | },
80 | "width": 6,
81 | "yPos": 62
82 | },
83 | {
84 | "height": 4,
85 | "widget": {
86 | "title": "GCE Instance - Device to Host Transfer Latency Outliers [p99]",
87 | "xyChart": {
88 | "chartOptions": {
89 | "mode": "COLOR"
90 | },
91 | "dataSets": [
92 | {
93 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
94 | "minAlignmentPeriod": "60s",
95 | "plotType": "LINE",
96 | "targetAxis": "Y1",
97 | "timeSeriesQuery": {
98 | "timeSeriesFilter": {
99 | "aggregation": {
100 | "alignmentPeriod": "60s",
101 | "crossSeriesReducer": "REDUCE_PERCENTILE_99",
102 | "groupByFields": [
103 | "metric.label.\"node_id\"",
104 | "metric.label.\"worker_id\""
105 | ],
106 | "perSeriesAligner": "ALIGN_SUM"
107 | },
108 | "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
109 | "pickTimeSeriesFilter": {
110 | "direction": "TOP",
111 | "numTimeSeries": ${OUTLIER_COUNT},
112 | "rankingMethod": "METHOD_MAX"
113 | }
114 | }
115 | }
116 | }
117 | ],
118 | "thresholds": [],
119 | "timeshiftDuration": "0s",
120 | "yAxis": {
121 | "label": "",
122 | "scale": "LINEAR"
123 | }
124 | }
125 | },
126 | "width": 6,
127 | "xPos": 6,
128 | "yPos": 62
129 | },
130 | {
131 | "height": 8,
132 | "widget": {
133 | "collapsibleGroup": {
134 | "collapsed": false
135 | },
136 | "title": "Device to Host Transfer Latency"
137 | },
138 | "width": 12,
139 | "yPos": 58
140 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/host-to-device-transfer-latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "GCE Instance - Host to Device Transfer Latency Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
19 | "perSeriesAligner": "ALIGN_SUM"
20 | },
21 | "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
22 | }
23 | }
24 | }
25 | ],
26 | "thresholds": [],
27 | "timeshiftDuration": "0s",
28 | "yAxis": {
29 | "label": "",
30 | "scale": "LINEAR"
31 | }
32 | }
33 | },
34 | "width": 12,
35 | "yPos": 50
36 | },
37 | {
38 | "height": 4,
39 | "widget": {
40 | "title": "GCE Instance - Host to Device Transfer Latency Outliers [p50]",
41 | "xyChart": {
42 | "chartOptions": {
43 | "mode": "COLOR"
44 | },
45 | "dataSets": [
46 | {
47 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
48 | "minAlignmentPeriod": "60s",
49 | "plotType": "LINE",
50 | "targetAxis": "Y1",
51 | "timeSeriesQuery": {
52 | "timeSeriesFilter": {
53 | "aggregation": {
54 | "alignmentPeriod": "60s",
55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
56 | "groupByFields": [
57 | "metric.label.\"node_id\"",
58 | "metric.label.\"worker_id\""
59 | ],
60 | "perSeriesAligner": "ALIGN_SUM"
61 | },
62 | "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
63 | "pickTimeSeriesFilter": {
64 | "direction": "TOP",
65 | "numTimeSeries": ${OUTLIER_COUNT},
66 | "rankingMethod": "METHOD_MAX"
67 | }
68 | }
69 | }
70 | }
71 | ],
72 | "thresholds": [],
73 | "timeshiftDuration": "0s",
74 | "yAxis": {
75 | "label": "",
76 | "scale": "LINEAR"
77 | }
78 | }
79 | },
80 | "width": 6,
81 | "yPos": 54
82 | },
83 | {
84 | "height": 4,
85 | "widget": {
86 | "title": "GCE Instance - Host to Device Transfer Latency Outliers [p99]",
87 | "xyChart": {
88 | "chartOptions": {
89 | "mode": "COLOR"
90 | },
91 | "dataSets": [
92 | {
93 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}",
94 | "minAlignmentPeriod": "60s",
95 | "plotType": "LINE",
96 | "targetAxis": "Y1",
97 | "timeSeriesQuery": {
98 | "timeSeriesFilter": {
99 | "aggregation": {
100 | "alignmentPeriod": "60s",
101 | "crossSeriesReducer": "REDUCE_PERCENTILE_99",
102 | "groupByFields": [
103 | "metric.label.\"node_id\"",
104 | "metric.label.\"worker_id\""
105 | ],
106 | "perSeriesAligner": "ALIGN_SUM"
107 | },
108 | "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
109 | "pickTimeSeriesFilter": {
110 | "direction": "TOP",
111 | "numTimeSeries": ${OUTLIER_COUNT},
112 | "rankingMethod": "METHOD_MAX"
113 | }
114 | }
115 | }
116 | }
117 | ],
118 | "thresholds": [],
119 | "timeshiftDuration": "0s",
120 | "yAxis": {
121 | "label": "",
122 | "scale": "LINEAR"
123 | }
124 | }
125 | },
126 | "width": 6,
127 | "xPos": 6,
128 | "yPos": 54
129 | },
130 | {
131 | "height": 8,
132 | "widget": {
133 | "collapsibleGroup": {
134 | "collapsed": false
135 | },
136 | "title": "Host to Device Transfer Latency"
137 | },
138 | "width": 12,
139 | "yPos": 50
140 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/main.json:
--------------------------------------------------------------------------------
1 | {
2 | "category": "CUSTOM",
3 | "displayName": "GCE - TPU Monitoring Dashboard",
4 | "dashboardFilters": [
5 | {
6 | "filterType": "RESOURCE_LABEL",
7 | "labelKey": "worker_id"
8 | }
9 | ],
10 | "mosaicLayout": {
11 | "columns": 12,
12 | "tiles": [
13 | {
14 | "height": 1,
15 | "widget": {
16 | "title": "TPU Worker Metrics",
17 | "text": {
18 | "content": ""
19 | }
20 | },
21 | "width": 12,
22 | "yPos": 0
23 | },
24 | ${TILE_1},
25 | ${TILE_2},
26 | ${TILE_3},
27 | ${TILE_4},
28 | {
29 | "height": 1,
30 | "widget": {
31 | "title": "Megascale Metrics",
32 | "text": {
33 | "content": ""
34 | }
35 | },
36 | "width": 12,
37 | "yPos": 41
38 | },
39 | ${TILE_5},
40 | ${TILE_6},
41 | ${TILE_7}
42 | ]
43 | }
44 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/memory-usage.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "TPU VM - Memory Usage Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_NONE"
19 | },
20 | "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12,
34 | "yPos": 9
35 | },
36 | {
37 | "height": 4,
38 | "widget": {
39 | "title": "TPU VM - Memory Usage Outliers [MEAN]",
40 | "xyChart": {
41 | "chartOptions": {
42 | "mode": "COLOR"
43 | },
44 | "dataSets": [
45 | {
46 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
47 | "minAlignmentPeriod": "60s",
48 | "plotType": "LINE",
49 | "targetAxis": "Y1",
50 | "timeSeriesQuery": {
51 | "timeSeriesFilter": {
52 | "aggregation": {
53 | "alignmentPeriod": "60s",
54 | "crossSeriesReducer": "REDUCE_MEAN",
55 | "groupByFields": [
56 | "resource.label.\"node_id\"",
57 | "resource.label.\"worker_id\""
58 | ],
59 | "perSeriesAligner": "ALIGN_MEAN"
60 | },
61 | "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
62 | "pickTimeSeriesFilter": {
63 | "direction": "TOP",
64 | "numTimeSeries": ${OUTLIER_COUNT},
65 | "rankingMethod": "METHOD_MEAN"
66 | }
67 | }
68 | }
69 | }
70 | ],
71 | "thresholds": [],
72 | "timeshiftDuration": "0s",
73 | "yAxis": {
74 | "label": "",
75 | "scale": "LINEAR"
76 | }
77 | }
78 | },
79 | "width": 6,
80 | "yPos": 13
81 | },
82 | {
83 | "height": 4,
84 | "widget": {
85 | "title": "TPU VM - Memory Usage Outliers [MAX]",
86 | "xyChart": {
87 | "chartOptions": {
88 | "mode": "COLOR"
89 | },
90 | "dataSets": [
91 | {
92 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
93 | "minAlignmentPeriod": "60s",
94 | "plotType": "LINE",
95 | "targetAxis": "Y1",
96 | "timeSeriesQuery": {
97 | "timeSeriesFilter": {
98 | "aggregation": {
99 | "alignmentPeriod": "60s",
100 | "crossSeriesReducer": "REDUCE_MAX",
101 | "groupByFields": [
102 | "resource.label.\"node_id\"",
103 | "resource.label.\"worker_id\""
104 | ],
105 | "perSeriesAligner": "ALIGN_MAX"
106 | },
107 | "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
108 | "pickTimeSeriesFilter": {
109 | "direction": "TOP",
110 | "numTimeSeries": ${OUTLIER_COUNT},
111 | "rankingMethod": "METHOD_MAX"
112 | }
113 | }
114 | }
115 | }
116 | ],
117 | "thresholds": [],
118 | "timeshiftDuration": "0s",
119 | "yAxis": {
120 | "label": "",
121 | "scale": "LINEAR"
122 | }
123 | }
124 | },
125 | "width": 6,
126 | "xPos": 6,
127 | "yPos": 13
128 | },
129 | {
130 | "height": 8,
131 | "widget": {
132 | "collapsibleGroup": {
133 | "collapsed": false
134 | },
135 | "title": "Memory Usage by TPU VM"
136 | },
137 | "width": 12,
138 | "yPos": 9
139 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/network-bytes.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "TPU VM - Network Bytes Sent Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_NONE"
19 | },
20 | "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12,
34 | "yPos": 25
35 | },
36 | {
37 | "height": 4,
38 | "widget": {
39 | "title": "TPU VM - Network Bytes Sent Outliers [MEAN]",
40 | "xyChart": {
41 | "chartOptions": {
42 | "mode": "COLOR"
43 | },
44 | "dataSets": [
45 | {
46 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
47 | "minAlignmentPeriod": "60s",
48 | "plotType": "LINE",
49 | "targetAxis": "Y1",
50 | "timeSeriesQuery": {
51 | "timeSeriesFilter": {
52 | "aggregation": {
53 | "alignmentPeriod": "60s",
54 | "perSeriesAligner": "ALIGN_RATE"
55 | },
56 | "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
57 | "pickTimeSeriesFilter": {
58 | "direction": "TOP",
59 | "numTimeSeries": ${OUTLIER_COUNT},
60 | "rankingMethod": "METHOD_MEAN"
61 | }
62 | }
63 | }
64 | }
65 | ],
66 | "thresholds": [],
67 | "timeshiftDuration": "0s",
68 | "yAxis": {
69 | "label": "",
70 | "scale": "LINEAR"
71 | }
72 | }
73 | },
74 | "width": 6,
75 | "yPos": 29
76 | },
77 | {
78 | "height": 4,
79 | "widget": {
80 | "title": "TPU VM - Network Bytes Sent Outliers [MAX]",
81 | "xyChart": {
82 | "chartOptions": {
83 | "mode": "COLOR"
84 | },
85 | "dataSets": [
86 | {
87 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
88 | "minAlignmentPeriod": "60s",
89 | "plotType": "LINE",
90 | "targetAxis": "Y1",
91 | "timeSeriesQuery": {
92 | "timeSeriesFilter": {
93 | "aggregation": {
94 | "alignmentPeriod": "60s",
95 | "perSeriesAligner": "ALIGN_RATE"
96 | },
97 | "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
98 | "pickTimeSeriesFilter": {
99 | "direction": "TOP",
100 | "numTimeSeries": ${OUTLIER_COUNT},
101 | "rankingMethod": "METHOD_MAX"
102 | }
103 | }
104 | }
105 | }
106 | ],
107 | "thresholds": [],
108 | "timeshiftDuration": "0s",
109 | "yAxis": {
110 | "label": "",
111 | "scale": "LINEAR"
112 | }
113 | }
114 | },
115 | "width": 6,
116 | "xPos": 6,
117 | "yPos": 29
118 | },
119 | {
120 | "height": 4,
121 | "widget": {
122 | "title": "TPU VM - Network Bytes Received Stats",
123 | "xyChart": {
124 | "chartOptions": {
125 | "mode": "STATS"
126 | },
127 | "dataSets": [
128 | {
129 | "minAlignmentPeriod": "60s",
130 | "plotType": "LINE",
131 | "targetAxis": "Y1",
132 | "timeSeriesQuery": {
133 | "timeSeriesFilter": {
134 | "aggregation": {
135 | "alignmentPeriod": "60s",
136 | "perSeriesAligner": "ALIGN_NONE"
137 | },
138 | "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
139 | }
140 | }
141 | }
142 | ],
143 | "thresholds": [],
144 | "timeshiftDuration": "0s",
145 | "yAxis": {
146 | "label": "",
147 | "scale": "LINEAR"
148 | }
149 | }
150 | },
151 | "width": 12,
152 | "yPos": 33
153 | },
154 | {
155 | "height": 4,
156 | "widget": {
157 | "title": "TPU VM - Network Bytes Received Outliers [MEAN]",
158 | "xyChart": {
159 | "chartOptions": {
160 | "mode": "COLOR"
161 | },
162 | "dataSets": [
163 | {
164 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
165 | "minAlignmentPeriod": "60s",
166 | "plotType": "LINE",
167 | "targetAxis": "Y1",
168 | "timeSeriesQuery": {
169 | "timeSeriesFilter": {
170 | "aggregation": {
171 | "alignmentPeriod": "60s",
172 | "perSeriesAligner": "ALIGN_RATE"
173 | },
174 | "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
175 | "pickTimeSeriesFilter": {
176 | "direction": "TOP",
177 | "numTimeSeries": ${OUTLIER_COUNT},
178 | "rankingMethod": "METHOD_MEAN"
179 | }
180 | }
181 | }
182 | }
183 | ],
184 | "thresholds": [],
185 | "timeshiftDuration": "0s",
186 | "yAxis": {
187 | "label": "",
188 | "scale": "LINEAR"
189 | }
190 | }
191 | },
192 | "width": 6,
193 | "yPos": 37
194 | },
195 | {
196 | "height": 4,
197 | "widget": {
198 | "title": "TPU VM - Network Bytes Received Outliers [MAX]",
199 | "xyChart": {
200 | "chartOptions": {
201 | "mode": "COLOR"
202 | },
203 | "dataSets": [
204 | {
205 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
206 | "minAlignmentPeriod": "60s",
207 | "plotType": "LINE",
208 | "targetAxis": "Y1",
209 | "timeSeriesQuery": {
210 | "timeSeriesFilter": {
211 | "aggregation": {
212 | "alignmentPeriod": "60s",
213 | "perSeriesAligner": "ALIGN_RATE"
214 | },
215 | "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
216 | "pickTimeSeriesFilter": {
217 | "direction": "TOP",
218 | "numTimeSeries": ${OUTLIER_COUNT},
219 | "rankingMethod": "METHOD_MAX"
220 | }
221 | }
222 | }
223 | }
224 | ],
225 | "thresholds": [],
226 | "timeshiftDuration": "0s",
227 | "yAxis": {
228 | "label": "",
229 | "scale": "LINEAR"
230 | }
231 | }
232 | },
233 | "width": 6,
234 | "xPos": 6,
235 | "yPos": 37
236 | },
237 | {
238 | "height": 16,
239 | "widget": {
240 | "collapsibleGroup": {
241 | "collapsed": false
242 | },
243 | "title": "Network Bytes Received and Sent by TPU VM"
244 | },
245 | "width": 12,
246 | "yPos": 25
247 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/tensorcore-idle-duration.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "Tensorcore Idle Duration Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_NONE"
19 | },
20 | "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12,
34 | "yPos": 17
35 | },
36 | {
37 | "height": 4,
38 | "widget": {
39 | "title": "Tensorcore Idle Duration Outliers [MEAN]",
40 | "xyChart": {
41 | "chartOptions": {
42 | "mode": "COLOR"
43 | },
44 | "dataSets": [
45 | {
46 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
47 | "minAlignmentPeriod": "60s",
48 | "plotType": "LINE",
49 | "targetAxis": "Y1",
50 | "timeSeriesQuery": {
51 | "timeSeriesFilter": {
52 | "aggregation": {
53 | "alignmentPeriod": "60s",
54 | "crossSeriesReducer": "REDUCE_MEAN",
55 | "groupByFields": [
56 | "resource.label.\"node_id\"",
57 | "resource.label.\"worker_id\""
58 | ],
59 | "perSeriesAligner": "ALIGN_MEAN"
60 | },
61 | "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
62 | "pickTimeSeriesFilter": {
63 | "direction": "TOP",
64 | "numTimeSeries": ${OUTLIER_COUNT},
65 | "rankingMethod": "METHOD_MEAN"
66 | }
67 | }
68 | }
69 | }
70 | ],
71 | "thresholds": [],
72 | "timeshiftDuration": "0s",
73 | "yAxis": {
74 | "label": "",
75 | "scale": "LINEAR"
76 | }
77 | }
78 | },
79 | "width": 6,
80 | "yPos": 21
81 | },
82 | {
83 | "height": 4,
84 | "widget": {
85 | "title": "Tensorcore Idle Duration Outliers [MAX]",
86 | "xyChart": {
87 | "chartOptions": {
88 | "mode": "COLOR"
89 | },
90 | "dataSets": [
91 | {
92 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}",
93 | "minAlignmentPeriod": "60s",
94 | "plotType": "LINE",
95 | "targetAxis": "Y1",
96 | "timeSeriesQuery": {
97 | "timeSeriesFilter": {
98 | "aggregation": {
99 | "alignmentPeriod": "60s",
100 | "crossSeriesReducer": "REDUCE_MAX",
101 | "groupByFields": [
102 | "resource.label.\"node_id\"",
103 | "resource.label.\"worker_id\""
104 | ],
105 | "perSeriesAligner": "ALIGN_MAX"
106 | },
107 | "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")",
108 | "pickTimeSeriesFilter": {
109 | "direction": "TOP",
110 | "numTimeSeries": ${OUTLIER_COUNT},
111 | "rankingMethod": "METHOD_MAX"
112 | }
113 | }
114 | }
115 | }
116 | ],
117 | "thresholds": [],
118 | "timeshiftDuration": "0s",
119 | "yAxis": {
120 | "label": "",
121 | "scale": "LINEAR"
122 | }
123 | }
124 | },
125 | "width": 6,
126 | "xPos": 6,
127 | "yPos": 21
128 | },
129 | {
130 | "height": 8,
131 | "widget": {
132 | "collapsibleGroup": {
133 | "collapsed": false
134 | },
135 | "title": "Tensorcore Idle Duration of TPU Chip"
136 | },
137 | "width": 12,
138 | "yPos": 17
139 | }
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/dashboard/monitoring_dashboard/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
20 | variable "monitoring_dashboard_config" {
21 | type = object({
22 | node_prefix : optional(string),
23 | outlier_count : optional(number)
24 | })
25 | description = </default.tfstate
29 | prefix = "gce/dashboard/monitoring_dashboard"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/log_storage/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
20 | // Valid inputs:
21 | // 1. To create stack trace bucket for 30 retention days: {"bucket_name":""}
22 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"", "retention_days":x}
23 | // 3. To not create stack trace bucket: {}
24 | variable "stack_trace_bucket_config" {
25 | type = object({
26 | bucket_name : optional(string)
27 | retention_days : optional(number)
28 | })
29 | validation {
30 | condition = (
31 | (var.stack_trace_bucket_config.bucket_name == null &&
32 | var.stack_trace_bucket_config.retention_days == null) ||
33 | (var.stack_trace_bucket_config.bucket_name != null)
34 | )
35 | error_message = "bucket_name is not defined for stack_trace_bucket_config."
36 | }
37 | description = </default.tfstate
29 | prefix = "gce/log_storage"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/gcp_resources/gce/resources/log_storage/stack-trace-bucket.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | locals {
16 | stack_trace_filter = "projects/${var.project_name}/logs/tpu.googleapis.com%2Fruntime_monitor AND jsonPayload.verb=stacktraceanalyzer"
17 | stack_trace_bucket_counter = var.stack_trace_bucket_config.bucket_name == null ? 0 : 1
18 | }
19 |
20 | resource "google_logging_project_bucket_config" "log_bucket" {
21 | count = local.stack_trace_bucket_counter
22 | project = var.project_name
23 | location = "global"
24 | // default retention period is 30 days
25 | retention_days = var.stack_trace_bucket_config.retention_days == null ? 30 : var.stack_trace_bucket_config.retention_days
26 | bucket_id = var.stack_trace_bucket_config.bucket_name
27 | }
28 |
29 | resource "google_logging_project_sink" "log_sink" {
30 | count = local.stack_trace_bucket_counter
31 | project = var.project_name
32 | name = "${var.stack_trace_bucket_config.bucket_name}_sink"
33 | destination = "logging.googleapis.com/projects/${var.project_name}/locations/global/buckets/${google_logging_project_bucket_config.log_bucket[count.index].bucket_id}"
34 | filter = local.stack_trace_filter
35 | }
36 |
--------------------------------------------------------------------------------
/gcp_resources/gke/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
20 | variable "monitoring_dashboard_config" {
21 | type = object({
22 | outlier_count : optional(number)
23 | })
24 | description = <"}
35 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"", "retention_days":x}
36 | // 3. To not create stack trace bucket: {}
37 | variable "stack_trace_bucket_config" {
38 | type = object({
39 | bucket_name : optional(string)
40 | retention_days : optional(number)
41 | })
42 | validation {
43 | condition = (
44 | (var.stack_trace_bucket_config.bucket_name == null &&
45 | var.stack_trace_bucket_config.retention_days == null) ||
46 | (var.stack_trace_bucket_config.bucket_name != null)
47 | )
48 | error_message = "bucket_name is not defined for stack_trace_bucket_config."
49 | }
50 | description = </default.tfstate
29 | prefix = "gke"
30 | }
31 | }
32 |
33 | module "monitoring_dashboard" {
34 | source = "./resources/dashboard/monitoring_dashboard"
35 | project_name = var.project_name
36 | monitoring_dashboard_config = var.monitoring_dashboard_config
37 | }
38 |
39 | module "logging_dashboard" {
40 | source = "./resources/dashboard/logging_dashboard"
41 | project_name = var.project_name
42 | }
43 |
44 | module "log_storage" {
45 | source = "./resources/log_storage"
46 | project_name = var.project_name
47 | stack_trace_bucket_config = var.stack_trace_bucket_config
48 | }
49 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | data "google_project" "project" {
16 | project_id = var.project_name
17 | }
18 |
19 | // Add a dependency on log_metrics module to deploy log-based metrics before deploying logging dashboard
20 | module "log_metrics" {
21 | source = "./log_metrics"
22 | project_name = var.project_name
23 | }
24 |
25 | locals {
26 | dashboard_json = templatefile("${path.module}/dashboard_json/main.json",
27 | {
28 | TILE_1 = templatefile("${path.module}/dashboard_json/stack-trace-counter-metric.json",
29 | {
30 | METRIC_NAME = module.log_metrics.stack_trace_counter_metric_id
31 | }),
32 | TILE_2 = templatefile("${path.module}/dashboard_json/stack-trace-log-panel.json",
33 | {
34 | PROJECT_NUMBER = data.google_project.project.number
35 | })
36 | })
37 | }
38 |
39 | resource "google_monitoring_dashboard" "logging_dashboard" {
40 | project = var.project_name
41 | dashboard_json = local.dashboard_json
42 | depends_on = [module.log_metrics.stack_trace_counter_metric]
43 | }
44 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/main.json:
--------------------------------------------------------------------------------
1 | {
2 | "category": "CUSTOM",
3 | "displayName": "GKE - TPU Logging Dashboard",
4 | "dashboardFilters": [
5 | {
6 | "filterType": "RESOURCE_LABEL",
7 | "labelKey": "cluster_name",
8 | "templateVariable": "ClusterName"
9 | },
10 | {
11 | "filterType": "USER_METADATA_LABEL",
12 | "labelKey": "jobset.sigs.k8s.io/jobset-name",
13 | "templateVariable": "JobName"
14 | }
15 | ],
16 | "mosaicLayout": {
17 | "columns": 12,
18 | "tiles": [
19 | ${TILE_1},
20 | ${TILE_2},
21 | {
22 | "height": 10,
23 | "widget": {
24 | "collapsibleGroup": {
25 | "collapsed": false
26 | },
27 | "title": "TPU VM Process Debugging"
28 | },
29 | "width": 12
30 | }
31 | ]
32 | }
33 | }
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-counter-metric.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "timeSeriesTable": {
5 | "columnSettings": [
6 | {
7 | "column": "location",
8 | "visible": true
9 | },
10 | {
11 | "column": "pod",
12 | "visible": true
13 | },
14 | {
15 | "column": "cluster",
16 | "visible": true
17 | },
18 | {
19 | "column": "job_name",
20 | "visible": true
21 | },
22 | {
23 | "column": "value",
24 | "visible": true
25 | }
26 | ],
27 | "dataSets": [
28 | {
29 | "minAlignmentPeriod": "600s",
30 | "timeSeriesQuery": {
31 | "outputFullDuration": true,
32 | "timeSeriesFilter": {
33 | "aggregation": {
34 | "alignmentPeriod": "600s",
35 | "perSeriesAligner": "ALIGN_RATE"
36 | },
37 | "filter": "metric.type=\"logging.googleapis.com/user/${METRIC_NAME}\" resource.type=\"k8s_container\" $${ClusterName} $${JobName}",
38 | "secondaryAggregation": {
39 | "alignmentPeriod": "600s",
40 | "crossSeriesReducer": "REDUCE_MEAN",
41 | "groupByFields": [
42 | "metric.label.\"location\"",
43 | "metric.label.\"pod\"",
44 | "metric.label.\"cluster\"",
45 | "metric.label.\"job_name\""
46 | ],
47 | "perSeriesAligner": "ALIGN_MEAN"
48 | }
49 | }
50 | }
51 | }
52 | ],
53 | "metricVisualization": "BAR"
54 | },
55 | "title": "Stack Trace Log Entry Count per Period [Sorted by MEAN]"
56 | },
57 | "width": 12
58 | }
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-log-panel.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 6,
3 | "widget": {
4 | "logsPanel": {
5 | "filter": "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\" AND $${ClusterName}",
6 | "resourceNames": [
7 | "projects/${PROJECT_NUMBER}"
8 | ]
9 | },
10 | "title": "Stack Trace Logs"
11 | },
12 | "width": 12,
13 | "yPos": 4
14 | }
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/log_metrics/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/log_metrics/stack_trace_counter.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | // Metric that counts the number of stack trace entries that match a specified filter within a specific period
16 | resource "google_logging_metric" "stack_trace_counter_metric" {
17 | name = "stack_trace_counter_gke"
18 | project = var.project_name
19 | description = "Counts the number of stack trace log entries within a specific period."
20 | filter = "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\""
21 | metric_descriptor {
22 | metric_kind = "DELTA"
23 | value_type = "INT64"
24 | labels {
25 | key = "location"
26 | value_type = "STRING"
27 | }
28 | labels {
29 | key = "cluster"
30 | value_type = "STRING"
31 | }
32 | labels {
33 | key = "pod"
34 | value_type = "STRING"
35 | }
36 | labels {
37 | key = "job_name"
38 | value_type = "STRING"
39 | }
40 | }
41 | label_extractors = {
42 | "location" = "EXTRACT(resource.labels.location)",
43 | "cluster" = "EXTRACT(resource.labels.cluster_name)",
44 | "pod" = "EXTRACT(resource.labels.pod_name)",
45 | "job_name" = "EXTRACT(labels.k8s-pod/job-name)",
46 | }
47 | }
48 |
49 | output "stack_trace_counter_metric_id" {
50 | value = google_logging_metric.stack_trace_counter_metric.id
51 | }
52 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/logging_dashboard/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | terraform {
16 | required_providers {
17 | google = {
18 | source = "hashicorp/google"
19 | version = ">= 4.57.0"
20 | }
21 | }
22 | /*
23 | Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS).
24 | The bucket must exist prior to configuring the backend.
25 | For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
26 | */
27 | backend "gcs" {
28 | # GCS prefix inside the bucket. terraform states are stored in an object called /default.tfstate
29 | prefix = "gke/dashboard/logging_dashboard"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | locals {
16 | outlier_count = var.monitoring_dashboard_config.outlier_count == null ? 10 : var.monitoring_dashboard_config.outlier_count
17 | dashboard_json = templatefile("${path.module}/dashboard_json/main.json",
18 | {
19 | TILE_1 = templatefile("${path.module}/dashboard_json/cpu-utilization.json",
20 | {
21 | OUTLIER_COUNT = local.outlier_count
22 | }),
23 | TILE_2 = templatefile("${path.module}/dashboard_json/memory-usage.json",
24 | {
25 | OUTLIER_COUNT = local.outlier_count
26 | }),
27 | TILE_3 = templatefile("${path.module}/dashboard_json/accelerator-memory-used.json",
28 | {
29 | OUTLIER_COUNT = local.outlier_count
30 | }),
31 | TILE_4 = templatefile("${path.module}/dashboard_json/duty-cycle.json",
32 | {
33 | OUTLIER_COUNT = local.outlier_count
34 | }),
35 | TILE_5 = templatefile("${path.module}/dashboard_json/network-bytes.json",
36 | {
37 | OUTLIER_COUNT = local.outlier_count
38 | }),
39 | TILE_6 = templatefile("${path.module}/dashboard_json/dcn-transfer-latency.json",
40 | {
41 | OUTLIER_COUNT = local.outlier_count
42 | }),
43 | TILE_7 = templatefile("${path.module}/dashboard_json/collectives-latency.json",
44 | {
45 | OUTLIER_COUNT = local.outlier_count
46 | }),
47 | TILE_8 = templatefile("${path.module}/dashboard_json/host-to-device-transfer-latency.json",
48 | {
49 | OUTLIER_COUNT = local.outlier_count
50 | }),
51 | TILE_9 = templatefile("${path.module}/dashboard_json/device-to-host-transfer-latency.json",
52 | {
53 | OUTLIER_COUNT = local.outlier_count
54 | })
55 | })
56 | }
57 |
58 | resource "google_monitoring_dashboard" "monitoring_dashboard" {
59 | project = var.project_name
60 | dashboard_json = local.dashboard_json
61 | }
62 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/accelerator-memory-used.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "Accelerator Memory Used Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_MEAN"
19 | },
20 | "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12,
34 | "yPos": 16
35 | },
36 | {
37 | "height": 4,
38 | "widget": {
39 | "title": "Accelerator Memory Used Outliers [MEAN]",
40 | "xyChart": {
41 | "chartOptions": {
42 | "mode": "COLOR"
43 | },
44 | "dataSets": [
45 | {
46 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
47 | "minAlignmentPeriod": "60s",
48 | "plotType": "LINE",
49 | "targetAxis": "Y1",
50 | "timeSeriesQuery": {
51 | "timeSeriesFilter": {
52 | "aggregation": {
53 | "alignmentPeriod": "60s",
54 | "perSeriesAligner": "ALIGN_MEAN"
55 | },
56 | "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
57 | "pickTimeSeriesFilter": {
58 | "direction": "TOP",
59 | "numTimeSeries": ${OUTLIER_COUNT},
60 | "rankingMethod": "METHOD_MEAN"
61 | },
62 | "secondaryAggregation": {
63 | "alignmentPeriod": "60s",
64 | "crossSeriesReducer": "REDUCE_MEAN",
65 | "groupByFields": [
66 | "resource.label.\"cluster_name\"",
67 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
68 | "resource.label.\"pod_name\""
69 | ],
70 | "perSeriesAligner": "ALIGN_NONE"
71 | }
72 | }
73 | }
74 | }
75 | ],
76 | "thresholds": [],
77 | "timeshiftDuration": "0s",
78 | "yAxis": {
79 | "label": "",
80 | "scale": "LINEAR"
81 | }
82 | }
83 | },
84 | "width": 6,
85 | "yPos": 20
86 | },
87 | {
88 | "height": 4,
89 | "widget": {
90 | "title": "Accelerator Memory Used Outliers [MAX]",
91 | "xyChart": {
92 | "chartOptions": {
93 | "mode": "COLOR"
94 | },
95 | "dataSets": [
96 | {
97 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
98 | "minAlignmentPeriod": "60s",
99 | "plotType": "LINE",
100 | "targetAxis": "Y1",
101 | "timeSeriesQuery": {
102 | "timeSeriesFilter": {
103 | "aggregation": {
104 | "alignmentPeriod": "60s",
105 | "perSeriesAligner": "ALIGN_MEAN"
106 | },
107 | "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
108 | "pickTimeSeriesFilter": {
109 | "direction": "TOP",
110 | "numTimeSeries": ${OUTLIER_COUNT},
111 | "rankingMethod": "METHOD_MAX"
112 | },
113 | "secondaryAggregation": {
114 | "alignmentPeriod": "60s",
115 | "crossSeriesReducer": "REDUCE_MAX",
116 | "groupByFields": [
117 | "resource.label.\"cluster_name\"",
118 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
119 | "resource.label.\"pod_name\""
120 | ],
121 | "perSeriesAligner": "ALIGN_NONE"
122 | }
123 | }
124 | }
125 | }
126 | ],
127 | "thresholds": [],
128 | "timeshiftDuration": "0s",
129 | "yAxis": {
130 | "label": "",
131 | "scale": "LINEAR"
132 | }
133 | }
134 | },
135 | "width": 6,
136 | "xPos": 6,
137 | "yPos": 20
138 | },
139 | {
140 | "height": 8,
141 | "widget": {
142 | "collapsibleGroup": {
143 | "collapsed": false
144 | },
145 | "title": "Accelerator Memory Used by TPU Slice"
146 | },
147 | "width": 12,
148 | "yPos": 16
149 | }
150 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/collectives-latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "Collectives Latency Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "COLOR"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "HEATMAP",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "crossSeriesReducer": "REDUCE_SUM",
19 | "perSeriesAligner": "ALIGN_DELTA"
20 | },
21 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
22 | }
23 | }
24 | }
25 | ],
26 | "thresholds": [],
27 | "timeshiftDuration": "0s",
28 | "yAxis": {
29 | "label": "",
30 | "scale": "LINEAR"
31 | }
32 | }
33 | },
34 | "width": 12,
35 | "yPos": 57
36 | },
37 | {
38 | "height": 4,
39 | "widget": {
40 | "title": "Collectives Latency Outliers [p50]",
41 | "xyChart": {
42 | "chartOptions": {
43 | "mode": "COLOR"
44 | },
45 | "dataSets": [
46 | {
47 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
48 | "minAlignmentPeriod": "60s",
49 | "plotType": "LINE",
50 | "targetAxis": "Y1",
51 | "timeSeriesQuery": {
52 | "timeSeriesFilter": {
53 | "aggregation": {
54 | "alignmentPeriod": "60s",
55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
56 | "groupByFields": [
57 | "resource.label.\"cluster_name\"",
58 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
59 | "resource.label.\"pod_name\""
60 | ],
61 | "perSeriesAligner": "ALIGN_PERCENTILE_50"
62 | },
63 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
64 | "pickTimeSeriesFilter": {
65 | "direction": "TOP",
66 | "numTimeSeries": ${OUTLIER_COUNT},
67 | "rankingMethod": "METHOD_MAX"
68 | }
69 | }
70 | }
71 | }
72 | ],
73 | "thresholds": [],
74 | "timeshiftDuration": "0s",
75 | "yAxis": {
76 | "label": "",
77 | "scale": "LINEAR"
78 | }
79 | }
80 | },
81 | "width": 6,
82 | "yPos": 61
83 | },
84 | {
85 | "height": 4,
86 | "widget": {
87 | "title": "Collectives Latency Outliers [p99]",
88 | "xyChart": {
89 | "chartOptions": {
90 | "mode": "COLOR"
91 | },
92 | "dataSets": [
93 | {
94 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
95 | "minAlignmentPeriod": "60s",
96 | "plotType": "LINE",
97 | "targetAxis": "Y1",
98 | "timeSeriesQuery": {
99 | "timeSeriesFilter": {
100 | "aggregation": {
101 | "alignmentPeriod": "60s",
102 | "crossSeriesReducer": "REDUCE_PERCENTILE_99",
103 | "groupByFields": [
104 | "resource.label.\"cluster_name\"",
105 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
106 | "resource.label.\"pod_name\""
107 | ],
108 | "perSeriesAligner": "ALIGN_PERCENTILE_99"
109 | },
110 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
111 | "pickTimeSeriesFilter": {
112 | "direction": "TOP",
113 | "numTimeSeries": ${OUTLIER_COUNT},
114 | "rankingMethod": "METHOD_MAX"
115 | }
116 | }
117 | }
118 | }
119 | ],
120 | "thresholds": [],
121 | "timeshiftDuration": "0s",
122 | "yAxis": {
123 | "label": "",
124 | "scale": "LINEAR"
125 | }
126 | }
127 | },
128 | "width": 6,
129 | "xPos": 6,
130 | "yPos": 61
131 | },
132 | {
133 | "height": 8,
134 | "widget": {
135 | "collapsibleGroup": {
136 | "collapsed": false
137 | },
138 | "title": "Collectives Latency"
139 | },
140 | "width": 12,
141 | "yPos": 57
142 | }
143 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/cpu-utilization.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "CPU Utilization Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_RATE"
19 | },
20 | "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12
34 | },
35 | {
36 | "height": 4,
37 | "widget": {
38 | "title": "CPU Utilization Outliers [MEAN]",
39 | "xyChart": {
40 | "chartOptions": {
41 | "mode": "COLOR"
42 | },
43 | "dataSets": [
44 | {
45 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
46 | "minAlignmentPeriod": "60s",
47 | "plotType": "LINE",
48 | "targetAxis": "Y1",
49 | "timeSeriesQuery": {
50 | "timeSeriesFilter": {
51 | "aggregation": {
52 | "alignmentPeriod": "60s",
53 | "perSeriesAligner": "ALIGN_RATE"
54 | },
55 | "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
56 | "pickTimeSeriesFilter": {
57 | "direction": "TOP",
58 | "numTimeSeries": ${OUTLIER_COUNT},
59 | "rankingMethod": "METHOD_MEAN"
60 | },
61 | "secondaryAggregation": {
62 | "alignmentPeriod": "60s",
63 | "crossSeriesReducer": "REDUCE_MEAN",
64 | "groupByFields": [
65 | "resource.label.\"cluster_name\"",
66 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
67 | "resource.label.\"pod_name\""
68 | ],
69 | "perSeriesAligner": "ALIGN_NONE"
70 | }
71 | }
72 | }
73 | }
74 | ],
75 | "thresholds": [],
76 | "timeshiftDuration": "0s",
77 | "yAxis": {
78 | "label": "",
79 | "scale": "LINEAR"
80 | }
81 | }
82 | },
83 | "width": 6,
84 | "yPos": 4
85 | },
86 | {
87 | "height": 4,
88 | "widget": {
89 | "title": "CPU Utilization Outliers [MAX]",
90 | "xyChart": {
91 | "chartOptions": {
92 | "mode": "COLOR"
93 | },
94 | "dataSets": [
95 | {
96 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
97 | "minAlignmentPeriod": "60s",
98 | "plotType": "LINE",
99 | "targetAxis": "Y1",
100 | "timeSeriesQuery": {
101 | "timeSeriesFilter": {
102 | "aggregation": {
103 | "alignmentPeriod": "60s",
104 | "perSeriesAligner": "ALIGN_RATE"
105 | },
106 | "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
107 | "pickTimeSeriesFilter": {
108 | "direction": "TOP",
109 | "numTimeSeries": ${OUTLIER_COUNT},
110 | "rankingMethod": "METHOD_MAX"
111 | },
112 | "secondaryAggregation": {
113 | "alignmentPeriod": "60s",
114 | "crossSeriesReducer": "REDUCE_MAX",
115 | "groupByFields": [
116 | "resource.label.\"cluster_name\"",
117 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
118 | "resource.label.\"pod_name\""
119 | ],
120 | "perSeriesAligner": "ALIGN_NONE"
121 | }
122 | }
123 | }
124 | }
125 | ],
126 | "thresholds": [],
127 | "timeshiftDuration": "0s",
128 | "yAxis": {
129 | "label": "",
130 | "scale": "LINEAR"
131 | }
132 | }
133 | },
134 | "width": 6,
135 | "xPos": 6,
136 | "yPos": 4
137 | },
138 | {
139 | "height": 8,
140 | "widget": {
141 | "collapsibleGroup": {
142 | "collapsed": false
143 | },
144 | "title": "CPU Utilization by TPU Slice"
145 | },
146 | "width": 12
147 | }
148 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/dcn-transfer-latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "DCN Transfer Latency Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "COLOR"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "HEATMAP",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "crossSeriesReducer": "REDUCE_SUM",
19 | "perSeriesAligner": "ALIGN_DELTA"
20 | },
21 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
22 | }
23 | }
24 | }
25 | ],
26 | "thresholds": [],
27 | "timeshiftDuration": "0s",
28 | "yAxis": {
29 | "label": "",
30 | "scale": "LINEAR"
31 | }
32 | }
33 | },
34 | "width": 12,
35 | "yPos": 49
36 | },
37 | {
38 | "height": 4,
39 | "widget": {
40 | "title": "DCN Transfer Latency Outliers [p50]",
41 | "xyChart": {
42 | "chartOptions": {
43 | "mode": "COLOR"
44 | },
45 | "dataSets": [
46 | {
47 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
48 | "minAlignmentPeriod": "60s",
49 | "plotType": "LINE",
50 | "targetAxis": "Y1",
51 | "timeSeriesQuery": {
52 | "timeSeriesFilter": {
53 | "aggregation": {
54 | "alignmentPeriod": "60s",
55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
56 | "groupByFields": [
57 | "resource.label.\"cluster_name\"",
58 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
59 | "resource.label.\"pod_name\""
60 | ],
61 | "perSeriesAligner": "ALIGN_PERCENTILE_50"
62 | },
63 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
64 | "pickTimeSeriesFilter": {
65 | "direction": "TOP",
66 | "numTimeSeries": ${OUTLIER_COUNT},
67 | "rankingMethod": "METHOD_MAX"
68 | }
69 | }
70 | }
71 | }
72 | ],
73 | "thresholds": [],
74 | "timeshiftDuration": "0s",
75 | "yAxis": {
76 | "label": "",
77 | "scale": "LINEAR"
78 | }
79 | }
80 | },
81 | "width": 6,
82 | "yPos": 53
83 | },
84 | {
85 | "height": 4,
86 | "widget": {
87 | "title": "DCN Transfer Latency Outliers [p99]",
88 | "xyChart": {
89 | "chartOptions": {
90 | "mode": "COLOR"
91 | },
92 | "dataSets": [
93 | {
94 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
95 | "minAlignmentPeriod": "60s",
96 | "plotType": "LINE",
97 | "targetAxis": "Y1",
98 | "timeSeriesQuery": {
99 | "timeSeriesFilter": {
100 | "aggregation": {
101 | "alignmentPeriod": "60s",
102 | "crossSeriesReducer": "REDUCE_PERCENTILE_99",
103 | "groupByFields": [
104 | "resource.label.\"cluster_name\"",
105 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
106 | "resource.label.\"pod_name\""
107 | ],
108 | "perSeriesAligner": "ALIGN_PERCENTILE_99"
109 | },
110 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
111 | "pickTimeSeriesFilter": {
112 | "direction": "TOP",
113 | "numTimeSeries": ${OUTLIER_COUNT},
114 | "rankingMethod": "METHOD_MAX"
115 | }
116 | }
117 | }
118 | }
119 | ],
120 | "thresholds": [],
121 | "timeshiftDuration": "0s",
122 | "yAxis": {
123 | "label": "",
124 | "scale": "LINEAR"
125 | }
126 | }
127 | },
128 | "width": 6,
129 | "xPos": 6,
130 | "yPos": 53
131 | },
132 | {
133 | "height": 8,
134 | "widget": {
135 | "collapsibleGroup": {
136 | "collapsed": false
137 | },
138 | "title": "DCN Transfer Latency"
139 | },
140 | "width": 12,
141 | "yPos": 49
142 | }
143 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/device-to-host-transfer-latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "Device To Host Transfer Latency Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "COLOR"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "HEATMAP",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "crossSeriesReducer": "REDUCE_SUM",
19 | "perSeriesAligner": "ALIGN_DELTA"
20 | },
21 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
22 | }
23 | }
24 | }
25 | ],
26 | "thresholds": [],
27 | "timeshiftDuration": "0s",
28 | "yAxis": {
29 | "label": "",
30 | "scale": "LINEAR"
31 | }
32 | }
33 | },
34 | "width": 12,
35 | "yPos": 73
36 | },
37 | {
38 | "height": 4,
39 | "widget": {
40 | "title": "Device To Host Transfer Latency Outliers [p50]",
41 | "xyChart": {
42 | "chartOptions": {
43 | "mode": "COLOR"
44 | },
45 | "dataSets": [
46 | {
47 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
48 | "minAlignmentPeriod": "60s",
49 | "plotType": "LINE",
50 | "targetAxis": "Y1",
51 | "timeSeriesQuery": {
52 | "timeSeriesFilter": {
53 | "aggregation": {
54 | "alignmentPeriod": "60s",
55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
56 | "groupByFields": [
57 | "resource.label.\"cluster_name\"",
58 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
59 | "resource.label.\"pod_name\""
60 | ],
61 | "perSeriesAligner": "ALIGN_PERCENTILE_50"
62 | },
63 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
64 | "pickTimeSeriesFilter": {
65 | "direction": "TOP",
66 | "numTimeSeries": ${OUTLIER_COUNT},
67 | "rankingMethod": "METHOD_MAX"
68 | }
69 | }
70 | }
71 | }
72 | ],
73 | "thresholds": [],
74 | "timeshiftDuration": "0s",
75 | "yAxis": {
76 | "label": "",
77 | "scale": "LINEAR"
78 | }
79 | }
80 | },
81 | "width": 6,
82 | "yPos": 77
83 | },
84 | {
85 | "height": 4,
86 | "widget": {
87 | "title": "Device To Host Transfer Latency Outliers [p99]",
88 | "xyChart": {
89 | "chartOptions": {
90 | "mode": "COLOR"
91 | },
92 | "dataSets": [
93 | {
94 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
95 | "minAlignmentPeriod": "60s",
96 | "plotType": "LINE",
97 | "targetAxis": "Y1",
98 | "timeSeriesQuery": {
99 | "timeSeriesFilter": {
100 | "aggregation": {
101 | "alignmentPeriod": "60s",
102 | "crossSeriesReducer": "REDUCE_PERCENTILE_99",
103 | "groupByFields": [
104 | "resource.label.\"cluster_name\"",
105 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
106 | "resource.label.\"pod_name\""
107 | ],
108 | "perSeriesAligner": "ALIGN_PERCENTILE_99"
109 | },
110 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
111 | "pickTimeSeriesFilter": {
112 | "direction": "TOP",
113 | "numTimeSeries": ${OUTLIER_COUNT},
114 | "rankingMethod": "METHOD_MAX"
115 | }
116 | }
117 | }
118 | }
119 | ],
120 | "thresholds": [],
121 | "timeshiftDuration": "0s",
122 | "yAxis": {
123 | "label": "",
124 | "scale": "LINEAR"
125 | }
126 | }
127 | },
128 | "width": 6,
129 | "xPos": 6,
130 | "yPos": 77
131 | },
132 | {
133 | "height": 8,
134 | "widget": {
135 | "collapsibleGroup": {
136 | "collapsed": false
137 | },
138 | "title": "Device To Host Transfer Latency"
139 | },
140 | "width": 12,
141 | "yPos": 73
142 | }
143 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/duty-cycle.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "Duty Cycle Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_MEAN"
19 | },
20 | "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12,
34 | "yPos": 24
35 | },
36 | {
37 | "height": 4,
38 | "widget": {
39 | "title": "Duty Cycle Outliers [MEAN]",
40 | "xyChart": {
41 | "chartOptions": {
42 | "mode": "COLOR"
43 | },
44 | "dataSets": [
45 | {
46 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
47 | "minAlignmentPeriod": "60s",
48 | "plotType": "LINE",
49 | "targetAxis": "Y1",
50 | "timeSeriesQuery": {
51 | "timeSeriesFilter": {
52 | "aggregation": {
53 | "alignmentPeriod": "60s",
54 | "perSeriesAligner": "ALIGN_MEAN"
55 | },
56 | "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
57 | "pickTimeSeriesFilter": {
58 | "direction": "TOP",
59 | "numTimeSeries": ${OUTLIER_COUNT},
60 | "rankingMethod": "METHOD_MEAN"
61 | },
62 | "secondaryAggregation": {
63 | "alignmentPeriod": "60s",
64 | "crossSeriesReducer": "REDUCE_MEAN",
65 | "groupByFields": [
66 | "resource.label.\"cluster_name\"",
67 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
68 | "resource.label.\"pod_name\""
69 | ],
70 | "perSeriesAligner": "ALIGN_NONE"
71 | }
72 | }
73 | }
74 | }
75 | ],
76 | "thresholds": [],
77 | "timeshiftDuration": "0s",
78 | "yAxis": {
79 | "label": "",
80 | "scale": "LINEAR"
81 | }
82 | }
83 | },
84 | "width": 6,
85 | "yPos": 28
86 | },
87 | {
88 | "height": 4,
89 | "widget": {
90 | "title": "Duty Cycle Outliers [MAX]",
91 | "xyChart": {
92 | "chartOptions": {
93 | "mode": "COLOR"
94 | },
95 | "dataSets": [
96 | {
97 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
98 | "minAlignmentPeriod": "60s",
99 | "plotType": "LINE",
100 | "targetAxis": "Y1",
101 | "timeSeriesQuery": {
102 | "timeSeriesFilter": {
103 | "aggregation": {
104 | "alignmentPeriod": "60s",
105 | "perSeriesAligner": "ALIGN_MEAN"
106 | },
107 | "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
108 | "pickTimeSeriesFilter": {
109 | "direction": "TOP",
110 | "numTimeSeries": ${OUTLIER_COUNT},
111 | "rankingMethod": "METHOD_MAX"
112 | },
113 | "secondaryAggregation": {
114 | "alignmentPeriod": "60s",
115 | "crossSeriesReducer": "REDUCE_MAX",
116 | "groupByFields": [
117 | "resource.label.\"cluster_name\"",
118 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
119 | "resource.label.\"pod_name\""
120 | ],
121 | "perSeriesAligner": "ALIGN_NONE"
122 | }
123 | }
124 | }
125 | }
126 | ],
127 | "thresholds": [],
128 | "timeshiftDuration": "0s",
129 | "yAxis": {
130 | "label": "",
131 | "scale": "LINEAR"
132 | }
133 | }
134 | },
135 | "width": 6,
136 | "xPos": 6,
137 | "yPos": 28
138 | },
139 | {
140 | "height": 8,
141 | "widget": {
142 | "collapsibleGroup": {
143 | "collapsed": false
144 | },
145 | "title": "Duty Cycle by TPU Slice"
146 | },
147 | "width": 12,
148 | "yPos": 24
149 | }
150 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/host-to-device-transfer-latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "Host To Device Transfer Latency Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "COLOR"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "HEATMAP",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "crossSeriesReducer": "REDUCE_SUM",
19 | "perSeriesAligner": "ALIGN_DELTA"
20 | },
21 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
22 | }
23 | }
24 | }
25 | ],
26 | "thresholds": [],
27 | "timeshiftDuration": "0s",
28 | "yAxis": {
29 | "label": "",
30 | "scale": "LINEAR"
31 | }
32 | }
33 | },
34 | "width": 12,
35 | "yPos": 65
36 | },
37 | {
38 | "height": 4,
39 | "widget": {
40 | "title": "Host To Device Transfer Latency Outliers [p50]",
41 | "xyChart": {
42 | "chartOptions": {
43 | "mode": "COLOR"
44 | },
45 | "dataSets": [
46 | {
47 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
48 | "minAlignmentPeriod": "60s",
49 | "plotType": "LINE",
50 | "targetAxis": "Y1",
51 | "timeSeriesQuery": {
52 | "timeSeriesFilter": {
53 | "aggregation": {
54 | "alignmentPeriod": "60s",
55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50",
56 | "groupByFields": [
57 | "resource.label.\"cluster_name\"",
58 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
59 | "resource.label.\"pod_name\""
60 | ],
61 | "perSeriesAligner": "ALIGN_PERCENTILE_50"
62 | },
63 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
64 | "pickTimeSeriesFilter": {
65 | "direction": "TOP",
66 | "numTimeSeries": ${OUTLIER_COUNT},
67 | "rankingMethod": "METHOD_MAX"
68 | }
69 | }
70 | }
71 | }
72 | ],
73 | "thresholds": [],
74 | "timeshiftDuration": "0s",
75 | "yAxis": {
76 | "label": "",
77 | "scale": "LINEAR"
78 | }
79 | }
80 | },
81 | "width": 6,
82 | "yPos": 69
83 | },
84 | {
85 | "height": 4,
86 | "widget": {
87 | "title": "Host To Device Transfer Latency Outliers [p99]",
88 | "xyChart": {
89 | "chartOptions": {
90 | "mode": "COLOR"
91 | },
92 | "dataSets": [
93 | {
94 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
95 | "minAlignmentPeriod": "60s",
96 | "plotType": "LINE",
97 | "targetAxis": "Y1",
98 | "timeSeriesQuery": {
99 | "timeSeriesFilter": {
100 | "aggregation": {
101 | "alignmentPeriod": "60s",
102 | "crossSeriesReducer": "REDUCE_PERCENTILE_99",
103 | "groupByFields": [
104 | "resource.label.\"cluster_name\"",
105 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
106 | "resource.label.\"pod_name\""
107 | ],
108 | "perSeriesAligner": "ALIGN_PERCENTILE_99"
109 | },
110 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
111 | "pickTimeSeriesFilter": {
112 | "direction": "TOP",
113 | "numTimeSeries": ${OUTLIER_COUNT},
114 | "rankingMethod": "METHOD_MAX"
115 | }
116 | }
117 | }
118 | }
119 | ],
120 | "thresholds": [],
121 | "timeshiftDuration": "0s",
122 | "yAxis": {
123 | "label": "",
124 | "scale": "LINEAR"
125 | }
126 | }
127 | },
128 | "width": 6,
129 | "xPos": 6,
130 | "yPos": 69
131 | },
132 | {
133 | "height": 8,
134 | "widget": {
135 | "collapsibleGroup": {
136 | "collapsed": false
137 | },
138 | "title": "Host To Device Transfer Latency"
139 | },
140 | "width": 12,
141 | "yPos": 65
142 | }
143 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/main.json:
--------------------------------------------------------------------------------
1 | {
2 | "category": "CUSTOM",
3 | "displayName": "GKE - TPU Monitoring Dashboard",
4 | "dashboardFilters": [
5 | {
6 | "filterType": "RESOURCE_LABEL",
7 | "labelKey": "cluster_name",
8 | "templateVariable": "ClusterName"
9 | },
10 | {
11 | "filterType": "USER_METADATA_LABEL",
12 | "labelKey": "jobset.sigs.k8s.io/jobset-name",
13 | "templateVariable": "JobName"
14 | },
15 | {
16 | "filterType": "RESOURCE_LABEL",
17 | "labelKey": "pod_name",
18 | "templateVariable": "PodName"
19 | }
20 | ],
21 | "mosaicLayout": {
22 | "columns": 12,
23 | "tiles": [
24 | ${TILE_1},
25 | ${TILE_2},
26 | ${TILE_3},
27 | ${TILE_4},
28 | ${TILE_5},
29 | {
30 | "height": 1,
31 | "widget": {
32 | "title": "Megascale Metrics",
33 | "sectionHeader": {
34 | "subtitle": "These metrics are available in GKE version 1.29.1-gke.1016000 or later. TPU workload must use JAX version 0.4.24.",
35 | "dividerBelow": false
36 | }
37 | },
38 | "width": 12,
39 | "yPos": 48
40 | },
41 | ${TILE_6},
42 | ${TILE_7},
43 | ${TILE_8},
44 | ${TILE_9}
45 | ]
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/memory-usage.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "Memory Usage Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_SUM"
19 | },
20 | "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12,
34 | "yPos": 8
35 | },
36 | {
37 | "height": 4,
38 | "widget": {
39 | "title": "Memory Usage Outliers [MEAN]",
40 | "xyChart": {
41 | "chartOptions": {
42 | "mode": "COLOR"
43 | },
44 | "dataSets": [
45 | {
46 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
47 | "minAlignmentPeriod": "60s",
48 | "plotType": "LINE",
49 | "targetAxis": "Y1",
50 | "timeSeriesQuery": {
51 | "timeSeriesFilter": {
52 | "aggregation": {
53 | "alignmentPeriod": "60s",
54 | "crossSeriesReducer": "REDUCE_MEAN",
55 | "groupByFields": [
56 | "resource.label.\"cluster_name\"",
57 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
58 | "resource.label.\"pod_name\""
59 | ],
60 | "perSeriesAligner": "ALIGN_MEAN"
61 | },
62 | "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
63 | "pickTimeSeriesFilter": {
64 | "direction": "TOP",
65 | "numTimeSeries": ${OUTLIER_COUNT},
66 | "rankingMethod": "METHOD_MEAN"
67 | }
68 | }
69 | }
70 | }
71 | ],
72 | "thresholds": [],
73 | "timeshiftDuration": "0s",
74 | "yAxis": {
75 | "label": "",
76 | "scale": "LINEAR"
77 | }
78 | }
79 | },
80 | "width": 6,
81 | "yPos": 12
82 | },
83 | {
84 | "height": 4,
85 | "widget": {
86 | "title": "Memory Usage Outliers [MAX]",
87 | "xyChart": {
88 | "chartOptions": {
89 | "mode": "COLOR"
90 | },
91 | "dataSets": [
92 | {
93 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
94 | "minAlignmentPeriod": "60s",
95 | "plotType": "LINE",
96 | "targetAxis": "Y1",
97 | "timeSeriesQuery": {
98 | "timeSeriesFilter": {
99 | "aggregation": {
100 | "alignmentPeriod": "60s",
101 | "crossSeriesReducer": "REDUCE_MAX",
102 | "groupByFields": [
103 | "resource.label.\"cluster_name\"",
104 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
105 | "resource.label.\"pod_name\""
106 | ],
107 | "perSeriesAligner": "ALIGN_MAX"
108 | },
109 | "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}",
110 | "pickTimeSeriesFilter": {
111 | "direction": "TOP",
112 | "numTimeSeries": ${OUTLIER_COUNT},
113 | "rankingMethod": "METHOD_MAX"
114 | }
115 | }
116 | }
117 | }
118 | ],
119 | "thresholds": [],
120 | "timeshiftDuration": "0s",
121 | "yAxis": {
122 | "label": "",
123 | "scale": "LINEAR"
124 | }
125 | }
126 | },
127 | "width": 6,
128 | "xPos": 6,
129 | "yPos": 12
130 | },
131 | {
132 | "height": 8,
133 | "widget": {
134 | "collapsibleGroup": {
135 | "collapsed": false
136 | },
137 | "title": "Memory Usage by TPU Slice"
138 | },
139 | "width": 12,
140 | "yPos": 8
141 | }
142 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/network-bytes.json:
--------------------------------------------------------------------------------
1 | {
2 | "height": 4,
3 | "widget": {
4 | "title": "Network Bytes Sent Stats",
5 | "xyChart": {
6 | "chartOptions": {
7 | "mode": "STATS"
8 | },
9 | "dataSets": [
10 | {
11 | "minAlignmentPeriod": "60s",
12 | "plotType": "LINE",
13 | "targetAxis": "Y1",
14 | "timeSeriesQuery": {
15 | "timeSeriesFilter": {
16 | "aggregation": {
17 | "alignmentPeriod": "60s",
18 | "perSeriesAligner": "ALIGN_RATE"
19 | },
20 | "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}"
21 | }
22 | }
23 | }
24 | ],
25 | "thresholds": [],
26 | "timeshiftDuration": "0s",
27 | "yAxis": {
28 | "label": "",
29 | "scale": "LINEAR"
30 | }
31 | }
32 | },
33 | "width": 12,
34 | "yPos": 32
35 | },
36 | {
37 | "height": 4,
38 | "widget": {
39 | "title": "Network Bytes Sent Outliers [MEAN]",
40 | "xyChart": {
41 | "chartOptions": {
42 | "mode": "COLOR"
43 | },
44 | "dataSets": [
45 | {
46 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
47 | "minAlignmentPeriod": "60s",
48 | "plotType": "LINE",
49 | "targetAxis": "Y1",
50 | "timeSeriesQuery": {
51 | "timeSeriesFilter": {
52 | "aggregation": {
53 | "alignmentPeriod": "60s",
54 | "perSeriesAligner": "ALIGN_RATE"
55 | },
56 | "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}",
57 | "pickTimeSeriesFilter": {
58 | "direction": "TOP",
59 | "numTimeSeries": ${OUTLIER_COUNT},
60 | "rankingMethod": "METHOD_MEAN"
61 | },
62 | "secondaryAggregation": {
63 | "alignmentPeriod": "60s",
64 | "crossSeriesReducer": "REDUCE_MEAN",
65 | "groupByFields": [
66 | "resource.label.\"cluster_name\"",
67 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
68 | "resource.label.\"pod_name\""
69 | ],
70 | "perSeriesAligner": "ALIGN_MEAN"
71 | }
72 | }
73 | }
74 | }
75 | ],
76 | "thresholds": [],
77 | "timeshiftDuration": "0s",
78 | "yAxis": {
79 | "label": "",
80 | "scale": "LINEAR"
81 | }
82 | }
83 | },
84 | "width": 6,
85 | "yPos": 36
86 | },
87 | {
88 | "height": 4,
89 | "widget": {
90 | "title": "Network Bytes Sent Outliers [MAX]",
91 | "xyChart": {
92 | "chartOptions": {
93 | "mode": "COLOR"
94 | },
95 | "dataSets": [
96 | {
97 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
98 | "minAlignmentPeriod": "60s",
99 | "plotType": "LINE",
100 | "targetAxis": "Y1",
101 | "timeSeriesQuery": {
102 | "timeSeriesFilter": {
103 | "aggregation": {
104 | "alignmentPeriod": "60s",
105 | "perSeriesAligner": "ALIGN_RATE"
106 | },
107 | "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}",
108 | "pickTimeSeriesFilter": {
109 | "direction": "TOP",
110 | "numTimeSeries": ${OUTLIER_COUNT},
111 | "rankingMethod": "METHOD_MAX"
112 | },
113 | "secondaryAggregation": {
114 | "alignmentPeriod": "60s",
115 | "crossSeriesReducer": "REDUCE_MAX",
116 | "groupByFields": [
117 | "resource.label.\"cluster_name\"",
118 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
119 | "resource.label.\"pod_name\""
120 | ],
121 | "perSeriesAligner": "ALIGN_MAX"
122 | }
123 | }
124 | }
125 | }
126 | ],
127 | "thresholds": [],
128 | "timeshiftDuration": "0s",
129 | "yAxis": {
130 | "label": "",
131 | "scale": "LINEAR"
132 | }
133 | }
134 | },
135 | "width": 6,
136 | "xPos": 6,
137 | "yPos": 36
138 | },
139 | {
140 | "height": 4,
141 | "widget": {
142 | "title": "Network Bytes Received Stats",
143 | "xyChart": {
144 | "chartOptions": {
145 | "mode": "STATS"
146 | },
147 | "dataSets": [
148 | {
149 | "minAlignmentPeriod": "60s",
150 | "plotType": "LINE",
151 | "targetAxis": "Y1",
152 | "timeSeriesQuery": {
153 | "timeSeriesFilter": {
154 | "aggregation": {
155 | "alignmentPeriod": "60s",
156 | "perSeriesAligner": "ALIGN_RATE"
157 | },
158 | "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}"
159 | }
160 | }
161 | }
162 | ],
163 | "thresholds": [],
164 | "timeshiftDuration": "0s",
165 | "yAxis": {
166 | "label": "",
167 | "scale": "LINEAR"
168 | }
169 | }
170 | },
171 | "width": 12,
172 | "yPos": 40
173 | },
174 | {
175 | "height": 4,
176 | "widget": {
177 | "title": "Network Bytes Received Outliers [MEAN]",
178 | "xyChart": {
179 | "chartOptions": {
180 | "mode": "COLOR"
181 | },
182 | "dataSets": [
183 | {
184 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
185 | "minAlignmentPeriod": "60s",
186 | "plotType": "LINE",
187 | "targetAxis": "Y1",
188 | "timeSeriesQuery": {
189 | "timeSeriesFilter": {
190 | "aggregation": {
191 | "alignmentPeriod": "60s",
192 | "perSeriesAligner": "ALIGN_RATE"
193 | },
194 | "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}",
195 | "pickTimeSeriesFilter": {
196 | "direction": "TOP",
197 | "numTimeSeries": ${OUTLIER_COUNT},
198 | "rankingMethod": "METHOD_MEAN"
199 | },
200 | "secondaryAggregation": {
201 | "alignmentPeriod": "60s",
202 | "crossSeriesReducer": "REDUCE_MEAN",
203 | "groupByFields": [
204 | "resource.label.\"cluster_name\"",
205 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
206 | "resource.label.\"pod_name\""
207 | ],
208 | "perSeriesAligner": "ALIGN_MEAN"
209 | }
210 | }
211 | }
212 | }
213 | ],
214 | "thresholds": [],
215 | "timeshiftDuration": "0s",
216 | "yAxis": {
217 | "label": "",
218 | "scale": "LINEAR"
219 | }
220 | }
221 | },
222 | "width": 6,
223 | "yPos": 44
224 | },
225 | {
226 | "height": 4,
227 | "widget": {
228 | "title": "Network Bytes Received Outliers [MAX]",
229 | "xyChart": {
230 | "chartOptions": {
231 | "mode": "COLOR"
232 | },
233 | "dataSets": [
234 | {
235 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}",
236 | "minAlignmentPeriod": "60s",
237 | "plotType": "LINE",
238 | "targetAxis": "Y1",
239 | "timeSeriesQuery": {
240 | "timeSeriesFilter": {
241 | "aggregation": {
242 | "alignmentPeriod": "60s",
243 | "perSeriesAligner": "ALIGN_RATE"
244 | },
245 | "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}",
246 | "pickTimeSeriesFilter": {
247 | "direction": "TOP",
248 | "numTimeSeries": ${OUTLIER_COUNT},
249 | "rankingMethod": "METHOD_MAX"
250 | },
251 | "secondaryAggregation": {
252 | "alignmentPeriod": "60s",
253 | "crossSeriesReducer": "REDUCE_MAX",
254 | "groupByFields": [
255 | "resource.label.\"cluster_name\"",
256 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"",
257 | "resource.label.\"pod_name\""
258 | ],
259 | "perSeriesAligner": "ALIGN_MAX"
260 | }
261 | }
262 | }
263 | }
264 | ],
265 | "thresholds": [],
266 | "timeshiftDuration": "0s",
267 | "yAxis": {
268 | "label": "",
269 | "scale": "LINEAR"
270 | }
271 | }
272 | },
273 | "width": 6,
274 | "xPos": 6,
275 | "yPos": 44
276 | },
277 | {
278 | "height": 16,
279 | "widget": {
280 | "collapsibleGroup": {
281 | "collapsed": false
282 | },
283 | "title": "Network Bytes Sent and Received by TPU Slice"
284 | },
285 | "width": 12,
286 | "yPos": 32
287 | }
288 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/dashboard/monitoring_dashboard/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
20 | variable "monitoring_dashboard_config" {
21 | type = object({
22 | outlier_count : optional(number)
23 | })
24 | description = </default.tfstate
29 | prefix = "gke/dashboard/monitoring_dashboard"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/log_storage/input.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_name" {
16 | type = string
17 | description = "Name of gcp project"
18 | }
19 |
20 | // Valid inputs:
21 | // 1. To create stack trace bucket for 30 retention days: {"bucket_name":""}
22 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"", "retention_days":x}
23 | // 3. To not create stack trace bucket: {}
24 | variable "stack_trace_bucket_config" {
25 | type = object({
26 | bucket_name : optional(string)
27 | retention_days : optional(number)
28 | })
29 | validation {
30 | condition = (
31 | (var.stack_trace_bucket_config.bucket_name == null &&
32 | var.stack_trace_bucket_config.retention_days == null) ||
33 | (var.stack_trace_bucket_config.bucket_name != null)
34 | )
35 | error_message = "bucket_name is not defined for stack_trace_bucket_config."
36 | }
37 | description = </default.tfstate
29 | prefix = "gke/log_storage"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/gcp_resources/gke/resources/log_storage/stack-trace-bucket.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | locals {
16 | stack_trace_filter = "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\""
17 | stack_trace_bucket_counter = var.stack_trace_bucket_config.bucket_name == null ? 0 : 1
18 | }
19 |
20 | resource "google_logging_project_bucket_config" "log_bucket" {
21 | count = local.stack_trace_bucket_counter
22 | project = var.project_name
23 | location = "global"
24 | // default retention period is 30 days
25 | retention_days = var.stack_trace_bucket_config.retention_days == null ? 30 : var.stack_trace_bucket_config.retention_days
26 | bucket_id = var.stack_trace_bucket_config.bucket_name
27 | }
28 |
29 | resource "google_logging_project_sink" "log_sink" {
30 | count = local.stack_trace_bucket_counter
31 | project = var.project_name
32 | name = "${var.stack_trace_bucket_config.bucket_name}_sink"
33 | destination = "logging.googleapis.com/projects/${var.project_name}/locations/global/buckets/${google_logging_project_bucket_config.log_bucket[count.index].bucket_id}"
34 | filter = local.stack_trace_filter
35 | }
36 |
--------------------------------------------------------------------------------
/pip_package/CHANGELOG.md:
--------------------------------------------------------------------------------
1 |
16 | # Changelog
17 |
18 |
36 |
37 | ## [0.1.5] - 2023-12-08
38 | * Raise exception without waiting for the daemon thread to terminate
39 | * Remove sending user signal in `stop_debugging()` to avoid unnecessary stack traces related to `cloud-tpu-diagnostics` package
40 |
41 | ## [0.1.4] - 2023-11-07
42 | * Gracefully exiting daemon threads
43 | * Fixed the URL for PyPI package in README
44 |
45 | ## [0.1.3] - 2023-11-01
46 | * Fixing issue with using signals and threads together in a program
47 |
48 | ## [0.1.2] - 2023-09-20
49 | * Improved stack trace readability and clarity by adding a message for more information
50 |
51 | ## [0.1.1] - 2023-06-21
52 | * Bug Fixes
53 | * Fixes dumping of stack traces on the console when exceptions like `AssertionError`, `tensorflow.python.framework.errors_impl.NotFoundError` are thrown when `collect_stack_trace=True` and `stack_trace_to_cloud=False`.
54 | * Updated README
55 |
56 | ## [0.1.0] - 2023-06-08
57 | * Initial release of cloud-tpu-diagnostics PyPI package
58 | * FEATURE: Contains debug module to collect stack traces on faults
59 |
60 | [0.1.5]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.4...v0.1.5
61 | [0.1.4]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.3...v0.1.4
62 | [0.1.3]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.2...v0.1.3
63 | [0.1.2]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.1...v0.1.2
64 | [0.1.1]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.0...v0.1.1
65 | [0.1.0]: https://github.com/google/cloud-tpu-monitoring-debugging/releases/tag/v0.1.0
66 |
--------------------------------------------------------------------------------
/pip_package/README.md:
--------------------------------------------------------------------------------
1 |
16 | # Cloud TPU Diagnostics
17 |
18 | This is a comprehensive library to monitor, debug and profile the jobs running on Cloud TPU.
19 | To learn about Cloud TPU, refer to the [full documentation](https://cloud.google.com/tpu/docs/intro-to-tpu).
20 |
21 | ## Features
22 | ### 1. Debugging
23 | #### 1.1 Collect Stack Traces
24 | This module will dump the python traces when a fault such as Segmentation fault, Floating-point exception, Illegal operation exception occurs in the program. Additionally, it will also periodically collect stack traces to help debug when a program running on Cloud TPU is stuck or hung somewhere.
25 |
26 | ## Installation
27 | To install the package, run the following command on TPU VM:
28 |
29 | ```
30 | pip install cloud-tpu-diagnostics
31 | ```
32 |
33 | ## Usage
34 | To use this package, first import the module:
35 |
36 | ```
37 | from cloud_tpu_diagnostics import diagnostic
38 | from cloud_tpu_diagnostics.configuration import debug_configuration
39 | from cloud_tpu_diagnostics.configuration import diagnostic_configuration
40 | from cloud_tpu_diagnostics.configuration import stack_trace_configuration
41 | ```
42 |
43 | Then, create configuration object for stack traces. The module will only collect stack traces when `collect_stack_trace` parameter is set to `True`. There are following scenarios supported currently:
44 |
45 | ##### Scenario 1: Do not collect stack traces on faults
46 |
47 | ```
48 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
49 | collect_stack_trace=False)
50 | ```
51 | This configuration will prevent you from collecting stack traces in the event of a fault or process hang.
52 |
53 | ##### Scenario 2: Collect stack traces on faults and display on console
54 |
55 | ```
56 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
57 | collect_stack_trace=True,
58 | stack_trace_to_cloud=False)
59 | ```
60 | If there is a fault or process hang, this configuration will show the stack traces on the console (stderr).
61 |
62 | ##### Scenario 3: Collect stack traces on faults and upload on cloud
63 |
64 | ```
65 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
66 | collect_stack_trace=True,
67 | stack_trace_to_cloud=True)
68 | ```
69 | This configuration will temporary collect stack traces inside `/tmp/debugging` directory on TPU host if there is a fault or process hang. Additionally, the traces collected in TPU host memory will be uploaded to Google Cloud Logging, which will make it easier to troubleshoot and fix the problems. You can view the traces in [Logs Explorer](https://cloud.google.com/logging/docs/view/logs-explorer-interface) using the following query:
70 |
71 | ```
72 | logName="projects//logs/tpu.googleapis.com%2Fruntime_monitor"
73 | jsonPayload.verb="stacktraceanalyzer"
74 | ```
75 |
76 | By default, stack traces will be collected every 10 minutes. In order to change the duration between two stack trace collection events, add the following configuration:
77 |
78 | ```
79 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
80 | collect_stack_trace=True,
81 | stack_trace_to_cloud=True,
82 | stack_trace_interval_seconds=300)
83 | ```
84 | This configuration will collect the stack traces on cloud after every 5 minutes.
85 |
86 | Then, create configuration object for debug.
87 |
88 | ```
89 | debug_config = debug_configuration.DebugConfig(
90 | stack_trace_config=stack_trace_config)
91 | ```
92 |
93 | Then, create configuration object for diagnostic.
94 |
95 | ```
96 | diagnostic_config = diagnostic_configuration.DiagnosticConfig(
97 | debug_config=debug_config)
98 | ```
99 |
100 | Finally, call the `diagnose()` method using `with` and wrap the statements inside the context manager for which you want to collect the stack traces.
101 |
102 | ```
103 | with diagnostic.diagnose(diagnostic_config):
104 | run_job(...)
105 | ```
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from cloud_tpu_diagnostics import configuration
16 | from cloud_tpu_diagnostics import diagnostic
17 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/configuration.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from cloud_tpu_diagnostics.src.config import debug_configuration
16 | from cloud_tpu_diagnostics.src.config import diagnostic_configuration
17 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
18 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/diagnostic.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from cloud_tpu_diagnostics.src.diagnose import diagnose
16 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/config/debug_configuration.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import dataclasses
16 | from typing import Optional
17 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
18 |
19 |
20 | @dataclasses.dataclass
21 | class DebugConfig:
22 | """Configuration for debugging.
23 |
24 | Attributes:
25 | stack_trace_config: config object for stack trace collection, default is
26 | None
27 | """
28 |
29 | stack_trace_config: Optional[stack_trace_configuration.StackTraceConfig] = (
30 | None
31 | )
32 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/config/diagnostic_configuration.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import dataclasses
16 | from typing import Optional
17 | from cloud_tpu_diagnostics.src.config import debug_configuration
18 |
19 |
20 | @dataclasses.dataclass
21 | class DiagnosticConfig:
22 | """Configuration for diagnostic.
23 |
24 | Attributes:
25 | debug_config: config object for debugging, default is None
26 | """
27 |
28 | debug_config: Optional[debug_configuration.DebugConfig] = None
29 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/config/stack_trace_configuration.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import dataclasses
16 | from cloud_tpu_diagnostics.src.util import default
17 |
18 |
19 | @dataclasses.dataclass
20 | class StackTraceConfig:
21 | """Configuration for stack trace collection.
22 |
23 | Attributes:
24 | collect_stack_trace: enable/disable collection of stack trace in case fault
25 | occurs in the program. Default is False, which means stack trace will not
26 | be collected unless collect_stack_trace is set to True.
27 | stack_trace_to_cloud: enable/disable upload of stack trace to cloud. Default
28 | is False, which means stack trace will be displayed on the termial unless
29 | stack_trace_to_cloud is set to True.
30 | stack_trace_interval_seconds: time interval in seconds between collection of
31 | stack trace event. Default is 600, that is 10 minutes.
32 | """
33 |
34 | collect_stack_trace: bool = default.COLLECT_STACK_TRACE_DEFAULT
35 | stack_trace_to_cloud: bool = default.STACK_TRACE_TO_CLOUD_DEFAULT
36 | stack_trace_interval_seconds: int = default.STACK_TRACE_INTERVAL_SECONDS_DEFAULT
37 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/debug.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | import signal
17 | import threading
18 | import time
19 |
20 | from cloud_tpu_diagnostics.src.stack_trace import disable_stack_trace_dumping
21 | from cloud_tpu_diagnostics.src.stack_trace import enable_stack_trace_dumping
22 |
23 | # flag to signal daemon thread to exit gracefully
24 | _exit_flag = threading.Event()
25 | _exit_flag.clear()
26 | _daemon_thread = None
27 | logger = logging.getLogger(__name__)
28 |
29 |
30 | def start_debugging(debug_config):
31 | """Context manager to debug and identify errors."""
32 | global _daemon_thread
33 | _exit_flag.clear()
34 | if (
35 | debug_config.stack_trace_config is not None
36 | and debug_config.stack_trace_config.collect_stack_trace
37 | ):
38 | _daemon_thread = threading.Thread(
39 | target=send_user_signal,
40 | daemon=True,
41 | args=(debug_config.stack_trace_config.stack_trace_interval_seconds,),
42 | )
43 | _daemon_thread.start() # start a daemon thread
44 | enable_stack_trace_dumping(debug_config.stack_trace_config)
45 |
46 |
47 | def stop_debugging(debug_config):
48 | """Context manager to debug and identify errors."""
49 | if (
50 | debug_config.stack_trace_config is not None
51 | and debug_config.stack_trace_config.collect_stack_trace
52 | ):
53 | _exit_flag.set()
54 | # wait for daemon thread to complete
55 | if _daemon_thread is not None:
56 | logger.info(
57 | "Waiting for completion of stack trace collection daemon thread."
58 | )
59 | _daemon_thread.join()
60 | logger.info("Stack trace collection daemon thread completed.")
61 | disable_stack_trace_dumping(debug_config.stack_trace_config)
62 | _exit_flag.clear()
63 |
64 |
65 | def send_user_signal(stack_trace_interval_seconds):
66 | """Send SIGUSR1 signal to main thread after every stack_trace_interval_seconds seconds."""
67 | while not _exit_flag.is_set():
68 | time.sleep(stack_trace_interval_seconds)
69 | if not _exit_flag.is_set():
70 | signal.pthread_kill(threading.main_thread().ident, signal.SIGUSR1)
71 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/diagnose.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import contextlib
16 |
17 | from cloud_tpu_diagnostics.src.debug import start_debugging
18 | from cloud_tpu_diagnostics.src.debug import stop_debugging
19 |
20 |
21 | @contextlib.contextmanager
22 | def diagnose(config):
23 | """Context manager to debug and identify errors."""
24 | if config is not None and config.debug_config is not None:
25 | start_debugging(config.debug_config)
26 | try:
27 | yield
28 | if config is not None and config.debug_config is not None:
29 | stop_debugging(config.debug_config)
30 | except Exception as e:
31 | raise e
32 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/stack_trace.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import faulthandler
16 | import logging
17 | import os
18 | import signal
19 | import sys
20 | import time
21 |
22 | from cloud_tpu_diagnostics.src.util import default
23 |
24 | _stack_trace_file_obj = None
25 | logger = logging.getLogger(__name__)
26 |
27 |
28 | def user_signal_handler_wrapper(file_descriptor, interval):
29 | def user_signal_handler(unused_signum, unused_frame):
30 | message = (
31 | "INFO: Not a crash. cloud-tpu-diagnostics emits a"
32 | f" stack trace snapshot every {interval} seconds.\n"
33 | )
34 | if file_descriptor is not sys.stderr:
35 | message = message.encode()
36 | file_descriptor.write(message)
37 | faulthandler.dump_traceback(file_descriptor, all_threads=False)
38 |
39 | return user_signal_handler
40 |
41 |
42 | def enable_stack_trace_dumping(stack_trace_config):
43 | """Enables stack trace dumping.
44 |
45 | Enables faulthandler and register SIGSEGV, SIGFPE, SIGABRT,
46 | SIGBUS, SIGILL and SIGUSR1 to collect stack trace.
47 |
48 | Args:
49 | stack_trace_config: configuration object for stack trace collection
50 | """
51 | try:
52 | global _stack_trace_file_obj
53 | if stack_trace_config.stack_trace_to_cloud:
54 | stack_trace_file = _get_stack_trace_file()
55 | _stack_trace_file_obj = open(stack_trace_file, "wb")
56 | logger.info("Stack trace will be written in: %s", stack_trace_file)
57 | else:
58 | _stack_trace_file_obj = sys.stderr
59 | logger.info("Stack trace will be written to the console.")
60 |
61 | # Enables faulthandler for SIGSEGV, SIGFPE, SIGABRT, SIGBUS and SIGILL
62 | faulthandler.enable(file=_stack_trace_file_obj, all_threads=False)
63 |
64 | # Register SIGUSR1 signal to faulthandler
65 | faulthandler.register(
66 | signal.SIGUSR1, all_threads=False, file=_stack_trace_file_obj
67 | )
68 |
69 | # Register handler for SIGUSR1 to dump traces
70 | signal.signal(
71 | signal.SIGUSR1,
72 | user_signal_handler_wrapper(
73 | _stack_trace_file_obj,
74 | stack_trace_config.stack_trace_interval_seconds,
75 | ),
76 | )
77 | except Exception as e: # pylint: disable=broad-exception-caught
78 | logger.error("Error in enabling dumping of stack trace.", e)
79 |
80 |
81 | def disable_stack_trace_dumping(stack_trace_config):
82 | """Disable faulthandler and unregister user signals.
83 |
84 | Args:
85 | stack_trace_config: configuration object for stack trace collection
86 | """
87 | try:
88 | global _stack_trace_file_obj
89 | if (
90 | stack_trace_config.stack_trace_to_cloud
91 | and _stack_trace_file_obj is not None
92 | ):
93 | _stack_trace_file_obj.close()
94 | _stack_trace_file_obj = None
95 |
96 | faulthandler.unregister(signal.SIGUSR1)
97 | faulthandler.disable()
98 | except Exception as e: # pylint: disable=broad-exception-caught
99 | logger.error("Error in disabling dumping of stack trace.", e)
100 |
101 |
102 | def _get_stack_trace_file():
103 | """Prefix stack trace file.
104 |
105 | Create a file with prefix as stack_trace_ and current local time in
106 | '%Y_%m_%d_%H_%M_%S' format inside default.STACK_TRACE_DIR_DEFAULT.
107 |
108 | Returns:
109 | path of stack trace file
110 | """
111 | root_trace_folder = os.path.abspath(default.STACK_TRACE_DIR_DEFAULT)
112 | if not os.path.exists(root_trace_folder):
113 | os.makedirs(root_trace_folder)
114 |
115 | current_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
116 | trace_file_name = "stack_trace_" + current_time + ".txt"
117 | stack_trace_file = os.path.join(root_trace_folder, trace_file_name)
118 | return stack_trace_file
119 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/util/default.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Stack trace default values
16 | COLLECT_STACK_TRACE_DEFAULT = False
17 | STACK_TRACE_TO_CLOUD_DEFAULT = False
18 | STACK_TRACE_DIR_DEFAULT = '/tmp/debugging/'
19 | STACK_TRACE_INTERVAL_SECONDS_DEFAULT = 600 # 10 minutes
20 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/src/util/stack_trace_test_util.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """Script to raise different signals to test dumping of stack trace."""
16 |
17 | import argparse
18 | import signal
19 |
20 | from cloud_tpu_diagnostics import diagnostic
21 | from cloud_tpu_diagnostics.configuration import debug_configuration
22 | from cloud_tpu_diagnostics.configuration import diagnostic_configuration
23 | from cloud_tpu_diagnostics.configuration import stack_trace_configuration
24 |
25 |
26 | if __name__ == '__main__':
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument('--signal', help='name of signal to raise')
29 | parser.add_argument(
30 | '--collect_stack_trace',
31 | type=lambda x: (x.lower() == 'true'),
32 | help='whether to collect stack trace or not',
33 | )
34 | parser.add_argument(
35 | '--log_to_cloud',
36 | type=lambda x: (x.lower() == 'true'),
37 | help='whether to log to cloud or console',
38 | )
39 | args = parser.parse_args()
40 | debug_config = debug_configuration.DebugConfig(
41 | stack_trace_config=stack_trace_configuration.StackTraceConfig(
42 | collect_stack_trace=args.collect_stack_trace,
43 | stack_trace_to_cloud=args.log_to_cloud,
44 | stack_trace_interval_seconds=1,
45 | ),
46 | )
47 | diagnostic_config = diagnostic_configuration.DiagnosticConfig(
48 | debug_config=debug_config
49 | )
50 | with diagnostic.diagnose(diagnostic_config):
51 | if args.signal == 'SIGSEGV':
52 | signal.raise_signal(signal.SIGSEGV)
53 |
54 | if args.signal == 'SIGABRT':
55 | signal.raise_signal(signal.SIGABRT)
56 |
57 | if args.signal == 'SIGFPE':
58 | signal.raise_signal(signal.SIGFPE)
59 |
60 | if args.signal == 'SIGILL':
61 | signal.raise_signal(signal.SIGILL)
62 |
63 | if args.signal == 'SIGBUS':
64 | signal.raise_signal(signal.SIGBUS)
65 |
66 | if args.signal == 'SIGUSR1':
67 | signal.raise_signal(signal.SIGUSR1)
68 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/tests/debug_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import signal
16 | import threading
17 | from unittest import mock
18 | from absl.testing import absltest
19 | from cloud_tpu_diagnostics.src.config import debug_configuration
20 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
21 | from cloud_tpu_diagnostics.src.debug import send_user_signal
22 | from cloud_tpu_diagnostics.src.debug import start_debugging
23 | from cloud_tpu_diagnostics.src.debug import stop_debugging
24 |
25 |
26 | class DebugTest(absltest.TestCase):
27 |
28 | def testDaemonThreadRunningWhenCollectStackTraceTrue(self):
29 | debug_config = debug_configuration.DebugConfig(
30 | stack_trace_config=stack_trace_configuration.StackTraceConfig(
31 | collect_stack_trace=True,
32 | stack_trace_to_cloud=True,
33 | stack_trace_interval_seconds=1,
34 | ),
35 | )
36 | start_debugging(debug_config)
37 | self.assertEqual(threading.active_count(), 2)
38 | daemon_thread_list = list(
39 | filter(lambda thread: thread.daemon is True, threading.enumerate())
40 | )
41 | self.assertLen(daemon_thread_list, 1)
42 | stop_debugging(debug_config)
43 | self.assertEqual(threading.active_count(), 1)
44 | daemon_thread_list = list(
45 | filter(lambda thread: thread.daemon is True, threading.enumerate())
46 | )
47 | self.assertLen(daemon_thread_list, 0)
48 |
49 | def testDaemonThreadNotRunningWhenCollectStackTraceFalse(self):
50 | debug_config = debug_configuration.DebugConfig(
51 | stack_trace_config=stack_trace_configuration.StackTraceConfig(
52 | collect_stack_trace=False,
53 | stack_trace_to_cloud=True,
54 | stack_trace_interval_seconds=1,
55 | ),
56 | )
57 | start_debugging(debug_config)
58 | self.assertEqual(threading.active_count(), 1)
59 | daemon_thread_list = list(
60 | filter(lambda thread: thread.daemon is True, threading.enumerate())
61 | )
62 | self.assertLen(daemon_thread_list, 0)
63 | stop_debugging(debug_config)
64 | self.assertEqual(threading.active_count(), 1)
65 | daemon_thread_list = list(
66 | filter(lambda thread: thread.daemon is True, threading.enumerate())
67 | )
68 | self.assertLen(daemon_thread_list, 0)
69 |
70 | @mock.patch(
71 | 'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.debug.disable_stack_trace_dumping'
72 | )
73 | def testStopDebuggingDisableStackTraceDumpingCalled(
74 | self, disable_stack_trace_dumping_mock
75 | ):
76 | debug_config = debug_configuration.DebugConfig(
77 | stack_trace_config=stack_trace_configuration.StackTraceConfig(
78 | collect_stack_trace=True,
79 | stack_trace_to_cloud=True,
80 | stack_trace_interval_seconds=1,
81 | ),
82 | )
83 | stop_debugging(debug_config)
84 | disable_stack_trace_dumping_mock.assert_called_once()
85 | self.assertEqual(threading.active_count(), 1)
86 | daemon_thread_list = list(
87 | filter(lambda thread: thread.daemon is True, threading.enumerate())
88 | )
89 | self.assertLen(daemon_thread_list, 0)
90 |
91 | def testSendUserSignalSIGUSR1SignalReceived(self):
92 | signal.signal(signal.SIGUSR1, user_signal_handler)
93 | stack_trace_interval_seconds = 1
94 | with self.assertRaises(Exception) as e:
95 | send_user_signal(stack_trace_interval_seconds)
96 | self.assertEqual(str(e.exception), 'SIGSUR1 signal received.')
97 |
98 |
99 | def user_signal_handler(signum, _):
100 | raise Exception('SIGSUR1 signal received.') # pylint: disable=broad-exception-caught
101 |
102 |
103 | if __name__ == '__main__':
104 | absltest.main()
105 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/tests/diagnose_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from unittest import mock
16 | from absl.testing import absltest
17 | from cloud_tpu_diagnostics.src.config import debug_configuration
18 | from cloud_tpu_diagnostics.src.config import diagnostic_configuration
19 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
20 | from cloud_tpu_diagnostics.src.diagnose import diagnose
21 |
22 |
23 | class DiagnoseTest(absltest.TestCase):
24 |
25 | @mock.patch(
26 | 'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.diagnose.start_debugging'
27 | )
28 | @mock.patch(
29 | 'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.diagnose.stop_debugging'
30 | )
31 | def testDiagnoseContextManager(
32 | self, stop_debugging_mock, start_debugging_mock
33 | ):
34 | debug_config = debug_configuration.DebugConfig(
35 | stack_trace_config=stack_trace_configuration.StackTraceConfig(
36 | collect_stack_trace=True,
37 | stack_trace_to_cloud=True,
38 | ),
39 | )
40 | diagnostic_config = diagnostic_configuration.DiagnosticConfig(
41 | debug_config=debug_config,
42 | )
43 | with diagnose(diagnostic_config):
44 | pass
45 | start_debugging_mock.assert_called_once_with(debug_config)
46 | stop_debugging_mock.assert_called_once_with(debug_config)
47 |
48 |
49 | if __name__ == '__main__':
50 | absltest.main()
51 |
--------------------------------------------------------------------------------
/pip_package/cloud_tpu_diagnostics/tests/stack_trace_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import faulthandler
16 | import os
17 | import shutil
18 | import signal
19 | import subprocess
20 | import sys
21 | import tempfile
22 | import textwrap
23 | import unittest
24 | from absl.testing import absltest
25 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration
26 | from cloud_tpu_diagnostics.src.stack_trace import disable_stack_trace_dumping
27 | from cloud_tpu_diagnostics.src.stack_trace import enable_stack_trace_dumping
28 | from cloud_tpu_diagnostics.src.stack_trace import user_signal_handler_wrapper
29 | from cloud_tpu_diagnostics.src.util import default
30 |
31 | class StackTraceTest(absltest.TestCase):
32 |
33 | def setUp(self):
34 | super().setUp()
35 | package_dir = '/'.join(os.path.dirname(__file__).split('/')[:-1])
36 | # Used to run test with blaze/bazel
37 | self.test_binary = os.path.join(package_dir, 'stack_trace_test_util')
38 | # Used to run test with unittest `python3 -m unittest stack_trace_test.py`
39 | self.test_file = os.path.join(
40 | package_dir, 'src/util/stack_trace_test_util.py'
41 | )
42 | self.stack_trace_module = os.path.join(package_dir, 'src/stack_trace.py')
43 |
44 | def tearDown(self):
45 | super().tearDown()
46 | if os.path.exists(default.STACK_TRACE_DIR_DEFAULT):
47 | shutil.rmtree(default.STACK_TRACE_DIR_DEFAULT)
48 |
49 | @unittest.skipIf(not hasattr(signal, 'SIGSEGV'), 'Missing signal.SIGSEGV')
50 | def testSigsegvCollectStackTraceTrueTraceCollectedOnCloud(self):
51 | error = 'Fatal Python error: Segmentation fault'
52 | self.check_fatal_error(52, error, 'SIGSEGV', True)
53 |
54 | @unittest.skipIf(not hasattr(signal, 'SIGABRT'), 'Missing signal.SIGABRT')
55 | def testSigabrtCollectStackTraceTrueTraceCollectedOnCloud(self):
56 | error = 'Fatal Python error: Aborted'
57 | self.check_fatal_error(55, error, 'SIGABRT', True)
58 |
59 | @unittest.skipIf(not hasattr(signal, 'SIGFPE'), 'Missing signal.SIGFPE')
60 | def testSigfpeCollectStackTraceTrueTraceCollectedOnCloud(self):
61 | error = 'Fatal Python error: Floating point exception'
62 | try:
63 | self.check_fatal_error(58, error, 'SIGFPE', True)
64 | except AssertionError:
65 | # error message is different for Python 3.12
66 | error = 'Fatal Python error: Floating-point exception'
67 | self.check_fatal_error(58, error, 'SIGFPE', True)
68 |
69 | @unittest.skipIf(not hasattr(signal, 'SIGILL'), 'Missing signal.SIGILL')
70 | def testSigillCollectStackTraceTrueTraceCollectedOnCloud(self):
71 | error = 'Fatal Python error: Illegal instruction'
72 | self.check_fatal_error(61, error, 'SIGILL', True)
73 |
74 | @unittest.skipIf(not hasattr(signal, 'SIGBUS'), 'Missing signal.SIGBUS')
75 | def testSigbusCollectStackTraceTrueTraceCollectedOnCloud(self):
76 | error = 'Fatal Python error: Bus error'
77 | self.check_fatal_error(64, error, 'SIGBUS', True)
78 |
79 | @unittest.skipIf(not hasattr(signal, 'SIGUSR1'), 'Missing signal.SIGUSR1')
80 | def testSigusrCollectStackTraceTrueTraceCollectedOnCloud(self):
81 | self.check_fatal_error(67, '', 'SIGUSR1', True)
82 |
83 | def testCollectStackTraceFalseNoTraceDirCreated(self):
84 | process = self.run_python_code('', False, True)
85 | _, stderr = process.communicate()
86 | self.assertFalse(os.path.exists(default.STACK_TRACE_DIR_DEFAULT))
87 | self.assertEmpty(stderr)
88 |
89 | @unittest.skipIf(not hasattr(signal, 'SIGUSR1'), 'Missing signal.SIGUSR1')
90 | def testCollectStackTraceToConsole(self):
91 | self.check_fatal_error(67, '', 'SIGUSR1', False)
92 |
93 | def testCollectStackTraceFalseNoTraceCollectedOnConsole(self):
94 | process = self.run_python_code('', False, False)
95 | _, stderr = process.communicate()
96 | self.assertEmpty(stderr)
97 |
98 | def testEnableStackTraceDumpingFaulthandlerEnabled(self):
99 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
100 | collect_stack_trace=True, stack_trace_to_cloud=True
101 | )
102 | with self.assertLogs(level='INFO') as log:
103 | enable_stack_trace_dumping(stack_trace_config)
104 | self.assertEqual(faulthandler.is_enabled(), True)
105 | self.assertRegex(
106 | log.output[0], 'Stack trace will be written in: /tmp/debugging/'
107 | )
108 |
109 | def testDisableStackTraceDumpingFaulthandlerDisabled(self):
110 | stack_trace_config = stack_trace_configuration.StackTraceConfig(
111 | collect_stack_trace=True, stack_trace_to_cloud=True
112 | )
113 | enable_stack_trace_dumping(stack_trace_config)
114 | disable_stack_trace_dumping(stack_trace_config)
115 | self.assertEqual(faulthandler.is_enabled(), False)
116 |
117 | def testUserSignalHandlerForStderr(self):
118 | file_obj = tempfile.NamedTemporaryFile('r+')
119 | sys.stderr = file_obj
120 | user_signal_handler = user_signal_handler_wrapper(sys.stderr, 30)
121 | user_signal_handler(signal.SIGUSR1, None)
122 | with open(file_obj.name, 'rb') as f:
123 | data = f.readlines()
124 | self.assertEqual(
125 | data[0],
126 | b'INFO: Not a crash. cloud-tpu-diagnostics emits a stack trace'
127 | b' snapshot every 30 seconds.\n',
128 | )
129 |
130 | def testUserSignalHandlerForFile(self):
131 | file_obj = tempfile.NamedTemporaryFile('rb+')
132 | user_signal_handler = user_signal_handler_wrapper(file_obj, 30)
133 | user_signal_handler(signal.SIGUSR1, None)
134 | with open(file_obj.name, 'rb') as f:
135 | data = f.readlines()
136 | self.assertEqual(
137 | data[0],
138 | b'INFO: Not a crash. cloud-tpu-diagnostics emits a stack trace'
139 | b' snapshot every 30 seconds.\n',
140 | )
141 |
142 | def check_fatal_error(self, line_number, error, signal_name, log_to_cloud):
143 | if error:
144 | header = r'Stack \(most recent call first\)'
145 | regex = """
146 | {error}
147 |
148 | {header}:
149 | File "{filename}", line {line_number} in
150 | """
151 | else:
152 | header = (
153 | r'INFO: Not a crash. cloud\-tpu\-diagnostics emits a stack trace'
154 | r' snapshot every 1 seconds.\n'
155 | r'Stack \(most recent call first\)'
156 | )
157 | regex = """
158 | {header}:
159 | File "{stack_trace_module}", line 23 in user_signal_handler
160 | File "{filename}", line {line_number} in
161 | """
162 | regex = (
163 | textwrap.dedent(regex)
164 | .format(
165 | error=error,
166 | header=header,
167 | filename=self.test_file,
168 | stack_trace_module=self.stack_trace_module,
169 | line_number=line_number,
170 | )
171 | .strip()
172 | )
173 |
174 | output, stderr = self.get_output(signal_name, True, log_to_cloud)
175 | if log_to_cloud:
176 | self.assertRegex(output, regex)
177 | self.assertEmpty(stderr)
178 | else:
179 | self.assertRegex(stderr, regex)
180 | self.assertEmpty(output)
181 |
182 | def get_output(self, signal_name, collect_stack_trace, log_to_cloud):
183 | process = self.run_python_code(
184 | signal_name, collect_stack_trace, log_to_cloud
185 | )
186 | _, stderr = process.communicate()
187 | stderr = stderr.decode('ascii', 'backslashreplace')
188 | output = ''
189 | if log_to_cloud:
190 | trace_file = os.listdir(default.STACK_TRACE_DIR_DEFAULT)
191 | if trace_file:
192 | stack_trace_file = default.STACK_TRACE_DIR_DEFAULT + trace_file[0]
193 | with open(stack_trace_file, 'rb') as fp:
194 | output = fp.read().decode('ascii', 'backslashreplace')
195 | return output, stderr
196 |
197 | def run_python_code(self, signal_name, collect_stack_trace, log_to_cloud):
198 | args = [
199 | '--signal=' + signal_name,
200 | '--collect_stack_trace=' + str(collect_stack_trace),
201 | '--log_to_cloud=' + str(log_to_cloud),
202 | ]
203 | if sys.executable is not None:
204 | code = [sys.executable, self.test_file]
205 | else:
206 | code = [self.test_binary]
207 | return subprocess.Popen(
208 | code + args,
209 | stdout=subprocess.PIPE,
210 | stderr=subprocess.PIPE,
211 | env=os.environ.copy(),
212 | )
213 |
214 |
215 | if __name__ == '__main__':
216 | absltest.main()
217 |
--------------------------------------------------------------------------------
/pip_package/pyproject.toml:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | [project]
16 | name = "cloud-tpu-diagnostics"
17 | version = "0.1.5"
18 | authors = [
19 | { name="Cloud TPU Team", email="cloud-tpu-eng@google.com" },
20 | ]
21 | description = "Monitor, debug and profile the jobs running on Cloud TPU."
22 | readme = "README.md"
23 | requires-python = ">=3.8"
24 | license = {text = "Apache-2.0"}
25 | classifiers = [
26 | "Programming Language :: Python :: 3.8",
27 | "Programming Language :: Python :: 3.9",
28 | "Programming Language :: Python :: 3.10",
29 | "Programming Language :: Python :: 3.11",
30 | ]
31 | keywords = []
32 |
33 | # pip dependencies installed with `pip install -e .`
34 | dependencies = []
35 |
36 | [project.urls]
37 | "Homepage" = "https://github.com/google/cloud-tpu-monitoring-debugging"
38 | "Bug Tracker" = "https://github.com/google/cloud-tpu-monitoring-debugging/issues"
39 |
40 | [build-system]
41 | # Build system specify which backend is used to build/install the project
42 | requires = ["flit_core >=3.8,<4"]
43 | build-backend = "flit_core.buildapi"
44 |
45 | [tool.flit.sdist]
46 | # Flit specific options (files to exclude from the PyPI package)
47 | exclude = [
48 | # Do not release tests files on PyPI
49 | "tests/*_test.py",
50 | ]
51 |
--------------------------------------------------------------------------------