├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── gcp_resources ├── gce │ ├── input.tf │ ├── main.tf │ └── resources │ │ ├── dashboard │ │ ├── logging_dashboard │ │ │ ├── dashboard.tf │ │ │ ├── dashboard_json │ │ │ │ ├── main.json │ │ │ │ ├── stack-trace-counter-metric.json │ │ │ │ └── stack-trace-log-panel.json │ │ │ ├── input.tf │ │ │ ├── log_metrics │ │ │ │ ├── input.tf │ │ │ │ └── stack_trace_counter.tf │ │ │ └── main.tf │ │ └── monitoring_dashboard │ │ │ ├── dashboard.tf │ │ │ ├── dashboard_json │ │ │ ├── cpu-utilization.json │ │ │ ├── dcn-transfer-latency.json │ │ │ ├── device-to-host-transfer-latency.json │ │ │ ├── host-to-device-transfer-latency.json │ │ │ ├── main.json │ │ │ ├── memory-usage.json │ │ │ ├── network-bytes.json │ │ │ └── tensorcore-idle-duration.json │ │ │ ├── input.tf │ │ │ └── main.tf │ │ └── log_storage │ │ ├── input.tf │ │ ├── main.tf │ │ └── stack-trace-bucket.tf └── gke │ ├── input.tf │ ├── main.tf │ └── resources │ ├── dashboard │ ├── logging_dashboard │ │ ├── dashboard.tf │ │ ├── dashboard_json │ │ │ ├── main.json │ │ │ ├── stack-trace-counter-metric.json │ │ │ └── stack-trace-log-panel.json │ │ ├── input.tf │ │ ├── log_metrics │ │ │ ├── input.tf │ │ │ └── stack_trace_counter.tf │ │ └── main.tf │ └── monitoring_dashboard │ │ ├── dashboard.tf │ │ ├── dashboard_json │ │ ├── accelerator-memory-used.json │ │ ├── collectives-latency.json │ │ ├── cpu-utilization.json │ │ ├── dcn-transfer-latency.json │ │ ├── device-to-host-transfer-latency.json │ │ ├── duty-cycle.json │ │ ├── host-to-device-transfer-latency.json │ │ ├── main.json │ │ ├── memory-usage.json │ │ └── network-bytes.json │ │ ├── input.tf │ │ └── main.tf │ └── log_storage │ ├── input.tf │ ├── main.tf │ └── stack-trace-bucket.tf └── pip_package ├── CHANGELOG.md ├── README.md ├── cloud_tpu_diagnostics ├── __init__.py ├── configuration.py ├── diagnostic.py ├── src │ ├── config │ │ ├── debug_configuration.py │ │ ├── diagnostic_configuration.py │ │ └── stack_trace_configuration.py │ ├── debug.py │ ├── diagnose.py │ ├── stack_trace.py │ └── util │ │ ├── default.py │ │ └── stack_trace_test_util.py └── tests │ ├── debug_test.py │ ├── diagnose_test.py │ └── stack_trace_test.py └── pyproject.toml /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 16 | # How to contribute 17 | 18 | We'd love to accept your patches and contributions to this project. 19 | 20 | ## Before you begin 21 | 22 | ### Sign our Contributor License Agreement 23 | 24 | Contributions to this project must be accompanied by a 25 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA). 26 | You (or your employer) retain the copyright to your contribution; this simply 27 | gives us permission to use and redistribute your contributions as part of the 28 | project. 29 | 30 | If you or your current employer have already signed the Google CLA (even if it 31 | was for a different project), you probably don't need to do it again. 32 | 33 | Visit to see your current agreements or to 34 | sign a new one. 35 | 36 | ### Review our community guidelines 37 | 38 | This project follows 39 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/). 40 | 41 | ## Contribution process 42 | 43 | ### Code reviews 44 | 45 | All submissions, including submissions by project members, require review. We 46 | use GitHub pull requests for this purpose. Consult 47 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 48 | information on using pull requests. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 16 | # Cloud TPU Monitoring Debugging 17 | 18 | ## Overview 19 | 20 | Cloud TPU Monitoring Debugging repository contains all the infrastructure and logic required to monitor and debug jobs running on Cloud TPU. 21 | 22 | Terraform is used to deploy resources in google cloud project. 23 | Terraform is an open-source tool to set up and manage google cloud 24 | infrastructure based on configuration files. This repository will help the 25 | customers to deploy various google cloud resources via script, without any 26 | manual effort. 27 | 28 | [cloud-tpu-diagnostics PyPI package](https://pypi.org/project/cloud-tpu-diagnostics) contains all the logic to monitor, debug and profile the jobs running on Cloud TPU. 29 | 30 | ## Getting Started with Terraform 31 | 32 | - Follow [this link](https://developer.hashicorp.com/terraform/tutorials/gcp-get-started/install-cli) to install Terraform on desktop. 33 | - Run `terraform init` to 34 | initialize google cloud Terraform provider version. This command will add 35 | the necessary plugins and build the `.terraform` directory. 36 | - If there is an update to terraform google cloud provider version, run 37 | `terraform init --upgrade` for the update to take place. 38 | - You can also run `terraform plan` to validate resource declarations, 39 | identify any syntax errors, version mismatch before deploying the resources. 40 | 41 | ### Configure Terraform to store state in Cloud Storage 42 | 43 | By default, Terraform stores [state](https://www.terraform.io/docs/state/) locally in a file named `terraform.tfstate`. This default configuration can make Terraform usage difficult for teams, especially when many users run Terraform at the same time and each machine has its own understanding of the current infrastructure. To help avoid such issues, this section configures a remote state that points to Google Cloud Storage (GCS) bucket. 44 | 45 | 1. In Cloud Shell, create the GCS bucket: 46 | 47 | gsutil mb gs://${GCS_BUCKET_NAME} 48 | 49 | 2. Enable [Object Versioning](https://cloud.google.com/storage/docs/object-versioning) to keep the history of your deployments. Enabling Object Versioning increases [storage costs](https://cloud.google.com/storage/pricing), which you can mitigate by configuring 50 | [Object Lifecycle Management](https://cloud.google.com/storage/docs/lifecycle) to delete old state versions. 51 | 52 | gsutil versioning set on gs://${GCS_BUCKET_NAME} 53 | 54 | 3. Enter the name of GCS bucket created above when you run `terraform init` to initialize Terraform. 55 | 56 | Initializing the backend... 57 | bucket 58 | The name of the Google Cloud Storage bucket 59 | 60 | Enter a value: 61 | 62 | ## Deploy GCP Resources 63 | There are following resources managed in this directory: 64 | 65 | 1. **Monitoring Dashboard**: This is an outlier dashboard that displays statistics and outlier mode for TPU metrics. 66 | 2. **Debugging Dashboard**: This dashboard displays the stack traces collected in Cloud Logging for the process running on TPU VMs. 67 | 3. **Logging Storage**: This is an user-defined log bucket to store stack traces. Creating a new log storage is completely optional. If you choose not to create a separate log bucket, the stack traces will be collected in [_Default log bucket](https://cloud.google.com/logging/docs/routing/overview#default-bucket). 68 | 69 | ### Deploy Resources for Workloads on GCE 70 | 71 | Run `terraform init && terraform apply` inside `gcp_resources/gce` directory to deploy all the resources mentioned above for TPU workloads running on GCE. You will be prompted to provide values for some input variables. After confirming the action, all the resources will get automatically deployed in your gcp project. 72 | 73 | ### Deploy Resources for Workloads on GKE 74 | 75 | Run `terraform init && terraform apply` inside `gcp_resources/gke` directory to deploy all the resources mentioned above for TPU workloads running on GKE. You will be prompted to provide values for some input variables. After confirming the action, all the resources will get automatically deployed in your gcp project. 76 | 77 | > **_NOTE:_** Please check the below guide for more details about GCE/GKE specific resources and prerequisites. 78 | 79 | Follow the below guide to deploy the resources individually: 80 | ### Monitoring Dashboard 81 | #### GCE 82 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/dashboard/monitoring_dashboard/` to deploy only monitoring dashboard for GCE in your gcp project. 83 | 84 | If the `node_prefix` parameter is not specified in the input variable `var.monitoring_dashboard_config` or is set to an empty string, the metrics on the dashboard will plot the data points for all TPU VMs in your GCP project. 85 | 86 | For instance, if you provide `{"node_prefix": "test"}` as the input value for the input variable `var.monitoring_dashboard_config`, then the metrics on the monitoring dashboard will only show the data points for the TPU VMs with node names that start with `test`. Refer to this [doc](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/tpus/queued-resources/create#--node-prefix) for more information on node prefix for TPUs in multislice. 87 | 88 | #### GKE 89 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/dashboard/monitoring_dashboard/` to deploy only monitoring dashboard for GKE in your gcp project. 90 | 91 | ### Debugging Dashboard 92 | #### GCE 93 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/dashboard/logging_dashboard/` to deploy only debugging dashboard for GCE in your gcp project. 94 | 95 | #### GKE 96 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/dashboard/logging_dashboard/` to deploy only debugging dashboard for GKE in your gcp project. 97 | 98 | Users need to add a sidecar container to their TPU workload running on GKE to view traces in the debugging dashboard. The sidecar container must be named in a specific way, matching the regex `[a-z-0-9]*stacktrace[a-z-0-9]*`. Here is an example of the sidecar container that should be added: 99 | 100 | ``` 101 | containers: 102 | - name: stacktrace-log-collector 103 | image: busybox:1.28 104 | resources: 105 | limits: 106 | cpu: 100m 107 | memory: 200Mi 108 | args: [/bin/sh, -c, "while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*"] 109 | volumeMounts: 110 | - name: tpu-debug-logs 111 | readOnly: true 112 | mountPath: /tmp/debugging 113 | - name: 114 | ..... 115 | ..... 116 | volumes: 117 | - name: tpu-debug-logs 118 | ``` 119 | 120 | ### Log Storage 121 | #### GCE 122 | Run `terraform init && terraform apply` inside `gcp_resources/gce/resources/log_storage/` to deploy a separate log bucket to store stack traces for GCE. You will be prompted to provide name of your gcp project and also the bucket configuration. You can also set the retention period for the bucket. 123 | 124 | #### GKE 125 | Run `terraform init && terraform apply` inside `gcp_resources/gke/resources/log_storage/` to deploy a separate log bucket to store stack traces for GKE. You will be prompted to provide name of your gcp project and also the bucket configuration. You can also set the retention period for the bucket. Make sure that you have the sidecar container running in your GKE cluster as mentioned in [Debugging Dashboard section for GKE](#debugging-dashboard). -------------------------------------------------------------------------------- /gcp_resources/gce/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | 20 | variable "monitoring_dashboard_config" { 21 | type = object({ 22 | node_prefix : optional(string), 23 | outlier_count : optional(number) 24 | }) 25 | description = <"} 37 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"", "retention_days":x} 38 | // 3. To not create stack trace bucket: {} 39 | variable "stack_trace_bucket_config" { 40 | type = object({ 41 | bucket_name : optional(string) 42 | retention_days : optional(number) 43 | }) 44 | validation { 45 | condition = ( 46 | (var.stack_trace_bucket_config.bucket_name == null && 47 | var.stack_trace_bucket_config.retention_days == null) || 48 | (var.stack_trace_bucket_config.bucket_name != null) 49 | ) 50 | error_message = "bucket_name is not defined for stack_trace_bucket_config." 51 | } 52 | description = </default.tfstate 29 | prefix = "gce" 30 | } 31 | } 32 | 33 | module "monitoring_dashboard" { 34 | source = "./resources/dashboard/monitoring_dashboard" 35 | project_name = var.project_name 36 | monitoring_dashboard_config = var.monitoring_dashboard_config 37 | } 38 | 39 | module "logging_dashboard" { 40 | source = "./resources/dashboard/logging_dashboard" 41 | project_name = var.project_name 42 | } 43 | 44 | module "log_storage" { 45 | source = "./resources/log_storage" 46 | project_name = var.project_name 47 | stack_trace_bucket_config = var.stack_trace_bucket_config 48 | } 49 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | data "google_project" "project" { 16 | project_id = var.project_name 17 | } 18 | 19 | // Add a dependency on log_metrics module to deploy log-based metrics before deploying logging dashboard 20 | module "log_metrics" { 21 | source = "./log_metrics" 22 | project_name = var.project_name 23 | } 24 | 25 | locals { 26 | dashboard_json = templatefile("${path.module}/dashboard_json/main.json", 27 | { 28 | TILE_1 = templatefile("${path.module}/dashboard_json/stack-trace-counter-metric.json", 29 | { 30 | METRIC_NAME = module.log_metrics.stack_trace_counter_metric_id 31 | }), 32 | TILE_2 = templatefile("${path.module}/dashboard_json/stack-trace-log-panel.json", 33 | { 34 | PROJECT_NUMBER = data.google_project.project.number 35 | }) 36 | }) 37 | } 38 | 39 | resource "google_monitoring_dashboard" "logging_dashboard" { 40 | project = var.project_name 41 | dashboard_json = local.dashboard_json 42 | depends_on = [module.log_metrics.stack_trace_counter_metric] 43 | } 44 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/main.json: -------------------------------------------------------------------------------- 1 | { 2 | "category": "CUSTOM", 3 | "displayName": "GCE - TPU Logging Dashboard", 4 | "dashboardFilters": [ 5 | { 6 | "filterType": "RESOURCE_LABEL", 7 | "labelKey": "node_id" 8 | }, 9 | { 10 | "filterType": "RESOURCE_LABEL", 11 | "labelKey": "worker_id" 12 | } 13 | ], 14 | "mosaicLayout": { 15 | "columns": 12, 16 | "tiles": [ 17 | ${TILE_1}, 18 | ${TILE_2}, 19 | { 20 | "height": 10, 21 | "widget": { 22 | "collapsibleGroup": { 23 | "collapsed": false 24 | }, 25 | "title": "TPU VM Process Debugging" 26 | }, 27 | "width": 12 28 | } 29 | ] 30 | } 31 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-counter-metric.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "timeSeriesTable": { 5 | "columnSettings": [ 6 | { 7 | "column": "node_id", 8 | "visible": true 9 | }, 10 | { 11 | "column": "worker_id", 12 | "visible": true 13 | }, 14 | { 15 | "column": "zone", 16 | "visible": true 17 | }, 18 | { 19 | "column": "value", 20 | "visible": true 21 | } 22 | ], 23 | "dataSets": [ 24 | { 25 | "minAlignmentPeriod": "600s", 26 | "timeSeriesQuery": { 27 | "outputFullDuration": true, 28 | "timeSeriesFilter": { 29 | "aggregation": { 30 | "alignmentPeriod": "600s", 31 | "perSeriesAligner": "ALIGN_RATE" 32 | }, 33 | "filter": "metric.type=\"logging.googleapis.com/user/${METRIC_NAME}\" resource.type=\"tpu_worker\"", 34 | "pickTimeSeriesFilter": { 35 | "direction": "TOP", 36 | "numTimeSeries": 300, 37 | "rankingMethod": "METHOD_MEAN" 38 | }, 39 | "secondaryAggregation": { 40 | "alignmentPeriod": "600s", 41 | "crossSeriesReducer": "REDUCE_MEAN", 42 | "groupByFields": [ 43 | "metric.label.\"node_id\"", 44 | "metric.label.\"worker_id\"", 45 | "metric.label.\"zone\"" 46 | ], 47 | "perSeriesAligner": "ALIGN_MEAN" 48 | } 49 | } 50 | } 51 | } 52 | ], 53 | "metricVisualization": "BAR" 54 | }, 55 | "title": "Stack Trace Log Entry Count per Period [Sorted by MEAN]" 56 | }, 57 | "width": 12 58 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-log-panel.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 6, 3 | "widget": { 4 | "logsPanel": { 5 | "filter": "resource.type=\"tpu_worker\" log_id(\"tpu.googleapis.com/runtime_monitor\") jsonPayload.verb=\"stacktraceanalyzer\"", 6 | "resourceNames": [ 7 | "projects/${PROJECT_NUMBER}" 8 | ] 9 | }, 10 | "title": "Stack Trace Logs" 11 | }, 12 | "width": 12, 13 | "yPos": 4 14 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/logging_dashboard/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/logging_dashboard/log_metrics/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/logging_dashboard/log_metrics/stack_trace_counter.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | // Metric that counts the number of stack trace entries that match a specified filter within a specific period 16 | resource "google_logging_metric" "stack_trace_counter_metric" { 17 | name = "stack_trace_counter_gce" 18 | project = var.project_name 19 | description = "Counts the number of stack trace log entries within a specific period." 20 | filter = "resource.type=\"tpu_worker\" AND log_id(\"tpu.googleapis.com/runtime_monitor\") AND jsonPayload.verb=\"stacktraceanalyzer\"" 21 | metric_descriptor { 22 | metric_kind = "DELTA" 23 | value_type = "INT64" 24 | labels { 25 | key = "zone" 26 | value_type = "STRING" 27 | } 28 | labels { 29 | key = "node_id" 30 | value_type = "STRING" 31 | } 32 | labels { 33 | key = "worker_id" 34 | value_type = "STRING" 35 | } 36 | } 37 | label_extractors = { 38 | "zone" = "EXTRACT(resource.labels.zone)", 39 | "node_id" = "EXTRACT(resource.labels.node_id)", 40 | "worker_id" = "EXTRACT(resource.labels.worker_id)", 41 | } 42 | } 43 | 44 | output "stack_trace_counter_metric_id" { 45 | value = google_logging_metric.stack_trace_counter_metric.id 46 | } 47 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/logging_dashboard/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | version = ">= 4.57.0" 20 | } 21 | } 22 | /* 23 | Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS). 24 | The bucket must exist prior to configuring the backend. 25 | For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs 26 | */ 27 | backend "gcs" { 28 | # GCS prefix inside the bucket. terraform states are stored in an object called /default.tfstate 29 | prefix = "gce/dashboard/logging_dashboard" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | outlier_count = var.monitoring_dashboard_config.outlier_count == null ? 10 : var.monitoring_dashboard_config.outlier_count 17 | node_prefix_regex = var.monitoring_dashboard_config.node_prefix == null ? "[a-z0-9-_]*" : "${var.monitoring_dashboard_config.node_prefix}[a-z0-9-_]*" 18 | dashboard_json = templatefile("${path.module}/dashboard_json/main.json", 19 | { 20 | TILE_1 = templatefile("${path.module}/dashboard_json/cpu-utilization.json", 21 | { 22 | OUTLIER_COUNT = local.outlier_count, 23 | NODE_PREFIX_REGEX = local.node_prefix_regex 24 | }), 25 | TILE_2 = templatefile("${path.module}/dashboard_json/tensorcore-idle-duration.json", 26 | { 27 | OUTLIER_COUNT = local.outlier_count, 28 | NODE_PREFIX_REGEX = local.node_prefix_regex 29 | }), 30 | TILE_3 = templatefile("${path.module}/dashboard_json/memory-usage.json", 31 | { 32 | OUTLIER_COUNT = local.outlier_count, 33 | NODE_PREFIX_REGEX = local.node_prefix_regex 34 | }), 35 | TILE_4 = templatefile("${path.module}/dashboard_json/network-bytes.json", 36 | { 37 | OUTLIER_COUNT = local.outlier_count, 38 | NODE_PREFIX_REGEX = local.node_prefix_regex 39 | }), 40 | TILE_5 = templatefile("${path.module}/dashboard_json/dcn-transfer-latency.json", 41 | { 42 | OUTLIER_COUNT = local.outlier_count, 43 | NODE_PREFIX_REGEX = local.node_prefix_regex 44 | }), 45 | TILE_6 = templatefile("${path.module}/dashboard_json/host-to-device-transfer-latency.json", 46 | { 47 | OUTLIER_COUNT = local.outlier_count, 48 | NODE_PREFIX_REGEX = local.node_prefix_regex 49 | }), 50 | TILE_7 = templatefile("${path.module}/dashboard_json/device-to-host-transfer-latency.json", 51 | { 52 | OUTLIER_COUNT = local.outlier_count, 53 | NODE_PREFIX_REGEX = local.node_prefix_regex 54 | }) 55 | }) 56 | } 57 | 58 | resource "google_monitoring_dashboard" "monitoring_dashboard" { 59 | project = var.project_name 60 | dashboard_json = local.dashboard_json 61 | } 62 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/cpu-utilization.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "TPU Worker - CPU Utilization Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_NONE" 19 | }, 20 | "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12, 34 | "yPos": 1 35 | }, 36 | { 37 | "height": 4, 38 | "widget": { 39 | "title": "TPU Worker - CPU Utilization Outliers [MEAN]", 40 | "xyChart": { 41 | "chartOptions": { 42 | "mode": "COLOR" 43 | }, 44 | "dataSets": [ 45 | { 46 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 47 | "minAlignmentPeriod": "60s", 48 | "plotType": "LINE", 49 | "targetAxis": "Y1", 50 | "timeSeriesQuery": { 51 | "timeSeriesFilter": { 52 | "aggregation": { 53 | "alignmentPeriod": "60s", 54 | "crossSeriesReducer": "REDUCE_MEAN", 55 | "groupByFields": [ 56 | "resource.label.\"node_id\"", 57 | "resource.label.\"worker_id\"" 58 | ], 59 | "perSeriesAligner": "ALIGN_MEAN" 60 | }, 61 | "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 62 | "pickTimeSeriesFilter": { 63 | "direction": "TOP", 64 | "numTimeSeries": ${OUTLIER_COUNT}, 65 | "rankingMethod": "METHOD_MEAN" 66 | } 67 | } 68 | } 69 | } 70 | ], 71 | "thresholds": [], 72 | "timeshiftDuration": "0s", 73 | "yAxis": { 74 | "label": "", 75 | "scale": "LINEAR" 76 | } 77 | } 78 | }, 79 | "width": 6, 80 | "yPos": 5 81 | }, 82 | { 83 | "height": 4, 84 | "widget": { 85 | "title": "TPU Worker - CPU Utilization Outliers [MAX]", 86 | "xyChart": { 87 | "chartOptions": { 88 | "mode": "COLOR" 89 | }, 90 | "dataSets": [ 91 | { 92 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 93 | "minAlignmentPeriod": "60s", 94 | "plotType": "LINE", 95 | "targetAxis": "Y1", 96 | "timeSeriesQuery": { 97 | "timeSeriesFilter": { 98 | "aggregation": { 99 | "alignmentPeriod": "60s", 100 | "crossSeriesReducer": "REDUCE_MAX", 101 | "groupByFields": [ 102 | "resource.label.\"node_id\"", 103 | "resource.label.\"worker_id\"" 104 | ], 105 | "perSeriesAligner": "ALIGN_MAX" 106 | }, 107 | "filter": "metric.type=\"tpu.googleapis.com/cpu/utilization\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 108 | "pickTimeSeriesFilter": { 109 | "direction": "TOP", 110 | "numTimeSeries": ${OUTLIER_COUNT}, 111 | "rankingMethod": "METHOD_MAX" 112 | } 113 | } 114 | } 115 | } 116 | ], 117 | "thresholds": [], 118 | "timeshiftDuration": "0s", 119 | "yAxis": { 120 | "label": "", 121 | "scale": "LINEAR" 122 | } 123 | } 124 | }, 125 | "width": 6, 126 | "xPos": 6, 127 | "yPos": 5 128 | }, 129 | { 130 | "height": 8, 131 | "widget": { 132 | "collapsibleGroup": { 133 | "collapsed": false 134 | }, 135 | "title": "CPU Utilization on TPU Worker" 136 | }, 137 | "width": 12, 138 | "yPos": 1 139 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/dcn-transfer-latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "GCE Instance - DCN Transfer Latency Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 19 | "perSeriesAligner": "ALIGN_SUM" 20 | }, 21 | "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")" 22 | } 23 | } 24 | } 25 | ], 26 | "thresholds": [], 27 | "timeshiftDuration": "0s", 28 | "yAxis": { 29 | "label": "", 30 | "scale": "LINEAR" 31 | } 32 | } 33 | }, 34 | "width": 12, 35 | "yPos": 42 36 | }, 37 | { 38 | "height": 4, 39 | "widget": { 40 | "title": "GCE Instance - DCN Transfer Latency Outliers [p50]", 41 | "xyChart": { 42 | "chartOptions": { 43 | "mode": "COLOR" 44 | }, 45 | "dataSets": [ 46 | { 47 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}", 48 | "minAlignmentPeriod": "60s", 49 | "plotType": "LINE", 50 | "targetAxis": "Y1", 51 | "timeSeriesQuery": { 52 | "timeSeriesFilter": { 53 | "aggregation": { 54 | "alignmentPeriod": "60s", 55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 56 | "groupByFields": [ 57 | "metric.label.\"node_id\"", 58 | "metric.label.\"worker_id\"" 59 | ], 60 | "perSeriesAligner": "ALIGN_SUM" 61 | }, 62 | "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 63 | "pickTimeSeriesFilter": { 64 | "direction": "TOP", 65 | "numTimeSeries": ${OUTLIER_COUNT}, 66 | "rankingMethod": "METHOD_MAX" 67 | } 68 | } 69 | } 70 | } 71 | ], 72 | "thresholds": [], 73 | "timeshiftDuration": "0s", 74 | "yAxis": { 75 | "label": "", 76 | "scale": "LINEAR" 77 | } 78 | } 79 | }, 80 | "width": 6, 81 | "yPos": 46 82 | }, 83 | { 84 | "height": 4, 85 | "widget": { 86 | "title": "GCE Instance - DCN Transfer Latency Outliers [p99]", 87 | "xyChart": { 88 | "chartOptions": { 89 | "mode": "COLOR" 90 | }, 91 | "dataSets": [ 92 | { 93 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}", 94 | "minAlignmentPeriod": "60s", 95 | "plotType": "LINE", 96 | "targetAxis": "Y1", 97 | "timeSeriesQuery": { 98 | "timeSeriesFilter": { 99 | "aggregation": { 100 | "alignmentPeriod": "60s", 101 | "crossSeriesReducer": "REDUCE_PERCENTILE_99", 102 | "groupByFields": [ 103 | "metric.label.\"node_id\"", 104 | "metric.label.\"worker_id\"" 105 | ], 106 | "perSeriesAligner": "ALIGN_SUM" 107 | }, 108 | "filter": "metric.type=\"custom.googleapis.com/dcn_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 109 | "pickTimeSeriesFilter": { 110 | "direction": "TOP", 111 | "numTimeSeries": ${OUTLIER_COUNT}, 112 | "rankingMethod": "METHOD_MAX" 113 | } 114 | } 115 | } 116 | } 117 | ], 118 | "thresholds": [], 119 | "timeshiftDuration": "0s", 120 | "yAxis": { 121 | "label": "", 122 | "scale": "LINEAR" 123 | } 124 | } 125 | }, 126 | "width": 6, 127 | "xPos": 6, 128 | "yPos": 46 129 | }, 130 | { 131 | "height": 8, 132 | "widget": { 133 | "collapsibleGroup": { 134 | "collapsed": false 135 | }, 136 | "title": "DCN Transfer Latency" 137 | }, 138 | "width": 12, 139 | "yPos": 42 140 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/device-to-host-transfer-latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "GCE Instance - Device to Host Transfer Latency Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 19 | "perSeriesAligner": "ALIGN_SUM" 20 | }, 21 | "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")" 22 | } 23 | } 24 | } 25 | ], 26 | "thresholds": [], 27 | "timeshiftDuration": "0s", 28 | "yAxis": { 29 | "label": "", 30 | "scale": "LINEAR" 31 | } 32 | } 33 | }, 34 | "width": 12, 35 | "yPos": 58 36 | }, 37 | { 38 | "height": 4, 39 | "widget": { 40 | "title": "GCE Instance - Device to Host Transfer Latency Outliers [p50]", 41 | "xyChart": { 42 | "chartOptions": { 43 | "mode": "COLOR" 44 | }, 45 | "dataSets": [ 46 | { 47 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}", 48 | "minAlignmentPeriod": "60s", 49 | "plotType": "LINE", 50 | "targetAxis": "Y1", 51 | "timeSeriesQuery": { 52 | "timeSeriesFilter": { 53 | "aggregation": { 54 | "alignmentPeriod": "60s", 55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 56 | "groupByFields": [ 57 | "metric.label.\"node_id\"", 58 | "metric.label.\"worker_id\"" 59 | ], 60 | "perSeriesAligner": "ALIGN_SUM" 61 | }, 62 | "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 63 | "pickTimeSeriesFilter": { 64 | "direction": "TOP", 65 | "numTimeSeries": ${OUTLIER_COUNT}, 66 | "rankingMethod": "METHOD_MAX" 67 | } 68 | } 69 | } 70 | } 71 | ], 72 | "thresholds": [], 73 | "timeshiftDuration": "0s", 74 | "yAxis": { 75 | "label": "", 76 | "scale": "LINEAR" 77 | } 78 | } 79 | }, 80 | "width": 6, 81 | "yPos": 62 82 | }, 83 | { 84 | "height": 4, 85 | "widget": { 86 | "title": "GCE Instance - Device to Host Transfer Latency Outliers [p99]", 87 | "xyChart": { 88 | "chartOptions": { 89 | "mode": "COLOR" 90 | }, 91 | "dataSets": [ 92 | { 93 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}", 94 | "minAlignmentPeriod": "60s", 95 | "plotType": "LINE", 96 | "targetAxis": "Y1", 97 | "timeSeriesQuery": { 98 | "timeSeriesFilter": { 99 | "aggregation": { 100 | "alignmentPeriod": "60s", 101 | "crossSeriesReducer": "REDUCE_PERCENTILE_99", 102 | "groupByFields": [ 103 | "metric.label.\"node_id\"", 104 | "metric.label.\"worker_id\"" 105 | ], 106 | "perSeriesAligner": "ALIGN_SUM" 107 | }, 108 | "filter": "metric.type=\"custom.googleapis.com/device_to_host_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 109 | "pickTimeSeriesFilter": { 110 | "direction": "TOP", 111 | "numTimeSeries": ${OUTLIER_COUNT}, 112 | "rankingMethod": "METHOD_MAX" 113 | } 114 | } 115 | } 116 | } 117 | ], 118 | "thresholds": [], 119 | "timeshiftDuration": "0s", 120 | "yAxis": { 121 | "label": "", 122 | "scale": "LINEAR" 123 | } 124 | } 125 | }, 126 | "width": 6, 127 | "xPos": 6, 128 | "yPos": 62 129 | }, 130 | { 131 | "height": 8, 132 | "widget": { 133 | "collapsibleGroup": { 134 | "collapsed": false 135 | }, 136 | "title": "Device to Host Transfer Latency" 137 | }, 138 | "width": 12, 139 | "yPos": 58 140 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/host-to-device-transfer-latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "GCE Instance - Host to Device Transfer Latency Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 19 | "perSeriesAligner": "ALIGN_SUM" 20 | }, 21 | "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")" 22 | } 23 | } 24 | } 25 | ], 26 | "thresholds": [], 27 | "timeshiftDuration": "0s", 28 | "yAxis": { 29 | "label": "", 30 | "scale": "LINEAR" 31 | } 32 | } 33 | }, 34 | "width": 12, 35 | "yPos": 50 36 | }, 37 | { 38 | "height": 4, 39 | "widget": { 40 | "title": "GCE Instance - Host to Device Transfer Latency Outliers [p50]", 41 | "xyChart": { 42 | "chartOptions": { 43 | "mode": "COLOR" 44 | }, 45 | "dataSets": [ 46 | { 47 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}", 48 | "minAlignmentPeriod": "60s", 49 | "plotType": "LINE", 50 | "targetAxis": "Y1", 51 | "timeSeriesQuery": { 52 | "timeSeriesFilter": { 53 | "aggregation": { 54 | "alignmentPeriod": "60s", 55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 56 | "groupByFields": [ 57 | "metric.label.\"node_id\"", 58 | "metric.label.\"worker_id\"" 59 | ], 60 | "perSeriesAligner": "ALIGN_SUM" 61 | }, 62 | "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 63 | "pickTimeSeriesFilter": { 64 | "direction": "TOP", 65 | "numTimeSeries": ${OUTLIER_COUNT}, 66 | "rankingMethod": "METHOD_MAX" 67 | } 68 | } 69 | } 70 | } 71 | ], 72 | "thresholds": [], 73 | "timeshiftDuration": "0s", 74 | "yAxis": { 75 | "label": "", 76 | "scale": "LINEAR" 77 | } 78 | } 79 | }, 80 | "width": 6, 81 | "yPos": 54 82 | }, 83 | { 84 | "height": 4, 85 | "widget": { 86 | "title": "GCE Instance - Host to Device Transfer Latency Outliers [p99]", 87 | "xyChart": { 88 | "chartOptions": { 89 | "mode": "COLOR" 90 | }, 91 | "dataSets": [ 92 | { 93 | "legendTemplate": "TPU Node: $${metric.labels.node_id} Worker ID: $${metric.labels.worker_id}", 94 | "minAlignmentPeriod": "60s", 95 | "plotType": "LINE", 96 | "targetAxis": "Y1", 97 | "timeSeriesQuery": { 98 | "timeSeriesFilter": { 99 | "aggregation": { 100 | "alignmentPeriod": "60s", 101 | "crossSeriesReducer": "REDUCE_PERCENTILE_99", 102 | "groupByFields": [ 103 | "metric.label.\"node_id\"", 104 | "metric.label.\"worker_id\"" 105 | ], 106 | "perSeriesAligner": "ALIGN_SUM" 107 | }, 108 | "filter": "metric.type=\"custom.googleapis.com/host_to_device_transfer_latency\" resource.type=\"gce_instance\" metric.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 109 | "pickTimeSeriesFilter": { 110 | "direction": "TOP", 111 | "numTimeSeries": ${OUTLIER_COUNT}, 112 | "rankingMethod": "METHOD_MAX" 113 | } 114 | } 115 | } 116 | } 117 | ], 118 | "thresholds": [], 119 | "timeshiftDuration": "0s", 120 | "yAxis": { 121 | "label": "", 122 | "scale": "LINEAR" 123 | } 124 | } 125 | }, 126 | "width": 6, 127 | "xPos": 6, 128 | "yPos": 54 129 | }, 130 | { 131 | "height": 8, 132 | "widget": { 133 | "collapsibleGroup": { 134 | "collapsed": false 135 | }, 136 | "title": "Host to Device Transfer Latency" 137 | }, 138 | "width": 12, 139 | "yPos": 50 140 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/main.json: -------------------------------------------------------------------------------- 1 | { 2 | "category": "CUSTOM", 3 | "displayName": "GCE - TPU Monitoring Dashboard", 4 | "dashboardFilters": [ 5 | { 6 | "filterType": "RESOURCE_LABEL", 7 | "labelKey": "worker_id" 8 | } 9 | ], 10 | "mosaicLayout": { 11 | "columns": 12, 12 | "tiles": [ 13 | { 14 | "height": 1, 15 | "widget": { 16 | "title": "TPU Worker Metrics", 17 | "text": { 18 | "content": "" 19 | } 20 | }, 21 | "width": 12, 22 | "yPos": 0 23 | }, 24 | ${TILE_1}, 25 | ${TILE_2}, 26 | ${TILE_3}, 27 | ${TILE_4}, 28 | { 29 | "height": 1, 30 | "widget": { 31 | "title": "Megascale Metrics", 32 | "text": { 33 | "content": "" 34 | } 35 | }, 36 | "width": 12, 37 | "yPos": 41 38 | }, 39 | ${TILE_5}, 40 | ${TILE_6}, 41 | ${TILE_7} 42 | ] 43 | } 44 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/memory-usage.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "TPU VM - Memory Usage Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_NONE" 19 | }, 20 | "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12, 34 | "yPos": 9 35 | }, 36 | { 37 | "height": 4, 38 | "widget": { 39 | "title": "TPU VM - Memory Usage Outliers [MEAN]", 40 | "xyChart": { 41 | "chartOptions": { 42 | "mode": "COLOR" 43 | }, 44 | "dataSets": [ 45 | { 46 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 47 | "minAlignmentPeriod": "60s", 48 | "plotType": "LINE", 49 | "targetAxis": "Y1", 50 | "timeSeriesQuery": { 51 | "timeSeriesFilter": { 52 | "aggregation": { 53 | "alignmentPeriod": "60s", 54 | "crossSeriesReducer": "REDUCE_MEAN", 55 | "groupByFields": [ 56 | "resource.label.\"node_id\"", 57 | "resource.label.\"worker_id\"" 58 | ], 59 | "perSeriesAligner": "ALIGN_MEAN" 60 | }, 61 | "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 62 | "pickTimeSeriesFilter": { 63 | "direction": "TOP", 64 | "numTimeSeries": ${OUTLIER_COUNT}, 65 | "rankingMethod": "METHOD_MEAN" 66 | } 67 | } 68 | } 69 | } 70 | ], 71 | "thresholds": [], 72 | "timeshiftDuration": "0s", 73 | "yAxis": { 74 | "label": "", 75 | "scale": "LINEAR" 76 | } 77 | } 78 | }, 79 | "width": 6, 80 | "yPos": 13 81 | }, 82 | { 83 | "height": 4, 84 | "widget": { 85 | "title": "TPU VM - Memory Usage Outliers [MAX]", 86 | "xyChart": { 87 | "chartOptions": { 88 | "mode": "COLOR" 89 | }, 90 | "dataSets": [ 91 | { 92 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 93 | "minAlignmentPeriod": "60s", 94 | "plotType": "LINE", 95 | "targetAxis": "Y1", 96 | "timeSeriesQuery": { 97 | "timeSeriesFilter": { 98 | "aggregation": { 99 | "alignmentPeriod": "60s", 100 | "crossSeriesReducer": "REDUCE_MAX", 101 | "groupByFields": [ 102 | "resource.label.\"node_id\"", 103 | "resource.label.\"worker_id\"" 104 | ], 105 | "perSeriesAligner": "ALIGN_MAX" 106 | }, 107 | "filter": "metric.type=\"tpu.googleapis.com/memory/usage\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 108 | "pickTimeSeriesFilter": { 109 | "direction": "TOP", 110 | "numTimeSeries": ${OUTLIER_COUNT}, 111 | "rankingMethod": "METHOD_MAX" 112 | } 113 | } 114 | } 115 | } 116 | ], 117 | "thresholds": [], 118 | "timeshiftDuration": "0s", 119 | "yAxis": { 120 | "label": "", 121 | "scale": "LINEAR" 122 | } 123 | } 124 | }, 125 | "width": 6, 126 | "xPos": 6, 127 | "yPos": 13 128 | }, 129 | { 130 | "height": 8, 131 | "widget": { 132 | "collapsibleGroup": { 133 | "collapsed": false 134 | }, 135 | "title": "Memory Usage by TPU VM" 136 | }, 137 | "width": 12, 138 | "yPos": 9 139 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/network-bytes.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "TPU VM - Network Bytes Sent Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_NONE" 19 | }, 20 | "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12, 34 | "yPos": 25 35 | }, 36 | { 37 | "height": 4, 38 | "widget": { 39 | "title": "TPU VM - Network Bytes Sent Outliers [MEAN]", 40 | "xyChart": { 41 | "chartOptions": { 42 | "mode": "COLOR" 43 | }, 44 | "dataSets": [ 45 | { 46 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 47 | "minAlignmentPeriod": "60s", 48 | "plotType": "LINE", 49 | "targetAxis": "Y1", 50 | "timeSeriesQuery": { 51 | "timeSeriesFilter": { 52 | "aggregation": { 53 | "alignmentPeriod": "60s", 54 | "perSeriesAligner": "ALIGN_RATE" 55 | }, 56 | "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 57 | "pickTimeSeriesFilter": { 58 | "direction": "TOP", 59 | "numTimeSeries": ${OUTLIER_COUNT}, 60 | "rankingMethod": "METHOD_MEAN" 61 | } 62 | } 63 | } 64 | } 65 | ], 66 | "thresholds": [], 67 | "timeshiftDuration": "0s", 68 | "yAxis": { 69 | "label": "", 70 | "scale": "LINEAR" 71 | } 72 | } 73 | }, 74 | "width": 6, 75 | "yPos": 29 76 | }, 77 | { 78 | "height": 4, 79 | "widget": { 80 | "title": "TPU VM - Network Bytes Sent Outliers [MAX]", 81 | "xyChart": { 82 | "chartOptions": { 83 | "mode": "COLOR" 84 | }, 85 | "dataSets": [ 86 | { 87 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 88 | "minAlignmentPeriod": "60s", 89 | "plotType": "LINE", 90 | "targetAxis": "Y1", 91 | "timeSeriesQuery": { 92 | "timeSeriesFilter": { 93 | "aggregation": { 94 | "alignmentPeriod": "60s", 95 | "perSeriesAligner": "ALIGN_RATE" 96 | }, 97 | "filter": "metric.type=\"tpu.googleapis.com/network/sent_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 98 | "pickTimeSeriesFilter": { 99 | "direction": "TOP", 100 | "numTimeSeries": ${OUTLIER_COUNT}, 101 | "rankingMethod": "METHOD_MAX" 102 | } 103 | } 104 | } 105 | } 106 | ], 107 | "thresholds": [], 108 | "timeshiftDuration": "0s", 109 | "yAxis": { 110 | "label": "", 111 | "scale": "LINEAR" 112 | } 113 | } 114 | }, 115 | "width": 6, 116 | "xPos": 6, 117 | "yPos": 29 118 | }, 119 | { 120 | "height": 4, 121 | "widget": { 122 | "title": "TPU VM - Network Bytes Received Stats", 123 | "xyChart": { 124 | "chartOptions": { 125 | "mode": "STATS" 126 | }, 127 | "dataSets": [ 128 | { 129 | "minAlignmentPeriod": "60s", 130 | "plotType": "LINE", 131 | "targetAxis": "Y1", 132 | "timeSeriesQuery": { 133 | "timeSeriesFilter": { 134 | "aggregation": { 135 | "alignmentPeriod": "60s", 136 | "perSeriesAligner": "ALIGN_NONE" 137 | }, 138 | "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")" 139 | } 140 | } 141 | } 142 | ], 143 | "thresholds": [], 144 | "timeshiftDuration": "0s", 145 | "yAxis": { 146 | "label": "", 147 | "scale": "LINEAR" 148 | } 149 | } 150 | }, 151 | "width": 12, 152 | "yPos": 33 153 | }, 154 | { 155 | "height": 4, 156 | "widget": { 157 | "title": "TPU VM - Network Bytes Received Outliers [MEAN]", 158 | "xyChart": { 159 | "chartOptions": { 160 | "mode": "COLOR" 161 | }, 162 | "dataSets": [ 163 | { 164 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 165 | "minAlignmentPeriod": "60s", 166 | "plotType": "LINE", 167 | "targetAxis": "Y1", 168 | "timeSeriesQuery": { 169 | "timeSeriesFilter": { 170 | "aggregation": { 171 | "alignmentPeriod": "60s", 172 | "perSeriesAligner": "ALIGN_RATE" 173 | }, 174 | "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 175 | "pickTimeSeriesFilter": { 176 | "direction": "TOP", 177 | "numTimeSeries": ${OUTLIER_COUNT}, 178 | "rankingMethod": "METHOD_MEAN" 179 | } 180 | } 181 | } 182 | } 183 | ], 184 | "thresholds": [], 185 | "timeshiftDuration": "0s", 186 | "yAxis": { 187 | "label": "", 188 | "scale": "LINEAR" 189 | } 190 | } 191 | }, 192 | "width": 6, 193 | "yPos": 37 194 | }, 195 | { 196 | "height": 4, 197 | "widget": { 198 | "title": "TPU VM - Network Bytes Received Outliers [MAX]", 199 | "xyChart": { 200 | "chartOptions": { 201 | "mode": "COLOR" 202 | }, 203 | "dataSets": [ 204 | { 205 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 206 | "minAlignmentPeriod": "60s", 207 | "plotType": "LINE", 208 | "targetAxis": "Y1", 209 | "timeSeriesQuery": { 210 | "timeSeriesFilter": { 211 | "aggregation": { 212 | "alignmentPeriod": "60s", 213 | "perSeriesAligner": "ALIGN_RATE" 214 | }, 215 | "filter": "metric.type=\"tpu.googleapis.com/network/received_bytes_count\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 216 | "pickTimeSeriesFilter": { 217 | "direction": "TOP", 218 | "numTimeSeries": ${OUTLIER_COUNT}, 219 | "rankingMethod": "METHOD_MAX" 220 | } 221 | } 222 | } 223 | } 224 | ], 225 | "thresholds": [], 226 | "timeshiftDuration": "0s", 227 | "yAxis": { 228 | "label": "", 229 | "scale": "LINEAR" 230 | } 231 | } 232 | }, 233 | "width": 6, 234 | "xPos": 6, 235 | "yPos": 37 236 | }, 237 | { 238 | "height": 16, 239 | "widget": { 240 | "collapsibleGroup": { 241 | "collapsed": false 242 | }, 243 | "title": "Network Bytes Received and Sent by TPU VM" 244 | }, 245 | "width": 12, 246 | "yPos": 25 247 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/dashboard_json/tensorcore-idle-duration.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "Tensorcore Idle Duration Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_NONE" 19 | }, 20 | "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12, 34 | "yPos": 17 35 | }, 36 | { 37 | "height": 4, 38 | "widget": { 39 | "title": "Tensorcore Idle Duration Outliers [MEAN]", 40 | "xyChart": { 41 | "chartOptions": { 42 | "mode": "COLOR" 43 | }, 44 | "dataSets": [ 45 | { 46 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 47 | "minAlignmentPeriod": "60s", 48 | "plotType": "LINE", 49 | "targetAxis": "Y1", 50 | "timeSeriesQuery": { 51 | "timeSeriesFilter": { 52 | "aggregation": { 53 | "alignmentPeriod": "60s", 54 | "crossSeriesReducer": "REDUCE_MEAN", 55 | "groupByFields": [ 56 | "resource.label.\"node_id\"", 57 | "resource.label.\"worker_id\"" 58 | ], 59 | "perSeriesAligner": "ALIGN_MEAN" 60 | }, 61 | "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 62 | "pickTimeSeriesFilter": { 63 | "direction": "TOP", 64 | "numTimeSeries": ${OUTLIER_COUNT}, 65 | "rankingMethod": "METHOD_MEAN" 66 | } 67 | } 68 | } 69 | } 70 | ], 71 | "thresholds": [], 72 | "timeshiftDuration": "0s", 73 | "yAxis": { 74 | "label": "", 75 | "scale": "LINEAR" 76 | } 77 | } 78 | }, 79 | "width": 6, 80 | "yPos": 21 81 | }, 82 | { 83 | "height": 4, 84 | "widget": { 85 | "title": "Tensorcore Idle Duration Outliers [MAX]", 86 | "xyChart": { 87 | "chartOptions": { 88 | "mode": "COLOR" 89 | }, 90 | "dataSets": [ 91 | { 92 | "legendTemplate": "TPU Node: $${resource.labels.node_id} Worker ID: $${resource.labels.worker_id}", 93 | "minAlignmentPeriod": "60s", 94 | "plotType": "LINE", 95 | "targetAxis": "Y1", 96 | "timeSeriesQuery": { 97 | "timeSeriesFilter": { 98 | "aggregation": { 99 | "alignmentPeriod": "60s", 100 | "crossSeriesReducer": "REDUCE_MAX", 101 | "groupByFields": [ 102 | "resource.label.\"node_id\"", 103 | "resource.label.\"worker_id\"" 104 | ], 105 | "perSeriesAligner": "ALIGN_MAX" 106 | }, 107 | "filter": "metric.type=\"tpu.googleapis.com/tpu/tensorcore/idle_duration\" resource.type=\"tpu_worker\" resource.label.\"node_id\"=monitoring.regex.full_match(\"${NODE_PREFIX_REGEX}\")", 108 | "pickTimeSeriesFilter": { 109 | "direction": "TOP", 110 | "numTimeSeries": ${OUTLIER_COUNT}, 111 | "rankingMethod": "METHOD_MAX" 112 | } 113 | } 114 | } 115 | } 116 | ], 117 | "thresholds": [], 118 | "timeshiftDuration": "0s", 119 | "yAxis": { 120 | "label": "", 121 | "scale": "LINEAR" 122 | } 123 | } 124 | }, 125 | "width": 6, 126 | "xPos": 6, 127 | "yPos": 21 128 | }, 129 | { 130 | "height": 8, 131 | "widget": { 132 | "collapsibleGroup": { 133 | "collapsed": false 134 | }, 135 | "title": "Tensorcore Idle Duration of TPU Chip" 136 | }, 137 | "width": 12, 138 | "yPos": 17 139 | } -------------------------------------------------------------------------------- /gcp_resources/gce/resources/dashboard/monitoring_dashboard/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | 20 | variable "monitoring_dashboard_config" { 21 | type = object({ 22 | node_prefix : optional(string), 23 | outlier_count : optional(number) 24 | }) 25 | description = </default.tfstate 29 | prefix = "gce/dashboard/monitoring_dashboard" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/log_storage/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | 20 | // Valid inputs: 21 | // 1. To create stack trace bucket for 30 retention days: {"bucket_name":""} 22 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"", "retention_days":x} 23 | // 3. To not create stack trace bucket: {} 24 | variable "stack_trace_bucket_config" { 25 | type = object({ 26 | bucket_name : optional(string) 27 | retention_days : optional(number) 28 | }) 29 | validation { 30 | condition = ( 31 | (var.stack_trace_bucket_config.bucket_name == null && 32 | var.stack_trace_bucket_config.retention_days == null) || 33 | (var.stack_trace_bucket_config.bucket_name != null) 34 | ) 35 | error_message = "bucket_name is not defined for stack_trace_bucket_config." 36 | } 37 | description = </default.tfstate 29 | prefix = "gce/log_storage" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /gcp_resources/gce/resources/log_storage/stack-trace-bucket.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | stack_trace_filter = "projects/${var.project_name}/logs/tpu.googleapis.com%2Fruntime_monitor AND jsonPayload.verb=stacktraceanalyzer" 17 | stack_trace_bucket_counter = var.stack_trace_bucket_config.bucket_name == null ? 0 : 1 18 | } 19 | 20 | resource "google_logging_project_bucket_config" "log_bucket" { 21 | count = local.stack_trace_bucket_counter 22 | project = var.project_name 23 | location = "global" 24 | // default retention period is 30 days 25 | retention_days = var.stack_trace_bucket_config.retention_days == null ? 30 : var.stack_trace_bucket_config.retention_days 26 | bucket_id = var.stack_trace_bucket_config.bucket_name 27 | } 28 | 29 | resource "google_logging_project_sink" "log_sink" { 30 | count = local.stack_trace_bucket_counter 31 | project = var.project_name 32 | name = "${var.stack_trace_bucket_config.bucket_name}_sink" 33 | destination = "logging.googleapis.com/projects/${var.project_name}/locations/global/buckets/${google_logging_project_bucket_config.log_bucket[count.index].bucket_id}" 34 | filter = local.stack_trace_filter 35 | } 36 | -------------------------------------------------------------------------------- /gcp_resources/gke/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | 20 | variable "monitoring_dashboard_config" { 21 | type = object({ 22 | outlier_count : optional(number) 23 | }) 24 | description = <"} 35 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"", "retention_days":x} 36 | // 3. To not create stack trace bucket: {} 37 | variable "stack_trace_bucket_config" { 38 | type = object({ 39 | bucket_name : optional(string) 40 | retention_days : optional(number) 41 | }) 42 | validation { 43 | condition = ( 44 | (var.stack_trace_bucket_config.bucket_name == null && 45 | var.stack_trace_bucket_config.retention_days == null) || 46 | (var.stack_trace_bucket_config.bucket_name != null) 47 | ) 48 | error_message = "bucket_name is not defined for stack_trace_bucket_config." 49 | } 50 | description = </default.tfstate 29 | prefix = "gke" 30 | } 31 | } 32 | 33 | module "monitoring_dashboard" { 34 | source = "./resources/dashboard/monitoring_dashboard" 35 | project_name = var.project_name 36 | monitoring_dashboard_config = var.monitoring_dashboard_config 37 | } 38 | 39 | module "logging_dashboard" { 40 | source = "./resources/dashboard/logging_dashboard" 41 | project_name = var.project_name 42 | } 43 | 44 | module "log_storage" { 45 | source = "./resources/log_storage" 46 | project_name = var.project_name 47 | stack_trace_bucket_config = var.stack_trace_bucket_config 48 | } 49 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | data "google_project" "project" { 16 | project_id = var.project_name 17 | } 18 | 19 | // Add a dependency on log_metrics module to deploy log-based metrics before deploying logging dashboard 20 | module "log_metrics" { 21 | source = "./log_metrics" 22 | project_name = var.project_name 23 | } 24 | 25 | locals { 26 | dashboard_json = templatefile("${path.module}/dashboard_json/main.json", 27 | { 28 | TILE_1 = templatefile("${path.module}/dashboard_json/stack-trace-counter-metric.json", 29 | { 30 | METRIC_NAME = module.log_metrics.stack_trace_counter_metric_id 31 | }), 32 | TILE_2 = templatefile("${path.module}/dashboard_json/stack-trace-log-panel.json", 33 | { 34 | PROJECT_NUMBER = data.google_project.project.number 35 | }) 36 | }) 37 | } 38 | 39 | resource "google_monitoring_dashboard" "logging_dashboard" { 40 | project = var.project_name 41 | dashboard_json = local.dashboard_json 42 | depends_on = [module.log_metrics.stack_trace_counter_metric] 43 | } 44 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/main.json: -------------------------------------------------------------------------------- 1 | { 2 | "category": "CUSTOM", 3 | "displayName": "GKE - TPU Logging Dashboard", 4 | "dashboardFilters": [ 5 | { 6 | "filterType": "RESOURCE_LABEL", 7 | "labelKey": "cluster_name", 8 | "templateVariable": "ClusterName" 9 | }, 10 | { 11 | "filterType": "USER_METADATA_LABEL", 12 | "labelKey": "jobset.sigs.k8s.io/jobset-name", 13 | "templateVariable": "JobName" 14 | } 15 | ], 16 | "mosaicLayout": { 17 | "columns": 12, 18 | "tiles": [ 19 | ${TILE_1}, 20 | ${TILE_2}, 21 | { 22 | "height": 10, 23 | "widget": { 24 | "collapsibleGroup": { 25 | "collapsed": false 26 | }, 27 | "title": "TPU VM Process Debugging" 28 | }, 29 | "width": 12 30 | } 31 | ] 32 | } 33 | } -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-counter-metric.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "timeSeriesTable": { 5 | "columnSettings": [ 6 | { 7 | "column": "location", 8 | "visible": true 9 | }, 10 | { 11 | "column": "pod", 12 | "visible": true 13 | }, 14 | { 15 | "column": "cluster", 16 | "visible": true 17 | }, 18 | { 19 | "column": "job_name", 20 | "visible": true 21 | }, 22 | { 23 | "column": "value", 24 | "visible": true 25 | } 26 | ], 27 | "dataSets": [ 28 | { 29 | "minAlignmentPeriod": "600s", 30 | "timeSeriesQuery": { 31 | "outputFullDuration": true, 32 | "timeSeriesFilter": { 33 | "aggregation": { 34 | "alignmentPeriod": "600s", 35 | "perSeriesAligner": "ALIGN_RATE" 36 | }, 37 | "filter": "metric.type=\"logging.googleapis.com/user/${METRIC_NAME}\" resource.type=\"k8s_container\" $${ClusterName} $${JobName}", 38 | "secondaryAggregation": { 39 | "alignmentPeriod": "600s", 40 | "crossSeriesReducer": "REDUCE_MEAN", 41 | "groupByFields": [ 42 | "metric.label.\"location\"", 43 | "metric.label.\"pod\"", 44 | "metric.label.\"cluster\"", 45 | "metric.label.\"job_name\"" 46 | ], 47 | "perSeriesAligner": "ALIGN_MEAN" 48 | } 49 | } 50 | } 51 | } 52 | ], 53 | "metricVisualization": "BAR" 54 | }, 55 | "title": "Stack Trace Log Entry Count per Period [Sorted by MEAN]" 56 | }, 57 | "width": 12 58 | } -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/logging_dashboard/dashboard_json/stack-trace-log-panel.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 6, 3 | "widget": { 4 | "logsPanel": { 5 | "filter": "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\" AND $${ClusterName}", 6 | "resourceNames": [ 7 | "projects/${PROJECT_NUMBER}" 8 | ] 9 | }, 10 | "title": "Stack Trace Logs" 11 | }, 12 | "width": 12, 13 | "yPos": 4 14 | } -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/logging_dashboard/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/logging_dashboard/log_metrics/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/logging_dashboard/log_metrics/stack_trace_counter.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | // Metric that counts the number of stack trace entries that match a specified filter within a specific period 16 | resource "google_logging_metric" "stack_trace_counter_metric" { 17 | name = "stack_trace_counter_gke" 18 | project = var.project_name 19 | description = "Counts the number of stack trace log entries within a specific period." 20 | filter = "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\"" 21 | metric_descriptor { 22 | metric_kind = "DELTA" 23 | value_type = "INT64" 24 | labels { 25 | key = "location" 26 | value_type = "STRING" 27 | } 28 | labels { 29 | key = "cluster" 30 | value_type = "STRING" 31 | } 32 | labels { 33 | key = "pod" 34 | value_type = "STRING" 35 | } 36 | labels { 37 | key = "job_name" 38 | value_type = "STRING" 39 | } 40 | } 41 | label_extractors = { 42 | "location" = "EXTRACT(resource.labels.location)", 43 | "cluster" = "EXTRACT(resource.labels.cluster_name)", 44 | "pod" = "EXTRACT(resource.labels.pod_name)", 45 | "job_name" = "EXTRACT(labels.k8s-pod/job-name)", 46 | } 47 | } 48 | 49 | output "stack_trace_counter_metric_id" { 50 | value = google_logging_metric.stack_trace_counter_metric.id 51 | } 52 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/logging_dashboard/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | version = ">= 4.57.0" 20 | } 21 | } 22 | /* 23 | Stores the state as an object in a configurable prefix in a pre-existing bucket on Google Cloud Storage (GCS). 24 | The bucket must exist prior to configuring the backend. 25 | For more information: https://developer.hashicorp.com/terraform/language/settings/backends/gcs 26 | */ 27 | backend "gcs" { 28 | # GCS prefix inside the bucket. terraform states are stored in an object called /default.tfstate 29 | prefix = "gke/dashboard/logging_dashboard" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | outlier_count = var.monitoring_dashboard_config.outlier_count == null ? 10 : var.monitoring_dashboard_config.outlier_count 17 | dashboard_json = templatefile("${path.module}/dashboard_json/main.json", 18 | { 19 | TILE_1 = templatefile("${path.module}/dashboard_json/cpu-utilization.json", 20 | { 21 | OUTLIER_COUNT = local.outlier_count 22 | }), 23 | TILE_2 = templatefile("${path.module}/dashboard_json/memory-usage.json", 24 | { 25 | OUTLIER_COUNT = local.outlier_count 26 | }), 27 | TILE_3 = templatefile("${path.module}/dashboard_json/accelerator-memory-used.json", 28 | { 29 | OUTLIER_COUNT = local.outlier_count 30 | }), 31 | TILE_4 = templatefile("${path.module}/dashboard_json/duty-cycle.json", 32 | { 33 | OUTLIER_COUNT = local.outlier_count 34 | }), 35 | TILE_5 = templatefile("${path.module}/dashboard_json/network-bytes.json", 36 | { 37 | OUTLIER_COUNT = local.outlier_count 38 | }), 39 | TILE_6 = templatefile("${path.module}/dashboard_json/dcn-transfer-latency.json", 40 | { 41 | OUTLIER_COUNT = local.outlier_count 42 | }), 43 | TILE_7 = templatefile("${path.module}/dashboard_json/collectives-latency.json", 44 | { 45 | OUTLIER_COUNT = local.outlier_count 46 | }), 47 | TILE_8 = templatefile("${path.module}/dashboard_json/host-to-device-transfer-latency.json", 48 | { 49 | OUTLIER_COUNT = local.outlier_count 50 | }), 51 | TILE_9 = templatefile("${path.module}/dashboard_json/device-to-host-transfer-latency.json", 52 | { 53 | OUTLIER_COUNT = local.outlier_count 54 | }) 55 | }) 56 | } 57 | 58 | resource "google_monitoring_dashboard" "monitoring_dashboard" { 59 | project = var.project_name 60 | dashboard_json = local.dashboard_json 61 | } 62 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/accelerator-memory-used.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "Accelerator Memory Used Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_MEAN" 19 | }, 20 | "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12, 34 | "yPos": 16 35 | }, 36 | { 37 | "height": 4, 38 | "widget": { 39 | "title": "Accelerator Memory Used Outliers [MEAN]", 40 | "xyChart": { 41 | "chartOptions": { 42 | "mode": "COLOR" 43 | }, 44 | "dataSets": [ 45 | { 46 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 47 | "minAlignmentPeriod": "60s", 48 | "plotType": "LINE", 49 | "targetAxis": "Y1", 50 | "timeSeriesQuery": { 51 | "timeSeriesFilter": { 52 | "aggregation": { 53 | "alignmentPeriod": "60s", 54 | "perSeriesAligner": "ALIGN_MEAN" 55 | }, 56 | "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 57 | "pickTimeSeriesFilter": { 58 | "direction": "TOP", 59 | "numTimeSeries": ${OUTLIER_COUNT}, 60 | "rankingMethod": "METHOD_MEAN" 61 | }, 62 | "secondaryAggregation": { 63 | "alignmentPeriod": "60s", 64 | "crossSeriesReducer": "REDUCE_MEAN", 65 | "groupByFields": [ 66 | "resource.label.\"cluster_name\"", 67 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 68 | "resource.label.\"pod_name\"" 69 | ], 70 | "perSeriesAligner": "ALIGN_NONE" 71 | } 72 | } 73 | } 74 | } 75 | ], 76 | "thresholds": [], 77 | "timeshiftDuration": "0s", 78 | "yAxis": { 79 | "label": "", 80 | "scale": "LINEAR" 81 | } 82 | } 83 | }, 84 | "width": 6, 85 | "yPos": 20 86 | }, 87 | { 88 | "height": 4, 89 | "widget": { 90 | "title": "Accelerator Memory Used Outliers [MAX]", 91 | "xyChart": { 92 | "chartOptions": { 93 | "mode": "COLOR" 94 | }, 95 | "dataSets": [ 96 | { 97 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 98 | "minAlignmentPeriod": "60s", 99 | "plotType": "LINE", 100 | "targetAxis": "Y1", 101 | "timeSeriesQuery": { 102 | "timeSeriesFilter": { 103 | "aggregation": { 104 | "alignmentPeriod": "60s", 105 | "perSeriesAligner": "ALIGN_MEAN" 106 | }, 107 | "filter": "metric.type=\"kubernetes.io/container/accelerator/memory_used\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 108 | "pickTimeSeriesFilter": { 109 | "direction": "TOP", 110 | "numTimeSeries": ${OUTLIER_COUNT}, 111 | "rankingMethod": "METHOD_MAX" 112 | }, 113 | "secondaryAggregation": { 114 | "alignmentPeriod": "60s", 115 | "crossSeriesReducer": "REDUCE_MAX", 116 | "groupByFields": [ 117 | "resource.label.\"cluster_name\"", 118 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 119 | "resource.label.\"pod_name\"" 120 | ], 121 | "perSeriesAligner": "ALIGN_NONE" 122 | } 123 | } 124 | } 125 | } 126 | ], 127 | "thresholds": [], 128 | "timeshiftDuration": "0s", 129 | "yAxis": { 130 | "label": "", 131 | "scale": "LINEAR" 132 | } 133 | } 134 | }, 135 | "width": 6, 136 | "xPos": 6, 137 | "yPos": 20 138 | }, 139 | { 140 | "height": 8, 141 | "widget": { 142 | "collapsibleGroup": { 143 | "collapsed": false 144 | }, 145 | "title": "Accelerator Memory Used by TPU Slice" 146 | }, 147 | "width": 12, 148 | "yPos": 16 149 | } 150 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/collectives-latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "Collectives Latency Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "COLOR" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "HEATMAP", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "crossSeriesReducer": "REDUCE_SUM", 19 | "perSeriesAligner": "ALIGN_DELTA" 20 | }, 21 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}" 22 | } 23 | } 24 | } 25 | ], 26 | "thresholds": [], 27 | "timeshiftDuration": "0s", 28 | "yAxis": { 29 | "label": "", 30 | "scale": "LINEAR" 31 | } 32 | } 33 | }, 34 | "width": 12, 35 | "yPos": 57 36 | }, 37 | { 38 | "height": 4, 39 | "widget": { 40 | "title": "Collectives Latency Outliers [p50]", 41 | "xyChart": { 42 | "chartOptions": { 43 | "mode": "COLOR" 44 | }, 45 | "dataSets": [ 46 | { 47 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 48 | "minAlignmentPeriod": "60s", 49 | "plotType": "LINE", 50 | "targetAxis": "Y1", 51 | "timeSeriesQuery": { 52 | "timeSeriesFilter": { 53 | "aggregation": { 54 | "alignmentPeriod": "60s", 55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 56 | "groupByFields": [ 57 | "resource.label.\"cluster_name\"", 58 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 59 | "resource.label.\"pod_name\"" 60 | ], 61 | "perSeriesAligner": "ALIGN_PERCENTILE_50" 62 | }, 63 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 64 | "pickTimeSeriesFilter": { 65 | "direction": "TOP", 66 | "numTimeSeries": ${OUTLIER_COUNT}, 67 | "rankingMethod": "METHOD_MAX" 68 | } 69 | } 70 | } 71 | } 72 | ], 73 | "thresholds": [], 74 | "timeshiftDuration": "0s", 75 | "yAxis": { 76 | "label": "", 77 | "scale": "LINEAR" 78 | } 79 | } 80 | }, 81 | "width": 6, 82 | "yPos": 61 83 | }, 84 | { 85 | "height": 4, 86 | "widget": { 87 | "title": "Collectives Latency Outliers [p99]", 88 | "xyChart": { 89 | "chartOptions": { 90 | "mode": "COLOR" 91 | }, 92 | "dataSets": [ 93 | { 94 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 95 | "minAlignmentPeriod": "60s", 96 | "plotType": "LINE", 97 | "targetAxis": "Y1", 98 | "timeSeriesQuery": { 99 | "timeSeriesFilter": { 100 | "aggregation": { 101 | "alignmentPeriod": "60s", 102 | "crossSeriesReducer": "REDUCE_PERCENTILE_99", 103 | "groupByFields": [ 104 | "resource.label.\"cluster_name\"", 105 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 106 | "resource.label.\"pod_name\"" 107 | ], 108 | "perSeriesAligner": "ALIGN_PERCENTILE_99" 109 | }, 110 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/collective_end_to_end_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 111 | "pickTimeSeriesFilter": { 112 | "direction": "TOP", 113 | "numTimeSeries": ${OUTLIER_COUNT}, 114 | "rankingMethod": "METHOD_MAX" 115 | } 116 | } 117 | } 118 | } 119 | ], 120 | "thresholds": [], 121 | "timeshiftDuration": "0s", 122 | "yAxis": { 123 | "label": "", 124 | "scale": "LINEAR" 125 | } 126 | } 127 | }, 128 | "width": 6, 129 | "xPos": 6, 130 | "yPos": 61 131 | }, 132 | { 133 | "height": 8, 134 | "widget": { 135 | "collapsibleGroup": { 136 | "collapsed": false 137 | }, 138 | "title": "Collectives Latency" 139 | }, 140 | "width": 12, 141 | "yPos": 57 142 | } 143 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/cpu-utilization.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "CPU Utilization Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_RATE" 19 | }, 20 | "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12 34 | }, 35 | { 36 | "height": 4, 37 | "widget": { 38 | "title": "CPU Utilization Outliers [MEAN]", 39 | "xyChart": { 40 | "chartOptions": { 41 | "mode": "COLOR" 42 | }, 43 | "dataSets": [ 44 | { 45 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 46 | "minAlignmentPeriod": "60s", 47 | "plotType": "LINE", 48 | "targetAxis": "Y1", 49 | "timeSeriesQuery": { 50 | "timeSeriesFilter": { 51 | "aggregation": { 52 | "alignmentPeriod": "60s", 53 | "perSeriesAligner": "ALIGN_RATE" 54 | }, 55 | "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 56 | "pickTimeSeriesFilter": { 57 | "direction": "TOP", 58 | "numTimeSeries": ${OUTLIER_COUNT}, 59 | "rankingMethod": "METHOD_MEAN" 60 | }, 61 | "secondaryAggregation": { 62 | "alignmentPeriod": "60s", 63 | "crossSeriesReducer": "REDUCE_MEAN", 64 | "groupByFields": [ 65 | "resource.label.\"cluster_name\"", 66 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 67 | "resource.label.\"pod_name\"" 68 | ], 69 | "perSeriesAligner": "ALIGN_NONE" 70 | } 71 | } 72 | } 73 | } 74 | ], 75 | "thresholds": [], 76 | "timeshiftDuration": "0s", 77 | "yAxis": { 78 | "label": "", 79 | "scale": "LINEAR" 80 | } 81 | } 82 | }, 83 | "width": 6, 84 | "yPos": 4 85 | }, 86 | { 87 | "height": 4, 88 | "widget": { 89 | "title": "CPU Utilization Outliers [MAX]", 90 | "xyChart": { 91 | "chartOptions": { 92 | "mode": "COLOR" 93 | }, 94 | "dataSets": [ 95 | { 96 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 97 | "minAlignmentPeriod": "60s", 98 | "plotType": "LINE", 99 | "targetAxis": "Y1", 100 | "timeSeriesQuery": { 101 | "timeSeriesFilter": { 102 | "aggregation": { 103 | "alignmentPeriod": "60s", 104 | "perSeriesAligner": "ALIGN_RATE" 105 | }, 106 | "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 107 | "pickTimeSeriesFilter": { 108 | "direction": "TOP", 109 | "numTimeSeries": ${OUTLIER_COUNT}, 110 | "rankingMethod": "METHOD_MAX" 111 | }, 112 | "secondaryAggregation": { 113 | "alignmentPeriod": "60s", 114 | "crossSeriesReducer": "REDUCE_MAX", 115 | "groupByFields": [ 116 | "resource.label.\"cluster_name\"", 117 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 118 | "resource.label.\"pod_name\"" 119 | ], 120 | "perSeriesAligner": "ALIGN_NONE" 121 | } 122 | } 123 | } 124 | } 125 | ], 126 | "thresholds": [], 127 | "timeshiftDuration": "0s", 128 | "yAxis": { 129 | "label": "", 130 | "scale": "LINEAR" 131 | } 132 | } 133 | }, 134 | "width": 6, 135 | "xPos": 6, 136 | "yPos": 4 137 | }, 138 | { 139 | "height": 8, 140 | "widget": { 141 | "collapsibleGroup": { 142 | "collapsed": false 143 | }, 144 | "title": "CPU Utilization by TPU Slice" 145 | }, 146 | "width": 12 147 | } 148 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/dcn-transfer-latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "DCN Transfer Latency Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "COLOR" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "HEATMAP", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "crossSeriesReducer": "REDUCE_SUM", 19 | "perSeriesAligner": "ALIGN_DELTA" 20 | }, 21 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}" 22 | } 23 | } 24 | } 25 | ], 26 | "thresholds": [], 27 | "timeshiftDuration": "0s", 28 | "yAxis": { 29 | "label": "", 30 | "scale": "LINEAR" 31 | } 32 | } 33 | }, 34 | "width": 12, 35 | "yPos": 49 36 | }, 37 | { 38 | "height": 4, 39 | "widget": { 40 | "title": "DCN Transfer Latency Outliers [p50]", 41 | "xyChart": { 42 | "chartOptions": { 43 | "mode": "COLOR" 44 | }, 45 | "dataSets": [ 46 | { 47 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 48 | "minAlignmentPeriod": "60s", 49 | "plotType": "LINE", 50 | "targetAxis": "Y1", 51 | "timeSeriesQuery": { 52 | "timeSeriesFilter": { 53 | "aggregation": { 54 | "alignmentPeriod": "60s", 55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 56 | "groupByFields": [ 57 | "resource.label.\"cluster_name\"", 58 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 59 | "resource.label.\"pod_name\"" 60 | ], 61 | "perSeriesAligner": "ALIGN_PERCENTILE_50" 62 | }, 63 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 64 | "pickTimeSeriesFilter": { 65 | "direction": "TOP", 66 | "numTimeSeries": ${OUTLIER_COUNT}, 67 | "rankingMethod": "METHOD_MAX" 68 | } 69 | } 70 | } 71 | } 72 | ], 73 | "thresholds": [], 74 | "timeshiftDuration": "0s", 75 | "yAxis": { 76 | "label": "", 77 | "scale": "LINEAR" 78 | } 79 | } 80 | }, 81 | "width": 6, 82 | "yPos": 53 83 | }, 84 | { 85 | "height": 4, 86 | "widget": { 87 | "title": "DCN Transfer Latency Outliers [p99]", 88 | "xyChart": { 89 | "chartOptions": { 90 | "mode": "COLOR" 91 | }, 92 | "dataSets": [ 93 | { 94 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 95 | "minAlignmentPeriod": "60s", 96 | "plotType": "LINE", 97 | "targetAxis": "Y1", 98 | "timeSeriesQuery": { 99 | "timeSeriesFilter": { 100 | "aggregation": { 101 | "alignmentPeriod": "60s", 102 | "crossSeriesReducer": "REDUCE_PERCENTILE_99", 103 | "groupByFields": [ 104 | "resource.label.\"cluster_name\"", 105 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 106 | "resource.label.\"pod_name\"" 107 | ], 108 | "perSeriesAligner": "ALIGN_PERCENTILE_99" 109 | }, 110 | "filter": "metric.type=\"kubernetes.io/container/multislice/network/dcn_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 111 | "pickTimeSeriesFilter": { 112 | "direction": "TOP", 113 | "numTimeSeries": ${OUTLIER_COUNT}, 114 | "rankingMethod": "METHOD_MAX" 115 | } 116 | } 117 | } 118 | } 119 | ], 120 | "thresholds": [], 121 | "timeshiftDuration": "0s", 122 | "yAxis": { 123 | "label": "", 124 | "scale": "LINEAR" 125 | } 126 | } 127 | }, 128 | "width": 6, 129 | "xPos": 6, 130 | "yPos": 53 131 | }, 132 | { 133 | "height": 8, 134 | "widget": { 135 | "collapsibleGroup": { 136 | "collapsed": false 137 | }, 138 | "title": "DCN Transfer Latency" 139 | }, 140 | "width": 12, 141 | "yPos": 49 142 | } 143 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/device-to-host-transfer-latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "Device To Host Transfer Latency Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "COLOR" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "HEATMAP", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "crossSeriesReducer": "REDUCE_SUM", 19 | "perSeriesAligner": "ALIGN_DELTA" 20 | }, 21 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}" 22 | } 23 | } 24 | } 25 | ], 26 | "thresholds": [], 27 | "timeshiftDuration": "0s", 28 | "yAxis": { 29 | "label": "", 30 | "scale": "LINEAR" 31 | } 32 | } 33 | }, 34 | "width": 12, 35 | "yPos": 73 36 | }, 37 | { 38 | "height": 4, 39 | "widget": { 40 | "title": "Device To Host Transfer Latency Outliers [p50]", 41 | "xyChart": { 42 | "chartOptions": { 43 | "mode": "COLOR" 44 | }, 45 | "dataSets": [ 46 | { 47 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 48 | "minAlignmentPeriod": "60s", 49 | "plotType": "LINE", 50 | "targetAxis": "Y1", 51 | "timeSeriesQuery": { 52 | "timeSeriesFilter": { 53 | "aggregation": { 54 | "alignmentPeriod": "60s", 55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 56 | "groupByFields": [ 57 | "resource.label.\"cluster_name\"", 58 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 59 | "resource.label.\"pod_name\"" 60 | ], 61 | "perSeriesAligner": "ALIGN_PERCENTILE_50" 62 | }, 63 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 64 | "pickTimeSeriesFilter": { 65 | "direction": "TOP", 66 | "numTimeSeries": ${OUTLIER_COUNT}, 67 | "rankingMethod": "METHOD_MAX" 68 | } 69 | } 70 | } 71 | } 72 | ], 73 | "thresholds": [], 74 | "timeshiftDuration": "0s", 75 | "yAxis": { 76 | "label": "", 77 | "scale": "LINEAR" 78 | } 79 | } 80 | }, 81 | "width": 6, 82 | "yPos": 77 83 | }, 84 | { 85 | "height": 4, 86 | "widget": { 87 | "title": "Device To Host Transfer Latency Outliers [p99]", 88 | "xyChart": { 89 | "chartOptions": { 90 | "mode": "COLOR" 91 | }, 92 | "dataSets": [ 93 | { 94 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 95 | "minAlignmentPeriod": "60s", 96 | "plotType": "LINE", 97 | "targetAxis": "Y1", 98 | "timeSeriesQuery": { 99 | "timeSeriesFilter": { 100 | "aggregation": { 101 | "alignmentPeriod": "60s", 102 | "crossSeriesReducer": "REDUCE_PERCENTILE_99", 103 | "groupByFields": [ 104 | "resource.label.\"cluster_name\"", 105 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 106 | "resource.label.\"pod_name\"" 107 | ], 108 | "perSeriesAligner": "ALIGN_PERCENTILE_99" 109 | }, 110 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/device_to_host_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 111 | "pickTimeSeriesFilter": { 112 | "direction": "TOP", 113 | "numTimeSeries": ${OUTLIER_COUNT}, 114 | "rankingMethod": "METHOD_MAX" 115 | } 116 | } 117 | } 118 | } 119 | ], 120 | "thresholds": [], 121 | "timeshiftDuration": "0s", 122 | "yAxis": { 123 | "label": "", 124 | "scale": "LINEAR" 125 | } 126 | } 127 | }, 128 | "width": 6, 129 | "xPos": 6, 130 | "yPos": 77 131 | }, 132 | { 133 | "height": 8, 134 | "widget": { 135 | "collapsibleGroup": { 136 | "collapsed": false 137 | }, 138 | "title": "Device To Host Transfer Latency" 139 | }, 140 | "width": 12, 141 | "yPos": 73 142 | } 143 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/duty-cycle.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "Duty Cycle Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_MEAN" 19 | }, 20 | "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12, 34 | "yPos": 24 35 | }, 36 | { 37 | "height": 4, 38 | "widget": { 39 | "title": "Duty Cycle Outliers [MEAN]", 40 | "xyChart": { 41 | "chartOptions": { 42 | "mode": "COLOR" 43 | }, 44 | "dataSets": [ 45 | { 46 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 47 | "minAlignmentPeriod": "60s", 48 | "plotType": "LINE", 49 | "targetAxis": "Y1", 50 | "timeSeriesQuery": { 51 | "timeSeriesFilter": { 52 | "aggregation": { 53 | "alignmentPeriod": "60s", 54 | "perSeriesAligner": "ALIGN_MEAN" 55 | }, 56 | "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 57 | "pickTimeSeriesFilter": { 58 | "direction": "TOP", 59 | "numTimeSeries": ${OUTLIER_COUNT}, 60 | "rankingMethod": "METHOD_MEAN" 61 | }, 62 | "secondaryAggregation": { 63 | "alignmentPeriod": "60s", 64 | "crossSeriesReducer": "REDUCE_MEAN", 65 | "groupByFields": [ 66 | "resource.label.\"cluster_name\"", 67 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 68 | "resource.label.\"pod_name\"" 69 | ], 70 | "perSeriesAligner": "ALIGN_NONE" 71 | } 72 | } 73 | } 74 | } 75 | ], 76 | "thresholds": [], 77 | "timeshiftDuration": "0s", 78 | "yAxis": { 79 | "label": "", 80 | "scale": "LINEAR" 81 | } 82 | } 83 | }, 84 | "width": 6, 85 | "yPos": 28 86 | }, 87 | { 88 | "height": 4, 89 | "widget": { 90 | "title": "Duty Cycle Outliers [MAX]", 91 | "xyChart": { 92 | "chartOptions": { 93 | "mode": "COLOR" 94 | }, 95 | "dataSets": [ 96 | { 97 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 98 | "minAlignmentPeriod": "60s", 99 | "plotType": "LINE", 100 | "targetAxis": "Y1", 101 | "timeSeriesQuery": { 102 | "timeSeriesFilter": { 103 | "aggregation": { 104 | "alignmentPeriod": "60s", 105 | "perSeriesAligner": "ALIGN_MEAN" 106 | }, 107 | "filter": "metric.type=\"kubernetes.io/container/accelerator/duty_cycle\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 108 | "pickTimeSeriesFilter": { 109 | "direction": "TOP", 110 | "numTimeSeries": ${OUTLIER_COUNT}, 111 | "rankingMethod": "METHOD_MAX" 112 | }, 113 | "secondaryAggregation": { 114 | "alignmentPeriod": "60s", 115 | "crossSeriesReducer": "REDUCE_MAX", 116 | "groupByFields": [ 117 | "resource.label.\"cluster_name\"", 118 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 119 | "resource.label.\"pod_name\"" 120 | ], 121 | "perSeriesAligner": "ALIGN_NONE" 122 | } 123 | } 124 | } 125 | } 126 | ], 127 | "thresholds": [], 128 | "timeshiftDuration": "0s", 129 | "yAxis": { 130 | "label": "", 131 | "scale": "LINEAR" 132 | } 133 | } 134 | }, 135 | "width": 6, 136 | "xPos": 6, 137 | "yPos": 28 138 | }, 139 | { 140 | "height": 8, 141 | "widget": { 142 | "collapsibleGroup": { 143 | "collapsed": false 144 | }, 145 | "title": "Duty Cycle by TPU Slice" 146 | }, 147 | "width": 12, 148 | "yPos": 24 149 | } 150 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/host-to-device-transfer-latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "Host To Device Transfer Latency Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "COLOR" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "HEATMAP", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "crossSeriesReducer": "REDUCE_SUM", 19 | "perSeriesAligner": "ALIGN_DELTA" 20 | }, 21 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}" 22 | } 23 | } 24 | } 25 | ], 26 | "thresholds": [], 27 | "timeshiftDuration": "0s", 28 | "yAxis": { 29 | "label": "", 30 | "scale": "LINEAR" 31 | } 32 | } 33 | }, 34 | "width": 12, 35 | "yPos": 65 36 | }, 37 | { 38 | "height": 4, 39 | "widget": { 40 | "title": "Host To Device Transfer Latency Outliers [p50]", 41 | "xyChart": { 42 | "chartOptions": { 43 | "mode": "COLOR" 44 | }, 45 | "dataSets": [ 46 | { 47 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 48 | "minAlignmentPeriod": "60s", 49 | "plotType": "LINE", 50 | "targetAxis": "Y1", 51 | "timeSeriesQuery": { 52 | "timeSeriesFilter": { 53 | "aggregation": { 54 | "alignmentPeriod": "60s", 55 | "crossSeriesReducer": "REDUCE_PERCENTILE_50", 56 | "groupByFields": [ 57 | "resource.label.\"cluster_name\"", 58 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 59 | "resource.label.\"pod_name\"" 60 | ], 61 | "perSeriesAligner": "ALIGN_PERCENTILE_50" 62 | }, 63 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 64 | "pickTimeSeriesFilter": { 65 | "direction": "TOP", 66 | "numTimeSeries": ${OUTLIER_COUNT}, 67 | "rankingMethod": "METHOD_MAX" 68 | } 69 | } 70 | } 71 | } 72 | ], 73 | "thresholds": [], 74 | "timeshiftDuration": "0s", 75 | "yAxis": { 76 | "label": "", 77 | "scale": "LINEAR" 78 | } 79 | } 80 | }, 81 | "width": 6, 82 | "yPos": 69 83 | }, 84 | { 85 | "height": 4, 86 | "widget": { 87 | "title": "Host To Device Transfer Latency Outliers [p99]", 88 | "xyChart": { 89 | "chartOptions": { 90 | "mode": "COLOR" 91 | }, 92 | "dataSets": [ 93 | { 94 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 95 | "minAlignmentPeriod": "60s", 96 | "plotType": "LINE", 97 | "targetAxis": "Y1", 98 | "timeSeriesQuery": { 99 | "timeSeriesFilter": { 100 | "aggregation": { 101 | "alignmentPeriod": "60s", 102 | "crossSeriesReducer": "REDUCE_PERCENTILE_99", 103 | "groupByFields": [ 104 | "resource.label.\"cluster_name\"", 105 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 106 | "resource.label.\"pod_name\"" 107 | ], 108 | "perSeriesAligner": "ALIGN_PERCENTILE_99" 109 | }, 110 | "filter": "metric.type=\"kubernetes.io/container/multislice/accelerator/host_to_device_transfer_latencies\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 111 | "pickTimeSeriesFilter": { 112 | "direction": "TOP", 113 | "numTimeSeries": ${OUTLIER_COUNT}, 114 | "rankingMethod": "METHOD_MAX" 115 | } 116 | } 117 | } 118 | } 119 | ], 120 | "thresholds": [], 121 | "timeshiftDuration": "0s", 122 | "yAxis": { 123 | "label": "", 124 | "scale": "LINEAR" 125 | } 126 | } 127 | }, 128 | "width": 6, 129 | "xPos": 6, 130 | "yPos": 69 131 | }, 132 | { 133 | "height": 8, 134 | "widget": { 135 | "collapsibleGroup": { 136 | "collapsed": false 137 | }, 138 | "title": "Host To Device Transfer Latency" 139 | }, 140 | "width": 12, 141 | "yPos": 65 142 | } 143 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/main.json: -------------------------------------------------------------------------------- 1 | { 2 | "category": "CUSTOM", 3 | "displayName": "GKE - TPU Monitoring Dashboard", 4 | "dashboardFilters": [ 5 | { 6 | "filterType": "RESOURCE_LABEL", 7 | "labelKey": "cluster_name", 8 | "templateVariable": "ClusterName" 9 | }, 10 | { 11 | "filterType": "USER_METADATA_LABEL", 12 | "labelKey": "jobset.sigs.k8s.io/jobset-name", 13 | "templateVariable": "JobName" 14 | }, 15 | { 16 | "filterType": "RESOURCE_LABEL", 17 | "labelKey": "pod_name", 18 | "templateVariable": "PodName" 19 | } 20 | ], 21 | "mosaicLayout": { 22 | "columns": 12, 23 | "tiles": [ 24 | ${TILE_1}, 25 | ${TILE_2}, 26 | ${TILE_3}, 27 | ${TILE_4}, 28 | ${TILE_5}, 29 | { 30 | "height": 1, 31 | "widget": { 32 | "title": "Megascale Metrics", 33 | "sectionHeader": { 34 | "subtitle": "These metrics are available in GKE version 1.29.1-gke.1016000 or later. TPU workload must use JAX version 0.4.24.", 35 | "dividerBelow": false 36 | } 37 | }, 38 | "width": 12, 39 | "yPos": 48 40 | }, 41 | ${TILE_6}, 42 | ${TILE_7}, 43 | ${TILE_8}, 44 | ${TILE_9} 45 | ] 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/memory-usage.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "Memory Usage Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_SUM" 19 | }, 20 | "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12, 34 | "yPos": 8 35 | }, 36 | { 37 | "height": 4, 38 | "widget": { 39 | "title": "Memory Usage Outliers [MEAN]", 40 | "xyChart": { 41 | "chartOptions": { 42 | "mode": "COLOR" 43 | }, 44 | "dataSets": [ 45 | { 46 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 47 | "minAlignmentPeriod": "60s", 48 | "plotType": "LINE", 49 | "targetAxis": "Y1", 50 | "timeSeriesQuery": { 51 | "timeSeriesFilter": { 52 | "aggregation": { 53 | "alignmentPeriod": "60s", 54 | "crossSeriesReducer": "REDUCE_MEAN", 55 | "groupByFields": [ 56 | "resource.label.\"cluster_name\"", 57 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 58 | "resource.label.\"pod_name\"" 59 | ], 60 | "perSeriesAligner": "ALIGN_MEAN" 61 | }, 62 | "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 63 | "pickTimeSeriesFilter": { 64 | "direction": "TOP", 65 | "numTimeSeries": ${OUTLIER_COUNT}, 66 | "rankingMethod": "METHOD_MEAN" 67 | } 68 | } 69 | } 70 | } 71 | ], 72 | "thresholds": [], 73 | "timeshiftDuration": "0s", 74 | "yAxis": { 75 | "label": "", 76 | "scale": "LINEAR" 77 | } 78 | } 79 | }, 80 | "width": 6, 81 | "yPos": 12 82 | }, 83 | { 84 | "height": 4, 85 | "widget": { 86 | "title": "Memory Usage Outliers [MAX]", 87 | "xyChart": { 88 | "chartOptions": { 89 | "mode": "COLOR" 90 | }, 91 | "dataSets": [ 92 | { 93 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 94 | "minAlignmentPeriod": "60s", 95 | "plotType": "LINE", 96 | "targetAxis": "Y1", 97 | "timeSeriesQuery": { 98 | "timeSeriesFilter": { 99 | "aggregation": { 100 | "alignmentPeriod": "60s", 101 | "crossSeriesReducer": "REDUCE_MAX", 102 | "groupByFields": [ 103 | "resource.label.\"cluster_name\"", 104 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 105 | "resource.label.\"pod_name\"" 106 | ], 107 | "perSeriesAligner": "ALIGN_MAX" 108 | }, 109 | "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" $${ClusterName} $${JobName} $${PodName}", 110 | "pickTimeSeriesFilter": { 111 | "direction": "TOP", 112 | "numTimeSeries": ${OUTLIER_COUNT}, 113 | "rankingMethod": "METHOD_MAX" 114 | } 115 | } 116 | } 117 | } 118 | ], 119 | "thresholds": [], 120 | "timeshiftDuration": "0s", 121 | "yAxis": { 122 | "label": "", 123 | "scale": "LINEAR" 124 | } 125 | } 126 | }, 127 | "width": 6, 128 | "xPos": 6, 129 | "yPos": 12 130 | }, 131 | { 132 | "height": 8, 133 | "widget": { 134 | "collapsibleGroup": { 135 | "collapsed": false 136 | }, 137 | "title": "Memory Usage by TPU Slice" 138 | }, 139 | "width": 12, 140 | "yPos": 8 141 | } 142 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/dashboard_json/network-bytes.json: -------------------------------------------------------------------------------- 1 | { 2 | "height": 4, 3 | "widget": { 4 | "title": "Network Bytes Sent Stats", 5 | "xyChart": { 6 | "chartOptions": { 7 | "mode": "STATS" 8 | }, 9 | "dataSets": [ 10 | { 11 | "minAlignmentPeriod": "60s", 12 | "plotType": "LINE", 13 | "targetAxis": "Y1", 14 | "timeSeriesQuery": { 15 | "timeSeriesFilter": { 16 | "aggregation": { 17 | "alignmentPeriod": "60s", 18 | "perSeriesAligner": "ALIGN_RATE" 19 | }, 20 | "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}" 21 | } 22 | } 23 | } 24 | ], 25 | "thresholds": [], 26 | "timeshiftDuration": "0s", 27 | "yAxis": { 28 | "label": "", 29 | "scale": "LINEAR" 30 | } 31 | } 32 | }, 33 | "width": 12, 34 | "yPos": 32 35 | }, 36 | { 37 | "height": 4, 38 | "widget": { 39 | "title": "Network Bytes Sent Outliers [MEAN]", 40 | "xyChart": { 41 | "chartOptions": { 42 | "mode": "COLOR" 43 | }, 44 | "dataSets": [ 45 | { 46 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 47 | "minAlignmentPeriod": "60s", 48 | "plotType": "LINE", 49 | "targetAxis": "Y1", 50 | "timeSeriesQuery": { 51 | "timeSeriesFilter": { 52 | "aggregation": { 53 | "alignmentPeriod": "60s", 54 | "perSeriesAligner": "ALIGN_RATE" 55 | }, 56 | "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}", 57 | "pickTimeSeriesFilter": { 58 | "direction": "TOP", 59 | "numTimeSeries": ${OUTLIER_COUNT}, 60 | "rankingMethod": "METHOD_MEAN" 61 | }, 62 | "secondaryAggregation": { 63 | "alignmentPeriod": "60s", 64 | "crossSeriesReducer": "REDUCE_MEAN", 65 | "groupByFields": [ 66 | "resource.label.\"cluster_name\"", 67 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 68 | "resource.label.\"pod_name\"" 69 | ], 70 | "perSeriesAligner": "ALIGN_MEAN" 71 | } 72 | } 73 | } 74 | } 75 | ], 76 | "thresholds": [], 77 | "timeshiftDuration": "0s", 78 | "yAxis": { 79 | "label": "", 80 | "scale": "LINEAR" 81 | } 82 | } 83 | }, 84 | "width": 6, 85 | "yPos": 36 86 | }, 87 | { 88 | "height": 4, 89 | "widget": { 90 | "title": "Network Bytes Sent Outliers [MAX]", 91 | "xyChart": { 92 | "chartOptions": { 93 | "mode": "COLOR" 94 | }, 95 | "dataSets": [ 96 | { 97 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 98 | "minAlignmentPeriod": "60s", 99 | "plotType": "LINE", 100 | "targetAxis": "Y1", 101 | "timeSeriesQuery": { 102 | "timeSeriesFilter": { 103 | "aggregation": { 104 | "alignmentPeriod": "60s", 105 | "perSeriesAligner": "ALIGN_RATE" 106 | }, 107 | "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}", 108 | "pickTimeSeriesFilter": { 109 | "direction": "TOP", 110 | "numTimeSeries": ${OUTLIER_COUNT}, 111 | "rankingMethod": "METHOD_MAX" 112 | }, 113 | "secondaryAggregation": { 114 | "alignmentPeriod": "60s", 115 | "crossSeriesReducer": "REDUCE_MAX", 116 | "groupByFields": [ 117 | "resource.label.\"cluster_name\"", 118 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 119 | "resource.label.\"pod_name\"" 120 | ], 121 | "perSeriesAligner": "ALIGN_MAX" 122 | } 123 | } 124 | } 125 | } 126 | ], 127 | "thresholds": [], 128 | "timeshiftDuration": "0s", 129 | "yAxis": { 130 | "label": "", 131 | "scale": "LINEAR" 132 | } 133 | } 134 | }, 135 | "width": 6, 136 | "xPos": 6, 137 | "yPos": 36 138 | }, 139 | { 140 | "height": 4, 141 | "widget": { 142 | "title": "Network Bytes Received Stats", 143 | "xyChart": { 144 | "chartOptions": { 145 | "mode": "STATS" 146 | }, 147 | "dataSets": [ 148 | { 149 | "minAlignmentPeriod": "60s", 150 | "plotType": "LINE", 151 | "targetAxis": "Y1", 152 | "timeSeriesQuery": { 153 | "timeSeriesFilter": { 154 | "aggregation": { 155 | "alignmentPeriod": "60s", 156 | "perSeriesAligner": "ALIGN_RATE" 157 | }, 158 | "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}" 159 | } 160 | } 161 | } 162 | ], 163 | "thresholds": [], 164 | "timeshiftDuration": "0s", 165 | "yAxis": { 166 | "label": "", 167 | "scale": "LINEAR" 168 | } 169 | } 170 | }, 171 | "width": 12, 172 | "yPos": 40 173 | }, 174 | { 175 | "height": 4, 176 | "widget": { 177 | "title": "Network Bytes Received Outliers [MEAN]", 178 | "xyChart": { 179 | "chartOptions": { 180 | "mode": "COLOR" 181 | }, 182 | "dataSets": [ 183 | { 184 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 185 | "minAlignmentPeriod": "60s", 186 | "plotType": "LINE", 187 | "targetAxis": "Y1", 188 | "timeSeriesQuery": { 189 | "timeSeriesFilter": { 190 | "aggregation": { 191 | "alignmentPeriod": "60s", 192 | "perSeriesAligner": "ALIGN_RATE" 193 | }, 194 | "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}", 195 | "pickTimeSeriesFilter": { 196 | "direction": "TOP", 197 | "numTimeSeries": ${OUTLIER_COUNT}, 198 | "rankingMethod": "METHOD_MEAN" 199 | }, 200 | "secondaryAggregation": { 201 | "alignmentPeriod": "60s", 202 | "crossSeriesReducer": "REDUCE_MEAN", 203 | "groupByFields": [ 204 | "resource.label.\"cluster_name\"", 205 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 206 | "resource.label.\"pod_name\"" 207 | ], 208 | "perSeriesAligner": "ALIGN_MEAN" 209 | } 210 | } 211 | } 212 | } 213 | ], 214 | "thresholds": [], 215 | "timeshiftDuration": "0s", 216 | "yAxis": { 217 | "label": "", 218 | "scale": "LINEAR" 219 | } 220 | } 221 | }, 222 | "width": 6, 223 | "yPos": 44 224 | }, 225 | { 226 | "height": 4, 227 | "widget": { 228 | "title": "Network Bytes Received Outliers [MAX]", 229 | "xyChart": { 230 | "chartOptions": { 231 | "mode": "COLOR" 232 | }, 233 | "dataSets": [ 234 | { 235 | "legendTemplate": "Cluster: $${resource.labels.cluster_name} Job: $${metadata.user_labels\\.jobset\\.sigs\\.k8s\\.io/jobset-name} Pod: $${resource.labels.pod_name}", 236 | "minAlignmentPeriod": "60s", 237 | "plotType": "LINE", 238 | "targetAxis": "Y1", 239 | "timeSeriesQuery": { 240 | "timeSeriesFilter": { 241 | "aggregation": { 242 | "alignmentPeriod": "60s", 243 | "perSeriesAligner": "ALIGN_RATE" 244 | }, 245 | "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\" $${ClusterName} $${JobName} $${PodName}", 246 | "pickTimeSeriesFilter": { 247 | "direction": "TOP", 248 | "numTimeSeries": ${OUTLIER_COUNT}, 249 | "rankingMethod": "METHOD_MAX" 250 | }, 251 | "secondaryAggregation": { 252 | "alignmentPeriod": "60s", 253 | "crossSeriesReducer": "REDUCE_MAX", 254 | "groupByFields": [ 255 | "resource.label.\"cluster_name\"", 256 | "metadata.user_labels.\"jobset.sigs.k8s.io/jobset-name\"", 257 | "resource.label.\"pod_name\"" 258 | ], 259 | "perSeriesAligner": "ALIGN_MAX" 260 | } 261 | } 262 | } 263 | } 264 | ], 265 | "thresholds": [], 266 | "timeshiftDuration": "0s", 267 | "yAxis": { 268 | "label": "", 269 | "scale": "LINEAR" 270 | } 271 | } 272 | }, 273 | "width": 6, 274 | "xPos": 6, 275 | "yPos": 44 276 | }, 277 | { 278 | "height": 16, 279 | "widget": { 280 | "collapsibleGroup": { 281 | "collapsed": false 282 | }, 283 | "title": "Network Bytes Sent and Received by TPU Slice" 284 | }, 285 | "width": 12, 286 | "yPos": 32 287 | } 288 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/dashboard/monitoring_dashboard/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | 20 | variable "monitoring_dashboard_config" { 21 | type = object({ 22 | outlier_count : optional(number) 23 | }) 24 | description = </default.tfstate 29 | prefix = "gke/dashboard/monitoring_dashboard" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/log_storage/input.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_name" { 16 | type = string 17 | description = "Name of gcp project" 18 | } 19 | 20 | // Valid inputs: 21 | // 1. To create stack trace bucket for 30 retention days: {"bucket_name":""} 22 | // 2. To create stack trace bucket for x retention days: {"bucket_name":"", "retention_days":x} 23 | // 3. To not create stack trace bucket: {} 24 | variable "stack_trace_bucket_config" { 25 | type = object({ 26 | bucket_name : optional(string) 27 | retention_days : optional(number) 28 | }) 29 | validation { 30 | condition = ( 31 | (var.stack_trace_bucket_config.bucket_name == null && 32 | var.stack_trace_bucket_config.retention_days == null) || 33 | (var.stack_trace_bucket_config.bucket_name != null) 34 | ) 35 | error_message = "bucket_name is not defined for stack_trace_bucket_config." 36 | } 37 | description = </default.tfstate 29 | prefix = "gke/log_storage" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /gcp_resources/gke/resources/log_storage/stack-trace-bucket.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | stack_trace_filter = "resource.type=\"k8s_container\" AND resource.labels.container_name=~\"[a-z-0-9]*stacktrace[a-z-0-9]*\"" 17 | stack_trace_bucket_counter = var.stack_trace_bucket_config.bucket_name == null ? 0 : 1 18 | } 19 | 20 | resource "google_logging_project_bucket_config" "log_bucket" { 21 | count = local.stack_trace_bucket_counter 22 | project = var.project_name 23 | location = "global" 24 | // default retention period is 30 days 25 | retention_days = var.stack_trace_bucket_config.retention_days == null ? 30 : var.stack_trace_bucket_config.retention_days 26 | bucket_id = var.stack_trace_bucket_config.bucket_name 27 | } 28 | 29 | resource "google_logging_project_sink" "log_sink" { 30 | count = local.stack_trace_bucket_counter 31 | project = var.project_name 32 | name = "${var.stack_trace_bucket_config.bucket_name}_sink" 33 | destination = "logging.googleapis.com/projects/${var.project_name}/locations/global/buckets/${google_logging_project_bucket_config.log_bucket[count.index].bucket_id}" 34 | filter = local.stack_trace_filter 35 | } 36 | -------------------------------------------------------------------------------- /pip_package/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 16 | # Changelog 17 | 18 | 36 | 37 | ## [0.1.5] - 2023-12-08 38 | * Raise exception without waiting for the daemon thread to terminate 39 | * Remove sending user signal in `stop_debugging()` to avoid unnecessary stack traces related to `cloud-tpu-diagnostics` package 40 | 41 | ## [0.1.4] - 2023-11-07 42 | * Gracefully exiting daemon threads 43 | * Fixed the URL for PyPI package in README 44 | 45 | ## [0.1.3] - 2023-11-01 46 | * Fixing issue with using signals and threads together in a program 47 | 48 | ## [0.1.2] - 2023-09-20 49 | * Improved stack trace readability and clarity by adding a message for more information 50 | 51 | ## [0.1.1] - 2023-06-21 52 | * Bug Fixes 53 | * Fixes dumping of stack traces on the console when exceptions like `AssertionError`, `tensorflow.python.framework.errors_impl.NotFoundError` are thrown when `collect_stack_trace=True` and `stack_trace_to_cloud=False`. 54 | * Updated README 55 | 56 | ## [0.1.0] - 2023-06-08 57 | * Initial release of cloud-tpu-diagnostics PyPI package 58 | * FEATURE: Contains debug module to collect stack traces on faults 59 | 60 | [0.1.5]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.4...v0.1.5 61 | [0.1.4]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.3...v0.1.4 62 | [0.1.3]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.2...v0.1.3 63 | [0.1.2]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.1...v0.1.2 64 | [0.1.1]: https://github.com/google/cloud-tpu-monitoring-debugging/compare/v0.1.0...v0.1.1 65 | [0.1.0]: https://github.com/google/cloud-tpu-monitoring-debugging/releases/tag/v0.1.0 66 | -------------------------------------------------------------------------------- /pip_package/README.md: -------------------------------------------------------------------------------- 1 | 16 | # Cloud TPU Diagnostics 17 | 18 | This is a comprehensive library to monitor, debug and profile the jobs running on Cloud TPU. 19 | To learn about Cloud TPU, refer to the [full documentation](https://cloud.google.com/tpu/docs/intro-to-tpu). 20 | 21 | ## Features 22 | ### 1. Debugging 23 | #### 1.1 Collect Stack Traces 24 | This module will dump the python traces when a fault such as Segmentation fault, Floating-point exception, Illegal operation exception occurs in the program. Additionally, it will also periodically collect stack traces to help debug when a program running on Cloud TPU is stuck or hung somewhere. 25 | 26 | ## Installation 27 | To install the package, run the following command on TPU VM: 28 | 29 | ``` 30 | pip install cloud-tpu-diagnostics 31 | ``` 32 | 33 | ## Usage 34 | To use this package, first import the module: 35 | 36 | ``` 37 | from cloud_tpu_diagnostics import diagnostic 38 | from cloud_tpu_diagnostics.configuration import debug_configuration 39 | from cloud_tpu_diagnostics.configuration import diagnostic_configuration 40 | from cloud_tpu_diagnostics.configuration import stack_trace_configuration 41 | ``` 42 | 43 | Then, create configuration object for stack traces. The module will only collect stack traces when `collect_stack_trace` parameter is set to `True`. There are following scenarios supported currently: 44 | 45 | ##### Scenario 1: Do not collect stack traces on faults 46 | 47 | ``` 48 | stack_trace_config = stack_trace_configuration.StackTraceConfig( 49 | collect_stack_trace=False) 50 | ``` 51 | This configuration will prevent you from collecting stack traces in the event of a fault or process hang. 52 | 53 | ##### Scenario 2: Collect stack traces on faults and display on console 54 | 55 | ``` 56 | stack_trace_config = stack_trace_configuration.StackTraceConfig( 57 | collect_stack_trace=True, 58 | stack_trace_to_cloud=False) 59 | ``` 60 | If there is a fault or process hang, this configuration will show the stack traces on the console (stderr). 61 | 62 | ##### Scenario 3: Collect stack traces on faults and upload on cloud 63 | 64 | ``` 65 | stack_trace_config = stack_trace_configuration.StackTraceConfig( 66 | collect_stack_trace=True, 67 | stack_trace_to_cloud=True) 68 | ``` 69 | This configuration will temporary collect stack traces inside `/tmp/debugging` directory on TPU host if there is a fault or process hang. Additionally, the traces collected in TPU host memory will be uploaded to Google Cloud Logging, which will make it easier to troubleshoot and fix the problems. You can view the traces in [Logs Explorer](https://cloud.google.com/logging/docs/view/logs-explorer-interface) using the following query: 70 | 71 | ``` 72 | logName="projects//logs/tpu.googleapis.com%2Fruntime_monitor" 73 | jsonPayload.verb="stacktraceanalyzer" 74 | ``` 75 | 76 | By default, stack traces will be collected every 10 minutes. In order to change the duration between two stack trace collection events, add the following configuration: 77 | 78 | ``` 79 | stack_trace_config = stack_trace_configuration.StackTraceConfig( 80 | collect_stack_trace=True, 81 | stack_trace_to_cloud=True, 82 | stack_trace_interval_seconds=300) 83 | ``` 84 | This configuration will collect the stack traces on cloud after every 5 minutes. 85 | 86 | Then, create configuration object for debug. 87 | 88 | ``` 89 | debug_config = debug_configuration.DebugConfig( 90 | stack_trace_config=stack_trace_config) 91 | ``` 92 | 93 | Then, create configuration object for diagnostic. 94 | 95 | ``` 96 | diagnostic_config = diagnostic_configuration.DiagnosticConfig( 97 | debug_config=debug_config) 98 | ``` 99 | 100 | Finally, call the `diagnose()` method using `with` and wrap the statements inside the context manager for which you want to collect the stack traces. 101 | 102 | ``` 103 | with diagnostic.diagnose(diagnostic_config): 104 | run_job(...) 105 | ``` -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from cloud_tpu_diagnostics import configuration 16 | from cloud_tpu_diagnostics import diagnostic 17 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from cloud_tpu_diagnostics.src.config import debug_configuration 16 | from cloud_tpu_diagnostics.src.config import diagnostic_configuration 17 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration 18 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/diagnostic.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from cloud_tpu_diagnostics.src.diagnose import diagnose 16 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/src/config/debug_configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import dataclasses 16 | from typing import Optional 17 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration 18 | 19 | 20 | @dataclasses.dataclass 21 | class DebugConfig: 22 | """Configuration for debugging. 23 | 24 | Attributes: 25 | stack_trace_config: config object for stack trace collection, default is 26 | None 27 | """ 28 | 29 | stack_trace_config: Optional[stack_trace_configuration.StackTraceConfig] = ( 30 | None 31 | ) 32 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/src/config/diagnostic_configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import dataclasses 16 | from typing import Optional 17 | from cloud_tpu_diagnostics.src.config import debug_configuration 18 | 19 | 20 | @dataclasses.dataclass 21 | class DiagnosticConfig: 22 | """Configuration for diagnostic. 23 | 24 | Attributes: 25 | debug_config: config object for debugging, default is None 26 | """ 27 | 28 | debug_config: Optional[debug_configuration.DebugConfig] = None 29 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/src/config/stack_trace_configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import dataclasses 16 | from cloud_tpu_diagnostics.src.util import default 17 | 18 | 19 | @dataclasses.dataclass 20 | class StackTraceConfig: 21 | """Configuration for stack trace collection. 22 | 23 | Attributes: 24 | collect_stack_trace: enable/disable collection of stack trace in case fault 25 | occurs in the program. Default is False, which means stack trace will not 26 | be collected unless collect_stack_trace is set to True. 27 | stack_trace_to_cloud: enable/disable upload of stack trace to cloud. Default 28 | is False, which means stack trace will be displayed on the termial unless 29 | stack_trace_to_cloud is set to True. 30 | stack_trace_interval_seconds: time interval in seconds between collection of 31 | stack trace event. Default is 600, that is 10 minutes. 32 | """ 33 | 34 | collect_stack_trace: bool = default.COLLECT_STACK_TRACE_DEFAULT 35 | stack_trace_to_cloud: bool = default.STACK_TRACE_TO_CLOUD_DEFAULT 36 | stack_trace_interval_seconds: int = default.STACK_TRACE_INTERVAL_SECONDS_DEFAULT 37 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/src/debug.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import signal 17 | import threading 18 | import time 19 | 20 | from cloud_tpu_diagnostics.src.stack_trace import disable_stack_trace_dumping 21 | from cloud_tpu_diagnostics.src.stack_trace import enable_stack_trace_dumping 22 | 23 | # flag to signal daemon thread to exit gracefully 24 | _exit_flag = threading.Event() 25 | _exit_flag.clear() 26 | _daemon_thread = None 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def start_debugging(debug_config): 31 | """Context manager to debug and identify errors.""" 32 | global _daemon_thread 33 | _exit_flag.clear() 34 | if ( 35 | debug_config.stack_trace_config is not None 36 | and debug_config.stack_trace_config.collect_stack_trace 37 | ): 38 | _daemon_thread = threading.Thread( 39 | target=send_user_signal, 40 | daemon=True, 41 | args=(debug_config.stack_trace_config.stack_trace_interval_seconds,), 42 | ) 43 | _daemon_thread.start() # start a daemon thread 44 | enable_stack_trace_dumping(debug_config.stack_trace_config) 45 | 46 | 47 | def stop_debugging(debug_config): 48 | """Context manager to debug and identify errors.""" 49 | if ( 50 | debug_config.stack_trace_config is not None 51 | and debug_config.stack_trace_config.collect_stack_trace 52 | ): 53 | _exit_flag.set() 54 | # wait for daemon thread to complete 55 | if _daemon_thread is not None: 56 | logger.info( 57 | "Waiting for completion of stack trace collection daemon thread." 58 | ) 59 | _daemon_thread.join() 60 | logger.info("Stack trace collection daemon thread completed.") 61 | disable_stack_trace_dumping(debug_config.stack_trace_config) 62 | _exit_flag.clear() 63 | 64 | 65 | def send_user_signal(stack_trace_interval_seconds): 66 | """Send SIGUSR1 signal to main thread after every stack_trace_interval_seconds seconds.""" 67 | while not _exit_flag.is_set(): 68 | time.sleep(stack_trace_interval_seconds) 69 | if not _exit_flag.is_set(): 70 | signal.pthread_kill(threading.main_thread().ident, signal.SIGUSR1) 71 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/src/diagnose.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import contextlib 16 | 17 | from cloud_tpu_diagnostics.src.debug import start_debugging 18 | from cloud_tpu_diagnostics.src.debug import stop_debugging 19 | 20 | 21 | @contextlib.contextmanager 22 | def diagnose(config): 23 | """Context manager to debug and identify errors.""" 24 | if config is not None and config.debug_config is not None: 25 | start_debugging(config.debug_config) 26 | try: 27 | yield 28 | if config is not None and config.debug_config is not None: 29 | stop_debugging(config.debug_config) 30 | except Exception as e: 31 | raise e 32 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/src/stack_trace.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import faulthandler 16 | import logging 17 | import os 18 | import signal 19 | import sys 20 | import time 21 | 22 | from cloud_tpu_diagnostics.src.util import default 23 | 24 | _stack_trace_file_obj = None 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | def user_signal_handler_wrapper(file_descriptor, interval): 29 | def user_signal_handler(unused_signum, unused_frame): 30 | message = ( 31 | "INFO: Not a crash. cloud-tpu-diagnostics emits a" 32 | f" stack trace snapshot every {interval} seconds.\n" 33 | ) 34 | if file_descriptor is not sys.stderr: 35 | message = message.encode() 36 | file_descriptor.write(message) 37 | faulthandler.dump_traceback(file_descriptor, all_threads=False) 38 | 39 | return user_signal_handler 40 | 41 | 42 | def enable_stack_trace_dumping(stack_trace_config): 43 | """Enables stack trace dumping. 44 | 45 | Enables faulthandler and register SIGSEGV, SIGFPE, SIGABRT, 46 | SIGBUS, SIGILL and SIGUSR1 to collect stack trace. 47 | 48 | Args: 49 | stack_trace_config: configuration object for stack trace collection 50 | """ 51 | try: 52 | global _stack_trace_file_obj 53 | if stack_trace_config.stack_trace_to_cloud: 54 | stack_trace_file = _get_stack_trace_file() 55 | _stack_trace_file_obj = open(stack_trace_file, "wb") 56 | logger.info("Stack trace will be written in: %s", stack_trace_file) 57 | else: 58 | _stack_trace_file_obj = sys.stderr 59 | logger.info("Stack trace will be written to the console.") 60 | 61 | # Enables faulthandler for SIGSEGV, SIGFPE, SIGABRT, SIGBUS and SIGILL 62 | faulthandler.enable(file=_stack_trace_file_obj, all_threads=False) 63 | 64 | # Register SIGUSR1 signal to faulthandler 65 | faulthandler.register( 66 | signal.SIGUSR1, all_threads=False, file=_stack_trace_file_obj 67 | ) 68 | 69 | # Register handler for SIGUSR1 to dump traces 70 | signal.signal( 71 | signal.SIGUSR1, 72 | user_signal_handler_wrapper( 73 | _stack_trace_file_obj, 74 | stack_trace_config.stack_trace_interval_seconds, 75 | ), 76 | ) 77 | except Exception as e: # pylint: disable=broad-exception-caught 78 | logger.error("Error in enabling dumping of stack trace.", e) 79 | 80 | 81 | def disable_stack_trace_dumping(stack_trace_config): 82 | """Disable faulthandler and unregister user signals. 83 | 84 | Args: 85 | stack_trace_config: configuration object for stack trace collection 86 | """ 87 | try: 88 | global _stack_trace_file_obj 89 | if ( 90 | stack_trace_config.stack_trace_to_cloud 91 | and _stack_trace_file_obj is not None 92 | ): 93 | _stack_trace_file_obj.close() 94 | _stack_trace_file_obj = None 95 | 96 | faulthandler.unregister(signal.SIGUSR1) 97 | faulthandler.disable() 98 | except Exception as e: # pylint: disable=broad-exception-caught 99 | logger.error("Error in disabling dumping of stack trace.", e) 100 | 101 | 102 | def _get_stack_trace_file(): 103 | """Prefix stack trace file. 104 | 105 | Create a file with prefix as stack_trace_ and current local time in 106 | '%Y_%m_%d_%H_%M_%S' format inside default.STACK_TRACE_DIR_DEFAULT. 107 | 108 | Returns: 109 | path of stack trace file 110 | """ 111 | root_trace_folder = os.path.abspath(default.STACK_TRACE_DIR_DEFAULT) 112 | if not os.path.exists(root_trace_folder): 113 | os.makedirs(root_trace_folder) 114 | 115 | current_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) 116 | trace_file_name = "stack_trace_" + current_time + ".txt" 117 | stack_trace_file = os.path.join(root_trace_folder, trace_file_name) 118 | return stack_trace_file 119 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/src/util/default.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Stack trace default values 16 | COLLECT_STACK_TRACE_DEFAULT = False 17 | STACK_TRACE_TO_CLOUD_DEFAULT = False 18 | STACK_TRACE_DIR_DEFAULT = '/tmp/debugging/' 19 | STACK_TRACE_INTERVAL_SECONDS_DEFAULT = 600 # 10 minutes 20 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/src/util/stack_trace_test_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Script to raise different signals to test dumping of stack trace.""" 16 | 17 | import argparse 18 | import signal 19 | 20 | from cloud_tpu_diagnostics import diagnostic 21 | from cloud_tpu_diagnostics.configuration import debug_configuration 22 | from cloud_tpu_diagnostics.configuration import diagnostic_configuration 23 | from cloud_tpu_diagnostics.configuration import stack_trace_configuration 24 | 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--signal', help='name of signal to raise') 29 | parser.add_argument( 30 | '--collect_stack_trace', 31 | type=lambda x: (x.lower() == 'true'), 32 | help='whether to collect stack trace or not', 33 | ) 34 | parser.add_argument( 35 | '--log_to_cloud', 36 | type=lambda x: (x.lower() == 'true'), 37 | help='whether to log to cloud or console', 38 | ) 39 | args = parser.parse_args() 40 | debug_config = debug_configuration.DebugConfig( 41 | stack_trace_config=stack_trace_configuration.StackTraceConfig( 42 | collect_stack_trace=args.collect_stack_trace, 43 | stack_trace_to_cloud=args.log_to_cloud, 44 | stack_trace_interval_seconds=1, 45 | ), 46 | ) 47 | diagnostic_config = diagnostic_configuration.DiagnosticConfig( 48 | debug_config=debug_config 49 | ) 50 | with diagnostic.diagnose(diagnostic_config): 51 | if args.signal == 'SIGSEGV': 52 | signal.raise_signal(signal.SIGSEGV) 53 | 54 | if args.signal == 'SIGABRT': 55 | signal.raise_signal(signal.SIGABRT) 56 | 57 | if args.signal == 'SIGFPE': 58 | signal.raise_signal(signal.SIGFPE) 59 | 60 | if args.signal == 'SIGILL': 61 | signal.raise_signal(signal.SIGILL) 62 | 63 | if args.signal == 'SIGBUS': 64 | signal.raise_signal(signal.SIGBUS) 65 | 66 | if args.signal == 'SIGUSR1': 67 | signal.raise_signal(signal.SIGUSR1) 68 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/tests/debug_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import signal 16 | import threading 17 | from unittest import mock 18 | from absl.testing import absltest 19 | from cloud_tpu_diagnostics.src.config import debug_configuration 20 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration 21 | from cloud_tpu_diagnostics.src.debug import send_user_signal 22 | from cloud_tpu_diagnostics.src.debug import start_debugging 23 | from cloud_tpu_diagnostics.src.debug import stop_debugging 24 | 25 | 26 | class DebugTest(absltest.TestCase): 27 | 28 | def testDaemonThreadRunningWhenCollectStackTraceTrue(self): 29 | debug_config = debug_configuration.DebugConfig( 30 | stack_trace_config=stack_trace_configuration.StackTraceConfig( 31 | collect_stack_trace=True, 32 | stack_trace_to_cloud=True, 33 | stack_trace_interval_seconds=1, 34 | ), 35 | ) 36 | start_debugging(debug_config) 37 | self.assertEqual(threading.active_count(), 2) 38 | daemon_thread_list = list( 39 | filter(lambda thread: thread.daemon is True, threading.enumerate()) 40 | ) 41 | self.assertLen(daemon_thread_list, 1) 42 | stop_debugging(debug_config) 43 | self.assertEqual(threading.active_count(), 1) 44 | daemon_thread_list = list( 45 | filter(lambda thread: thread.daemon is True, threading.enumerate()) 46 | ) 47 | self.assertLen(daemon_thread_list, 0) 48 | 49 | def testDaemonThreadNotRunningWhenCollectStackTraceFalse(self): 50 | debug_config = debug_configuration.DebugConfig( 51 | stack_trace_config=stack_trace_configuration.StackTraceConfig( 52 | collect_stack_trace=False, 53 | stack_trace_to_cloud=True, 54 | stack_trace_interval_seconds=1, 55 | ), 56 | ) 57 | start_debugging(debug_config) 58 | self.assertEqual(threading.active_count(), 1) 59 | daemon_thread_list = list( 60 | filter(lambda thread: thread.daemon is True, threading.enumerate()) 61 | ) 62 | self.assertLen(daemon_thread_list, 0) 63 | stop_debugging(debug_config) 64 | self.assertEqual(threading.active_count(), 1) 65 | daemon_thread_list = list( 66 | filter(lambda thread: thread.daemon is True, threading.enumerate()) 67 | ) 68 | self.assertLen(daemon_thread_list, 0) 69 | 70 | @mock.patch( 71 | 'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.debug.disable_stack_trace_dumping' 72 | ) 73 | def testStopDebuggingDisableStackTraceDumpingCalled( 74 | self, disable_stack_trace_dumping_mock 75 | ): 76 | debug_config = debug_configuration.DebugConfig( 77 | stack_trace_config=stack_trace_configuration.StackTraceConfig( 78 | collect_stack_trace=True, 79 | stack_trace_to_cloud=True, 80 | stack_trace_interval_seconds=1, 81 | ), 82 | ) 83 | stop_debugging(debug_config) 84 | disable_stack_trace_dumping_mock.assert_called_once() 85 | self.assertEqual(threading.active_count(), 1) 86 | daemon_thread_list = list( 87 | filter(lambda thread: thread.daemon is True, threading.enumerate()) 88 | ) 89 | self.assertLen(daemon_thread_list, 0) 90 | 91 | def testSendUserSignalSIGUSR1SignalReceived(self): 92 | signal.signal(signal.SIGUSR1, user_signal_handler) 93 | stack_trace_interval_seconds = 1 94 | with self.assertRaises(Exception) as e: 95 | send_user_signal(stack_trace_interval_seconds) 96 | self.assertEqual(str(e.exception), 'SIGSUR1 signal received.') 97 | 98 | 99 | def user_signal_handler(signum, _): 100 | raise Exception('SIGSUR1 signal received.') # pylint: disable=broad-exception-caught 101 | 102 | 103 | if __name__ == '__main__': 104 | absltest.main() 105 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/tests/diagnose_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest import mock 16 | from absl.testing import absltest 17 | from cloud_tpu_diagnostics.src.config import debug_configuration 18 | from cloud_tpu_diagnostics.src.config import diagnostic_configuration 19 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration 20 | from cloud_tpu_diagnostics.src.diagnose import diagnose 21 | 22 | 23 | class DiagnoseTest(absltest.TestCase): 24 | 25 | @mock.patch( 26 | 'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.diagnose.start_debugging' 27 | ) 28 | @mock.patch( 29 | 'google3.third_party.cloud_tpu_monitoring_debugging.pip_package.cloud_tpu_diagnostics.src.diagnose.stop_debugging' 30 | ) 31 | def testDiagnoseContextManager( 32 | self, stop_debugging_mock, start_debugging_mock 33 | ): 34 | debug_config = debug_configuration.DebugConfig( 35 | stack_trace_config=stack_trace_configuration.StackTraceConfig( 36 | collect_stack_trace=True, 37 | stack_trace_to_cloud=True, 38 | ), 39 | ) 40 | diagnostic_config = diagnostic_configuration.DiagnosticConfig( 41 | debug_config=debug_config, 42 | ) 43 | with diagnose(diagnostic_config): 44 | pass 45 | start_debugging_mock.assert_called_once_with(debug_config) 46 | stop_debugging_mock.assert_called_once_with(debug_config) 47 | 48 | 49 | if __name__ == '__main__': 50 | absltest.main() 51 | -------------------------------------------------------------------------------- /pip_package/cloud_tpu_diagnostics/tests/stack_trace_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import faulthandler 16 | import os 17 | import shutil 18 | import signal 19 | import subprocess 20 | import sys 21 | import tempfile 22 | import textwrap 23 | import unittest 24 | from absl.testing import absltest 25 | from cloud_tpu_diagnostics.src.config import stack_trace_configuration 26 | from cloud_tpu_diagnostics.src.stack_trace import disable_stack_trace_dumping 27 | from cloud_tpu_diagnostics.src.stack_trace import enable_stack_trace_dumping 28 | from cloud_tpu_diagnostics.src.stack_trace import user_signal_handler_wrapper 29 | from cloud_tpu_diagnostics.src.util import default 30 | 31 | class StackTraceTest(absltest.TestCase): 32 | 33 | def setUp(self): 34 | super().setUp() 35 | package_dir = '/'.join(os.path.dirname(__file__).split('/')[:-1]) 36 | # Used to run test with blaze/bazel 37 | self.test_binary = os.path.join(package_dir, 'stack_trace_test_util') 38 | # Used to run test with unittest `python3 -m unittest stack_trace_test.py` 39 | self.test_file = os.path.join( 40 | package_dir, 'src/util/stack_trace_test_util.py' 41 | ) 42 | self.stack_trace_module = os.path.join(package_dir, 'src/stack_trace.py') 43 | 44 | def tearDown(self): 45 | super().tearDown() 46 | if os.path.exists(default.STACK_TRACE_DIR_DEFAULT): 47 | shutil.rmtree(default.STACK_TRACE_DIR_DEFAULT) 48 | 49 | @unittest.skipIf(not hasattr(signal, 'SIGSEGV'), 'Missing signal.SIGSEGV') 50 | def testSigsegvCollectStackTraceTrueTraceCollectedOnCloud(self): 51 | error = 'Fatal Python error: Segmentation fault' 52 | self.check_fatal_error(52, error, 'SIGSEGV', True) 53 | 54 | @unittest.skipIf(not hasattr(signal, 'SIGABRT'), 'Missing signal.SIGABRT') 55 | def testSigabrtCollectStackTraceTrueTraceCollectedOnCloud(self): 56 | error = 'Fatal Python error: Aborted' 57 | self.check_fatal_error(55, error, 'SIGABRT', True) 58 | 59 | @unittest.skipIf(not hasattr(signal, 'SIGFPE'), 'Missing signal.SIGFPE') 60 | def testSigfpeCollectStackTraceTrueTraceCollectedOnCloud(self): 61 | error = 'Fatal Python error: Floating point exception' 62 | try: 63 | self.check_fatal_error(58, error, 'SIGFPE', True) 64 | except AssertionError: 65 | # error message is different for Python 3.12 66 | error = 'Fatal Python error: Floating-point exception' 67 | self.check_fatal_error(58, error, 'SIGFPE', True) 68 | 69 | @unittest.skipIf(not hasattr(signal, 'SIGILL'), 'Missing signal.SIGILL') 70 | def testSigillCollectStackTraceTrueTraceCollectedOnCloud(self): 71 | error = 'Fatal Python error: Illegal instruction' 72 | self.check_fatal_error(61, error, 'SIGILL', True) 73 | 74 | @unittest.skipIf(not hasattr(signal, 'SIGBUS'), 'Missing signal.SIGBUS') 75 | def testSigbusCollectStackTraceTrueTraceCollectedOnCloud(self): 76 | error = 'Fatal Python error: Bus error' 77 | self.check_fatal_error(64, error, 'SIGBUS', True) 78 | 79 | @unittest.skipIf(not hasattr(signal, 'SIGUSR1'), 'Missing signal.SIGUSR1') 80 | def testSigusrCollectStackTraceTrueTraceCollectedOnCloud(self): 81 | self.check_fatal_error(67, '', 'SIGUSR1', True) 82 | 83 | def testCollectStackTraceFalseNoTraceDirCreated(self): 84 | process = self.run_python_code('', False, True) 85 | _, stderr = process.communicate() 86 | self.assertFalse(os.path.exists(default.STACK_TRACE_DIR_DEFAULT)) 87 | self.assertEmpty(stderr) 88 | 89 | @unittest.skipIf(not hasattr(signal, 'SIGUSR1'), 'Missing signal.SIGUSR1') 90 | def testCollectStackTraceToConsole(self): 91 | self.check_fatal_error(67, '', 'SIGUSR1', False) 92 | 93 | def testCollectStackTraceFalseNoTraceCollectedOnConsole(self): 94 | process = self.run_python_code('', False, False) 95 | _, stderr = process.communicate() 96 | self.assertEmpty(stderr) 97 | 98 | def testEnableStackTraceDumpingFaulthandlerEnabled(self): 99 | stack_trace_config = stack_trace_configuration.StackTraceConfig( 100 | collect_stack_trace=True, stack_trace_to_cloud=True 101 | ) 102 | with self.assertLogs(level='INFO') as log: 103 | enable_stack_trace_dumping(stack_trace_config) 104 | self.assertEqual(faulthandler.is_enabled(), True) 105 | self.assertRegex( 106 | log.output[0], 'Stack trace will be written in: /tmp/debugging/' 107 | ) 108 | 109 | def testDisableStackTraceDumpingFaulthandlerDisabled(self): 110 | stack_trace_config = stack_trace_configuration.StackTraceConfig( 111 | collect_stack_trace=True, stack_trace_to_cloud=True 112 | ) 113 | enable_stack_trace_dumping(stack_trace_config) 114 | disable_stack_trace_dumping(stack_trace_config) 115 | self.assertEqual(faulthandler.is_enabled(), False) 116 | 117 | def testUserSignalHandlerForStderr(self): 118 | file_obj = tempfile.NamedTemporaryFile('r+') 119 | sys.stderr = file_obj 120 | user_signal_handler = user_signal_handler_wrapper(sys.stderr, 30) 121 | user_signal_handler(signal.SIGUSR1, None) 122 | with open(file_obj.name, 'rb') as f: 123 | data = f.readlines() 124 | self.assertEqual( 125 | data[0], 126 | b'INFO: Not a crash. cloud-tpu-diagnostics emits a stack trace' 127 | b' snapshot every 30 seconds.\n', 128 | ) 129 | 130 | def testUserSignalHandlerForFile(self): 131 | file_obj = tempfile.NamedTemporaryFile('rb+') 132 | user_signal_handler = user_signal_handler_wrapper(file_obj, 30) 133 | user_signal_handler(signal.SIGUSR1, None) 134 | with open(file_obj.name, 'rb') as f: 135 | data = f.readlines() 136 | self.assertEqual( 137 | data[0], 138 | b'INFO: Not a crash. cloud-tpu-diagnostics emits a stack trace' 139 | b' snapshot every 30 seconds.\n', 140 | ) 141 | 142 | def check_fatal_error(self, line_number, error, signal_name, log_to_cloud): 143 | if error: 144 | header = r'Stack \(most recent call first\)' 145 | regex = """ 146 | {error} 147 | 148 | {header}: 149 | File "{filename}", line {line_number} in 150 | """ 151 | else: 152 | header = ( 153 | r'INFO: Not a crash. cloud\-tpu\-diagnostics emits a stack trace' 154 | r' snapshot every 1 seconds.\n' 155 | r'Stack \(most recent call first\)' 156 | ) 157 | regex = """ 158 | {header}: 159 | File "{stack_trace_module}", line 23 in user_signal_handler 160 | File "{filename}", line {line_number} in 161 | """ 162 | regex = ( 163 | textwrap.dedent(regex) 164 | .format( 165 | error=error, 166 | header=header, 167 | filename=self.test_file, 168 | stack_trace_module=self.stack_trace_module, 169 | line_number=line_number, 170 | ) 171 | .strip() 172 | ) 173 | 174 | output, stderr = self.get_output(signal_name, True, log_to_cloud) 175 | if log_to_cloud: 176 | self.assertRegex(output, regex) 177 | self.assertEmpty(stderr) 178 | else: 179 | self.assertRegex(stderr, regex) 180 | self.assertEmpty(output) 181 | 182 | def get_output(self, signal_name, collect_stack_trace, log_to_cloud): 183 | process = self.run_python_code( 184 | signal_name, collect_stack_trace, log_to_cloud 185 | ) 186 | _, stderr = process.communicate() 187 | stderr = stderr.decode('ascii', 'backslashreplace') 188 | output = '' 189 | if log_to_cloud: 190 | trace_file = os.listdir(default.STACK_TRACE_DIR_DEFAULT) 191 | if trace_file: 192 | stack_trace_file = default.STACK_TRACE_DIR_DEFAULT + trace_file[0] 193 | with open(stack_trace_file, 'rb') as fp: 194 | output = fp.read().decode('ascii', 'backslashreplace') 195 | return output, stderr 196 | 197 | def run_python_code(self, signal_name, collect_stack_trace, log_to_cloud): 198 | args = [ 199 | '--signal=' + signal_name, 200 | '--collect_stack_trace=' + str(collect_stack_trace), 201 | '--log_to_cloud=' + str(log_to_cloud), 202 | ] 203 | if sys.executable is not None: 204 | code = [sys.executable, self.test_file] 205 | else: 206 | code = [self.test_binary] 207 | return subprocess.Popen( 208 | code + args, 209 | stdout=subprocess.PIPE, 210 | stderr=subprocess.PIPE, 211 | env=os.environ.copy(), 212 | ) 213 | 214 | 215 | if __name__ == '__main__': 216 | absltest.main() 217 | -------------------------------------------------------------------------------- /pip_package/pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [project] 16 | name = "cloud-tpu-diagnostics" 17 | version = "0.1.5" 18 | authors = [ 19 | { name="Cloud TPU Team", email="cloud-tpu-eng@google.com" }, 20 | ] 21 | description = "Monitor, debug and profile the jobs running on Cloud TPU." 22 | readme = "README.md" 23 | requires-python = ">=3.8" 24 | license = {text = "Apache-2.0"} 25 | classifiers = [ 26 | "Programming Language :: Python :: 3.8", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | ] 31 | keywords = [] 32 | 33 | # pip dependencies installed with `pip install -e .` 34 | dependencies = [] 35 | 36 | [project.urls] 37 | "Homepage" = "https://github.com/google/cloud-tpu-monitoring-debugging" 38 | "Bug Tracker" = "https://github.com/google/cloud-tpu-monitoring-debugging/issues" 39 | 40 | [build-system] 41 | # Build system specify which backend is used to build/install the project 42 | requires = ["flit_core >=3.8,<4"] 43 | build-backend = "flit_core.buildapi" 44 | 45 | [tool.flit.sdist] 46 | # Flit specific options (files to exclude from the PyPI package) 47 | exclude = [ 48 | # Do not release tests files on PyPI 49 | "tests/*_test.py", 50 | ] 51 | --------------------------------------------------------------------------------