├── doc-images └── grafana-dash-launch-stack.png ├── prometheus ├── config_files │ ├── node_exporter.service │ ├── yarn_jmx_env_setup.txt │ ├── yarn_jmx_config_node_manager.yaml │ ├── hdfs_jmx_config_namenode.yaml │ ├── yarn_jmx_config_resource_manager.yaml │ └── hdfs_jmx_config_datanode.yaml ├── textfiles │ └── emr_node_info.sh └── bootstrap_monitoring.sh ├── README.md ├── LICENSE └── grafana-dashboards ├── Log+Metrics.json ├── RPC+Metrics.json ├── JVM+Metrics.json ├── YARN+-+Node+Manager.json └── HDFS+-+DataNode.json /doc-images/grafana-dash-launch-stack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vinicelms/emr-monitoring-prometheus-grafana/HEAD/doc-images/grafana-dash-launch-stack.png -------------------------------------------------------------------------------- /prometheus/config_files/node_exporter.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Node Exporter 3 | 4 | [Service] 5 | User=node_exporter 6 | Group=node_exporter 7 | ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory /etc/prometheus/textfiles $OPTIONS 8 | 9 | [Install] 10 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /prometheus/textfiles/emr_node_info.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | ## Description: get EMR instance role 4 | # 5 | ROLE=$(cat /var/aws/emr/userData.json | jq -r '.isMaster') 6 | CLUSTER_ID=$(cat /var/aws/emr/userData.json | jq -r '.clusterId') 7 | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) 8 | 9 | echo '# HELP emr_node_info EMR cluster info' > /etc/prometheus/textfiles/emr_node_info.prom 10 | echo '# TYPE emr_node_info gauge' >> /etc/prometheus/textfiles/emr_node_info.prom 11 | if [ ${ROLE} == 'true' ] 12 | then 13 | echo "emr_node_info{isMaster=\"true\",isSlave=\"false\",cluster_id=\"${CLUSTER_ID}\",instance_id=\"${INSTANCE_ID}\"} 1" >> /etc/prometheus/textfiles/emr_node_info.prom 14 | else 15 | echo "emr_node_info{isMaster=\"false\",isSlave=\"true\",cluster_id=\"${CLUSTER_ID}\",instance_id=\"${INSTANCE_ID}\"} 1" >> /etc/prometheus/textfiles/emr_node_info.prom 16 | fi -------------------------------------------------------------------------------- /prometheus/config_files/yarn_jmx_env_setup.txt: -------------------------------------------------------------------------------- 1 | 2 | if [[ $YARN_RESOURCEMANAGER_OPTS != *"jmx_prometheus_javaagent"* ]]; then 3 | export YARN_RESOURCEMANAGER_OPTS="${YARN_RESOURCEMANAGER_OPTS} -javaagent:/etc/prometheus/jmx_prometheus_javaagent-__JMX_EXPORTER_VERSION__.jar=7005:/etc/hadoop/conf/yarn_jmx_config_resource_manager.yaml -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=50111" 4 | fi 5 | 6 | if [[ $YARN_NODEMANAGER_OPTS != *"jmx_prometheus_javaagent"* ]]; then 7 | export YARN_NODEMANAGER_OPTS="${YARN_NODEMANAGER_OPTS} -javaagent:/etc/prometheus/jmx_prometheus_javaagent-__JMX_EXPORTER_VERSION__.jar=7005:/etc/hadoop/conf/yarn_jmx_config_node_manager.yaml -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=50111" 8 | fi -------------------------------------------------------------------------------- /prometheus/config_files/yarn_jmx_config_node_manager.yaml: -------------------------------------------------------------------------------- 1 | lowercaseOutputName: true 2 | lowercaseOutputLabelNames: true 3 | rules: 4 | # RPC 5 | - pattern: 'Hadoop]+)><>([\w.]+)' 6 | attrNameSnakeCase: true 7 | name: rpc_activity_$3 8 | labels: 9 | role: "$1" 10 | port: "$2" 11 | # Log 12 | - pattern: 'Hadoop<>Log(Warn|Fatal|Info|Error)' 13 | name: log_$2 14 | labels: 15 | role: $1 16 | - pattern: 'Hadoop]+)><>([\w.]+)' 17 | attrNameSnakeCase: true 18 | name: yarn_nodemanager_$1_$2 19 | - pattern: 'Hadoop<>([\w.]+)' 20 | attrNameSnakeCase: true 21 | name: yarn_nodemanager_metrics_$1 22 | - pattern: 'Hadoop]+)><>([\w.]+)' 23 | attrNameSnakeCase: true 24 | name: yarn_nodemanager_$1_$2 25 | - pattern: '.*' -------------------------------------------------------------------------------- /prometheus/config_files/hdfs_jmx_config_namenode.yaml: -------------------------------------------------------------------------------- 1 | lowercaseOutputName: true 2 | lowercaseOutputLabelNames: true 3 | rules: 4 | # RPC 5 | - pattern: 'Hadoop]+)><>([\w.]+)' 6 | attrNameSnakeCase: true 7 | name: rpc_activity_$3 8 | labels: 9 | role: "$1" 10 | port: "$2" 11 | # Log 12 | - pattern: 'Hadoop<>Log(Warn|Fatal|Info|Error)' 13 | name: log_$2 14 | labels: 15 | role: $1 16 | # MetricsSystem 17 | - pattern: 'Hadoop<>(.*): (\d+)' 18 | attrNameSnakeCase: true 19 | name: hdfs_$1_$3 20 | value: $4 21 | labels: 22 | role: $1 23 | kind: 'MetricsSystem' 24 | sub: $2 25 | type: GAUGE 26 | # All NameNode infos 27 | - pattern: 'Hadoop<>(.*): (\d+)' 28 | attrNameSnakeCase: true 29 | name: hdfs_$1_$3 30 | value: $4 31 | labels: 32 | role: $1 33 | kind: $2 34 | type: GAUGE 35 | - pattern: '.*' -------------------------------------------------------------------------------- /prometheus/config_files/yarn_jmx_config_resource_manager.yaml: -------------------------------------------------------------------------------- 1 | lowercaseOutputName: true 2 | lowercaseOutputLabelNames: true 3 | rules: 4 | # RPC 5 | - pattern: 'Hadoop]+)><>([\w.]+)' 6 | attrNameSnakeCase: true 7 | name: rpc_activity_$3 8 | labels: 9 | role: "$1" 10 | port: "$2" 11 | # Log 12 | - pattern: 'Hadoop<>Log(Warn|Fatal|Info|Error)' 13 | name: log_$2 14 | labels: 15 | role: $1 16 | - pattern: 'Hadoop]+)><>([\w.]+)' 17 | attrNameSnakeCase: true 18 | name: yarn_resourcemanager_$1_$2 19 | - pattern: 'Hadoop]+)><>([\w.]+)' 20 | name: yarn_resourcemanager_queue_$2 21 | attrNameSnakeCase: true 22 | labels: 23 | queue: "$1" 24 | - pattern: 'Hadoop]+), q.=([^\W>]+)><>([\w.]+)' 25 | name: yarn_resourcemanager_queue_$3 26 | attrNameSnakeCase: true 27 | labels: 28 | queue: "$1" 29 | child_queue: "$2" 30 | - pattern: 'Hadoop]+), q.=([^\W>]+), q.=([^\W>]+)><>([\w.]+)' 31 | name: yarn_resourcemanager_queue_$4 32 | attrNameSnakeCase: true 33 | labels: 34 | queue: "$1" 35 | child_queue: "$2" 36 | second_level_queue: "$3" 37 | - pattern: 'Hadoop]+)><>([\w.]+)' 38 | attrNameSnakeCase: true 39 | name: yarn_resourcemanager_$1_$2 40 | - pattern: '.*' -------------------------------------------------------------------------------- /prometheus/config_files/hdfs_jmx_config_datanode.yaml: -------------------------------------------------------------------------------- 1 | lowercaseOutputName: true 2 | lowercaseOutputLabelNames: true 3 | rules: 4 | # RPC 5 | - pattern: 'Hadoop]+)><>([\w.]+)' 6 | attrNameSnakeCase: true 7 | name: rpc_activity_$3 8 | labels: 9 | role: "$1" 10 | port: "$2" 11 | # Log 12 | - pattern: 'Hadoop<>Log(Warn|Fatal|Info|Error)' 13 | name: log_$2 14 | labels: 15 | role: $1 16 | # MetricsSystem 17 | - pattern: 'Hadoop<>(.*): (\d+)' 18 | attrNameSnakeCase: true 19 | name: hdfs_$1_$3 20 | value: $4 21 | labels: 22 | role: $1 23 | kind: 'MetricsSystem' 24 | sub: $2 25 | type: GAUGE 26 | # FSDatasetState (also extracts the FSDataset ID) 27 | - pattern: 'Hadoop<>(.*): (\d+)' 28 | attrNameSnakeCase: true 29 | name: hdfs_$1_$3 30 | value: $4 31 | labels: 32 | role: $1 33 | fsdatasetid: $2 34 | kind: 'FSDatasetState' 35 | type: GAUGE 36 | # DataNodeActivity (also extracts hostname and port) 37 | - pattern: 'Hadoop<>(.*): (\d+)' 38 | attrNameSnakeCase: true 39 | name: hdfs_$1_$4 40 | value: $5 41 | labels: 42 | role: $1 43 | host: $2 44 | port: $3 45 | kind: 'DataNodeActivity' 46 | type: GAUGE 47 | # All other services 48 | - pattern: 'Hadoop<>(.*): (\d+)' 49 | attrNameSnakeCase: true 50 | name: hdfs_$1_$3 51 | value: $4 52 | labels: 53 | role: $1 54 | kind: $2 55 | type: GAUGE 56 | - pattern: '.*' -------------------------------------------------------------------------------- /prometheus/bootstrap_monitoring.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | 3 | REPO_URL="https://raw.githubusercontent.com/vinicelms/emr-monitoring-prometheus-grafana/master" 4 | NODE_EXPORTER_VERSION="1.0.1" 5 | JMX_EXPORTER_VERSION="0.14.0" 6 | 7 | #set up node_exporter for pushing OS level metrics 8 | sudo useradd --no-create-home --shell /bin/false node_exporter 9 | cd /tmp 10 | wget https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz 11 | tar -xvzf node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz 12 | cd node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64 13 | sudo cp node_exporter /usr/local/bin/ 14 | sudo chown node_exporter:node_exporter /usr/local/bin/node_exporter 15 | 16 | cd /tmp 17 | wget ${REPO_URL}/prometheus/config_files/node_exporter.service 18 | sudo cp node_exporter.service /etc/systemd/system/node_exporter.service 19 | sudo chown node_exporter:node_exporter /etc/systemd/system/node_exporter.service 20 | sudo systemctl daemon-reload && \ 21 | sudo systemctl start node_exporter && \ 22 | sudo systemctl status node_exporter && \ 23 | sudo systemctl enable node_exporter 24 | 25 | #set up jmx_exporter for pushing application metrics 26 | wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_EXPORTER_VERSION}/jmx_prometheus_javaagent-${JMX_EXPORTER_VERSION}.jar 27 | sudo mkdir -p /etc/prometheus/textfiles 28 | sudo cp jmx_prometheus_javaagent-${JMX_EXPORTER_VERSION}.jar /etc/prometheus 29 | 30 | wget ${REPO_URL}/prometheus/config_files/hdfs_jmx_config_namenode.yaml 31 | wget ${REPO_URL}/prometheus/config_files/hdfs_jmx_config_datanode.yaml 32 | wget ${REPO_URL}/prometheus/config_files/yarn_jmx_config_resource_manager.yaml 33 | wget ${REPO_URL}/prometheus/config_files/yarn_jmx_config_node_manager.yaml 34 | wget ${REPO_URL}/prometheus/textfiles/emr_node_info.sh 35 | 36 | HADOOP_CONF='/etc/hadoop/conf' 37 | sudo mkdir -p ${HADOOP_CONF} 38 | sudo cp hdfs_jmx_config_namenode.yaml ${HADOOP_CONF} 39 | sudo cp hdfs_jmx_config_datanode.yaml ${HADOOP_CONF} 40 | sudo cp yarn_jmx_config_resource_manager.yaml ${HADOOP_CONF} 41 | sudo cp yarn_jmx_config_node_manager.yaml ${HADOOP_CONF} 42 | sudo cp emr_node_info.sh /etc/prometheus/textfiles 43 | sudo /etc/prometheus/textfiles/emr_node_info.sh 44 | 45 | 46 | # Yarn configuration setup 47 | wget ${REPO_URL}/prometheus/config_files/yarn_jmx_env_setup.txt 48 | sed -i "s/__JMX_EXPORTER_VERSION__/${JMX_EXPORTER_VERSION}/g" /tmp/yarn_jmx_env_setup.txt 49 | cat /tmp/yarn_jmx_env_setup.txt | sudo tee -a /etc/hadoop/conf/yarn-env.sh > /dev/null 50 | 51 | exit 0 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EMR Monitoring - Prometheus + Grafana 2 | Project to concentrate files and settings for AWS EMR monitoring. Source: https://aws.amazon.com/blogs/big-data/monitor-and-optimize-analytic-workloads-on-amazon-emr-with-prometheus-and-grafana/ 3 | 4 | ## Important 5 | 6 | This project was entirely based on the project contained in this URL: https://aws.amazon.com/blogs/big-data/monitor-and-optimize-analytic-workloads-on-amazon-emr-with-prometheus-and-grafana/ 7 | 8 | I just dismembered the scripts contained in the CloudFormation stack so that it was possible to customize some things I wanted. There is no intention to plagiarize any knowledge! 9 | 10 | All merit and credits must be given to the AWS EMR team engineers who created a complete solution to easily provision resources. 11 | 12 | 13 | ## How to use this project 14 | 15 | Build your cluster the way you want, either via the control panel, AWS-CLI or Terraform. You will need to add a configuration snippet for Hadoop to use JMX Exporter. You will also need to apply a script to bootstrap the cluster. 16 | 17 | ### Configuring the JMX Exporter on Hadoop 18 | ``` 19 | [ 20 | { 21 | "Classification": "hadoop-env", 22 | "Configurations": [ 23 | { 24 | "Classification": "export", 25 | "Properties": { 26 | "HADOOP_DATANODE_OPTS": "\"-javaagent:/etc/prometheus/jmx_prometheus_javaagent-0.14.0.jar=7001:/etc/hadoop/conf/hdfs_jmx_config_datanode.yaml -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=50103\"", 27 | "HADOOP_NAMENODE_OPTS": "\"-javaagent:/etc/prometheus/jmx_prometheus_javaagent-0.14.0.jar=7001:/etc/hadoop/conf/hdfs_jmx_config_namenode.yaml -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=50103\"" 28 | } 29 | } 30 | ], 31 | "Properties": {} 32 | } 33 | ] 34 | ``` 35 | > Stay tuned in the defined version of the JMX Exporter, because if you change, you will need to change this section. 36 | 37 | ### Bootstrap 38 | 39 | Upload the file [bootstrap.sh](prometheus/bootstrap_monitoring.sh) in S3 and then set the script path in the bootstrap step of the cluster. 40 | 41 | ## Grafana Dashboards 42 | 43 | Since the dashboards were created by AWS, I will document how to get the dashboards to make the process easier. If AWS updates them, you will be able to get the dashboards yourself. 44 | 45 | 1. Access the project URL: https://aws.amazon.com/blogs/big-data/monitor-and-optimize-analytic-workloads-on-amazon-emr-with-prometheus-and-grafana/ 46 | 2. Copy the Launch Stack link address: 47 | > ![Grafana Dashboard Launch Stack](doc-images/grafana-dash-launch-stack.png) 48 | > Example: https://us-east-1.console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/create/template?templateURL=https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/cloudformation_templates/emrPrometheusGrafana.cf.json 49 | 3. Copy only the parameter `templateURL`: 50 | > Example: https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/cloudformation_templates/emrPrometheusGrafana.cf.json 51 | 4. Download the contents of this URL or visit via browser 52 | 5. In the file, search for the term `setup-grafana.sh` and download this file 53 | > Example: `wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/scripts/setup-grafana.sh` 54 | 6. Download all files with the extension `.json` 55 | > Example: 56 | > ```sh 57 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/HDFS+-+DataNode.json 58 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/HDFS+-+NameNode.json 59 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/JVM+Metrics.json 60 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/Log+Metrics.json 61 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/OS+Level+Metrics.json 62 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/RPC+Metrics.json 63 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/YARN+-+Node+Manager.json 64 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/YARN+-+Queues.json 65 | > wget https://aws-bigdata-blog.s3.amazonaws.com/artifacts/aws-blog-emr-prometheus-grafana/dashboards/YARN+-+Resource+Manager.json 66 | > ``` 67 | 68 | ### Important 69 | 70 | I will keep the files in the repository if someone needs these dashboards in case of unavailability of the page. If someone identifies that there were changes, please open a PR with the new files. 71 | 72 | [Grafana Dashboards directory](grafana-dashboards) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /grafana-dashboards/Log+Metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "$$hashKey": "object:29855", 6 | "builtIn": 1, 7 | "datasource": "-- Grafana --", 8 | "enable": true, 9 | "hide": true, 10 | "iconColor": "rgba(0, 211, 255, 1)", 11 | "name": "Annotations & Alerts", 12 | "type": "dashboard" 13 | } 14 | ] 15 | }, 16 | "editable": true, 17 | "gnetId": null, 18 | "graphTooltip": 0, 19 | "iteration": 1595830189145, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "aliasColors": {}, 24 | "bars": false, 25 | "dashLength": 10, 26 | "dashes": false, 27 | "datasource": "Prometheus", 28 | "decimals": 1, 29 | "fill": 1, 30 | "fillGradient": 0, 31 | "gridPos": { 32 | "h": 7, 33 | "w": 8, 34 | "x": 0, 35 | "y": 0 36 | }, 37 | "hiddenSeries": false, 38 | "id": 2, 39 | "legend": { 40 | "alignAsTable": true, 41 | "avg": true, 42 | "current": true, 43 | "max": true, 44 | "min": false, 45 | "show": true, 46 | "total": false, 47 | "values": true 48 | }, 49 | "lines": true, 50 | "linewidth": 1, 51 | "nullPointMode": "null", 52 | "options": { 53 | "dataLinks": [] 54 | }, 55 | "percentage": false, 56 | "pointradius": 2, 57 | "points": false, 58 | "renderer": "flot", 59 | "seriesOverrides": [], 60 | "spaceLength": 10, 61 | "stack": false, 62 | "steppedLine": false, 63 | "targets": [ 64 | { 65 | "expr": "increase(log_fatal {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'} [$interval])", 66 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 67 | "refId": "A" 68 | } 69 | ], 70 | "thresholds": [], 71 | "timeFrom": null, 72 | "timeRegions": [], 73 | "timeShift": null, 74 | "title": "Fatals", 75 | "tooltip": { 76 | "shared": true, 77 | "sort": 1, 78 | "value_type": "individual" 79 | }, 80 | "type": "graph", 81 | "xaxis": { 82 | "buckets": null, 83 | "mode": "time", 84 | "name": null, 85 | "show": true, 86 | "values": [] 87 | }, 88 | "yaxes": [ 89 | { 90 | "format": "short", 91 | "label": "", 92 | "logBase": 1, 93 | "max": null, 94 | "min": null, 95 | "show": true 96 | }, 97 | { 98 | "format": "short", 99 | "label": null, 100 | "logBase": 1, 101 | "max": null, 102 | "min": null, 103 | "show": true 104 | } 105 | ], 106 | "yaxis": { 107 | "align": false, 108 | "alignLevel": null 109 | } 110 | }, 111 | { 112 | "aliasColors": {}, 113 | "bars": false, 114 | "dashLength": 10, 115 | "dashes": false, 116 | "datasource": "Prometheus", 117 | "decimals": 1, 118 | "fill": 1, 119 | "fillGradient": 0, 120 | "gridPos": { 121 | "h": 7, 122 | "w": 8, 123 | "x": 8, 124 | "y": 0 125 | }, 126 | "hiddenSeries": false, 127 | "id": 3, 128 | "legend": { 129 | "alignAsTable": true, 130 | "avg": true, 131 | "current": true, 132 | "max": true, 133 | "min": false, 134 | "show": true, 135 | "total": false, 136 | "values": true 137 | }, 138 | "lines": true, 139 | "linewidth": 1, 140 | "nullPointMode": "null", 141 | "options": { 142 | "dataLinks": [] 143 | }, 144 | "percentage": false, 145 | "pointradius": 2, 146 | "points": false, 147 | "renderer": "flot", 148 | "seriesOverrides": [], 149 | "spaceLength": 10, 150 | "stack": false, 151 | "steppedLine": false, 152 | "targets": [ 153 | { 154 | "expr": "increase(log_error {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'} [$interval])", 155 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 156 | "refId": "A" 157 | } 158 | ], 159 | "thresholds": [], 160 | "timeFrom": null, 161 | "timeRegions": [], 162 | "timeShift": null, 163 | "title": "Errors", 164 | "tooltip": { 165 | "shared": true, 166 | "sort": 1, 167 | "value_type": "individual" 168 | }, 169 | "type": "graph", 170 | "xaxis": { 171 | "buckets": null, 172 | "mode": "time", 173 | "name": null, 174 | "show": true, 175 | "values": [] 176 | }, 177 | "yaxes": [ 178 | { 179 | "format": "short", 180 | "label": null, 181 | "logBase": 1, 182 | "max": null, 183 | "min": null, 184 | "show": true 185 | }, 186 | { 187 | "format": "short", 188 | "label": null, 189 | "logBase": 1, 190 | "max": null, 191 | "min": null, 192 | "show": true 193 | } 194 | ], 195 | "yaxis": { 196 | "align": false, 197 | "alignLevel": null 198 | } 199 | }, 200 | { 201 | "aliasColors": {}, 202 | "bars": false, 203 | "dashLength": 10, 204 | "dashes": false, 205 | "datasource": "Prometheus", 206 | "decimals": 1, 207 | "fill": 1, 208 | "fillGradient": 0, 209 | "gridPos": { 210 | "h": 7, 211 | "w": 8, 212 | "x": 16, 213 | "y": 0 214 | }, 215 | "hiddenSeries": false, 216 | "id": 4, 217 | "legend": { 218 | "alignAsTable": true, 219 | "avg": true, 220 | "current": true, 221 | "max": true, 222 | "min": false, 223 | "show": true, 224 | "total": false, 225 | "values": true 226 | }, 227 | "lines": true, 228 | "linewidth": 1, 229 | "nullPointMode": "null", 230 | "options": { 231 | "dataLinks": [] 232 | }, 233 | "percentage": false, 234 | "pointradius": 2, 235 | "points": false, 236 | "renderer": "flot", 237 | "seriesOverrides": [], 238 | "spaceLength": 10, 239 | "stack": false, 240 | "steppedLine": false, 241 | "targets": [ 242 | { 243 | "expr": "increase(log_warn {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'} [$interval])", 244 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 245 | "refId": "A" 246 | } 247 | ], 248 | "thresholds": [], 249 | "timeFrom": null, 250 | "timeRegions": [], 251 | "timeShift": null, 252 | "title": "Warnings", 253 | "tooltip": { 254 | "shared": true, 255 | "sort": 1, 256 | "value_type": "individual" 257 | }, 258 | "type": "graph", 259 | "xaxis": { 260 | "buckets": null, 261 | "mode": "time", 262 | "name": null, 263 | "show": true, 264 | "values": [] 265 | }, 266 | "yaxes": [ 267 | { 268 | "format": "short", 269 | "label": null, 270 | "logBase": 1, 271 | "max": null, 272 | "min": null, 273 | "show": true 274 | }, 275 | { 276 | "format": "short", 277 | "label": null, 278 | "logBase": 1, 279 | "max": null, 280 | "min": null, 281 | "show": true 282 | } 283 | ], 284 | "yaxis": { 285 | "align": false, 286 | "alignLevel": null 287 | } 288 | } 289 | ], 290 | "schemaVersion": 22, 291 | "style": "dark", 292 | "tags": [ 293 | "Amazon EMR", 294 | "Logs", 295 | "Prometheus" 296 | ], 297 | "templating": { 298 | "list": [ 299 | { 300 | "auto": false, 301 | "auto_count": 30, 302 | "auto_min": "10s", 303 | "current": { 304 | "selected": false, 305 | "text": "1m", 306 | "value": "1m" 307 | }, 308 | "hide": 0, 309 | "label": "Interval", 310 | "name": "interval", 311 | "options": [ 312 | { 313 | "selected": true, 314 | "text": "1m", 315 | "value": "1m" 316 | }, 317 | { 318 | "selected": false, 319 | "text": "5m", 320 | "value": "5m" 321 | }, 322 | { 323 | "selected": false, 324 | "text": "10m", 325 | "value": "10m" 326 | }, 327 | { 328 | "selected": false, 329 | "text": "30m", 330 | "value": "30m" 331 | }, 332 | { 333 | "selected": false, 334 | "text": "1h", 335 | "value": "1h" 336 | }, 337 | { 338 | "selected": false, 339 | "text": "6h", 340 | "value": "6h" 341 | }, 342 | { 343 | "selected": false, 344 | "text": "12h", 345 | "value": "12h" 346 | }, 347 | { 348 | "selected": false, 349 | "text": "1d", 350 | "value": "1d" 351 | }, 352 | { 353 | "selected": false, 354 | "text": "7d", 355 | "value": "7d" 356 | }, 357 | { 358 | "selected": false, 359 | "text": "14d", 360 | "value": "14d" 361 | }, 362 | { 363 | "selected": false, 364 | "text": "30d", 365 | "value": "30d" 366 | } 367 | ], 368 | "query": "1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 369 | "refresh": 2, 370 | "skipUrlSync": false, 371 | "type": "interval" 372 | }, 373 | { 374 | "allValue": null, 375 | "current": { 376 | "selected": false, 377 | "text": "All", 378 | "value": "$__all" 379 | }, 380 | "datasource": "Prometheus", 381 | "definition": "label_values(log_fatal, role)", 382 | "hide": 0, 383 | "includeAll": true, 384 | "index": -1, 385 | "label": "Role", 386 | "multi": true, 387 | "name": "role", 388 | "options": [], 389 | "query": "label_values(log_fatal, role)", 390 | "refresh": 1, 391 | "regex": "", 392 | "skipUrlSync": false, 393 | "sort": 0, 394 | "tagValuesQuery": "", 395 | "tags": [], 396 | "tagsQuery": "", 397 | "type": "query", 398 | "useTags": false 399 | }, 400 | { 401 | "allValue": null, 402 | "current": { 403 | "selected": false, 404 | "text": "All", 405 | "value": "$__all" 406 | }, 407 | "datasource": "Prometheus", 408 | "definition": "label_values(log_fatal {role=~\"$role\"}, job)", 409 | "hide": 0, 410 | "includeAll": true, 411 | "index": -1, 412 | "label": "Job", 413 | "multi": true, 414 | "name": "job", 415 | "options": [], 416 | "query": "label_values(log_fatal {role=~\"$role\"}, job)", 417 | "refresh": 1, 418 | "regex": "", 419 | "skipUrlSync": false, 420 | "sort": 1, 421 | "tagValuesQuery": "", 422 | "tags": [], 423 | "tagsQuery": "", 424 | "type": "query", 425 | "useTags": false 426 | }, 427 | { 428 | "allValue": null, 429 | "current": { 430 | "text": "j-IL8MLGKV6TBR", 431 | "value": "j-IL8MLGKV6TBR" 432 | }, 433 | "datasource": "Prometheus", 434 | "definition": "label_values(log_fatal {role=~\"$role\", job=~\"$job\"}, cluster_id)", 435 | "hide": 0, 436 | "includeAll": true, 437 | "index": -1, 438 | "label": "Cluster", 439 | "multi": true, 440 | "name": "cluster", 441 | "options": [], 442 | "query": "label_values(log_fatal {role=~\"$role\", job=~\"$job\"}, cluster_id)", 443 | "refresh": 1, 444 | "regex": "", 445 | "skipUrlSync": false, 446 | "sort": 0, 447 | "tagValuesQuery": "", 448 | "tags": [], 449 | "tagsQuery": "", 450 | "type": "query", 451 | "useTags": false 452 | }, 453 | { 454 | "allValue": null, 455 | "current": { 456 | "selected": false, 457 | "text": "All", 458 | "value": "$__all" 459 | }, 460 | "datasource": "Prometheus", 461 | "definition": "label_values(log_fatal {role=~\"$role\", job=~\"$job\", cluster_id=\"$cluster\"}, instance)", 462 | "hide": 0, 463 | "includeAll": true, 464 | "index": -1, 465 | "label": "Instance", 466 | "multi": true, 467 | "name": "instance", 468 | "options": [], 469 | "query": "label_values(log_fatal {role=~\"$role\", job=~\"$job\", cluster_id=\"$cluster\"}, instance)", 470 | "refresh": 1, 471 | "regex": "", 472 | "skipUrlSync": false, 473 | "sort": 0, 474 | "tagValuesQuery": "", 475 | "tags": [], 476 | "tagsQuery": "", 477 | "type": "query", 478 | "useTags": false 479 | } 480 | ] 481 | }, 482 | "time": { 483 | "from": "now-6h", 484 | "to": "now" 485 | }, 486 | "timepicker": { 487 | "refresh_intervals": [ 488 | "5s", 489 | "10s", 490 | "30s", 491 | "1m", 492 | "5m", 493 | "15m", 494 | "30m", 495 | "1h", 496 | "2h", 497 | "1d" 498 | ] 499 | }, 500 | "timezone": "", 501 | "title": "Log Metrics", 502 | "uid": "lgWGgreZk", 503 | "variables": { 504 | "list": [] 505 | }, 506 | "version": 1 507 | } -------------------------------------------------------------------------------- /grafana-dashboards/RPC+Metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "$$hashKey": "object:31174", 6 | "builtIn": 1, 7 | "datasource": "-- Grafana --", 8 | "enable": true, 9 | "hide": true, 10 | "iconColor": "rgba(0, 211, 255, 1)", 11 | "name": "Annotations & Alerts", 12 | "type": "dashboard" 13 | } 14 | ] 15 | }, 16 | "editable": true, 17 | "gnetId": null, 18 | "graphTooltip": 0, 19 | "iteration": 1595831019444, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "aliasColors": {}, 24 | "bars": false, 25 | "dashLength": 10, 26 | "dashes": false, 27 | "datasource": "Prometheus", 28 | "decimals": 1, 29 | "fill": 1, 30 | "fillGradient": 0, 31 | "gridPos": { 32 | "h": 7, 33 | "w": 8, 34 | "x": 0, 35 | "y": 0 36 | }, 37 | "hiddenSeries": false, 38 | "id": 5, 39 | "legend": { 40 | "alignAsTable": true, 41 | "avg": true, 42 | "current": true, 43 | "max": true, 44 | "min": false, 45 | "show": true, 46 | "total": false, 47 | "values": true 48 | }, 49 | "lines": true, 50 | "linewidth": 1, 51 | "nullPointMode": "null", 52 | "options": { 53 | "dataLinks": [] 54 | }, 55 | "percentage": false, 56 | "pointradius": 2, 57 | "points": false, 58 | "renderer": "flot", 59 | "seriesOverrides": [], 60 | "spaceLength": 10, 61 | "stack": false, 62 | "steppedLine": false, 63 | "targets": [ 64 | { 65 | "expr": "sum by (job, cluster_id, instance, role) (rpc_activity_call_queue_length {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'})", 66 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 67 | "refId": "A" 68 | } 69 | ], 70 | "thresholds": [], 71 | "timeFrom": null, 72 | "timeRegions": [], 73 | "timeShift": null, 74 | "title": "Call Queue Length", 75 | "tooltip": { 76 | "shared": true, 77 | "sort": 0, 78 | "value_type": "individual" 79 | }, 80 | "type": "graph", 81 | "xaxis": { 82 | "buckets": null, 83 | "mode": "time", 84 | "name": null, 85 | "show": true, 86 | "values": [] 87 | }, 88 | "yaxes": [ 89 | { 90 | "format": "short", 91 | "label": null, 92 | "logBase": 1, 93 | "max": null, 94 | "min": null, 95 | "show": true 96 | }, 97 | { 98 | "format": "short", 99 | "label": null, 100 | "logBase": 1, 101 | "max": null, 102 | "min": null, 103 | "show": true 104 | } 105 | ], 106 | "yaxis": { 107 | "align": false, 108 | "alignLevel": null 109 | } 110 | }, 111 | { 112 | "aliasColors": {}, 113 | "bars": false, 114 | "dashLength": 10, 115 | "dashes": false, 116 | "datasource": "Prometheus", 117 | "decimals": 1, 118 | "fill": 1, 119 | "fillGradient": 0, 120 | "gridPos": { 121 | "h": 7, 122 | "w": 8, 123 | "x": 8, 124 | "y": 0 125 | }, 126 | "hiddenSeries": false, 127 | "id": 10, 128 | "legend": { 129 | "alignAsTable": true, 130 | "avg": true, 131 | "current": true, 132 | "max": true, 133 | "min": false, 134 | "show": true, 135 | "total": false, 136 | "values": true 137 | }, 138 | "lines": true, 139 | "linewidth": 1, 140 | "nullPointMode": "null", 141 | "options": { 142 | "dataLinks": [] 143 | }, 144 | "percentage": false, 145 | "pointradius": 2, 146 | "points": false, 147 | "renderer": "flot", 148 | "seriesOverrides": [], 149 | "spaceLength": 10, 150 | "stack": false, 151 | "steppedLine": false, 152 | "targets": [ 153 | { 154 | "expr": "sum by (job, cluster_id, instance, role) (rate(rpc_activity_rpc_queue_time_num_ops {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'} [$interval]))", 155 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 156 | "refId": "A" 157 | } 158 | ], 159 | "thresholds": [], 160 | "timeFrom": null, 161 | "timeRegions": [], 162 | "timeShift": null, 163 | "title": "Call Rate", 164 | "tooltip": { 165 | "shared": true, 166 | "sort": 0, 167 | "value_type": "individual" 168 | }, 169 | "type": "graph", 170 | "xaxis": { 171 | "buckets": null, 172 | "mode": "time", 173 | "name": null, 174 | "show": true, 175 | "values": [] 176 | }, 177 | "yaxes": [ 178 | { 179 | "format": "ops", 180 | "label": null, 181 | "logBase": 1, 182 | "max": null, 183 | "min": null, 184 | "show": true 185 | }, 186 | { 187 | "format": "short", 188 | "label": null, 189 | "logBase": 1, 190 | "max": null, 191 | "min": null, 192 | "show": true 193 | } 194 | ], 195 | "yaxis": { 196 | "align": false, 197 | "alignLevel": null 198 | } 199 | }, 200 | { 201 | "aliasColors": {}, 202 | "bars": false, 203 | "dashLength": 10, 204 | "dashes": false, 205 | "datasource": "Prometheus", 206 | "decimals": 1, 207 | "fill": 1, 208 | "fillGradient": 0, 209 | "gridPos": { 210 | "h": 7, 211 | "w": 8, 212 | "x": 16, 213 | "y": 0 214 | }, 215 | "hiddenSeries": false, 216 | "id": 4, 217 | "legend": { 218 | "alignAsTable": true, 219 | "avg": true, 220 | "current": true, 221 | "max": true, 222 | "min": false, 223 | "show": true, 224 | "total": false, 225 | "values": true 226 | }, 227 | "lines": true, 228 | "linewidth": 1, 229 | "nullPointMode": "null", 230 | "options": { 231 | "dataLinks": [] 232 | }, 233 | "percentage": false, 234 | "pointradius": 2, 235 | "points": false, 236 | "renderer": "flot", 237 | "seriesOverrides": [], 238 | "spaceLength": 10, 239 | "stack": false, 240 | "steppedLine": false, 241 | "targets": [ 242 | { 243 | "expr": "sum by (job, cluster_id, instance, role) (rpc_activity_rpc_queue_time_avg_time {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'})", 244 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 245 | "refId": "A" 246 | } 247 | ], 248 | "thresholds": [], 249 | "timeFrom": null, 250 | "timeRegions": [], 251 | "timeShift": null, 252 | "title": "Average Queue Time", 253 | "tooltip": { 254 | "shared": true, 255 | "sort": 0, 256 | "value_type": "individual" 257 | }, 258 | "type": "graph", 259 | "xaxis": { 260 | "buckets": null, 261 | "mode": "time", 262 | "name": null, 263 | "show": true, 264 | "values": [] 265 | }, 266 | "yaxes": [ 267 | { 268 | "format": "ms", 269 | "label": null, 270 | "logBase": 1, 271 | "max": null, 272 | "min": null, 273 | "show": true 274 | }, 275 | { 276 | "format": "short", 277 | "label": null, 278 | "logBase": 1, 279 | "max": null, 280 | "min": null, 281 | "show": true 282 | } 283 | ], 284 | "yaxis": { 285 | "align": false, 286 | "alignLevel": null 287 | } 288 | }, 289 | { 290 | "aliasColors": {}, 291 | "bars": false, 292 | "dashLength": 10, 293 | "dashes": false, 294 | "datasource": "Prometheus", 295 | "decimals": 1, 296 | "fill": 1, 297 | "fillGradient": 0, 298 | "gridPos": { 299 | "h": 7, 300 | "w": 8, 301 | "x": 0, 302 | "y": 7 303 | }, 304 | "hiddenSeries": false, 305 | "id": 2, 306 | "legend": { 307 | "alignAsTable": true, 308 | "avg": true, 309 | "current": true, 310 | "max": true, 311 | "min": false, 312 | "show": true, 313 | "total": false, 314 | "values": true 315 | }, 316 | "lines": true, 317 | "linewidth": 1, 318 | "nullPointMode": "null", 319 | "options": { 320 | "dataLinks": [] 321 | }, 322 | "percentage": false, 323 | "pointradius": 2, 324 | "points": false, 325 | "renderer": "flot", 326 | "seriesOverrides": [], 327 | "spaceLength": 10, 328 | "stack": false, 329 | "steppedLine": false, 330 | "targets": [ 331 | { 332 | "expr": "sum by (job, cluster_id, instance, role) (rpc_activity_rpc_processing_time_avg_time {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'})", 333 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 334 | "refId": "A" 335 | } 336 | ], 337 | "thresholds": [], 338 | "timeFrom": null, 339 | "timeRegions": [], 340 | "timeShift": null, 341 | "title": "Average Processing Time", 342 | "tooltip": { 343 | "shared": true, 344 | "sort": 0, 345 | "value_type": "individual" 346 | }, 347 | "type": "graph", 348 | "xaxis": { 349 | "buckets": null, 350 | "mode": "time", 351 | "name": null, 352 | "show": true, 353 | "values": [] 354 | }, 355 | "yaxes": [ 356 | { 357 | "format": "ms", 358 | "label": null, 359 | "logBase": 1, 360 | "max": null, 361 | "min": null, 362 | "show": true 363 | }, 364 | { 365 | "format": "short", 366 | "label": null, 367 | "logBase": 1, 368 | "max": null, 369 | "min": null, 370 | "show": true 371 | } 372 | ], 373 | "yaxis": { 374 | "align": false, 375 | "alignLevel": null 376 | } 377 | }, 378 | { 379 | "aliasColors": {}, 380 | "bars": false, 381 | "dashLength": 10, 382 | "dashes": false, 383 | "datasource": "Prometheus", 384 | "decimals": 1, 385 | "fill": 1, 386 | "fillGradient": 0, 387 | "gridPos": { 388 | "h": 7, 389 | "w": 8, 390 | "x": 8, 391 | "y": 7 392 | }, 393 | "hiddenSeries": false, 394 | "id": 7, 395 | "legend": { 396 | "alignAsTable": true, 397 | "avg": true, 398 | "current": true, 399 | "max": true, 400 | "min": false, 401 | "show": true, 402 | "total": false, 403 | "values": true 404 | }, 405 | "lines": true, 406 | "linewidth": 1, 407 | "nullPointMode": "null", 408 | "options": { 409 | "dataLinks": [] 410 | }, 411 | "percentage": false, 412 | "pointradius": 2, 413 | "points": false, 414 | "renderer": "flot", 415 | "seriesOverrides": [], 416 | "spaceLength": 10, 417 | "stack": false, 418 | "steppedLine": false, 419 | "targets": [ 420 | { 421 | "expr": "sum by (job, cluster_id, instance, role)(rate(rpc_activity_received_bytes {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'} [$interval]))", 422 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 423 | "refId": "A" 424 | } 425 | ], 426 | "thresholds": [], 427 | "timeFrom": null, 428 | "timeRegions": [], 429 | "timeShift": null, 430 | "title": "Data Receive", 431 | "tooltip": { 432 | "shared": true, 433 | "sort": 0, 434 | "value_type": "individual" 435 | }, 436 | "type": "graph", 437 | "xaxis": { 438 | "buckets": null, 439 | "mode": "time", 440 | "name": null, 441 | "show": true, 442 | "values": [] 443 | }, 444 | "yaxes": [ 445 | { 446 | "format": "Bps", 447 | "label": null, 448 | "logBase": 1, 449 | "max": null, 450 | "min": null, 451 | "show": true 452 | }, 453 | { 454 | "format": "short", 455 | "label": null, 456 | "logBase": 1, 457 | "max": null, 458 | "min": null, 459 | "show": true 460 | } 461 | ], 462 | "yaxis": { 463 | "align": false, 464 | "alignLevel": null 465 | } 466 | }, 467 | { 468 | "aliasColors": {}, 469 | "bars": false, 470 | "dashLength": 10, 471 | "dashes": false, 472 | "datasource": "Prometheus", 473 | "decimals": 1, 474 | "fill": 1, 475 | "fillGradient": 0, 476 | "gridPos": { 477 | "h": 7, 478 | "w": 8, 479 | "x": 16, 480 | "y": 7 481 | }, 482 | "hiddenSeries": false, 483 | "id": 8, 484 | "legend": { 485 | "alignAsTable": true, 486 | "avg": true, 487 | "current": true, 488 | "max": true, 489 | "min": false, 490 | "show": true, 491 | "total": false, 492 | "values": true 493 | }, 494 | "lines": true, 495 | "linewidth": 1, 496 | "nullPointMode": "null", 497 | "options": { 498 | "dataLinks": [] 499 | }, 500 | "percentage": false, 501 | "pointradius": 2, 502 | "points": false, 503 | "renderer": "flot", 504 | "seriesOverrides": [], 505 | "spaceLength": 10, 506 | "stack": false, 507 | "steppedLine": false, 508 | "targets": [ 509 | { 510 | "expr": "sum by (job, cluster_id, instance, role)(rate(rpc_activity_sent_bytes {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'} [$interval]))", 511 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 512 | "refId": "A" 513 | } 514 | ], 515 | "thresholds": [], 516 | "timeFrom": null, 517 | "timeRegions": [], 518 | "timeShift": null, 519 | "title": "Data Sent", 520 | "tooltip": { 521 | "shared": true, 522 | "sort": 0, 523 | "value_type": "individual" 524 | }, 525 | "type": "graph", 526 | "xaxis": { 527 | "buckets": null, 528 | "mode": "time", 529 | "name": null, 530 | "show": true, 531 | "values": [] 532 | }, 533 | "yaxes": [ 534 | { 535 | "format": "Bps", 536 | "label": null, 537 | "logBase": 1, 538 | "max": null, 539 | "min": null, 540 | "show": true 541 | }, 542 | { 543 | "format": "short", 544 | "label": null, 545 | "logBase": 1, 546 | "max": null, 547 | "min": null, 548 | "show": true 549 | } 550 | ], 551 | "yaxis": { 552 | "align": false, 553 | "alignLevel": null 554 | } 555 | }, 556 | { 557 | "aliasColors": {}, 558 | "bars": false, 559 | "dashLength": 10, 560 | "dashes": false, 561 | "datasource": "Prometheus", 562 | "decimals": 1, 563 | "fill": 1, 564 | "fillGradient": 0, 565 | "gridPos": { 566 | "h": 7, 567 | "w": 8, 568 | "x": 0, 569 | "y": 14 570 | }, 571 | "hiddenSeries": false, 572 | "id": 6, 573 | "legend": { 574 | "alignAsTable": true, 575 | "avg": true, 576 | "current": true, 577 | "max": true, 578 | "min": false, 579 | "show": true, 580 | "total": false, 581 | "values": true 582 | }, 583 | "lines": true, 584 | "linewidth": 1, 585 | "nullPointMode": "null", 586 | "options": { 587 | "dataLinks": [] 588 | }, 589 | "percentage": false, 590 | "pointradius": 2, 591 | "points": false, 592 | "renderer": "flot", 593 | "seriesOverrides": [], 594 | "spaceLength": 10, 595 | "stack": false, 596 | "steppedLine": false, 597 | "targets": [ 598 | { 599 | "expr": "sum by (job, cluster_id, instance, role)(rate(rpc_activity_rpc_slow_calls {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'} [$interval]))", 600 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 601 | "refId": "A" 602 | } 603 | ], 604 | "thresholds": [], 605 | "timeFrom": null, 606 | "timeRegions": [], 607 | "timeShift": null, 608 | "title": "Slow Calls Rate", 609 | "tooltip": { 610 | "shared": true, 611 | "sort": 0, 612 | "value_type": "individual" 613 | }, 614 | "type": "graph", 615 | "xaxis": { 616 | "buckets": null, 617 | "mode": "time", 618 | "name": null, 619 | "show": true, 620 | "values": [] 621 | }, 622 | "yaxes": [ 623 | { 624 | "format": "ops", 625 | "label": null, 626 | "logBase": 1, 627 | "max": null, 628 | "min": null, 629 | "show": true 630 | }, 631 | { 632 | "format": "short", 633 | "label": null, 634 | "logBase": 1, 635 | "max": null, 636 | "min": null, 637 | "show": true 638 | } 639 | ], 640 | "yaxis": { 641 | "align": false, 642 | "alignLevel": null 643 | } 644 | }, 645 | { 646 | "aliasColors": {}, 647 | "bars": false, 648 | "dashLength": 10, 649 | "dashes": false, 650 | "datasource": "Prometheus", 651 | "decimals": 1, 652 | "fill": 1, 653 | "fillGradient": 0, 654 | "gridPos": { 655 | "h": 7, 656 | "w": 8, 657 | "x": 8, 658 | "y": 14 659 | }, 660 | "hiddenSeries": false, 661 | "id": 11, 662 | "legend": { 663 | "alignAsTable": true, 664 | "avg": true, 665 | "current": true, 666 | "max": true, 667 | "min": false, 668 | "show": true, 669 | "total": false, 670 | "values": true 671 | }, 672 | "lines": true, 673 | "linewidth": 1, 674 | "nullPointMode": "null", 675 | "options": { 676 | "dataLinks": [] 677 | }, 678 | "percentage": false, 679 | "pointradius": 2, 680 | "points": false, 681 | "renderer": "flot", 682 | "seriesOverrides": [], 683 | "spaceLength": 10, 684 | "stack": false, 685 | "steppedLine": false, 686 | "targets": [ 687 | { 688 | "expr": "sum by (job, cluster_id, instance, role) (rpc_activity_num_open_connections {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'})", 689 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 690 | "refId": "A" 691 | } 692 | ], 693 | "thresholds": [], 694 | "timeFrom": null, 695 | "timeRegions": [], 696 | "timeShift": null, 697 | "title": "Open Connections", 698 | "tooltip": { 699 | "shared": true, 700 | "sort": 0, 701 | "value_type": "individual" 702 | }, 703 | "type": "graph", 704 | "xaxis": { 705 | "buckets": null, 706 | "mode": "time", 707 | "name": null, 708 | "show": true, 709 | "values": [] 710 | }, 711 | "yaxes": [ 712 | { 713 | "format": "short", 714 | "label": null, 715 | "logBase": 1, 716 | "max": null, 717 | "min": null, 718 | "show": true 719 | }, 720 | { 721 | "format": "short", 722 | "label": null, 723 | "logBase": 1, 724 | "max": null, 725 | "min": null, 726 | "show": true 727 | } 728 | ], 729 | "yaxis": { 730 | "align": false, 731 | "alignLevel": null 732 | } 733 | }, 734 | { 735 | "aliasColors": {}, 736 | "bars": false, 737 | "dashLength": 10, 738 | "dashes": false, 739 | "datasource": "Prometheus", 740 | "decimals": 1, 741 | "fill": 1, 742 | "fillGradient": 0, 743 | "gridPos": { 744 | "h": 7, 745 | "w": 8, 746 | "x": 16, 747 | "y": 14 748 | }, 749 | "hiddenSeries": false, 750 | "id": 12, 751 | "legend": { 752 | "alignAsTable": true, 753 | "avg": true, 754 | "current": true, 755 | "max": true, 756 | "min": false, 757 | "show": true, 758 | "total": false, 759 | "values": true 760 | }, 761 | "lines": true, 762 | "linewidth": 1, 763 | "nullPointMode": "null", 764 | "options": { 765 | "dataLinks": [] 766 | }, 767 | "percentage": false, 768 | "pointradius": 2, 769 | "points": false, 770 | "renderer": "flot", 771 | "seriesOverrides": [], 772 | "spaceLength": 10, 773 | "stack": false, 774 | "steppedLine": false, 775 | "targets": [ 776 | { 777 | "expr": "sum by (job, cluster_id, instance, role) (rpc_activity_num_dropped_connections {role=~'$role', job=~'$job', cluster_id=~'$cluster', instance=~'$instance'})", 778 | "legendFormat": "{{role}}_{{job}}_{{instance}}", 779 | "refId": "A" 780 | } 781 | ], 782 | "thresholds": [], 783 | "timeFrom": null, 784 | "timeRegions": [], 785 | "timeShift": null, 786 | "title": "Dropped Connections", 787 | "tooltip": { 788 | "shared": true, 789 | "sort": 0, 790 | "value_type": "individual" 791 | }, 792 | "type": "graph", 793 | "xaxis": { 794 | "buckets": null, 795 | "mode": "time", 796 | "name": null, 797 | "show": true, 798 | "values": [] 799 | }, 800 | "yaxes": [ 801 | { 802 | "format": "short", 803 | "label": null, 804 | "logBase": 1, 805 | "max": null, 806 | "min": null, 807 | "show": true 808 | }, 809 | { 810 | "format": "short", 811 | "label": null, 812 | "logBase": 1, 813 | "max": null, 814 | "min": null, 815 | "show": true 816 | } 817 | ], 818 | "yaxis": { 819 | "align": false, 820 | "alignLevel": null 821 | } 822 | } 823 | ], 824 | "schemaVersion": 22, 825 | "style": "dark", 826 | "tags": [ 827 | "Amazon EMR", 828 | "RPC", 829 | "Prometheus" 830 | ], 831 | "templating": { 832 | "list": [ 833 | { 834 | "auto": false, 835 | "auto_count": 30, 836 | "auto_min": "10s", 837 | "current": { 838 | "selected": false, 839 | "text": "1m", 840 | "value": "1m" 841 | }, 842 | "hide": 0, 843 | "label": "Interval", 844 | "name": "interval", 845 | "options": [ 846 | { 847 | "selected": true, 848 | "text": "1m", 849 | "value": "1m" 850 | }, 851 | { 852 | "selected": false, 853 | "text": "5m", 854 | "value": "5m" 855 | }, 856 | { 857 | "selected": false, 858 | "text": "10m", 859 | "value": "10m" 860 | }, 861 | { 862 | "selected": false, 863 | "text": "30m", 864 | "value": "30m" 865 | }, 866 | { 867 | "selected": false, 868 | "text": "1h", 869 | "value": "1h" 870 | }, 871 | { 872 | "selected": false, 873 | "text": "6h", 874 | "value": "6h" 875 | }, 876 | { 877 | "selected": false, 878 | "text": "12h", 879 | "value": "12h" 880 | }, 881 | { 882 | "selected": false, 883 | "text": "1d", 884 | "value": "1d" 885 | }, 886 | { 887 | "selected": false, 888 | "text": "7d", 889 | "value": "7d" 890 | }, 891 | { 892 | "selected": false, 893 | "text": "14d", 894 | "value": "14d" 895 | }, 896 | { 897 | "selected": false, 898 | "text": "30d", 899 | "value": "30d" 900 | } 901 | ], 902 | "query": "1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 903 | "refresh": 2, 904 | "skipUrlSync": false, 905 | "type": "interval" 906 | }, 907 | { 908 | "allValue": null, 909 | "current": { 910 | "selected": false, 911 | "text": "All", 912 | "value": "$__all" 913 | }, 914 | "datasource": "Prometheus", 915 | "definition": "label_values(rpc_activity_call_queue_length, role)", 916 | "hide": 0, 917 | "includeAll": true, 918 | "index": -1, 919 | "label": "Role", 920 | "multi": true, 921 | "name": "role", 922 | "options": [], 923 | "query": "label_values(rpc_activity_call_queue_length, role)", 924 | "refresh": 1, 925 | "regex": "", 926 | "skipUrlSync": false, 927 | "sort": 0, 928 | "tagValuesQuery": "", 929 | "tags": [], 930 | "tagsQuery": "", 931 | "type": "query", 932 | "useTags": false 933 | }, 934 | { 935 | "allValue": null, 936 | "current": { 937 | "selected": false, 938 | "text": "All", 939 | "value": "$__all" 940 | }, 941 | "datasource": "Prometheus", 942 | "definition": "label_values(rpc_activity_call_queue_length {role=~\"$role\"}, job)", 943 | "hide": 0, 944 | "includeAll": true, 945 | "index": -1, 946 | "label": "Job", 947 | "multi": true, 948 | "name": "job", 949 | "options": [], 950 | "query": "label_values(rpc_activity_call_queue_length {role=~\"$role\"}, job)", 951 | "refresh": 1, 952 | "regex": "", 953 | "skipUrlSync": false, 954 | "sort": 1, 955 | "tagValuesQuery": "", 956 | "tags": [], 957 | "tagsQuery": "", 958 | "type": "query", 959 | "useTags": false 960 | }, 961 | { 962 | "allValue": null, 963 | "current": { 964 | "selected": false, 965 | "text": "All", 966 | "value": "$__all" 967 | }, 968 | "datasource": "Prometheus", 969 | "definition": "label_values(rpc_activity_call_queue_length {role=~\"$role\", job=~'$job'}, cluster_id)", 970 | "hide": 0, 971 | "includeAll": true, 972 | "index": -1, 973 | "label": "Cluster", 974 | "multi": true, 975 | "name": "cluster", 976 | "options": [], 977 | "query": "label_values(rpc_activity_call_queue_length {role=~\"$role\", job=~'$job'}, cluster_id)", 978 | "refresh": 1, 979 | "regex": "", 980 | "skipUrlSync": false, 981 | "sort": 0, 982 | "tagValuesQuery": "", 983 | "tags": [], 984 | "tagsQuery": "", 985 | "type": "query", 986 | "useTags": false 987 | }, 988 | { 989 | "allValue": null, 990 | "current": { 991 | "selected": false, 992 | "text": "All", 993 | "value": "$__all" 994 | }, 995 | "datasource": "Prometheus", 996 | "definition": "label_values(rpc_activity_call_queue_length {role=~'$role', job=~\"$job\", cluster_id=~'$cluster'}, instance)", 997 | "hide": 0, 998 | "includeAll": true, 999 | "index": -1, 1000 | "label": "Instance", 1001 | "multi": true, 1002 | "name": "instance", 1003 | "options": [], 1004 | "query": "label_values(rpc_activity_call_queue_length {role=~'$role', job=~\"$job\", cluster_id=~'$cluster'}, instance)", 1005 | "refresh": 1, 1006 | "regex": "", 1007 | "skipUrlSync": false, 1008 | "sort": 0, 1009 | "tagValuesQuery": "", 1010 | "tags": [], 1011 | "tagsQuery": "", 1012 | "type": "query", 1013 | "useTags": false 1014 | } 1015 | ] 1016 | }, 1017 | "time": { 1018 | "from": "now-1h", 1019 | "to": "now" 1020 | }, 1021 | "timepicker": { 1022 | "refresh_intervals": [ 1023 | "5s", 1024 | "10s", 1025 | "30s", 1026 | "1m", 1027 | "5m", 1028 | "15m", 1029 | "30m", 1030 | "1h", 1031 | "2h", 1032 | "1d" 1033 | ] 1034 | }, 1035 | "timezone": "", 1036 | "title": "RPC Metrics", 1037 | "uid": "IRkRt96Zz", 1038 | "variables": { 1039 | "list": [] 1040 | }, 1041 | "version": 1 1042 | } -------------------------------------------------------------------------------- /grafana-dashboards/JVM+Metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "$$hashKey": "object:29326", 6 | "builtIn": 1, 7 | "datasource": "-- Grafana --", 8 | "enable": true, 9 | "hide": true, 10 | "iconColor": "rgba(0, 211, 255, 1)", 11 | "name": "Annotations & Alerts", 12 | "type": "dashboard" 13 | } 14 | ] 15 | }, 16 | "editable": true, 17 | "gnetId": null, 18 | "graphTooltip": 0, 19 | "iteration": 1595829318370, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "cacheTimeout": null, 24 | "columns": [], 25 | "datasource": "Prometheus", 26 | "fontSize": "100%", 27 | "gridPos": { 28 | "h": 6, 29 | "w": 12, 30 | "x": 0, 31 | "y": 0 32 | }, 33 | "id": 70, 34 | "links": [], 35 | "pageSize": null, 36 | "pluginVersion": "6.6.2", 37 | "showHeader": true, 38 | "sort": { 39 | "col": 4, 40 | "desc": false 41 | }, 42 | "styles": [ 43 | { 44 | "alias": "Time", 45 | "align": "auto", 46 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 47 | "pattern": "Time", 48 | "type": "hidden" 49 | }, 50 | { 51 | "alias": "Uptime", 52 | "align": "auto", 53 | "colorMode": null, 54 | "colors": [ 55 | "rgba(245, 54, 54, 0.9)", 56 | "rgba(237, 129, 40, 0.89)", 57 | "rgba(50, 172, 45, 0.97)" 58 | ], 59 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 60 | "decimals": null, 61 | "mappingType": 1, 62 | "pattern": "Value", 63 | "thresholds": [], 64 | "type": "number", 65 | "unit": "s" 66 | }, 67 | { 68 | "alias": "Instance", 69 | "align": "left", 70 | "colorMode": null, 71 | "colors": [ 72 | "rgba(245, 54, 54, 0.9)", 73 | "rgba(237, 129, 40, 0.89)", 74 | "rgba(50, 172, 45, 0.97)" 75 | ], 76 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 77 | "decimals": 2, 78 | "mappingType": 1, 79 | "pattern": "Metric", 80 | "preserveFormat": false, 81 | "thresholds": [], 82 | "type": "string", 83 | "unit": "short" 84 | }, 85 | { 86 | "alias": "", 87 | "align": "right", 88 | "colorMode": null, 89 | "colors": [ 90 | "rgba(245, 54, 54, 0.9)", 91 | "rgba(237, 129, 40, 0.89)", 92 | "rgba(50, 172, 45, 0.97)" 93 | ], 94 | "decimals": null, 95 | "pattern": "/.*/", 96 | "thresholds": [], 97 | "type": "number", 98 | "unit": "s" 99 | } 100 | ], 101 | "targets": [ 102 | { 103 | "expr": "time()-process_start_time_seconds {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 104 | "format": "time_series", 105 | "instant": true, 106 | "legendFormat": "{{instance}}", 107 | "refId": "A" 108 | } 109 | ], 110 | "timeFrom": null, 111 | "timeShift": null, 112 | "title": "Uptime", 113 | "transform": "timeseries_to_rows", 114 | "type": "table" 115 | }, 116 | { 117 | "cacheTimeout": null, 118 | "columns": [], 119 | "datasource": "Prometheus", 120 | "fontSize": "100%", 121 | "gridPos": { 122 | "h": 6, 123 | "w": 12, 124 | "x": 12, 125 | "y": 0 126 | }, 127 | "id": 71, 128 | "links": [], 129 | "pageSize": null, 130 | "pluginVersion": "6.6.2", 131 | "showHeader": true, 132 | "sort": { 133 | "col": 4, 134 | "desc": false 135 | }, 136 | "styles": [ 137 | { 138 | "alias": "Time", 139 | "align": "auto", 140 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 141 | "pattern": "Time", 142 | "type": "hidden" 143 | }, 144 | { 145 | "alias": "Uptime", 146 | "align": "auto", 147 | "colorMode": null, 148 | "colors": [ 149 | "rgba(245, 54, 54, 0.9)", 150 | "rgba(237, 129, 40, 0.89)", 151 | "rgba(50, 172, 45, 0.97)" 152 | ], 153 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 154 | "decimals": null, 155 | "mappingType": 1, 156 | "pattern": "Value", 157 | "thresholds": [], 158 | "type": "number", 159 | "unit": "dateTimeAsIso" 160 | }, 161 | { 162 | "alias": "Instance", 163 | "align": "left", 164 | "colorMode": null, 165 | "colors": [ 166 | "rgba(245, 54, 54, 0.9)", 167 | "rgba(237, 129, 40, 0.89)", 168 | "rgba(50, 172, 45, 0.97)" 169 | ], 170 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 171 | "decimals": 2, 172 | "mappingType": 1, 173 | "pattern": "Metric", 174 | "preserveFormat": false, 175 | "thresholds": [], 176 | "type": "string", 177 | "unit": "short" 178 | }, 179 | { 180 | "alias": "", 181 | "align": "right", 182 | "colorMode": null, 183 | "colors": [ 184 | "rgba(245, 54, 54, 0.9)", 185 | "rgba(237, 129, 40, 0.89)", 186 | "rgba(50, 172, 45, 0.97)" 187 | ], 188 | "decimals": null, 189 | "pattern": "/.*/", 190 | "thresholds": [], 191 | "type": "number", 192 | "unit": "s" 193 | } 194 | ], 195 | "targets": [ 196 | { 197 | "expr": "process_start_time_seconds {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}*1000", 198 | "format": "time_series", 199 | "instant": true, 200 | "legendFormat": "{{instance}}", 201 | "refId": "A" 202 | } 203 | ], 204 | "timeFrom": null, 205 | "timeShift": null, 206 | "title": "Start Time", 207 | "transform": "timeseries_to_rows", 208 | "type": "table" 209 | }, 210 | { 211 | "aliasColors": {}, 212 | "bars": false, 213 | "cacheTimeout": null, 214 | "dashLength": 10, 215 | "dashes": false, 216 | "datasource": "Prometheus", 217 | "decimals": 1, 218 | "fill": 1, 219 | "fillGradient": 0, 220 | "gridPos": { 221 | "h": 6, 222 | "w": 8, 223 | "x": 0, 224 | "y": 6 225 | }, 226 | "hiddenSeries": false, 227 | "id": 21, 228 | "interval": "", 229 | "legend": { 230 | "alignAsTable": true, 231 | "avg": true, 232 | "current": true, 233 | "max": true, 234 | "min": false, 235 | "show": true, 236 | "total": false, 237 | "values": true 238 | }, 239 | "lines": true, 240 | "linewidth": 1, 241 | "links": [], 242 | "nullPointMode": "null", 243 | "options": { 244 | "dataLinks": [] 245 | }, 246 | "percentage": false, 247 | "pointradius": 2, 248 | "points": false, 249 | "renderer": "flot", 250 | "seriesOverrides": [], 251 | "spaceLength": 10, 252 | "stack": false, 253 | "steppedLine": false, 254 | "targets": [ 255 | { 256 | "expr": "java_lang_memory_heapmemoryusage_used {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 257 | "legendFormat": "{{instance}}_used", 258 | "refId": "A" 259 | }, 260 | { 261 | "expr": "java_lang_memory_heapmemoryusage_committed {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 262 | "legendFormat": "{{instance}}_committed", 263 | "refId": "C" 264 | }, 265 | { 266 | "expr": "java_lang_memory_heapmemoryusage_max {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 267 | "intervalFactor": 1, 268 | "legendFormat": "{{instance}}_max_configured", 269 | "refId": "B" 270 | } 271 | ], 272 | "thresholds": [], 273 | "timeFrom": null, 274 | "timeRegions": [], 275 | "timeShift": null, 276 | "title": "Heap", 277 | "tooltip": { 278 | "shared": true, 279 | "sort": 0, 280 | "value_type": "individual" 281 | }, 282 | "type": "graph", 283 | "xaxis": { 284 | "buckets": null, 285 | "mode": "time", 286 | "name": null, 287 | "show": true, 288 | "values": [] 289 | }, 290 | "yaxes": [ 291 | { 292 | "format": "decbytes", 293 | "label": null, 294 | "logBase": 1, 295 | "max": null, 296 | "min": null, 297 | "show": true 298 | }, 299 | { 300 | "format": "short", 301 | "label": null, 302 | "logBase": 1, 303 | "max": null, 304 | "min": null, 305 | "show": true 306 | } 307 | ], 308 | "yaxis": { 309 | "align": false, 310 | "alignLevel": null 311 | } 312 | }, 313 | { 314 | "aliasColors": {}, 315 | "bars": false, 316 | "cacheTimeout": null, 317 | "dashLength": 10, 318 | "dashes": false, 319 | "datasource": "Prometheus", 320 | "decimals": 1, 321 | "fill": 1, 322 | "fillGradient": 0, 323 | "gridPos": { 324 | "h": 6, 325 | "w": 8, 326 | "x": 8, 327 | "y": 6 328 | }, 329 | "hiddenSeries": false, 330 | "id": 60, 331 | "interval": "", 332 | "legend": { 333 | "alignAsTable": true, 334 | "avg": true, 335 | "current": true, 336 | "max": true, 337 | "min": false, 338 | "show": true, 339 | "total": false, 340 | "values": true 341 | }, 342 | "lines": true, 343 | "linewidth": 1, 344 | "links": [], 345 | "nullPointMode": "null", 346 | "options": { 347 | "dataLinks": [] 348 | }, 349 | "percentage": false, 350 | "pointradius": 2, 351 | "points": false, 352 | "renderer": "flot", 353 | "seriesOverrides": [], 354 | "spaceLength": 10, 355 | "stack": false, 356 | "steppedLine": false, 357 | "targets": [ 358 | { 359 | "expr": "java_lang_memory_heapmemoryusage_used {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'} / java_lang_memory_heapmemoryusage_max {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 360 | "legendFormat": "{{instance}}", 361 | "refId": "A" 362 | } 363 | ], 364 | "thresholds": [], 365 | "timeFrom": null, 366 | "timeRegions": [], 367 | "timeShift": null, 368 | "title": "Heap Usage %", 369 | "tooltip": { 370 | "shared": true, 371 | "sort": 0, 372 | "value_type": "individual" 373 | }, 374 | "type": "graph", 375 | "xaxis": { 376 | "buckets": null, 377 | "mode": "time", 378 | "name": null, 379 | "show": true, 380 | "values": [] 381 | }, 382 | "yaxes": [ 383 | { 384 | "decimals": null, 385 | "format": "percentunit", 386 | "label": "", 387 | "logBase": 1, 388 | "max": null, 389 | "min": null, 390 | "show": true 391 | }, 392 | { 393 | "format": "short", 394 | "label": null, 395 | "logBase": 1, 396 | "max": null, 397 | "min": null, 398 | "show": true 399 | } 400 | ], 401 | "yaxis": { 402 | "align": false, 403 | "alignLevel": null 404 | } 405 | }, 406 | { 407 | "aliasColors": {}, 408 | "bars": false, 409 | "cacheTimeout": null, 410 | "dashLength": 10, 411 | "dashes": false, 412 | "datasource": "Prometheus", 413 | "decimals": 1, 414 | "fill": 1, 415 | "fillGradient": 0, 416 | "gridPos": { 417 | "h": 6, 418 | "w": 8, 419 | "x": 16, 420 | "y": 6 421 | }, 422 | "hiddenSeries": false, 423 | "id": 61, 424 | "interval": "", 425 | "legend": { 426 | "alignAsTable": true, 427 | "avg": true, 428 | "current": true, 429 | "max": true, 430 | "min": false, 431 | "show": true, 432 | "total": false, 433 | "values": true 434 | }, 435 | "lines": true, 436 | "linewidth": 1, 437 | "links": [], 438 | "nullPointMode": "null", 439 | "options": { 440 | "dataLinks": [] 441 | }, 442 | "percentage": false, 443 | "pointradius": 2, 444 | "points": false, 445 | "renderer": "flot", 446 | "seriesOverrides": [], 447 | "spaceLength": 10, 448 | "stack": false, 449 | "steppedLine": false, 450 | "targets": [ 451 | { 452 | "expr": "java_lang_memory_nonheapmemoryusage_used {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 453 | "legendFormat": "{{instance}}_used", 454 | "refId": "A" 455 | }, 456 | { 457 | "expr": "java_lang_memory_nonheapmemoryusage_committed {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 458 | "legendFormat": "{{instance}}_committed", 459 | "refId": "C" 460 | } 461 | ], 462 | "thresholds": [], 463 | "timeFrom": null, 464 | "timeRegions": [], 465 | "timeShift": null, 466 | "title": "Non Heap", 467 | "tooltip": { 468 | "shared": true, 469 | "sort": 0, 470 | "value_type": "individual" 471 | }, 472 | "type": "graph", 473 | "xaxis": { 474 | "buckets": null, 475 | "mode": "time", 476 | "name": null, 477 | "show": true, 478 | "values": [] 479 | }, 480 | "yaxes": [ 481 | { 482 | "format": "decbytes", 483 | "label": null, 484 | "logBase": 1, 485 | "max": null, 486 | "min": null, 487 | "show": true 488 | }, 489 | { 490 | "format": "short", 491 | "label": null, 492 | "logBase": 1, 493 | "max": null, 494 | "min": null, 495 | "show": true 496 | } 497 | ], 498 | "yaxis": { 499 | "align": false, 500 | "alignLevel": null 501 | } 502 | }, 503 | { 504 | "aliasColors": {}, 505 | "bars": false, 506 | "cacheTimeout": null, 507 | "dashLength": 10, 508 | "dashes": false, 509 | "datasource": "Prometheus", 510 | "decimals": 1, 511 | "fill": 1, 512 | "fillGradient": 0, 513 | "gridPos": { 514 | "h": 6, 515 | "w": 8, 516 | "x": 0, 517 | "y": 12 518 | }, 519 | "hiddenSeries": false, 520 | "id": 68, 521 | "interval": "", 522 | "legend": { 523 | "alignAsTable": true, 524 | "avg": true, 525 | "current": true, 526 | "max": true, 527 | "min": false, 528 | "show": true, 529 | "total": false, 530 | "values": true 531 | }, 532 | "lines": true, 533 | "linewidth": 1, 534 | "links": [], 535 | "nullPointMode": "null", 536 | "options": { 537 | "dataLinks": [] 538 | }, 539 | "percentage": false, 540 | "pointradius": 2, 541 | "points": false, 542 | "renderer": "flot", 543 | "seriesOverrides": [], 544 | "spaceLength": 10, 545 | "stack": false, 546 | "steppedLine": false, 547 | "targets": [ 548 | { 549 | "expr": "increase(java_lang_garbagecollector_collectioncount {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'} [$interval])", 550 | "instant": false, 551 | "legendFormat": "{{instance}}_{{name}}", 552 | "refId": "A" 553 | } 554 | ], 555 | "thresholds": [], 556 | "timeFrom": null, 557 | "timeRegions": [], 558 | "timeShift": null, 559 | "title": "GC Count", 560 | "tooltip": { 561 | "shared": true, 562 | "sort": 0, 563 | "value_type": "individual" 564 | }, 565 | "type": "graph", 566 | "xaxis": { 567 | "buckets": null, 568 | "mode": "time", 569 | "name": null, 570 | "show": true, 571 | "values": [] 572 | }, 573 | "yaxes": [ 574 | { 575 | "decimals": null, 576 | "format": "short", 577 | "label": null, 578 | "logBase": 1, 579 | "max": null, 580 | "min": null, 581 | "show": true 582 | }, 583 | { 584 | "format": "short", 585 | "label": null, 586 | "logBase": 1, 587 | "max": null, 588 | "min": null, 589 | "show": true 590 | } 591 | ], 592 | "yaxis": { 593 | "align": false, 594 | "alignLevel": null 595 | } 596 | }, 597 | { 598 | "aliasColors": {}, 599 | "bars": false, 600 | "cacheTimeout": null, 601 | "dashLength": 10, 602 | "dashes": false, 603 | "datasource": "Prometheus", 604 | "decimals": 1, 605 | "fill": 1, 606 | "fillGradient": 0, 607 | "gridPos": { 608 | "h": 6, 609 | "w": 8, 610 | "x": 8, 611 | "y": 12 612 | }, 613 | "hiddenSeries": false, 614 | "id": 69, 615 | "interval": "", 616 | "legend": { 617 | "alignAsTable": true, 618 | "avg": true, 619 | "current": true, 620 | "max": true, 621 | "min": false, 622 | "show": true, 623 | "total": false, 624 | "values": true 625 | }, 626 | "lines": true, 627 | "linewidth": 1, 628 | "links": [], 629 | "nullPointMode": "null", 630 | "options": { 631 | "dataLinks": [] 632 | }, 633 | "percentage": false, 634 | "pointradius": 2, 635 | "points": false, 636 | "renderer": "flot", 637 | "seriesOverrides": [], 638 | "spaceLength": 10, 639 | "stack": false, 640 | "steppedLine": false, 641 | "targets": [ 642 | { 643 | "expr": "java_lang_garbagecollector_collectiontime {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}/ java_lang_garbagecollector_collectioncount {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 644 | "instant": false, 645 | "legendFormat": "{{instance}}_{{name}}", 646 | "refId": "A" 647 | } 648 | ], 649 | "thresholds": [], 650 | "timeFrom": null, 651 | "timeRegions": [], 652 | "timeShift": null, 653 | "title": "GC Time", 654 | "tooltip": { 655 | "shared": true, 656 | "sort": 0, 657 | "value_type": "individual" 658 | }, 659 | "type": "graph", 660 | "xaxis": { 661 | "buckets": null, 662 | "mode": "time", 663 | "name": null, 664 | "show": true, 665 | "values": [] 666 | }, 667 | "yaxes": [ 668 | { 669 | "decimals": null, 670 | "format": "ms", 671 | "label": "", 672 | "logBase": 1, 673 | "max": null, 674 | "min": null, 675 | "show": true 676 | }, 677 | { 678 | "format": "short", 679 | "label": null, 680 | "logBase": 1, 681 | "max": null, 682 | "min": null, 683 | "show": true 684 | } 685 | ], 686 | "yaxis": { 687 | "align": false, 688 | "alignLevel": null 689 | } 690 | }, 691 | { 692 | "aliasColors": {}, 693 | "bars": false, 694 | "cacheTimeout": null, 695 | "dashLength": 10, 696 | "dashes": false, 697 | "datasource": "Prometheus", 698 | "decimals": 1, 699 | "fill": 1, 700 | "fillGradient": 0, 701 | "gridPos": { 702 | "h": 6, 703 | "w": 8, 704 | "x": 16, 705 | "y": 12 706 | }, 707 | "hiddenSeries": false, 708 | "id": 66, 709 | "interval": "", 710 | "legend": { 711 | "alignAsTable": true, 712 | "avg": true, 713 | "current": true, 714 | "max": true, 715 | "min": false, 716 | "show": true, 717 | "total": false, 718 | "values": true 719 | }, 720 | "lines": true, 721 | "linewidth": 1, 722 | "links": [], 723 | "nullPointMode": "null", 724 | "options": { 725 | "dataLinks": [] 726 | }, 727 | "percentage": false, 728 | "pointradius": 2, 729 | "points": false, 730 | "renderer": "flot", 731 | "seriesOverrides": [], 732 | "spaceLength": 10, 733 | "stack": false, 734 | "steppedLine": false, 735 | "targets": [ 736 | { 737 | "expr": "rate( java_lang_garbagecollector_collectiontime {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'} [$interval])", 738 | "legendFormat": "{{instance}}_{{name}}", 739 | "refId": "A" 740 | } 741 | ], 742 | "thresholds": [], 743 | "timeFrom": null, 744 | "timeRegions": [], 745 | "timeShift": null, 746 | "title": "GC Time %", 747 | "tooltip": { 748 | "shared": true, 749 | "sort": 0, 750 | "value_type": "individual" 751 | }, 752 | "type": "graph", 753 | "xaxis": { 754 | "buckets": null, 755 | "mode": "time", 756 | "name": null, 757 | "show": true, 758 | "values": [] 759 | }, 760 | "yaxes": [ 761 | { 762 | "format": "percentunit", 763 | "label": "", 764 | "logBase": 1, 765 | "max": null, 766 | "min": null, 767 | "show": true 768 | }, 769 | { 770 | "format": "short", 771 | "label": null, 772 | "logBase": 1, 773 | "max": null, 774 | "min": null, 775 | "show": true 776 | } 777 | ], 778 | "yaxis": { 779 | "align": false, 780 | "alignLevel": null 781 | } 782 | }, 783 | { 784 | "aliasColors": {}, 785 | "bars": false, 786 | "cacheTimeout": null, 787 | "dashLength": 10, 788 | "dashes": false, 789 | "datasource": "Prometheus", 790 | "decimals": 1, 791 | "fill": 1, 792 | "fillGradient": 0, 793 | "gridPos": { 794 | "h": 6, 795 | "w": 8, 796 | "x": 0, 797 | "y": 18 798 | }, 799 | "hiddenSeries": false, 800 | "id": 64, 801 | "interval": "", 802 | "legend": { 803 | "alignAsTable": true, 804 | "avg": true, 805 | "current": true, 806 | "max": true, 807 | "min": false, 808 | "show": true, 809 | "total": false, 810 | "values": true 811 | }, 812 | "lines": true, 813 | "linewidth": 1, 814 | "links": [], 815 | "nullPointMode": "null", 816 | "options": { 817 | "dataLinks": [] 818 | }, 819 | "percentage": false, 820 | "pointradius": 2, 821 | "points": false, 822 | "renderer": "flot", 823 | "seriesOverrides": [], 824 | "spaceLength": 10, 825 | "stack": false, 826 | "steppedLine": false, 827 | "targets": [ 828 | { 829 | "expr": "java_lang_memorypool_collectionusage_used {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'} ", 830 | "legendFormat": "{{instance}}_{{name}}", 831 | "refId": "A" 832 | } 833 | ], 834 | "thresholds": [], 835 | "timeFrom": null, 836 | "timeRegions": [], 837 | "timeShift": null, 838 | "title": "Memory Pool Sizes", 839 | "tooltip": { 840 | "shared": true, 841 | "sort": 0, 842 | "value_type": "individual" 843 | }, 844 | "type": "graph", 845 | "xaxis": { 846 | "buckets": null, 847 | "mode": "time", 848 | "name": null, 849 | "show": true, 850 | "values": [] 851 | }, 852 | "yaxes": [ 853 | { 854 | "format": "decbytes", 855 | "label": null, 856 | "logBase": 1, 857 | "max": null, 858 | "min": null, 859 | "show": true 860 | }, 861 | { 862 | "format": "short", 863 | "label": null, 864 | "logBase": 1, 865 | "max": null, 866 | "min": null, 867 | "show": true 868 | } 869 | ], 870 | "yaxis": { 871 | "align": false, 872 | "alignLevel": null 873 | } 874 | }, 875 | { 876 | "aliasColors": {}, 877 | "bars": false, 878 | "cacheTimeout": null, 879 | "dashLength": 10, 880 | "dashes": false, 881 | "datasource": "Prometheus", 882 | "decimals": 1, 883 | "fill": 1, 884 | "fillGradient": 0, 885 | "gridPos": { 886 | "h": 6, 887 | "w": 8, 888 | "x": 8, 889 | "y": 18 890 | }, 891 | "hiddenSeries": false, 892 | "id": 67, 893 | "interval": "", 894 | "legend": { 895 | "alignAsTable": true, 896 | "avg": true, 897 | "current": true, 898 | "max": true, 899 | "min": false, 900 | "show": true, 901 | "total": false, 902 | "values": true 903 | }, 904 | "lines": true, 905 | "linewidth": 1, 906 | "links": [], 907 | "nullPointMode": "null", 908 | "options": { 909 | "dataLinks": [] 910 | }, 911 | "percentage": false, 912 | "pointradius": 2, 913 | "points": false, 914 | "renderer": "flot", 915 | "seriesOverrides": [], 916 | "spaceLength": 10, 917 | "stack": false, 918 | "steppedLine": false, 919 | "targets": [ 920 | { 921 | "expr": "sum by (job, cluster_id, instance)(java_lang_garbagecollector_lastgcinfo_memoryusageaftergc_used {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'})", 922 | "legendFormat": "{{instance}}", 923 | "refId": "A" 924 | } 925 | ], 926 | "thresholds": [], 927 | "timeFrom": null, 928 | "timeRegions": [], 929 | "timeShift": null, 930 | "title": "Memory Used After GC", 931 | "tooltip": { 932 | "shared": true, 933 | "sort": 0, 934 | "value_type": "individual" 935 | }, 936 | "type": "graph", 937 | "xaxis": { 938 | "buckets": null, 939 | "mode": "time", 940 | "name": null, 941 | "show": true, 942 | "values": [] 943 | }, 944 | "yaxes": [ 945 | { 946 | "format": "decbytes", 947 | "label": null, 948 | "logBase": 1, 949 | "max": null, 950 | "min": null, 951 | "show": true 952 | }, 953 | { 954 | "format": "short", 955 | "label": null, 956 | "logBase": 1, 957 | "max": null, 958 | "min": null, 959 | "show": true 960 | } 961 | ], 962 | "yaxis": { 963 | "align": false, 964 | "alignLevel": null 965 | } 966 | }, 967 | { 968 | "aliasColors": {}, 969 | "bars": false, 970 | "cacheTimeout": null, 971 | "dashLength": 10, 972 | "dashes": false, 973 | "datasource": "Prometheus", 974 | "decimals": 1, 975 | "fill": 1, 976 | "fillGradient": 0, 977 | "gridPos": { 978 | "h": 6, 979 | "w": 8, 980 | "x": 16, 981 | "y": 18 982 | }, 983 | "hiddenSeries": false, 984 | "id": 49, 985 | "interval": "", 986 | "legend": { 987 | "alignAsTable": true, 988 | "avg": true, 989 | "current": true, 990 | "max": true, 991 | "min": false, 992 | "show": true, 993 | "total": false, 994 | "values": true 995 | }, 996 | "lines": true, 997 | "linewidth": 1, 998 | "links": [], 999 | "nullPointMode": "null", 1000 | "options": { 1001 | "dataLinks": [] 1002 | }, 1003 | "percentage": false, 1004 | "pluginVersion": "6.6.2", 1005 | "pointradius": 2, 1006 | "points": false, 1007 | "renderer": "flot", 1008 | "seriesOverrides": [], 1009 | "spaceLength": 10, 1010 | "stack": false, 1011 | "steppedLine": false, 1012 | "targets": [ 1013 | { 1014 | "expr": "jvm_threads_current {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 1015 | "format": "time_series", 1016 | "instant": false, 1017 | "legendFormat": "{{instance}}_active", 1018 | "refId": "A" 1019 | }, 1020 | { 1021 | "expr": "jvm_threads_daemon {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 1022 | "format": "time_series", 1023 | "instant": false, 1024 | "legendFormat": "{{instance}}_daemon", 1025 | "refId": "B" 1026 | }, 1027 | { 1028 | "expr": "jvm_threads_deadlocked {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 1029 | "format": "time_series", 1030 | "instant": false, 1031 | "legendFormat": "{{instance}}_deadlocked", 1032 | "refId": "C" 1033 | } 1034 | ], 1035 | "thresholds": [], 1036 | "timeFrom": null, 1037 | "timeRegions": [], 1038 | "timeShift": null, 1039 | "title": "Threads", 1040 | "tooltip": { 1041 | "shared": true, 1042 | "sort": 1, 1043 | "value_type": "individual" 1044 | }, 1045 | "type": "graph", 1046 | "xaxis": { 1047 | "buckets": null, 1048 | "mode": "time", 1049 | "name": null, 1050 | "show": true, 1051 | "values": [] 1052 | }, 1053 | "yaxes": [ 1054 | { 1055 | "decimals": null, 1056 | "format": "short", 1057 | "label": null, 1058 | "logBase": 1, 1059 | "max": null, 1060 | "min": "0", 1061 | "show": true 1062 | }, 1063 | { 1064 | "format": "short", 1065 | "label": null, 1066 | "logBase": 1, 1067 | "max": null, 1068 | "min": null, 1069 | "show": true 1070 | } 1071 | ], 1072 | "yaxis": { 1073 | "align": false, 1074 | "alignLevel": null 1075 | } 1076 | }, 1077 | { 1078 | "aliasColors": {}, 1079 | "bars": false, 1080 | "cacheTimeout": null, 1081 | "dashLength": 10, 1082 | "dashes": false, 1083 | "datasource": "Prometheus", 1084 | "decimals": 1, 1085 | "fill": 1, 1086 | "fillGradient": 0, 1087 | "gridPos": { 1088 | "h": 6, 1089 | "w": 8, 1090 | "x": 0, 1091 | "y": 24 1092 | }, 1093 | "hiddenSeries": false, 1094 | "id": 72, 1095 | "interval": "", 1096 | "legend": { 1097 | "alignAsTable": true, 1098 | "avg": true, 1099 | "current": true, 1100 | "max": true, 1101 | "min": false, 1102 | "show": true, 1103 | "total": false, 1104 | "values": true 1105 | }, 1106 | "lines": true, 1107 | "linewidth": 1, 1108 | "links": [], 1109 | "nullPointMode": "null", 1110 | "options": { 1111 | "dataLinks": [] 1112 | }, 1113 | "percentage": false, 1114 | "pluginVersion": "6.6.2", 1115 | "pointradius": 2, 1116 | "points": false, 1117 | "renderer": "flot", 1118 | "seriesOverrides": [], 1119 | "spaceLength": 10, 1120 | "stack": false, 1121 | "steppedLine": false, 1122 | "targets": [ 1123 | { 1124 | "expr": "process_open_fds {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 1125 | "format": "time_series", 1126 | "instant": false, 1127 | "legendFormat": "{{instance}}", 1128 | "refId": "A" 1129 | } 1130 | ], 1131 | "thresholds": [], 1132 | "timeFrom": null, 1133 | "timeRegions": [], 1134 | "timeShift": null, 1135 | "title": "Open File Descriptors", 1136 | "tooltip": { 1137 | "shared": true, 1138 | "sort": 1, 1139 | "value_type": "individual" 1140 | }, 1141 | "type": "graph", 1142 | "xaxis": { 1143 | "buckets": null, 1144 | "mode": "time", 1145 | "name": null, 1146 | "show": true, 1147 | "values": [] 1148 | }, 1149 | "yaxes": [ 1150 | { 1151 | "decimals": null, 1152 | "format": "short", 1153 | "label": null, 1154 | "logBase": 1, 1155 | "max": null, 1156 | "min": "0", 1157 | "show": true 1158 | }, 1159 | { 1160 | "format": "short", 1161 | "label": null, 1162 | "logBase": 1, 1163 | "max": null, 1164 | "min": null, 1165 | "show": true 1166 | } 1167 | ], 1168 | "yaxis": { 1169 | "align": false, 1170 | "alignLevel": null 1171 | } 1172 | } 1173 | ], 1174 | "schemaVersion": 22, 1175 | "style": "dark", 1176 | "tags": [ 1177 | "Amazon EMR", 1178 | "JVM", 1179 | "Prometheus" 1180 | ], 1181 | "templating": { 1182 | "list": [ 1183 | { 1184 | "auto": false, 1185 | "auto_count": 30, 1186 | "auto_min": "10s", 1187 | "current": { 1188 | "selected": false, 1189 | "text": "1m", 1190 | "value": "1m" 1191 | }, 1192 | "hide": 0, 1193 | "label": "Interval", 1194 | "name": "interval", 1195 | "options": [ 1196 | { 1197 | "selected": true, 1198 | "text": "1m", 1199 | "value": "1m" 1200 | }, 1201 | { 1202 | "selected": false, 1203 | "text": "5m", 1204 | "value": "5m" 1205 | }, 1206 | { 1207 | "selected": false, 1208 | "text": "10m", 1209 | "value": "10m" 1210 | }, 1211 | { 1212 | "selected": false, 1213 | "text": "30m", 1214 | "value": "30m" 1215 | }, 1216 | { 1217 | "selected": false, 1218 | "text": "1h", 1219 | "value": "1h" 1220 | }, 1221 | { 1222 | "selected": false, 1223 | "text": "6h", 1224 | "value": "6h" 1225 | }, 1226 | { 1227 | "selected": false, 1228 | "text": "12h", 1229 | "value": "12h" 1230 | }, 1231 | { 1232 | "selected": false, 1233 | "text": "1d", 1234 | "value": "1d" 1235 | }, 1236 | { 1237 | "selected": false, 1238 | "text": "7d", 1239 | "value": "7d" 1240 | }, 1241 | { 1242 | "selected": false, 1243 | "text": "14d", 1244 | "value": "14d" 1245 | }, 1246 | { 1247 | "selected": false, 1248 | "text": "30d", 1249 | "value": "30d" 1250 | } 1251 | ], 1252 | "query": "1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 1253 | "refresh": 2, 1254 | "skipUrlSync": false, 1255 | "type": "interval" 1256 | }, 1257 | { 1258 | "allValue": null, 1259 | "current": { 1260 | "text": "hadoop_hdfs_datanode", 1261 | "value": "hadoop_hdfs_datanode" 1262 | }, 1263 | "datasource": "Prometheus", 1264 | "definition": "label_values(java_lang_memory_heapmemoryusage_used, job)", 1265 | "hide": 0, 1266 | "includeAll": false, 1267 | "index": -1, 1268 | "label": "Job", 1269 | "multi": false, 1270 | "name": "job", 1271 | "options": [], 1272 | "query": "label_values(java_lang_memory_heapmemoryusage_used, job)", 1273 | "refresh": 1, 1274 | "regex": "", 1275 | "skipUrlSync": false, 1276 | "sort": 1, 1277 | "tagValuesQuery": "", 1278 | "tags": [], 1279 | "tagsQuery": "", 1280 | "type": "query", 1281 | "useTags": false 1282 | }, 1283 | { 1284 | "allValue": null, 1285 | "current": { 1286 | "text": "j-IL8MLGKV6TBR", 1287 | "value": "j-IL8MLGKV6TBR" 1288 | }, 1289 | "datasource": "Prometheus", 1290 | "definition": "label_values(java_lang_memory_heapmemoryusage_used{job=~\"$job\"}, cluster_id)", 1291 | "hide": 0, 1292 | "includeAll": true, 1293 | "index": -1, 1294 | "label": "Cluster", 1295 | "multi": true, 1296 | "name": "cluster", 1297 | "options": [], 1298 | "query": "label_values(java_lang_memory_heapmemoryusage_used{job=~\"$job\"}, cluster_id)", 1299 | "refresh": 1, 1300 | "regex": "", 1301 | "skipUrlSync": false, 1302 | "sort": 0, 1303 | "tagValuesQuery": "", 1304 | "tags": [], 1305 | "tagsQuery": "", 1306 | "type": "query", 1307 | "useTags": false 1308 | }, 1309 | { 1310 | "allValue": null, 1311 | "current": { 1312 | "selected": false, 1313 | "text": "All", 1314 | "value": "$__all" 1315 | }, 1316 | "datasource": "Prometheus", 1317 | "definition": "label_values(java_lang_memory_heapmemoryusage_used{job=~\"$job\", cluster_id=~'$cluster'}, instance)", 1318 | "hide": 0, 1319 | "includeAll": true, 1320 | "index": -1, 1321 | "label": "Instance", 1322 | "multi": true, 1323 | "name": "instance", 1324 | "options": [], 1325 | "query": "label_values(java_lang_memory_heapmemoryusage_used{job=~\"$job\", cluster_id=~'$cluster'}, instance)", 1326 | "refresh": 1, 1327 | "regex": "", 1328 | "skipUrlSync": false, 1329 | "sort": 1, 1330 | "tagValuesQuery": "", 1331 | "tags": [], 1332 | "tagsQuery": "", 1333 | "type": "query", 1334 | "useTags": false 1335 | } 1336 | ] 1337 | }, 1338 | "time": { 1339 | "from": "now-1h", 1340 | "to": "now" 1341 | }, 1342 | "timepicker": { 1343 | "refresh_intervals": [ 1344 | "5s", 1345 | "10s", 1346 | "30s", 1347 | "1m", 1348 | "5m", 1349 | "15m", 1350 | "30m", 1351 | "1h", 1352 | "2h", 1353 | "1d" 1354 | ] 1355 | }, 1356 | "timezone": "", 1357 | "title": "JVM Metrics", 1358 | "uid": "dAddAPeWk", 1359 | "variables": { 1360 | "list": [] 1361 | }, 1362 | "version": 1 1363 | } -------------------------------------------------------------------------------- /grafana-dashboards/YARN+-+Node+Manager.json: -------------------------------------------------------------------------------- 1 | { 2 | "__requires": [ 3 | { 4 | "type": "grafana", 5 | "id": "grafana", 6 | "name": "Grafana", 7 | "version": "6.7.3" 8 | }, 9 | { 10 | "type": "panel", 11 | "id": "graph", 12 | "name": "Graph", 13 | "version": "" 14 | }, 15 | { 16 | "type": "datasource", 17 | "id": "prometheus", 18 | "name": "Prometheus", 19 | "version": "1.0.0" 20 | } 21 | ], 22 | "annotations": { 23 | "list": [ 24 | { 25 | "$$hashKey": "object:31751", 26 | "builtIn": 1, 27 | "datasource": "-- Grafana --", 28 | "enable": true, 29 | "hide": true, 30 | "iconColor": "rgba(0, 211, 255, 1)", 31 | "name": "Annotations & Alerts", 32 | "type": "dashboard" 33 | } 34 | ] 35 | }, 36 | "editable": true, 37 | "gnetId": null, 38 | "graphTooltip": 0, 39 | "id": null, 40 | "iteration": 1595837553048, 41 | "links": [], 42 | "panels": [ 43 | { 44 | "collapsed": false, 45 | "datasource": "Prometheus", 46 | "gridPos": { 47 | "h": 1, 48 | "w": 24, 49 | "x": 0, 50 | "y": 0 51 | }, 52 | "id": 40, 53 | "panels": [], 54 | "title": "Container Stats", 55 | "type": "row" 56 | }, 57 | { 58 | "aliasColors": {}, 59 | "bars": false, 60 | "cacheTimeout": null, 61 | "dashLength": 10, 62 | "dashes": false, 63 | "datasource": "Prometheus", 64 | "decimals": 1, 65 | "fill": 1, 66 | "fillGradient": 0, 67 | "gridPos": { 68 | "h": 6, 69 | "w": 12, 70 | "x": 0, 71 | "y": 1 72 | }, 73 | "hiddenSeries": false, 74 | "id": 45, 75 | "interval": "", 76 | "legend": { 77 | "alignAsTable": true, 78 | "avg": true, 79 | "current": true, 80 | "max": true, 81 | "min": false, 82 | "show": true, 83 | "total": false, 84 | "values": true 85 | }, 86 | "lines": true, 87 | "linewidth": 1, 88 | "links": [], 89 | "nullPointMode": "null", 90 | "options": { 91 | "dataLinks": [] 92 | }, 93 | "percentage": false, 94 | "pointradius": 2, 95 | "points": false, 96 | "renderer": "flot", 97 | "seriesOverrides": [], 98 | "spaceLength": 10, 99 | "stack": false, 100 | "steppedLine": false, 101 | "targets": [ 102 | { 103 | "expr": "yarn_nodemanager_metrics_containers_running {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 104 | "legendFormat": "{{instance}}", 105 | "refId": "A" 106 | } 107 | ], 108 | "thresholds": [], 109 | "timeFrom": null, 110 | "timeRegions": [], 111 | "timeShift": null, 112 | "title": "Running Containers", 113 | "tooltip": { 114 | "shared": true, 115 | "sort": 0, 116 | "value_type": "individual" 117 | }, 118 | "type": "graph", 119 | "xaxis": { 120 | "buckets": null, 121 | "mode": "time", 122 | "name": null, 123 | "show": true, 124 | "values": [] 125 | }, 126 | "yaxes": [ 127 | { 128 | "decimals": null, 129 | "format": "short", 130 | "label": "", 131 | "logBase": 1, 132 | "max": null, 133 | "min": null, 134 | "show": true 135 | }, 136 | { 137 | "format": "short", 138 | "label": null, 139 | "logBase": 1, 140 | "max": null, 141 | "min": null, 142 | "show": true 143 | } 144 | ], 145 | "yaxis": { 146 | "align": false, 147 | "alignLevel": null 148 | } 149 | }, 150 | { 151 | "aliasColors": {}, 152 | "bars": false, 153 | "cacheTimeout": null, 154 | "dashLength": 10, 155 | "dashes": false, 156 | "datasource": "Prometheus", 157 | "decimals": 1, 158 | "fill": 1, 159 | "fillGradient": 0, 160 | "gridPos": { 161 | "h": 6, 162 | "w": 12, 163 | "x": 12, 164 | "y": 1 165 | }, 166 | "hiddenSeries": false, 167 | "id": 46, 168 | "interval": "", 169 | "legend": { 170 | "alignAsTable": true, 171 | "avg": true, 172 | "current": true, 173 | "max": true, 174 | "min": false, 175 | "show": true, 176 | "total": false, 177 | "values": true 178 | }, 179 | "lines": true, 180 | "linewidth": 1, 181 | "links": [], 182 | "nullPointMode": "null", 183 | "options": { 184 | "dataLinks": [] 185 | }, 186 | "percentage": false, 187 | "pointradius": 2, 188 | "points": false, 189 | "renderer": "flot", 190 | "seriesOverrides": [], 191 | "spaceLength": 10, 192 | "stack": false, 193 | "steppedLine": false, 194 | "targets": [ 195 | { 196 | "expr": "yarn_nodemanager_metrics_containers_failed {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 197 | "legendFormat": "{{instance}}", 198 | "refId": "A" 199 | } 200 | ], 201 | "thresholds": [], 202 | "timeFrom": null, 203 | "timeRegions": [], 204 | "timeShift": null, 205 | "title": "Failed Containers", 206 | "tooltip": { 207 | "shared": true, 208 | "sort": 0, 209 | "value_type": "individual" 210 | }, 211 | "type": "graph", 212 | "xaxis": { 213 | "buckets": null, 214 | "mode": "time", 215 | "name": null, 216 | "show": true, 217 | "values": [] 218 | }, 219 | "yaxes": [ 220 | { 221 | "decimals": null, 222 | "format": "short", 223 | "label": "", 224 | "logBase": 1, 225 | "max": null, 226 | "min": null, 227 | "show": true 228 | }, 229 | { 230 | "format": "short", 231 | "label": null, 232 | "logBase": 1, 233 | "max": null, 234 | "min": null, 235 | "show": true 236 | } 237 | ], 238 | "yaxis": { 239 | "align": false, 240 | "alignLevel": null 241 | } 242 | }, 243 | { 244 | "aliasColors": {}, 245 | "bars": false, 246 | "cacheTimeout": null, 247 | "dashLength": 10, 248 | "dashes": false, 249 | "datasource": "Prometheus", 250 | "decimals": 1, 251 | "fill": 1, 252 | "fillGradient": 0, 253 | "gridPos": { 254 | "h": 6, 255 | "w": 12, 256 | "x": 0, 257 | "y": 7 258 | }, 259 | "hiddenSeries": false, 260 | "id": 49, 261 | "interval": "", 262 | "legend": { 263 | "alignAsTable": true, 264 | "avg": true, 265 | "current": true, 266 | "max": true, 267 | "min": false, 268 | "show": true, 269 | "total": false, 270 | "values": true 271 | }, 272 | "lines": true, 273 | "linewidth": 1, 274 | "links": [], 275 | "nullPointMode": "null", 276 | "options": { 277 | "dataLinks": [] 278 | }, 279 | "percentage": false, 280 | "pointradius": 2, 281 | "points": false, 282 | "renderer": "flot", 283 | "seriesOverrides": [], 284 | "spaceLength": 10, 285 | "stack": false, 286 | "steppedLine": false, 287 | "targets": [ 288 | { 289 | "expr": "yarn_nodemanager_metrics_containers_completed {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 290 | "legendFormat": "{{instance}}", 291 | "refId": "A" 292 | } 293 | ], 294 | "thresholds": [], 295 | "timeFrom": null, 296 | "timeRegions": [], 297 | "timeShift": null, 298 | "title": "Completed Containers", 299 | "tooltip": { 300 | "shared": true, 301 | "sort": 0, 302 | "value_type": "individual" 303 | }, 304 | "type": "graph", 305 | "xaxis": { 306 | "buckets": null, 307 | "mode": "time", 308 | "name": null, 309 | "show": true, 310 | "values": [] 311 | }, 312 | "yaxes": [ 313 | { 314 | "decimals": null, 315 | "format": "short", 316 | "label": "", 317 | "logBase": 1, 318 | "max": null, 319 | "min": null, 320 | "show": true 321 | }, 322 | { 323 | "format": "short", 324 | "label": null, 325 | "logBase": 1, 326 | "max": null, 327 | "min": null, 328 | "show": true 329 | } 330 | ], 331 | "yaxis": { 332 | "align": false, 333 | "alignLevel": null 334 | } 335 | }, 336 | { 337 | "aliasColors": {}, 338 | "bars": false, 339 | "cacheTimeout": null, 340 | "dashLength": 10, 341 | "dashes": false, 342 | "datasource": "Prometheus", 343 | "decimals": 1, 344 | "fill": 1, 345 | "fillGradient": 0, 346 | "gridPos": { 347 | "h": 6, 348 | "w": 12, 349 | "x": 12, 350 | "y": 7 351 | }, 352 | "hiddenSeries": false, 353 | "id": 73, 354 | "interval": "", 355 | "legend": { 356 | "alignAsTable": true, 357 | "avg": true, 358 | "current": true, 359 | "max": true, 360 | "min": false, 361 | "show": true, 362 | "total": false, 363 | "values": true 364 | }, 365 | "lines": true, 366 | "linewidth": 1, 367 | "links": [], 368 | "nullPointMode": "null", 369 | "options": { 370 | "dataLinks": [] 371 | }, 372 | "percentage": false, 373 | "pointradius": 2, 374 | "points": false, 375 | "renderer": "flot", 376 | "seriesOverrides": [], 377 | "spaceLength": 10, 378 | "stack": false, 379 | "steppedLine": false, 380 | "targets": [ 381 | { 382 | "expr": "yarn_nodemanager_metrics_containers_killed {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 383 | "legendFormat": "{{instance}}", 384 | "refId": "A" 385 | } 386 | ], 387 | "thresholds": [], 388 | "timeFrom": null, 389 | "timeRegions": [], 390 | "timeShift": null, 391 | "title": "Killed Containers", 392 | "tooltip": { 393 | "shared": true, 394 | "sort": 0, 395 | "value_type": "individual" 396 | }, 397 | "type": "graph", 398 | "xaxis": { 399 | "buckets": null, 400 | "mode": "time", 401 | "name": null, 402 | "show": true, 403 | "values": [] 404 | }, 405 | "yaxes": [ 406 | { 407 | "decimals": null, 408 | "format": "short", 409 | "label": "", 410 | "logBase": 1, 411 | "max": null, 412 | "min": null, 413 | "show": true 414 | }, 415 | { 416 | "format": "short", 417 | "label": null, 418 | "logBase": 1, 419 | "max": null, 420 | "min": null, 421 | "show": true 422 | } 423 | ], 424 | "yaxis": { 425 | "align": false, 426 | "alignLevel": null 427 | } 428 | }, 429 | { 430 | "aliasColors": {}, 431 | "bars": false, 432 | "cacheTimeout": null, 433 | "dashLength": 10, 434 | "dashes": false, 435 | "datasource": "Prometheus", 436 | "decimals": 1, 437 | "fill": 1, 438 | "fillGradient": 0, 439 | "gridPos": { 440 | "h": 6, 441 | "w": 12, 442 | "x": 0, 443 | "y": 13 444 | }, 445 | "hiddenSeries": false, 446 | "id": 76, 447 | "interval": "", 448 | "legend": { 449 | "alignAsTable": true, 450 | "avg": true, 451 | "current": true, 452 | "max": true, 453 | "min": false, 454 | "show": true, 455 | "total": false, 456 | "values": true 457 | }, 458 | "lines": true, 459 | "linewidth": 1, 460 | "links": [], 461 | "nullPointMode": "null", 462 | "options": { 463 | "dataLinks": [] 464 | }, 465 | "percentage": false, 466 | "pointradius": 2, 467 | "points": false, 468 | "renderer": "flot", 469 | "seriesOverrides": [], 470 | "spaceLength": 10, 471 | "stack": false, 472 | "steppedLine": false, 473 | "targets": [ 474 | { 475 | "expr": "yarn_nodemanager_metrics_containers_initing {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 476 | "legendFormat": "{{instance}}", 477 | "refId": "A" 478 | } 479 | ], 480 | "thresholds": [], 481 | "timeFrom": null, 482 | "timeRegions": [], 483 | "timeShift": null, 484 | "title": "Initing Containers", 485 | "tooltip": { 486 | "shared": true, 487 | "sort": 0, 488 | "value_type": "individual" 489 | }, 490 | "type": "graph", 491 | "xaxis": { 492 | "buckets": null, 493 | "mode": "time", 494 | "name": null, 495 | "show": true, 496 | "values": [] 497 | }, 498 | "yaxes": [ 499 | { 500 | "decimals": null, 501 | "format": "short", 502 | "label": "", 503 | "logBase": 1, 504 | "max": null, 505 | "min": null, 506 | "show": true 507 | }, 508 | { 509 | "format": "short", 510 | "label": null, 511 | "logBase": 1, 512 | "max": null, 513 | "min": null, 514 | "show": true 515 | } 516 | ], 517 | "yaxis": { 518 | "align": false, 519 | "alignLevel": null 520 | } 521 | }, 522 | { 523 | "aliasColors": {}, 524 | "bars": false, 525 | "cacheTimeout": null, 526 | "dashLength": 10, 527 | "dashes": false, 528 | "datasource": "Prometheus", 529 | "decimals": 1, 530 | "fill": 1, 531 | "fillGradient": 0, 532 | "gridPos": { 533 | "h": 6, 534 | "w": 12, 535 | "x": 12, 536 | "y": 13 537 | }, 538 | "hiddenSeries": false, 539 | "id": 77, 540 | "interval": "", 541 | "legend": { 542 | "alignAsTable": true, 543 | "avg": true, 544 | "current": true, 545 | "max": true, 546 | "min": false, 547 | "show": true, 548 | "total": false, 549 | "values": true 550 | }, 551 | "lines": true, 552 | "linewidth": 1, 553 | "links": [], 554 | "nullPointMode": "null", 555 | "options": { 556 | "dataLinks": [] 557 | }, 558 | "percentage": false, 559 | "pointradius": 2, 560 | "points": false, 561 | "renderer": "flot", 562 | "seriesOverrides": [], 563 | "spaceLength": 10, 564 | "stack": false, 565 | "steppedLine": false, 566 | "targets": [ 567 | { 568 | "expr": "yarn_nodemanager_metrics_containers_launched {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 569 | "legendFormat": "{{instance}}", 570 | "refId": "A" 571 | } 572 | ], 573 | "thresholds": [], 574 | "timeFrom": null, 575 | "timeRegions": [], 576 | "timeShift": null, 577 | "title": "Launched Containers", 578 | "tooltip": { 579 | "shared": true, 580 | "sort": 0, 581 | "value_type": "individual" 582 | }, 583 | "type": "graph", 584 | "xaxis": { 585 | "buckets": null, 586 | "mode": "time", 587 | "name": null, 588 | "show": true, 589 | "values": [] 590 | }, 591 | "yaxes": [ 592 | { 593 | "decimals": null, 594 | "format": "short", 595 | "label": "", 596 | "logBase": 1, 597 | "max": null, 598 | "min": null, 599 | "show": true 600 | }, 601 | { 602 | "format": "short", 603 | "label": null, 604 | "logBase": 1, 605 | "max": null, 606 | "min": null, 607 | "show": true 608 | } 609 | ], 610 | "yaxis": { 611 | "align": false, 612 | "alignLevel": null 613 | } 614 | }, 615 | { 616 | "aliasColors": {}, 617 | "bars": false, 618 | "cacheTimeout": null, 619 | "dashLength": 10, 620 | "dashes": false, 621 | "datasource": "Prometheus", 622 | "decimals": 1, 623 | "fill": 1, 624 | "fillGradient": 0, 625 | "gridPos": { 626 | "h": 6, 627 | "w": 12, 628 | "x": 0, 629 | "y": 19 630 | }, 631 | "hiddenSeries": false, 632 | "id": 78, 633 | "interval": "", 634 | "legend": { 635 | "alignAsTable": true, 636 | "avg": true, 637 | "current": true, 638 | "max": true, 639 | "min": false, 640 | "show": true, 641 | "total": false, 642 | "values": true 643 | }, 644 | "lines": true, 645 | "linewidth": 1, 646 | "links": [], 647 | "nullPointMode": "null", 648 | "options": { 649 | "dataLinks": [] 650 | }, 651 | "percentage": false, 652 | "pointradius": 2, 653 | "points": false, 654 | "renderer": "flot", 655 | "seriesOverrides": [], 656 | "spaceLength": 10, 657 | "stack": false, 658 | "steppedLine": false, 659 | "targets": [ 660 | { 661 | "expr": "yarn_nodemanager_metrics_containers_re_initing {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 662 | "legendFormat": "{{instance}}", 663 | "refId": "A" 664 | } 665 | ], 666 | "thresholds": [], 667 | "timeFrom": null, 668 | "timeRegions": [], 669 | "timeShift": null, 670 | "title": "Re-initing Containers", 671 | "tooltip": { 672 | "shared": true, 673 | "sort": 0, 674 | "value_type": "individual" 675 | }, 676 | "type": "graph", 677 | "xaxis": { 678 | "buckets": null, 679 | "mode": "time", 680 | "name": null, 681 | "show": true, 682 | "values": [] 683 | }, 684 | "yaxes": [ 685 | { 686 | "decimals": null, 687 | "format": "short", 688 | "label": "", 689 | "logBase": 1, 690 | "max": null, 691 | "min": null, 692 | "show": true 693 | }, 694 | { 695 | "format": "short", 696 | "label": null, 697 | "logBase": 1, 698 | "max": null, 699 | "min": null, 700 | "show": true 701 | } 702 | ], 703 | "yaxis": { 704 | "align": false, 705 | "alignLevel": null 706 | } 707 | }, 708 | { 709 | "aliasColors": {}, 710 | "bars": false, 711 | "cacheTimeout": null, 712 | "dashLength": 10, 713 | "dashes": false, 714 | "datasource": "Prometheus", 715 | "decimals": 1, 716 | "fill": 1, 717 | "fillGradient": 0, 718 | "gridPos": { 719 | "h": 6, 720 | "w": 12, 721 | "x": 12, 722 | "y": 19 723 | }, 724 | "hiddenSeries": false, 725 | "id": 75, 726 | "interval": "", 727 | "legend": { 728 | "alignAsTable": true, 729 | "avg": true, 730 | "current": true, 731 | "max": true, 732 | "min": false, 733 | "show": true, 734 | "total": false, 735 | "values": true 736 | }, 737 | "lines": true, 738 | "linewidth": 1, 739 | "links": [], 740 | "nullPointMode": "null", 741 | "options": { 742 | "dataLinks": [] 743 | }, 744 | "percentage": false, 745 | "pointradius": 2, 746 | "points": false, 747 | "renderer": "flot", 748 | "seriesOverrides": [], 749 | "spaceLength": 10, 750 | "stack": false, 751 | "steppedLine": false, 752 | "targets": [ 753 | { 754 | "expr": "yarn_nodemanager_metrics_container_launch_duration_avg_time {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 755 | "legendFormat": "{{instance}}", 756 | "refId": "A" 757 | } 758 | ], 759 | "thresholds": [], 760 | "timeFrom": null, 761 | "timeRegions": [], 762 | "timeShift": null, 763 | "title": "Container Launch Delay", 764 | "tooltip": { 765 | "shared": true, 766 | "sort": 0, 767 | "value_type": "individual" 768 | }, 769 | "type": "graph", 770 | "xaxis": { 771 | "buckets": null, 772 | "mode": "time", 773 | "name": null, 774 | "show": true, 775 | "values": [] 776 | }, 777 | "yaxes": [ 778 | { 779 | "decimals": null, 780 | "format": "ms", 781 | "label": "", 782 | "logBase": 1, 783 | "max": null, 784 | "min": null, 785 | "show": true 786 | }, 787 | { 788 | "format": "short", 789 | "label": null, 790 | "logBase": 1, 791 | "max": null, 792 | "min": null, 793 | "show": true 794 | } 795 | ], 796 | "yaxis": { 797 | "align": false, 798 | "alignLevel": null 799 | } 800 | }, 801 | { 802 | "collapsed": true, 803 | "datasource": "Prometheus", 804 | "gridPos": { 805 | "h": 1, 806 | "w": 24, 807 | "x": 0, 808 | "y": 25 809 | }, 810 | "id": 32, 811 | "panels": [ 812 | { 813 | "aliasColors": {}, 814 | "bars": false, 815 | "cacheTimeout": null, 816 | "dashLength": 10, 817 | "dashes": false, 818 | "datasource": "Prometheus", 819 | "decimals": 1, 820 | "fill": 1, 821 | "fillGradient": 0, 822 | "gridPos": { 823 | "h": 6, 824 | "w": 12, 825 | "x": 0, 826 | "y": 2 827 | }, 828 | "hiddenSeries": false, 829 | "id": 35, 830 | "interval": "", 831 | "legend": { 832 | "alignAsTable": true, 833 | "avg": true, 834 | "current": true, 835 | "max": true, 836 | "min": false, 837 | "show": true, 838 | "total": false, 839 | "values": true 840 | }, 841 | "lines": true, 842 | "linewidth": 1, 843 | "links": [], 844 | "nullPointMode": "null", 845 | "options": { 846 | "dataLinks": [] 847 | }, 848 | "percentage": false, 849 | "pointradius": 2, 850 | "points": false, 851 | "renderer": "flot", 852 | "seriesOverrides": [], 853 | "spaceLength": 10, 854 | "stack": false, 855 | "steppedLine": false, 856 | "targets": [ 857 | { 858 | "expr": "yarn_nodemanager_metrics_available_vcores {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 859 | "legendFormat": "{{instance}}", 860 | "refId": "A" 861 | } 862 | ], 863 | "thresholds": [], 864 | "timeFrom": null, 865 | "timeRegions": [], 866 | "timeShift": null, 867 | "title": "Available VCores", 868 | "tooltip": { 869 | "shared": true, 870 | "sort": 0, 871 | "value_type": "individual" 872 | }, 873 | "type": "graph", 874 | "xaxis": { 875 | "buckets": null, 876 | "mode": "time", 877 | "name": null, 878 | "show": true, 879 | "values": [] 880 | }, 881 | "yaxes": [ 882 | { 883 | "decimals": null, 884 | "format": "short", 885 | "label": "", 886 | "logBase": 1, 887 | "max": null, 888 | "min": null, 889 | "show": true 890 | }, 891 | { 892 | "format": "short", 893 | "label": null, 894 | "logBase": 1, 895 | "max": null, 896 | "min": null, 897 | "show": true 898 | } 899 | ], 900 | "yaxis": { 901 | "align": false, 902 | "alignLevel": null 903 | } 904 | }, 905 | { 906 | "aliasColors": {}, 907 | "bars": false, 908 | "cacheTimeout": null, 909 | "dashLength": 10, 910 | "dashes": false, 911 | "datasource": "Prometheus", 912 | "decimals": 1, 913 | "fill": 1, 914 | "fillGradient": 0, 915 | "gridPos": { 916 | "h": 6, 917 | "w": 12, 918 | "x": 12, 919 | "y": 2 920 | }, 921 | "hiddenSeries": false, 922 | "id": 36, 923 | "interval": "", 924 | "legend": { 925 | "alignAsTable": true, 926 | "avg": true, 927 | "current": true, 928 | "max": true, 929 | "min": false, 930 | "show": true, 931 | "total": false, 932 | "values": true 933 | }, 934 | "lines": true, 935 | "linewidth": 1, 936 | "links": [], 937 | "nullPointMode": "null", 938 | "options": { 939 | "dataLinks": [] 940 | }, 941 | "percentage": false, 942 | "pointradius": 2, 943 | "points": false, 944 | "renderer": "flot", 945 | "seriesOverrides": [], 946 | "spaceLength": 10, 947 | "stack": false, 948 | "steppedLine": false, 949 | "targets": [ 950 | { 951 | "expr": "yarn_nodemanager_metrics_allocated_vcores {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 952 | "legendFormat": "{{instance}}", 953 | "refId": "A" 954 | } 955 | ], 956 | "thresholds": [], 957 | "timeFrom": null, 958 | "timeRegions": [], 959 | "timeShift": null, 960 | "title": "Allocated VCores", 961 | "tooltip": { 962 | "shared": true, 963 | "sort": 0, 964 | "value_type": "individual" 965 | }, 966 | "type": "graph", 967 | "xaxis": { 968 | "buckets": null, 969 | "mode": "time", 970 | "name": null, 971 | "show": true, 972 | "values": [] 973 | }, 974 | "yaxes": [ 975 | { 976 | "decimals": null, 977 | "format": "short", 978 | "label": "", 979 | "logBase": 1, 980 | "max": null, 981 | "min": null, 982 | "show": true 983 | }, 984 | { 985 | "format": "short", 986 | "label": null, 987 | "logBase": 1, 988 | "max": null, 989 | "min": null, 990 | "show": true 991 | } 992 | ], 993 | "yaxis": { 994 | "align": false, 995 | "alignLevel": null 996 | } 997 | } 998 | ], 999 | "title": "VCores Stats", 1000 | "type": "row" 1001 | }, 1002 | { 1003 | "collapsed": true, 1004 | "datasource": "Prometheus", 1005 | "gridPos": { 1006 | "h": 1, 1007 | "w": 24, 1008 | "x": 0, 1009 | "y": 26 1010 | }, 1011 | "id": 17, 1012 | "panels": [ 1013 | { 1014 | "aliasColors": {}, 1015 | "bars": false, 1016 | "cacheTimeout": null, 1017 | "dashLength": 10, 1018 | "dashes": false, 1019 | "datasource": "Prometheus", 1020 | "decimals": 1, 1021 | "fill": 1, 1022 | "fillGradient": 0, 1023 | "gridPos": { 1024 | "h": 6, 1025 | "w": 8, 1026 | "x": 0, 1027 | "y": 3 1028 | }, 1029 | "hiddenSeries": false, 1030 | "id": 19, 1031 | "interval": "", 1032 | "legend": { 1033 | "alignAsTable": true, 1034 | "avg": true, 1035 | "current": true, 1036 | "max": true, 1037 | "min": false, 1038 | "show": true, 1039 | "total": false, 1040 | "values": true 1041 | }, 1042 | "lines": true, 1043 | "linewidth": 1, 1044 | "links": [], 1045 | "nullPointMode": "null", 1046 | "options": { 1047 | "dataLinks": [] 1048 | }, 1049 | "percentage": false, 1050 | "pointradius": 2, 1051 | "points": false, 1052 | "renderer": "flot", 1053 | "seriesOverrides": [], 1054 | "spaceLength": 10, 1055 | "stack": false, 1056 | "steppedLine": false, 1057 | "targets": [ 1058 | { 1059 | "expr": "yarn_nodemanager_metrics_available_gb {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 1060 | "legendFormat": "{{instance}}", 1061 | "refId": "A" 1062 | } 1063 | ], 1064 | "thresholds": [], 1065 | "timeFrom": null, 1066 | "timeRegions": [], 1067 | "timeShift": null, 1068 | "title": "Available Memory", 1069 | "tooltip": { 1070 | "shared": true, 1071 | "sort": 0, 1072 | "value_type": "individual" 1073 | }, 1074 | "type": "graph", 1075 | "xaxis": { 1076 | "buckets": null, 1077 | "mode": "time", 1078 | "name": null, 1079 | "show": true, 1080 | "values": [] 1081 | }, 1082 | "yaxes": [ 1083 | { 1084 | "decimals": null, 1085 | "format": "decgbytes", 1086 | "label": "", 1087 | "logBase": 1, 1088 | "max": null, 1089 | "min": null, 1090 | "show": true 1091 | }, 1092 | { 1093 | "format": "short", 1094 | "label": null, 1095 | "logBase": 1, 1096 | "max": null, 1097 | "min": null, 1098 | "show": true 1099 | } 1100 | ], 1101 | "yaxis": { 1102 | "align": false, 1103 | "alignLevel": null 1104 | } 1105 | }, 1106 | { 1107 | "aliasColors": {}, 1108 | "bars": false, 1109 | "cacheTimeout": null, 1110 | "dashLength": 10, 1111 | "dashes": false, 1112 | "datasource": "Prometheus", 1113 | "decimals": 1, 1114 | "fill": 1, 1115 | "fillGradient": 0, 1116 | "gridPos": { 1117 | "h": 6, 1118 | "w": 8, 1119 | "x": 8, 1120 | "y": 3 1121 | }, 1122 | "hiddenSeries": false, 1123 | "id": 74, 1124 | "interval": "", 1125 | "legend": { 1126 | "alignAsTable": true, 1127 | "avg": true, 1128 | "current": true, 1129 | "max": true, 1130 | "min": false, 1131 | "show": true, 1132 | "total": false, 1133 | "values": true 1134 | }, 1135 | "lines": true, 1136 | "linewidth": 1, 1137 | "links": [], 1138 | "nullPointMode": "null", 1139 | "options": { 1140 | "dataLinks": [] 1141 | }, 1142 | "percentage": false, 1143 | "pointradius": 2, 1144 | "points": false, 1145 | "renderer": "flot", 1146 | "seriesOverrides": [], 1147 | "spaceLength": 10, 1148 | "stack": false, 1149 | "steppedLine": false, 1150 | "targets": [ 1151 | { 1152 | "expr": "yarn_nodemanager_metrics_allocated_gb {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}", 1153 | "legendFormat": "{{instance}}", 1154 | "refId": "A" 1155 | } 1156 | ], 1157 | "thresholds": [], 1158 | "timeFrom": null, 1159 | "timeRegions": [], 1160 | "timeShift": null, 1161 | "title": "Used Memory", 1162 | "tooltip": { 1163 | "shared": true, 1164 | "sort": 0, 1165 | "value_type": "individual" 1166 | }, 1167 | "type": "graph", 1168 | "xaxis": { 1169 | "buckets": null, 1170 | "mode": "time", 1171 | "name": null, 1172 | "show": true, 1173 | "values": [] 1174 | }, 1175 | "yaxes": [ 1176 | { 1177 | "decimals": null, 1178 | "format": "decgbytes", 1179 | "label": "", 1180 | "logBase": 1, 1181 | "max": null, 1182 | "min": null, 1183 | "show": true 1184 | }, 1185 | { 1186 | "format": "short", 1187 | "label": null, 1188 | "logBase": 1, 1189 | "max": null, 1190 | "min": null, 1191 | "show": true 1192 | } 1193 | ], 1194 | "yaxis": { 1195 | "align": false, 1196 | "alignLevel": null 1197 | } 1198 | }, 1199 | { 1200 | "aliasColors": {}, 1201 | "bars": false, 1202 | "cacheTimeout": null, 1203 | "dashLength": 10, 1204 | "dashes": false, 1205 | "datasource": "Prometheus", 1206 | "decimals": 1, 1207 | "fill": 1, 1208 | "fillGradient": 0, 1209 | "gridPos": { 1210 | "h": 6, 1211 | "w": 8, 1212 | "x": 16, 1213 | "y": 3 1214 | }, 1215 | "hiddenSeries": false, 1216 | "id": 20, 1217 | "interval": "", 1218 | "legend": { 1219 | "alignAsTable": true, 1220 | "avg": true, 1221 | "current": true, 1222 | "max": true, 1223 | "min": false, 1224 | "show": true, 1225 | "total": false, 1226 | "values": true 1227 | }, 1228 | "lines": true, 1229 | "linewidth": 1, 1230 | "links": [], 1231 | "nullPointMode": "null", 1232 | "options": { 1233 | "dataLinks": [] 1234 | }, 1235 | "percentage": false, 1236 | "pointradius": 2, 1237 | "points": false, 1238 | "renderer": "flot", 1239 | "seriesOverrides": [], 1240 | "spaceLength": 10, 1241 | "stack": false, 1242 | "steppedLine": false, 1243 | "targets": [ 1244 | { 1245 | "expr": "yarn_nodemanager_metrics_allocated_gb {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'}/(yarn_nodemanager_metrics_allocated_gb {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'} + yarn_nodemanager_metrics_available_gb {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'})", 1246 | "legendFormat": "{{instance}}", 1247 | "refId": "A" 1248 | } 1249 | ], 1250 | "thresholds": [], 1251 | "timeFrom": null, 1252 | "timeRegions": [], 1253 | "timeShift": null, 1254 | "title": "Memory Utilization", 1255 | "tooltip": { 1256 | "shared": true, 1257 | "sort": 0, 1258 | "value_type": "individual" 1259 | }, 1260 | "type": "graph", 1261 | "xaxis": { 1262 | "buckets": null, 1263 | "mode": "time", 1264 | "name": null, 1265 | "show": true, 1266 | "values": [] 1267 | }, 1268 | "yaxes": [ 1269 | { 1270 | "decimals": null, 1271 | "format": "percentunit", 1272 | "label": "", 1273 | "logBase": 1, 1274 | "max": null, 1275 | "min": null, 1276 | "show": true 1277 | }, 1278 | { 1279 | "format": "short", 1280 | "label": null, 1281 | "logBase": 1, 1282 | "max": null, 1283 | "min": null, 1284 | "show": true 1285 | } 1286 | ], 1287 | "yaxis": { 1288 | "align": false, 1289 | "alignLevel": null 1290 | } 1291 | } 1292 | ], 1293 | "title": "Memory Stats", 1294 | "type": "row" 1295 | } 1296 | ], 1297 | "schemaVersion": 22, 1298 | "style": "dark", 1299 | "tags": [ 1300 | "Amazon EMR", 1301 | "YARN", 1302 | "Prometheus" 1303 | ], 1304 | "templating": { 1305 | "list": [ 1306 | { 1307 | "auto": false, 1308 | "auto_count": 30, 1309 | "auto_min": "10s", 1310 | "current": { 1311 | "selected": false, 1312 | "text": "1m", 1313 | "value": "1m" 1314 | }, 1315 | "hide": 0, 1316 | "label": "Interval", 1317 | "name": "interval", 1318 | "options": [ 1319 | { 1320 | "selected": true, 1321 | "text": "1m", 1322 | "value": "1m" 1323 | }, 1324 | { 1325 | "selected": false, 1326 | "text": "5m", 1327 | "value": "5m" 1328 | }, 1329 | { 1330 | "selected": false, 1331 | "text": "10m", 1332 | "value": "10m" 1333 | }, 1334 | { 1335 | "selected": false, 1336 | "text": "30m", 1337 | "value": "30m" 1338 | }, 1339 | { 1340 | "selected": false, 1341 | "text": "1h", 1342 | "value": "1h" 1343 | }, 1344 | { 1345 | "selected": false, 1346 | "text": "6h", 1347 | "value": "6h" 1348 | }, 1349 | { 1350 | "selected": false, 1351 | "text": "12h", 1352 | "value": "12h" 1353 | }, 1354 | { 1355 | "selected": false, 1356 | "text": "1d", 1357 | "value": "1d" 1358 | }, 1359 | { 1360 | "selected": false, 1361 | "text": "7d", 1362 | "value": "7d" 1363 | }, 1364 | { 1365 | "selected": false, 1366 | "text": "14d", 1367 | "value": "14d" 1368 | }, 1369 | { 1370 | "selected": false, 1371 | "text": "30d", 1372 | "value": "30d" 1373 | } 1374 | ], 1375 | "query": "1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 1376 | "refresh": 2, 1377 | "skipUrlSync": false, 1378 | "type": "interval" 1379 | }, 1380 | { 1381 | "allValue": null, 1382 | "current": {}, 1383 | "datasource": "Prometheus", 1384 | "definition": "label_values(yarn_nodemanager_metrics_available_vcores, job)", 1385 | "hide": 0, 1386 | "includeAll": false, 1387 | "index": -1, 1388 | "label": "Job", 1389 | "multi": false, 1390 | "name": "job", 1391 | "options": [], 1392 | "query": "label_values(yarn_nodemanager_metrics_available_vcores, job)", 1393 | "refresh": 1, 1394 | "regex": "", 1395 | "skipUrlSync": false, 1396 | "sort": 1, 1397 | "tagValuesQuery": "", 1398 | "tags": [], 1399 | "tagsQuery": "", 1400 | "type": "query", 1401 | "useTags": false 1402 | }, 1403 | { 1404 | "allValue": null, 1405 | "current": {}, 1406 | "datasource": "Prometheus", 1407 | "definition": "label_values(yarn_nodemanager_metrics_available_vcores{job=~\"$job\"}, cluster_id)", 1408 | "hide": 0, 1409 | "includeAll": true, 1410 | "index": -1, 1411 | "label": "Cluster", 1412 | "multi": true, 1413 | "name": "cluster", 1414 | "options": [], 1415 | "query": "label_values(yarn_nodemanager_metrics_available_vcores{job=~\"$job\"}, cluster_id)", 1416 | "refresh": 1, 1417 | "regex": "", 1418 | "skipUrlSync": false, 1419 | "sort": 0, 1420 | "tagValuesQuery": "", 1421 | "tags": [], 1422 | "tagsQuery": "", 1423 | "type": "query", 1424 | "useTags": false 1425 | }, 1426 | { 1427 | "allValue": null, 1428 | "current": {}, 1429 | "datasource": "Prometheus", 1430 | "definition": "label_values(yarn_nodemanager_metrics_available_vcores{job=~\"$job\", cluster_id=~'$cluster'}, instance)", 1431 | "hide": 0, 1432 | "includeAll": true, 1433 | "index": -1, 1434 | "label": "Instance", 1435 | "multi": true, 1436 | "name": "instance", 1437 | "options": [], 1438 | "query": "label_values(yarn_nodemanager_metrics_available_vcores{job=~\"$job\", cluster_id=~'$cluster'}, instance)", 1439 | "refresh": 1, 1440 | "regex": "", 1441 | "skipUrlSync": false, 1442 | "sort": 0, 1443 | "tagValuesQuery": "", 1444 | "tags": [], 1445 | "tagsQuery": "", 1446 | "type": "query", 1447 | "useTags": false 1448 | } 1449 | ] 1450 | }, 1451 | "time": { 1452 | "from": "now-1h", 1453 | "to": "now" 1454 | }, 1455 | "timepicker": { 1456 | "refresh_intervals": [ 1457 | "5s", 1458 | "10s", 1459 | "30s", 1460 | "1m", 1461 | "5m", 1462 | "15m", 1463 | "30m", 1464 | "1h", 1465 | "2h", 1466 | "1d" 1467 | ] 1468 | }, 1469 | "timezone": "", 1470 | "title": "YARN - Node Manager", 1471 | "uid": "CRj-lw6Zk", 1472 | "variables": { 1473 | "list": [] 1474 | }, 1475 | "version": 1 1476 | } -------------------------------------------------------------------------------- /grafana-dashboards/HDFS+-+DataNode.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "$$hashKey": "object:26082", 6 | "builtIn": 1, 7 | "datasource": "-- Grafana --", 8 | "enable": true, 9 | "hide": true, 10 | "iconColor": "rgba(0, 211, 255, 1)", 11 | "name": "Annotations & Alerts", 12 | "type": "dashboard" 13 | } 14 | ] 15 | }, 16 | "editable": true, 17 | "gnetId": null, 18 | "graphTooltip": 0, 19 | "iteration": 1595823333619, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "collapsed": false, 24 | "datasource": "Prometheus", 25 | "gridPos": { 26 | "h": 1, 27 | "w": 24, 28 | "x": 0, 29 | "y": 0 30 | }, 31 | "id": 4, 32 | "panels": [], 33 | "title": "Overview", 34 | "type": "row" 35 | }, 36 | { 37 | "cacheTimeout": null, 38 | "colorBackground": false, 39 | "colorValue": true, 40 | "colors": [ 41 | "#d44a3a", 42 | "rgba(237, 129, 40, 0.89)", 43 | "#299c46" 44 | ], 45 | "datasource": "Prometheus", 46 | "decimals": 0, 47 | "format": "short", 48 | "gauge": { 49 | "maxValue": 100, 50 | "minValue": 0, 51 | "show": false, 52 | "thresholdLabels": false, 53 | "thresholdMarkers": true 54 | }, 55 | "gridPos": { 56 | "h": 3, 57 | "w": 2, 58 | "x": 0, 59 | "y": 1 60 | }, 61 | "id": 2, 62 | "interval": null, 63 | "links": [], 64 | "mappingType": 1, 65 | "mappingTypes": [ 66 | { 67 | "name": "value to text", 68 | "value": 1 69 | }, 70 | { 71 | "name": "range to text", 72 | "value": 2 73 | } 74 | ], 75 | "maxDataPoints": 100, 76 | "nullPointMode": "connected", 77 | "nullText": null, 78 | "postfix": "", 79 | "postfixFontSize": "50%", 80 | "prefix": "", 81 | "prefixFontSize": "50%", 82 | "rangeMaps": [ 83 | { 84 | "from": "null", 85 | "text": "N/A", 86 | "to": "null" 87 | } 88 | ], 89 | "sparkline": { 90 | "fillColor": "rgba(31, 118, 189, 0.18)", 91 | "full": false, 92 | "lineColor": "rgb(31, 120, 193)", 93 | "show": false, 94 | "ymax": null, 95 | "ymin": null 96 | }, 97 | "tableColumn": "", 98 | "targets": [ 99 | { 100 | "expr": "count(up {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'})", 101 | "refId": "A" 102 | } 103 | ], 104 | "thresholds": "0,0", 105 | "timeFrom": null, 106 | "timeShift": null, 107 | "title": "Nodes Up", 108 | "type": "singlestat", 109 | "valueFontSize": "80%", 110 | "valueMaps": [ 111 | { 112 | "op": "=", 113 | "text": "N/A", 114 | "value": "null" 115 | } 116 | ], 117 | "valueName": "current" 118 | }, 119 | { 120 | "cacheTimeout": null, 121 | "colorBackground": false, 122 | "colorValue": true, 123 | "colors": [ 124 | "#d44a3a", 125 | "rgba(237, 129, 40, 0.89)", 126 | "#299c46" 127 | ], 128 | "datasource": "Prometheus", 129 | "format": "decbytes", 130 | "gauge": { 131 | "maxValue": 100, 132 | "minValue": 0, 133 | "show": false, 134 | "thresholdLabels": false, 135 | "thresholdMarkers": true 136 | }, 137 | "gridPos": { 138 | "h": 3, 139 | "w": 2, 140 | "x": 2, 141 | "y": 1 142 | }, 143 | "id": 5, 144 | "interval": null, 145 | "links": [], 146 | "mappingType": 1, 147 | "mappingTypes": [ 148 | { 149 | "name": "value to text", 150 | "value": 1 151 | }, 152 | { 153 | "name": "range to text", 154 | "value": 2 155 | } 156 | ], 157 | "maxDataPoints": 100, 158 | "nullPointMode": "connected", 159 | "nullText": null, 160 | "postfix": "", 161 | "postfixFontSize": "50%", 162 | "prefix": "", 163 | "prefixFontSize": "50%", 164 | "rangeMaps": [ 165 | { 166 | "from": "null", 167 | "text": "N/A", 168 | "to": "null" 169 | } 170 | ], 171 | "sparkline": { 172 | "fillColor": "rgba(31, 118, 189, 0.18)", 173 | "full": false, 174 | "lineColor": "rgb(31, 120, 193)", 175 | "show": false, 176 | "ymax": null, 177 | "ymin": null 178 | }, 179 | "tableColumn": "", 180 | "targets": [ 181 | { 182 | "expr": "sum(hdfs_datanode_capacity {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\",fsdatasetid=''})", 183 | "legendFormat": "", 184 | "refId": "A" 185 | } 186 | ], 187 | "thresholds": "0,0", 188 | "timeFrom": null, 189 | "timeShift": null, 190 | "title": "HDFS Capacity", 191 | "type": "singlestat", 192 | "valueFontSize": "80%", 193 | "valueMaps": [ 194 | { 195 | "op": "=", 196 | "text": "N/A", 197 | "value": "null" 198 | } 199 | ], 200 | "valueName": "current" 201 | }, 202 | { 203 | "cacheTimeout": null, 204 | "colorBackground": false, 205 | "colorValue": true, 206 | "colors": [ 207 | "#d44a3a", 208 | "rgba(237, 129, 40, 0.89)", 209 | "#299c46" 210 | ], 211 | "datasource": "Prometheus", 212 | "decimals": 0, 213 | "format": "short", 214 | "gauge": { 215 | "maxValue": 100, 216 | "minValue": 0, 217 | "show": false, 218 | "thresholdLabels": false, 219 | "thresholdMarkers": true 220 | }, 221 | "gridPos": { 222 | "h": 3, 223 | "w": 2, 224 | "x": 4, 225 | "y": 1 226 | }, 227 | "id": 37, 228 | "interval": null, 229 | "links": [], 230 | "mappingType": 1, 231 | "mappingTypes": [ 232 | { 233 | "name": "value to text", 234 | "value": 1 235 | }, 236 | { 237 | "name": "range to text", 238 | "value": 2 239 | } 240 | ], 241 | "maxDataPoints": 100, 242 | "nullPointMode": "connected", 243 | "nullText": null, 244 | "postfix": "", 245 | "postfixFontSize": "50%", 246 | "prefix": "", 247 | "prefixFontSize": "50%", 248 | "rangeMaps": [ 249 | { 250 | "from": "null", 251 | "text": "N/A", 252 | "to": "null" 253 | } 254 | ], 255 | "sparkline": { 256 | "fillColor": "rgba(31, 118, 189, 0.18)", 257 | "full": false, 258 | "lineColor": "rgb(31, 120, 193)", 259 | "show": false, 260 | "ymax": null, 261 | "ymin": null 262 | }, 263 | "tableColumn": "", 264 | "targets": [ 265 | { 266 | "expr": "sum(hdfs_datanode_num_failed_volumes {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', role=\"DataNode\", kind=\"FSDatasetState\"})", 267 | "legendFormat": "{{instance}}", 268 | "refId": "A" 269 | } 270 | ], 271 | "thresholds": "0,0", 272 | "timeFrom": null, 273 | "timeShift": null, 274 | "title": "Failed Volumes", 275 | "type": "singlestat", 276 | "valueFontSize": "80%", 277 | "valueMaps": [ 278 | { 279 | "op": "=", 280 | "text": "N/A", 281 | "value": "null" 282 | } 283 | ], 284 | "valueName": "current" 285 | }, 286 | { 287 | "aliasColors": {}, 288 | "bars": false, 289 | "dashLength": 10, 290 | "dashes": false, 291 | "datasource": "Prometheus", 292 | "decimals": 1, 293 | "fill": 1, 294 | "fillGradient": 0, 295 | "gridPos": { 296 | "h": 6, 297 | "w": 9, 298 | "x": 6, 299 | "y": 1 300 | }, 301 | "hiddenSeries": false, 302 | "id": 7, 303 | "interval": "", 304 | "legend": { 305 | "alignAsTable": true, 306 | "avg": true, 307 | "current": true, 308 | "max": true, 309 | "min": false, 310 | "show": true, 311 | "total": false, 312 | "values": true 313 | }, 314 | "lines": true, 315 | "linewidth": 1, 316 | "nullPointMode": "null", 317 | "options": { 318 | "dataLinks": [] 319 | }, 320 | "percentage": false, 321 | "pointradius": 2, 322 | "points": false, 323 | "renderer": "flot", 324 | "seriesOverrides": [], 325 | "spaceLength": 10, 326 | "stack": false, 327 | "steppedLine": false, 328 | "targets": [ 329 | { 330 | "expr": "hdfs_datanode_remaining {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\", fsdatasetid=''}", 331 | "legendFormat": "{{instance}}", 332 | "refId": "A" 333 | } 334 | ], 335 | "thresholds": [], 336 | "timeFrom": null, 337 | "timeRegions": [], 338 | "timeShift": null, 339 | "title": "Remaining Disk Capacity", 340 | "tooltip": { 341 | "shared": true, 342 | "sort": 0, 343 | "value_type": "individual" 344 | }, 345 | "type": "graph", 346 | "xaxis": { 347 | "buckets": null, 348 | "mode": "time", 349 | "name": null, 350 | "show": true, 351 | "values": [] 352 | }, 353 | "yaxes": [ 354 | { 355 | "decimals": null, 356 | "format": "decbytes", 357 | "label": "", 358 | "logBase": 1, 359 | "max": null, 360 | "min": null, 361 | "show": true 362 | }, 363 | { 364 | "format": "short", 365 | "label": null, 366 | "logBase": 1, 367 | "max": null, 368 | "min": null, 369 | "show": true 370 | } 371 | ], 372 | "yaxis": { 373 | "align": false, 374 | "alignLevel": null 375 | } 376 | }, 377 | { 378 | "aliasColors": {}, 379 | "bars": false, 380 | "dashLength": 10, 381 | "dashes": false, 382 | "datasource": "Prometheus", 383 | "decimals": 1, 384 | "fill": 1, 385 | "fillGradient": 0, 386 | "gridPos": { 387 | "h": 6, 388 | "w": 9, 389 | "x": 15, 390 | "y": 1 391 | }, 392 | "hiddenSeries": false, 393 | "id": 9, 394 | "interval": "", 395 | "legend": { 396 | "alignAsTable": true, 397 | "avg": true, 398 | "current": true, 399 | "max": true, 400 | "min": false, 401 | "show": true, 402 | "total": false, 403 | "values": true 404 | }, 405 | "lines": true, 406 | "linewidth": 1, 407 | "nullPointMode": "null", 408 | "options": { 409 | "dataLinks": [] 410 | }, 411 | "percentage": false, 412 | "pointradius": 2, 413 | "points": false, 414 | "renderer": "flot", 415 | "seriesOverrides": [], 416 | "spaceLength": 10, 417 | "stack": false, 418 | "steppedLine": false, 419 | "targets": [ 420 | { 421 | "expr": "1-hdfs_datanode_remaining {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\", fsdatasetid=''}/ hdfs_datanode_capacity {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\", fsdatasetid=''}", 422 | "legendFormat": "{{instance}}", 423 | "refId": "A" 424 | } 425 | ], 426 | "thresholds": [], 427 | "timeFrom": null, 428 | "timeRegions": [], 429 | "timeShift": null, 430 | "title": "Disk Utilization", 431 | "tooltip": { 432 | "shared": true, 433 | "sort": 0, 434 | "value_type": "individual" 435 | }, 436 | "type": "graph", 437 | "xaxis": { 438 | "buckets": null, 439 | "mode": "time", 440 | "name": null, 441 | "show": true, 442 | "values": [] 443 | }, 444 | "yaxes": [ 445 | { 446 | "decimals": null, 447 | "format": "percentunit", 448 | "label": null, 449 | "logBase": 1, 450 | "max": null, 451 | "min": null, 452 | "show": true 453 | }, 454 | { 455 | "format": "short", 456 | "label": null, 457 | "logBase": 1, 458 | "max": null, 459 | "min": null, 460 | "show": true 461 | } 462 | ], 463 | "yaxis": { 464 | "align": false, 465 | "alignLevel": null 466 | } 467 | }, 468 | { 469 | "cacheTimeout": null, 470 | "colorBackground": false, 471 | "colorValue": true, 472 | "colors": [ 473 | "#d44a3a", 474 | "rgba(237, 129, 40, 0.89)", 475 | "#299c46" 476 | ], 477 | "datasource": "Prometheus", 478 | "format": "dateTimeAsIso", 479 | "gauge": { 480 | "maxValue": 100, 481 | "minValue": 0, 482 | "show": false, 483 | "thresholdLabels": false, 484 | "thresholdMarkers": true 485 | }, 486 | "gridPos": { 487 | "h": 3, 488 | "w": 6, 489 | "x": 0, 490 | "y": 4 491 | }, 492 | "id": 36, 493 | "interval": null, 494 | "links": [], 495 | "mappingType": 1, 496 | "mappingTypes": [ 497 | { 498 | "name": "value to text", 499 | "value": 1 500 | }, 501 | { 502 | "name": "range to text", 503 | "value": 2 504 | } 505 | ], 506 | "maxDataPoints": 100, 507 | "nullPointMode": "connected", 508 | "nullText": null, 509 | "postfix": "", 510 | "postfixFontSize": "50%", 511 | "prefix": "", 512 | "prefixFontSize": "50%", 513 | "rangeMaps": [ 514 | { 515 | "from": "null", 516 | "text": "N/A", 517 | "to": "null" 518 | } 519 | ], 520 | "sparkline": { 521 | "fillColor": "rgba(31, 118, 189, 0.18)", 522 | "full": false, 523 | "lineColor": "rgb(31, 120, 193)", 524 | "show": false, 525 | "ymax": null, 526 | "ymin": null 527 | }, 528 | "tableColumn": "", 529 | "targets": [ 530 | { 531 | "expr": "max(hdfs_datanode_last_volume_failure_date {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance'})", 532 | "refId": "A" 533 | } 534 | ], 535 | "thresholds": "0,0", 536 | "timeFrom": null, 537 | "timeShift": null, 538 | "title": "Latest Volume Failure", 539 | "type": "singlestat", 540 | "valueFontSize": "80%", 541 | "valueMaps": [ 542 | { 543 | "op": "=", 544 | "text": "None", 545 | "value": "1970-01-01 08:00:00" 546 | } 547 | ], 548 | "valueName": "current" 549 | }, 550 | { 551 | "aliasColors": {}, 552 | "bars": false, 553 | "cacheTimeout": null, 554 | "dashLength": 10, 555 | "dashes": false, 556 | "datasource": "Prometheus", 557 | "decimals": 1, 558 | "fill": 1, 559 | "fillGradient": 0, 560 | "gridPos": { 561 | "h": 6, 562 | "w": 8, 563 | "x": 0, 564 | "y": 7 565 | }, 566 | "hiddenSeries": false, 567 | "id": 11, 568 | "interval": "", 569 | "legend": { 570 | "alignAsTable": true, 571 | "avg": true, 572 | "current": true, 573 | "max": true, 574 | "min": false, 575 | "show": true, 576 | "total": false, 577 | "values": true 578 | }, 579 | "lines": true, 580 | "linewidth": 1, 581 | "links": [], 582 | "nullPointMode": "null", 583 | "options": { 584 | "dataLinks": [] 585 | }, 586 | "percentage": false, 587 | "pointradius": 2, 588 | "points": false, 589 | "renderer": "flot", 590 | "seriesOverrides": [], 591 | "spaceLength": 10, 592 | "stack": false, 593 | "steppedLine": false, 594 | "targets": [ 595 | { 596 | "expr": "rate(hdfs_datanode_bytes_written {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 597 | "legendFormat": "{{instance}}", 598 | "refId": "A" 599 | } 600 | ], 601 | "thresholds": [], 602 | "timeFrom": null, 603 | "timeRegions": [], 604 | "timeShift": null, 605 | "title": "Writes", 606 | "tooltip": { 607 | "shared": true, 608 | "sort": 0, 609 | "value_type": "individual" 610 | }, 611 | "type": "graph", 612 | "xaxis": { 613 | "buckets": null, 614 | "mode": "time", 615 | "name": null, 616 | "show": true, 617 | "values": [] 618 | }, 619 | "yaxes": [ 620 | { 621 | "format": "ops", 622 | "label": null, 623 | "logBase": 1, 624 | "max": null, 625 | "min": null, 626 | "show": true 627 | }, 628 | { 629 | "format": "short", 630 | "label": null, 631 | "logBase": 1, 632 | "max": null, 633 | "min": null, 634 | "show": true 635 | } 636 | ], 637 | "yaxis": { 638 | "align": false, 639 | "alignLevel": null 640 | } 641 | }, 642 | { 643 | "aliasColors": {}, 644 | "bars": false, 645 | "cacheTimeout": null, 646 | "dashLength": 10, 647 | "dashes": false, 648 | "datasource": "Prometheus", 649 | "decimals": 1, 650 | "fill": 1, 651 | "fillGradient": 0, 652 | "gridPos": { 653 | "h": 6, 654 | "w": 8, 655 | "x": 8, 656 | "y": 7 657 | }, 658 | "hiddenSeries": false, 659 | "id": 10, 660 | "interval": "", 661 | "legend": { 662 | "alignAsTable": true, 663 | "avg": true, 664 | "current": true, 665 | "max": true, 666 | "min": false, 667 | "show": true, 668 | "total": false, 669 | "values": true 670 | }, 671 | "lines": true, 672 | "linewidth": 1, 673 | "links": [], 674 | "nullPointMode": "null", 675 | "options": { 676 | "dataLinks": [] 677 | }, 678 | "percentage": false, 679 | "pointradius": 2, 680 | "points": false, 681 | "renderer": "flot", 682 | "seriesOverrides": [], 683 | "spaceLength": 10, 684 | "stack": false, 685 | "steppedLine": false, 686 | "targets": [ 687 | { 688 | "expr": "rate(hdfs_datanode_bytes_read {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 689 | "legendFormat": "{{instance}}", 690 | "refId": "A" 691 | } 692 | ], 693 | "thresholds": [], 694 | "timeFrom": null, 695 | "timeRegions": [], 696 | "timeShift": null, 697 | "title": "Reads", 698 | "tooltip": { 699 | "shared": true, 700 | "sort": 0, 701 | "value_type": "individual" 702 | }, 703 | "type": "graph", 704 | "xaxis": { 705 | "buckets": null, 706 | "mode": "time", 707 | "name": null, 708 | "show": true, 709 | "values": [] 710 | }, 711 | "yaxes": [ 712 | { 713 | "format": "ops", 714 | "label": null, 715 | "logBase": 1, 716 | "max": null, 717 | "min": null, 718 | "show": true 719 | }, 720 | { 721 | "format": "short", 722 | "label": null, 723 | "logBase": 1, 724 | "max": null, 725 | "min": null, 726 | "show": true 727 | } 728 | ], 729 | "yaxis": { 730 | "align": false, 731 | "alignLevel": null 732 | } 733 | }, 734 | { 735 | "aliasColors": {}, 736 | "bars": false, 737 | "cacheTimeout": null, 738 | "dashLength": 10, 739 | "dashes": false, 740 | "datasource": "Prometheus", 741 | "decimals": 1, 742 | "fill": 1, 743 | "fillGradient": 0, 744 | "gridPos": { 745 | "h": 6, 746 | "w": 8, 747 | "x": 16, 748 | "y": 7 749 | }, 750 | "hiddenSeries": false, 751 | "id": 12, 752 | "interval": "", 753 | "legend": { 754 | "alignAsTable": true, 755 | "avg": true, 756 | "current": true, 757 | "max": true, 758 | "min": false, 759 | "show": true, 760 | "total": false, 761 | "values": true 762 | }, 763 | "lines": true, 764 | "linewidth": 1, 765 | "links": [], 766 | "nullPointMode": "null", 767 | "options": { 768 | "dataLinks": [] 769 | }, 770 | "percentage": false, 771 | "pointradius": 2, 772 | "points": false, 773 | "renderer": "flot", 774 | "seriesOverrides": [], 775 | "spaceLength": 10, 776 | "stack": false, 777 | "steppedLine": false, 778 | "targets": [ 779 | { 780 | "expr": "rate(hdfs_datanode_datanode_network_errors {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 781 | "legendFormat": "{{instance}}", 782 | "refId": "A" 783 | } 784 | ], 785 | "thresholds": [], 786 | "timeFrom": null, 787 | "timeRegions": [], 788 | "timeShift": null, 789 | "title": "Network Errors", 790 | "tooltip": { 791 | "shared": true, 792 | "sort": 0, 793 | "value_type": "individual" 794 | }, 795 | "type": "graph", 796 | "xaxis": { 797 | "buckets": null, 798 | "mode": "time", 799 | "name": null, 800 | "show": true, 801 | "values": [] 802 | }, 803 | "yaxes": [ 804 | { 805 | "format": "ops", 806 | "label": null, 807 | "logBase": 1, 808 | "max": null, 809 | "min": null, 810 | "show": true 811 | }, 812 | { 813 | "format": "short", 814 | "label": null, 815 | "logBase": 1, 816 | "max": null, 817 | "min": null, 818 | "show": true 819 | } 820 | ], 821 | "yaxis": { 822 | "align": false, 823 | "alignLevel": null 824 | } 825 | }, 826 | { 827 | "collapsed": true, 828 | "datasource": "Prometheus", 829 | "gridPos": { 830 | "h": 1, 831 | "w": 24, 832 | "x": 0, 833 | "y": 13 834 | }, 835 | "id": 39, 836 | "panels": [ 837 | { 838 | "aliasColors": {}, 839 | "bars": false, 840 | "dashLength": 10, 841 | "dashes": false, 842 | "datasource": "Prometheus", 843 | "decimals": 1, 844 | "fill": 1, 845 | "fillGradient": 0, 846 | "gridPos": { 847 | "h": 6, 848 | "w": 8, 849 | "x": 0, 850 | "y": 2 851 | }, 852 | "hiddenSeries": false, 853 | "id": 40, 854 | "interval": "", 855 | "legend": { 856 | "alignAsTable": true, 857 | "avg": true, 858 | "current": true, 859 | "max": true, 860 | "min": false, 861 | "show": true, 862 | "total": false, 863 | "values": true 864 | }, 865 | "lines": true, 866 | "linewidth": 1, 867 | "nullPointMode": "null", 868 | "options": { 869 | "dataLinks": [] 870 | }, 871 | "percentage": false, 872 | "pointradius": 2, 873 | "points": false, 874 | "renderer": "flot", 875 | "seriesOverrides": [], 876 | "spaceLength": 10, 877 | "stack": false, 878 | "steppedLine": false, 879 | "targets": [ 880 | { 881 | "expr": "1-hdfs_datanode_remaining {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\", fsdatasetid=''}/ hdfs_datanode_capacity {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\", fsdatasetid=''}", 882 | "legendFormat": "{{instance}}", 883 | "refId": "A" 884 | } 885 | ], 886 | "thresholds": [], 887 | "timeFrom": null, 888 | "timeRegions": [], 889 | "timeShift": null, 890 | "title": "Disk Capacity Utilization", 891 | "tooltip": { 892 | "shared": true, 893 | "sort": 0, 894 | "value_type": "individual" 895 | }, 896 | "type": "graph", 897 | "xaxis": { 898 | "buckets": null, 899 | "mode": "time", 900 | "name": null, 901 | "show": true, 902 | "values": [] 903 | }, 904 | "yaxes": [ 905 | { 906 | "format": "percentunit", 907 | "label": null, 908 | "logBase": 1, 909 | "max": null, 910 | "min": null, 911 | "show": true 912 | }, 913 | { 914 | "format": "short", 915 | "label": null, 916 | "logBase": 1, 917 | "max": null, 918 | "min": null, 919 | "show": true 920 | } 921 | ], 922 | "yaxis": { 923 | "align": false, 924 | "alignLevel": null 925 | } 926 | }, 927 | { 928 | "aliasColors": {}, 929 | "bars": false, 930 | "dashLength": 10, 931 | "dashes": false, 932 | "datasource": "Prometheus", 933 | "decimals": 1, 934 | "fill": 1, 935 | "fillGradient": 0, 936 | "gridPos": { 937 | "h": 6, 938 | "w": 8, 939 | "x": 8, 940 | "y": 2 941 | }, 942 | "hiddenSeries": false, 943 | "id": 42, 944 | "interval": "", 945 | "legend": { 946 | "alignAsTable": true, 947 | "avg": true, 948 | "current": true, 949 | "max": true, 950 | "min": false, 951 | "show": true, 952 | "total": false, 953 | "values": true 954 | }, 955 | "lines": true, 956 | "linewidth": 1, 957 | "nullPointMode": "null", 958 | "options": { 959 | "dataLinks": [] 960 | }, 961 | "percentage": false, 962 | "pointradius": 2, 963 | "points": false, 964 | "renderer": "flot", 965 | "seriesOverrides": [], 966 | "spaceLength": 10, 967 | "stack": false, 968 | "steppedLine": false, 969 | "targets": [ 970 | { 971 | "expr": "hdfs_datanode_dfs_used {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\", fsdatasetid=''}", 972 | "legendFormat": "{{instance}}", 973 | "refId": "A" 974 | } 975 | ], 976 | "thresholds": [], 977 | "timeFrom": null, 978 | "timeRegions": [], 979 | "timeShift": null, 980 | "title": "Used Disk Capacity", 981 | "tooltip": { 982 | "shared": true, 983 | "sort": 0, 984 | "value_type": "individual" 985 | }, 986 | "type": "graph", 987 | "xaxis": { 988 | "buckets": null, 989 | "mode": "time", 990 | "name": null, 991 | "show": true, 992 | "values": [] 993 | }, 994 | "yaxes": [ 995 | { 996 | "format": "decbytes", 997 | "label": null, 998 | "logBase": 1, 999 | "max": null, 1000 | "min": null, 1001 | "show": true 1002 | }, 1003 | { 1004 | "format": "short", 1005 | "label": null, 1006 | "logBase": 1, 1007 | "max": null, 1008 | "min": null, 1009 | "show": true 1010 | } 1011 | ], 1012 | "yaxis": { 1013 | "align": false, 1014 | "alignLevel": null 1015 | } 1016 | }, 1017 | { 1018 | "aliasColors": {}, 1019 | "bars": false, 1020 | "dashLength": 10, 1021 | "dashes": false, 1022 | "datasource": "Prometheus", 1023 | "decimals": 1, 1024 | "fill": 1, 1025 | "fillGradient": 0, 1026 | "gridPos": { 1027 | "h": 6, 1028 | "w": 8, 1029 | "x": 16, 1030 | "y": 2 1031 | }, 1032 | "hiddenSeries": false, 1033 | "id": 41, 1034 | "interval": "", 1035 | "legend": { 1036 | "alignAsTable": true, 1037 | "avg": true, 1038 | "current": true, 1039 | "max": true, 1040 | "min": false, 1041 | "show": true, 1042 | "total": false, 1043 | "values": true 1044 | }, 1045 | "lines": true, 1046 | "linewidth": 1, 1047 | "nullPointMode": "null", 1048 | "options": { 1049 | "dataLinks": [] 1050 | }, 1051 | "percentage": false, 1052 | "pointradius": 2, 1053 | "points": false, 1054 | "renderer": "flot", 1055 | "seriesOverrides": [], 1056 | "spaceLength": 10, 1057 | "stack": false, 1058 | "steppedLine": false, 1059 | "targets": [ 1060 | { 1061 | "expr": "hdfs_datanode_remaining {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\", fsdatasetid=''}", 1062 | "legendFormat": "{{instance}}", 1063 | "refId": "A" 1064 | } 1065 | ], 1066 | "thresholds": [], 1067 | "timeFrom": null, 1068 | "timeRegions": [], 1069 | "timeShift": null, 1070 | "title": "Remaining Disk Capacity", 1071 | "tooltip": { 1072 | "shared": true, 1073 | "sort": 0, 1074 | "value_type": "individual" 1075 | }, 1076 | "type": "graph", 1077 | "xaxis": { 1078 | "buckets": null, 1079 | "mode": "time", 1080 | "name": null, 1081 | "show": true, 1082 | "values": [] 1083 | }, 1084 | "yaxes": [ 1085 | { 1086 | "format": "decbytes", 1087 | "label": null, 1088 | "logBase": 1, 1089 | "max": null, 1090 | "min": null, 1091 | "show": true 1092 | }, 1093 | { 1094 | "format": "short", 1095 | "label": null, 1096 | "logBase": 1, 1097 | "max": null, 1098 | "min": null, 1099 | "show": true 1100 | } 1101 | ], 1102 | "yaxis": { 1103 | "align": false, 1104 | "alignLevel": null 1105 | } 1106 | }, 1107 | { 1108 | "aliasColors": {}, 1109 | "bars": false, 1110 | "cacheTimeout": null, 1111 | "dashLength": 10, 1112 | "dashes": false, 1113 | "datasource": "Prometheus", 1114 | "decimals": 1, 1115 | "fill": 1, 1116 | "fillGradient": 0, 1117 | "gridPos": { 1118 | "h": 6, 1119 | "w": 8, 1120 | "x": 0, 1121 | "y": 8 1122 | }, 1123 | "hiddenSeries": false, 1124 | "id": 44, 1125 | "interval": "", 1126 | "legend": { 1127 | "alignAsTable": true, 1128 | "avg": true, 1129 | "current": true, 1130 | "max": true, 1131 | "min": false, 1132 | "show": true, 1133 | "total": false, 1134 | "values": true 1135 | }, 1136 | "lines": true, 1137 | "linewidth": 1, 1138 | "links": [], 1139 | "nullPointMode": "null", 1140 | "options": { 1141 | "dataLinks": [] 1142 | }, 1143 | "percentage": false, 1144 | "pointradius": 2, 1145 | "points": false, 1146 | "renderer": "flot", 1147 | "seriesOverrides": [], 1148 | "spaceLength": 10, 1149 | "stack": false, 1150 | "steppedLine": false, 1151 | "targets": [ 1152 | { 1153 | "expr": "rate(hdfs_datanode_bytes_written {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 1154 | "legendFormat": "{{instance}}", 1155 | "refId": "A" 1156 | } 1157 | ], 1158 | "thresholds": [], 1159 | "timeFrom": null, 1160 | "timeRegions": [], 1161 | "timeShift": null, 1162 | "title": "Writes", 1163 | "tooltip": { 1164 | "shared": true, 1165 | "sort": 0, 1166 | "value_type": "individual" 1167 | }, 1168 | "type": "graph", 1169 | "xaxis": { 1170 | "buckets": null, 1171 | "mode": "time", 1172 | "name": null, 1173 | "show": true, 1174 | "values": [] 1175 | }, 1176 | "yaxes": [ 1177 | { 1178 | "format": "ops", 1179 | "label": null, 1180 | "logBase": 1, 1181 | "max": null, 1182 | "min": null, 1183 | "show": true 1184 | }, 1185 | { 1186 | "format": "short", 1187 | "label": null, 1188 | "logBase": 1, 1189 | "max": null, 1190 | "min": null, 1191 | "show": true 1192 | } 1193 | ], 1194 | "yaxis": { 1195 | "align": false, 1196 | "alignLevel": null 1197 | } 1198 | }, 1199 | { 1200 | "aliasColors": {}, 1201 | "bars": false, 1202 | "cacheTimeout": null, 1203 | "dashLength": 10, 1204 | "dashes": false, 1205 | "datasource": "Prometheus", 1206 | "decimals": 1, 1207 | "fill": 1, 1208 | "fillGradient": 0, 1209 | "gridPos": { 1210 | "h": 6, 1211 | "w": 8, 1212 | "x": 8, 1213 | "y": 8 1214 | }, 1215 | "hiddenSeries": false, 1216 | "id": 43, 1217 | "interval": "", 1218 | "legend": { 1219 | "alignAsTable": true, 1220 | "avg": true, 1221 | "current": true, 1222 | "max": true, 1223 | "min": false, 1224 | "show": true, 1225 | "total": false, 1226 | "values": true 1227 | }, 1228 | "lines": true, 1229 | "linewidth": 1, 1230 | "links": [], 1231 | "nullPointMode": "null", 1232 | "options": { 1233 | "dataLinks": [] 1234 | }, 1235 | "percentage": false, 1236 | "pointradius": 2, 1237 | "points": false, 1238 | "renderer": "flot", 1239 | "seriesOverrides": [], 1240 | "spaceLength": 10, 1241 | "stack": false, 1242 | "steppedLine": false, 1243 | "targets": [ 1244 | { 1245 | "expr": "rate(hdfs_datanode_bytes_read {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 1246 | "legendFormat": "{{instance}}", 1247 | "refId": "A" 1248 | } 1249 | ], 1250 | "thresholds": [], 1251 | "timeFrom": null, 1252 | "timeRegions": [], 1253 | "timeShift": null, 1254 | "title": "Reads", 1255 | "tooltip": { 1256 | "shared": true, 1257 | "sort": 0, 1258 | "value_type": "individual" 1259 | }, 1260 | "type": "graph", 1261 | "xaxis": { 1262 | "buckets": null, 1263 | "mode": "time", 1264 | "name": null, 1265 | "show": true, 1266 | "values": [] 1267 | }, 1268 | "yaxes": [ 1269 | { 1270 | "format": "ops", 1271 | "label": null, 1272 | "logBase": 1, 1273 | "max": null, 1274 | "min": null, 1275 | "show": true 1276 | }, 1277 | { 1278 | "format": "short", 1279 | "label": null, 1280 | "logBase": 1, 1281 | "max": null, 1282 | "min": null, 1283 | "show": true 1284 | } 1285 | ], 1286 | "yaxis": { 1287 | "align": false, 1288 | "alignLevel": null 1289 | } 1290 | }, 1291 | { 1292 | "aliasColors": {}, 1293 | "bars": false, 1294 | "cacheTimeout": null, 1295 | "dashLength": 10, 1296 | "dashes": false, 1297 | "datasource": "Prometheus", 1298 | "decimals": 1, 1299 | "fill": 1, 1300 | "fillGradient": 0, 1301 | "gridPos": { 1302 | "h": 6, 1303 | "w": 8, 1304 | "x": 16, 1305 | "y": 8 1306 | }, 1307 | "hiddenSeries": false, 1308 | "id": 45, 1309 | "interval": "", 1310 | "legend": { 1311 | "alignAsTable": true, 1312 | "avg": true, 1313 | "current": true, 1314 | "max": true, 1315 | "min": false, 1316 | "show": true, 1317 | "total": false, 1318 | "values": true 1319 | }, 1320 | "lines": true, 1321 | "linewidth": 1, 1322 | "links": [], 1323 | "nullPointMode": "null", 1324 | "options": { 1325 | "dataLinks": [] 1326 | }, 1327 | "percentage": false, 1328 | "pointradius": 2, 1329 | "points": false, 1330 | "renderer": "flot", 1331 | "seriesOverrides": [], 1332 | "spaceLength": 10, 1333 | "stack": false, 1334 | "steppedLine": false, 1335 | "targets": [ 1336 | { 1337 | "expr": "rate(hdfs_datanode_blocks_written {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 1338 | "legendFormat": "{{instance}}", 1339 | "refId": "A" 1340 | } 1341 | ], 1342 | "thresholds": [], 1343 | "timeFrom": null, 1344 | "timeRegions": [], 1345 | "timeShift": null, 1346 | "title": "Write Block Rate", 1347 | "tooltip": { 1348 | "shared": true, 1349 | "sort": 0, 1350 | "value_type": "individual" 1351 | }, 1352 | "type": "graph", 1353 | "xaxis": { 1354 | "buckets": null, 1355 | "mode": "time", 1356 | "name": null, 1357 | "show": true, 1358 | "values": [] 1359 | }, 1360 | "yaxes": [ 1361 | { 1362 | "format": "ops", 1363 | "label": null, 1364 | "logBase": 1, 1365 | "max": null, 1366 | "min": null, 1367 | "show": true 1368 | }, 1369 | { 1370 | "format": "short", 1371 | "label": null, 1372 | "logBase": 1, 1373 | "max": null, 1374 | "min": null, 1375 | "show": true 1376 | } 1377 | ], 1378 | "yaxis": { 1379 | "align": false, 1380 | "alignLevel": null 1381 | } 1382 | }, 1383 | { 1384 | "aliasColors": {}, 1385 | "bars": false, 1386 | "cacheTimeout": null, 1387 | "dashLength": 10, 1388 | "dashes": false, 1389 | "datasource": "Prometheus", 1390 | "decimals": 1, 1391 | "fill": 1, 1392 | "fillGradient": 0, 1393 | "gridPos": { 1394 | "h": 6, 1395 | "w": 8, 1396 | "x": 0, 1397 | "y": 14 1398 | }, 1399 | "hiddenSeries": false, 1400 | "id": 46, 1401 | "interval": "", 1402 | "legend": { 1403 | "alignAsTable": true, 1404 | "avg": true, 1405 | "current": true, 1406 | "max": true, 1407 | "min": false, 1408 | "show": true, 1409 | "total": false, 1410 | "values": true 1411 | }, 1412 | "lines": true, 1413 | "linewidth": 1, 1414 | "links": [], 1415 | "nullPointMode": "null", 1416 | "options": { 1417 | "dataLinks": [] 1418 | }, 1419 | "percentage": false, 1420 | "pointradius": 2, 1421 | "points": false, 1422 | "renderer": "flot", 1423 | "seriesOverrides": [], 1424 | "spaceLength": 10, 1425 | "stack": false, 1426 | "steppedLine": false, 1427 | "targets": [ 1428 | { 1429 | "expr": "rate(hdfs_datanode_blocks_read {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 1430 | "legendFormat": "{{instance}}", 1431 | "refId": "A" 1432 | } 1433 | ], 1434 | "thresholds": [], 1435 | "timeFrom": null, 1436 | "timeRegions": [], 1437 | "timeShift": null, 1438 | "title": "Read Block Rate", 1439 | "tooltip": { 1440 | "shared": true, 1441 | "sort": 0, 1442 | "value_type": "individual" 1443 | }, 1444 | "type": "graph", 1445 | "xaxis": { 1446 | "buckets": null, 1447 | "mode": "time", 1448 | "name": null, 1449 | "show": true, 1450 | "values": [] 1451 | }, 1452 | "yaxes": [ 1453 | { 1454 | "format": "ops", 1455 | "label": null, 1456 | "logBase": 1, 1457 | "max": null, 1458 | "min": null, 1459 | "show": true 1460 | }, 1461 | { 1462 | "format": "short", 1463 | "label": null, 1464 | "logBase": 1, 1465 | "max": null, 1466 | "min": null, 1467 | "show": true 1468 | } 1469 | ], 1470 | "yaxis": { 1471 | "align": false, 1472 | "alignLevel": null 1473 | } 1474 | }, 1475 | { 1476 | "aliasColors": {}, 1477 | "bars": false, 1478 | "cacheTimeout": null, 1479 | "dashLength": 10, 1480 | "dashes": false, 1481 | "datasource": "Prometheus", 1482 | "decimals": 1, 1483 | "fill": 1, 1484 | "fillGradient": 0, 1485 | "gridPos": { 1486 | "h": 6, 1487 | "w": 8, 1488 | "x": 8, 1489 | "y": 14 1490 | }, 1491 | "hiddenSeries": false, 1492 | "id": 47, 1493 | "interval": "", 1494 | "legend": { 1495 | "alignAsTable": true, 1496 | "avg": true, 1497 | "current": true, 1498 | "max": true, 1499 | "min": false, 1500 | "show": true, 1501 | "total": false, 1502 | "values": true 1503 | }, 1504 | "lines": true, 1505 | "linewidth": 1, 1506 | "links": [], 1507 | "nullPointMode": "null", 1508 | "options": { 1509 | "dataLinks": [] 1510 | }, 1511 | "percentage": false, 1512 | "pointradius": 2, 1513 | "points": false, 1514 | "renderer": "flot", 1515 | "seriesOverrides": [], 1516 | "spaceLength": 10, 1517 | "stack": false, 1518 | "steppedLine": false, 1519 | "targets": [ 1520 | { 1521 | "expr": "rate(hdfs_datanode_blocks_replicated {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 1522 | "legendFormat": "{{instance}}", 1523 | "refId": "A" 1524 | } 1525 | ], 1526 | "thresholds": [], 1527 | "timeFrom": null, 1528 | "timeRegions": [], 1529 | "timeShift": null, 1530 | "title": "Replication Block Rate", 1531 | "tooltip": { 1532 | "shared": true, 1533 | "sort": 0, 1534 | "value_type": "individual" 1535 | }, 1536 | "type": "graph", 1537 | "xaxis": { 1538 | "buckets": null, 1539 | "mode": "time", 1540 | "name": null, 1541 | "show": true, 1542 | "values": [] 1543 | }, 1544 | "yaxes": [ 1545 | { 1546 | "format": "ops", 1547 | "label": null, 1548 | "logBase": 1, 1549 | "max": null, 1550 | "min": null, 1551 | "show": true 1552 | }, 1553 | { 1554 | "format": "short", 1555 | "label": null, 1556 | "logBase": 1, 1557 | "max": null, 1558 | "min": null, 1559 | "show": true 1560 | } 1561 | ], 1562 | "yaxis": { 1563 | "align": false, 1564 | "alignLevel": null 1565 | } 1566 | }, 1567 | { 1568 | "aliasColors": {}, 1569 | "bars": false, 1570 | "cacheTimeout": null, 1571 | "dashLength": 10, 1572 | "dashes": false, 1573 | "datasource": "Prometheus", 1574 | "decimals": 1, 1575 | "fill": 1, 1576 | "fillGradient": 0, 1577 | "gridPos": { 1578 | "h": 6, 1579 | "w": 8, 1580 | "x": 16, 1581 | "y": 14 1582 | }, 1583 | "hiddenSeries": false, 1584 | "id": 48, 1585 | "interval": "", 1586 | "legend": { 1587 | "alignAsTable": true, 1588 | "avg": true, 1589 | "current": true, 1590 | "max": true, 1591 | "min": false, 1592 | "show": true, 1593 | "total": false, 1594 | "values": true 1595 | }, 1596 | "lines": true, 1597 | "linewidth": 1, 1598 | "links": [], 1599 | "nullPointMode": "null", 1600 | "options": { 1601 | "dataLinks": [] 1602 | }, 1603 | "percentage": false, 1604 | "pointradius": 2, 1605 | "points": false, 1606 | "renderer": "flot", 1607 | "seriesOverrides": [], 1608 | "spaceLength": 10, 1609 | "stack": false, 1610 | "steppedLine": false, 1611 | "targets": [ 1612 | { 1613 | "expr": "rate(hdfs_datanode_blocks_removed {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 1614 | "legendFormat": "{{instance}}", 1615 | "refId": "A" 1616 | } 1617 | ], 1618 | "thresholds": [], 1619 | "timeFrom": null, 1620 | "timeRegions": [], 1621 | "timeShift": null, 1622 | "title": "Remove Block Rate", 1623 | "tooltip": { 1624 | "shared": true, 1625 | "sort": 0, 1626 | "value_type": "individual" 1627 | }, 1628 | "type": "graph", 1629 | "xaxis": { 1630 | "buckets": null, 1631 | "mode": "time", 1632 | "name": null, 1633 | "show": true, 1634 | "values": [] 1635 | }, 1636 | "yaxes": [ 1637 | { 1638 | "format": "ops", 1639 | "label": null, 1640 | "logBase": 1, 1641 | "max": null, 1642 | "min": null, 1643 | "show": true 1644 | }, 1645 | { 1646 | "format": "short", 1647 | "label": null, 1648 | "logBase": 1, 1649 | "max": null, 1650 | "min": null, 1651 | "show": true 1652 | } 1653 | ], 1654 | "yaxis": { 1655 | "align": false, 1656 | "alignLevel": null 1657 | } 1658 | }, 1659 | { 1660 | "aliasColors": {}, 1661 | "bars": false, 1662 | "dashLength": 10, 1663 | "dashes": false, 1664 | "datasource": "Prometheus", 1665 | "decimals": 1, 1666 | "fill": 1, 1667 | "fillGradient": 0, 1668 | "gridPos": { 1669 | "h": 6, 1670 | "w": 8, 1671 | "x": 0, 1672 | "y": 20 1673 | }, 1674 | "hiddenSeries": false, 1675 | "id": 8, 1676 | "interval": "", 1677 | "legend": { 1678 | "alignAsTable": true, 1679 | "avg": true, 1680 | "current": true, 1681 | "max": true, 1682 | "min": false, 1683 | "show": true, 1684 | "total": false, 1685 | "values": true 1686 | }, 1687 | "lines": true, 1688 | "linewidth": 1, 1689 | "nullPointMode": "null", 1690 | "options": { 1691 | "dataLinks": [] 1692 | }, 1693 | "percentage": false, 1694 | "pointradius": 2, 1695 | "points": false, 1696 | "renderer": "flot", 1697 | "seriesOverrides": [], 1698 | "spaceLength": 10, 1699 | "stack": false, 1700 | "steppedLine": false, 1701 | "targets": [ 1702 | { 1703 | "expr": "hdfs_datanode_num_failed_volumes {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"FSDatasetState\", fsdatasetid=''}", 1704 | "legendFormat": "{{instance}}", 1705 | "refId": "A" 1706 | } 1707 | ], 1708 | "thresholds": [], 1709 | "timeFrom": null, 1710 | "timeRegions": [], 1711 | "timeShift": null, 1712 | "title": "Failed Volumes", 1713 | "tooltip": { 1714 | "shared": true, 1715 | "sort": 0, 1716 | "value_type": "individual" 1717 | }, 1718 | "type": "graph", 1719 | "xaxis": { 1720 | "buckets": null, 1721 | "mode": "time", 1722 | "name": null, 1723 | "show": true, 1724 | "values": [] 1725 | }, 1726 | "yaxes": [ 1727 | { 1728 | "format": "short", 1729 | "label": null, 1730 | "logBase": 1, 1731 | "max": null, 1732 | "min": null, 1733 | "show": true 1734 | }, 1735 | { 1736 | "format": "short", 1737 | "label": null, 1738 | "logBase": 1, 1739 | "max": null, 1740 | "min": null, 1741 | "show": true 1742 | } 1743 | ], 1744 | "yaxis": { 1745 | "align": false, 1746 | "alignLevel": null 1747 | } 1748 | } 1749 | ], 1750 | "title": "Storage Stats", 1751 | "type": "row" 1752 | }, 1753 | { 1754 | "collapsed": true, 1755 | "datasource": "Prometheus", 1756 | "gridPos": { 1757 | "h": 1, 1758 | "w": 24, 1759 | "x": 0, 1760 | "y": 14 1761 | }, 1762 | "id": 33, 1763 | "panels": [ 1764 | { 1765 | "aliasColors": {}, 1766 | "bars": false, 1767 | "cacheTimeout": null, 1768 | "dashLength": 10, 1769 | "dashes": false, 1770 | "datasource": "Prometheus", 1771 | "decimals": 1, 1772 | "fill": 1, 1773 | "fillGradient": 0, 1774 | "gridPos": { 1775 | "h": 6, 1776 | "w": 8, 1777 | "x": 0, 1778 | "y": 3 1779 | }, 1780 | "hiddenSeries": false, 1781 | "id": 34, 1782 | "interval": "", 1783 | "legend": { 1784 | "alignAsTable": true, 1785 | "avg": true, 1786 | "current": true, 1787 | "max": true, 1788 | "min": false, 1789 | "show": true, 1790 | "total": false, 1791 | "values": true 1792 | }, 1793 | "lines": true, 1794 | "linewidth": 1, 1795 | "links": [], 1796 | "nullPointMode": "null", 1797 | "options": { 1798 | "dataLinks": [] 1799 | }, 1800 | "percentage": false, 1801 | "pointradius": 2, 1802 | "points": false, 1803 | "renderer": "flot", 1804 | "seriesOverrides": [], 1805 | "spaceLength": 10, 1806 | "stack": false, 1807 | "steppedLine": false, 1808 | "targets": [ 1809 | { 1810 | "expr": "rate(hdfs_datanode_datanode_network_errors {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 1811 | "legendFormat": "{{instance}}", 1812 | "refId": "A" 1813 | } 1814 | ], 1815 | "thresholds": [], 1816 | "timeFrom": null, 1817 | "timeRegions": [], 1818 | "timeShift": null, 1819 | "title": "Network Errors", 1820 | "tooltip": { 1821 | "shared": true, 1822 | "sort": 0, 1823 | "value_type": "individual" 1824 | }, 1825 | "type": "graph", 1826 | "xaxis": { 1827 | "buckets": null, 1828 | "mode": "time", 1829 | "name": null, 1830 | "show": true, 1831 | "values": [] 1832 | }, 1833 | "yaxes": [ 1834 | { 1835 | "format": "ops", 1836 | "label": null, 1837 | "logBase": 1, 1838 | "max": null, 1839 | "min": null, 1840 | "show": true 1841 | }, 1842 | { 1843 | "format": "short", 1844 | "label": null, 1845 | "logBase": 1, 1846 | "max": null, 1847 | "min": null, 1848 | "show": true 1849 | } 1850 | ], 1851 | "yaxis": { 1852 | "align": false, 1853 | "alignLevel": null 1854 | } 1855 | }, 1856 | { 1857 | "aliasColors": {}, 1858 | "bars": false, 1859 | "cacheTimeout": null, 1860 | "dashLength": 10, 1861 | "dashes": false, 1862 | "datasource": "Prometheus", 1863 | "decimals": 1, 1864 | "fill": 1, 1865 | "fillGradient": 0, 1866 | "gridPos": { 1867 | "h": 6, 1868 | "w": 8, 1869 | "x": 8, 1870 | "y": 3 1871 | }, 1872 | "hiddenSeries": false, 1873 | "id": 13, 1874 | "interval": "", 1875 | "legend": { 1876 | "alignAsTable": true, 1877 | "avg": true, 1878 | "current": true, 1879 | "max": true, 1880 | "min": false, 1881 | "show": true, 1882 | "total": false, 1883 | "values": true 1884 | }, 1885 | "lines": true, 1886 | "linewidth": 1, 1887 | "links": [], 1888 | "nullPointMode": "null", 1889 | "options": { 1890 | "dataLinks": [] 1891 | }, 1892 | "percentage": false, 1893 | "pointradius": 2, 1894 | "points": false, 1895 | "renderer": "flot", 1896 | "seriesOverrides": [], 1897 | "spaceLength": 10, 1898 | "stack": false, 1899 | "steppedLine": false, 1900 | "targets": [ 1901 | { 1902 | "expr": "rate(hdfs_datanode_send_data_packet_blocked_on_network_nanos_num_ops {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', kind=\"DataNodeActivity\"} [$interval])", 1903 | "legendFormat": "{{instance}}", 1904 | "refId": "A" 1905 | } 1906 | ], 1907 | "thresholds": [], 1908 | "timeFrom": null, 1909 | "timeRegions": [], 1910 | "timeShift": null, 1911 | "title": "Send Data Packets Blocked on Network", 1912 | "tooltip": { 1913 | "shared": true, 1914 | "sort": 0, 1915 | "value_type": "individual" 1916 | }, 1917 | "type": "graph", 1918 | "xaxis": { 1919 | "buckets": null, 1920 | "mode": "time", 1921 | "name": null, 1922 | "show": true, 1923 | "values": [] 1924 | }, 1925 | "yaxes": [ 1926 | { 1927 | "format": "ops", 1928 | "label": null, 1929 | "logBase": 1, 1930 | "max": null, 1931 | "min": null, 1932 | "show": true 1933 | }, 1934 | { 1935 | "format": "short", 1936 | "label": null, 1937 | "logBase": 1, 1938 | "max": null, 1939 | "min": null, 1940 | "show": true 1941 | } 1942 | ], 1943 | "yaxis": { 1944 | "align": false, 1945 | "alignLevel": null 1946 | } 1947 | }, 1948 | { 1949 | "aliasColors": {}, 1950 | "bars": false, 1951 | "cacheTimeout": null, 1952 | "dashLength": 10, 1953 | "dashes": false, 1954 | "datasource": "Prometheus", 1955 | "decimals": 1, 1956 | "fill": 1, 1957 | "fillGradient": 0, 1958 | "gridPos": { 1959 | "h": 6, 1960 | "w": 8, 1961 | "x": 16, 1962 | "y": 3 1963 | }, 1964 | "hiddenSeries": false, 1965 | "id": 31, 1966 | "interval": "", 1967 | "legend": { 1968 | "alignAsTable": true, 1969 | "avg": true, 1970 | "current": true, 1971 | "max": true, 1972 | "min": false, 1973 | "show": true, 1974 | "total": false, 1975 | "values": true 1976 | }, 1977 | "lines": true, 1978 | "linewidth": 1, 1979 | "links": [], 1980 | "nullPointMode": "null", 1981 | "options": { 1982 | "dataLinks": [] 1983 | }, 1984 | "percentage": false, 1985 | "pointradius": 2, 1986 | "points": false, 1987 | "renderer": "flot", 1988 | "seriesOverrides": [], 1989 | "spaceLength": 10, 1990 | "stack": false, 1991 | "steppedLine": false, 1992 | "targets": [ 1993 | { 1994 | "expr": "hdfs_datanode_send_data_packet_blocked_on_network_nanos_avg_time {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', role=\"DataNode\", kind=\"DataNodeActivity\"}", 1995 | "instant": false, 1996 | "legendFormat": "{{instance}}", 1997 | "refId": "A" 1998 | } 1999 | ], 2000 | "thresholds": [], 2001 | "timeFrom": null, 2002 | "timeRegions": [], 2003 | "timeShift": null, 2004 | "title": "Packet Wait Time", 2005 | "tooltip": { 2006 | "shared": true, 2007 | "sort": 0, 2008 | "value_type": "individual" 2009 | }, 2010 | "type": "graph", 2011 | "xaxis": { 2012 | "buckets": null, 2013 | "mode": "time", 2014 | "name": null, 2015 | "show": true, 2016 | "values": [] 2017 | }, 2018 | "yaxes": [ 2019 | { 2020 | "format": "ns", 2021 | "label": null, 2022 | "logBase": 1, 2023 | "max": null, 2024 | "min": null, 2025 | "show": true 2026 | }, 2027 | { 2028 | "format": "short", 2029 | "label": null, 2030 | "logBase": 1, 2031 | "max": null, 2032 | "min": null, 2033 | "show": true 2034 | } 2035 | ], 2036 | "yaxis": { 2037 | "align": false, 2038 | "alignLevel": null 2039 | } 2040 | }, 2041 | { 2042 | "aliasColors": {}, 2043 | "bars": false, 2044 | "cacheTimeout": null, 2045 | "dashLength": 10, 2046 | "dashes": false, 2047 | "datasource": "Prometheus", 2048 | "decimals": 1, 2049 | "fill": 1, 2050 | "fillGradient": 0, 2051 | "gridPos": { 2052 | "h": 6, 2053 | "w": 8, 2054 | "x": 0, 2055 | "y": 9 2056 | }, 2057 | "hiddenSeries": false, 2058 | "id": 35, 2059 | "interval": "", 2060 | "legend": { 2061 | "alignAsTable": true, 2062 | "avg": true, 2063 | "current": true, 2064 | "max": true, 2065 | "min": false, 2066 | "show": true, 2067 | "total": false, 2068 | "values": true 2069 | }, 2070 | "lines": true, 2071 | "linewidth": 1, 2072 | "links": [], 2073 | "nullPointMode": "null", 2074 | "options": { 2075 | "dataLinks": [] 2076 | }, 2077 | "percentage": false, 2078 | "pointradius": 2, 2079 | "points": false, 2080 | "renderer": "flot", 2081 | "seriesOverrides": [], 2082 | "spaceLength": 10, 2083 | "stack": false, 2084 | "steppedLine": false, 2085 | "targets": [ 2086 | { 2087 | "expr": "hdfs_datanode_send_data_packet_transfer_nanos_avg_time {job=\"$job\", cluster_id=~'$cluster', instance=~'$instance', role=\"DataNode\", kind=\"DataNodeActivity\"}", 2088 | "instant": false, 2089 | "legendFormat": "{{instance}}", 2090 | "refId": "A" 2091 | } 2092 | ], 2093 | "thresholds": [], 2094 | "timeFrom": null, 2095 | "timeRegions": [], 2096 | "timeShift": null, 2097 | "title": "Packet Transfer Time", 2098 | "tooltip": { 2099 | "shared": true, 2100 | "sort": 0, 2101 | "value_type": "individual" 2102 | }, 2103 | "type": "graph", 2104 | "xaxis": { 2105 | "buckets": null, 2106 | "mode": "time", 2107 | "name": null, 2108 | "show": true, 2109 | "values": [] 2110 | }, 2111 | "yaxes": [ 2112 | { 2113 | "format": "ns", 2114 | "label": null, 2115 | "logBase": 1, 2116 | "max": null, 2117 | "min": null, 2118 | "show": true 2119 | }, 2120 | { 2121 | "format": "short", 2122 | "label": null, 2123 | "logBase": 1, 2124 | "max": null, 2125 | "min": null, 2126 | "show": true 2127 | } 2128 | ], 2129 | "yaxis": { 2130 | "align": false, 2131 | "alignLevel": null 2132 | } 2133 | } 2134 | ], 2135 | "title": "Network Stats", 2136 | "type": "row" 2137 | } 2138 | ], 2139 | "schemaVersion": 22, 2140 | "style": "dark", 2141 | "tags": [ 2142 | "Amazon EMR", 2143 | "HDFS", 2144 | "Prometheus" 2145 | ], 2146 | "templating": { 2147 | "list": [ 2148 | { 2149 | "auto": false, 2150 | "auto_count": 30, 2151 | "auto_min": "10s", 2152 | "current": { 2153 | "selected": false, 2154 | "text": "1m", 2155 | "value": "1m" 2156 | }, 2157 | "hide": 0, 2158 | "label": "Interval", 2159 | "name": "interval", 2160 | "options": [ 2161 | { 2162 | "selected": true, 2163 | "text": "1m", 2164 | "value": "1m" 2165 | }, 2166 | { 2167 | "selected": false, 2168 | "text": "5m", 2169 | "value": "5m" 2170 | }, 2171 | { 2172 | "selected": false, 2173 | "text": "10m", 2174 | "value": "10m" 2175 | }, 2176 | { 2177 | "selected": false, 2178 | "text": "30m", 2179 | "value": "30m" 2180 | }, 2181 | { 2182 | "selected": false, 2183 | "text": "1h", 2184 | "value": "1h" 2185 | }, 2186 | { 2187 | "selected": false, 2188 | "text": "6h", 2189 | "value": "6h" 2190 | }, 2191 | { 2192 | "selected": false, 2193 | "text": "12h", 2194 | "value": "12h" 2195 | }, 2196 | { 2197 | "selected": false, 2198 | "text": "1d", 2199 | "value": "1d" 2200 | }, 2201 | { 2202 | "selected": false, 2203 | "text": "7d", 2204 | "value": "7d" 2205 | }, 2206 | { 2207 | "selected": false, 2208 | "text": "14d", 2209 | "value": "14d" 2210 | }, 2211 | { 2212 | "selected": false, 2213 | "text": "30d", 2214 | "value": "30d" 2215 | } 2216 | ], 2217 | "query": "1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 2218 | "refresh": 2, 2219 | "skipUrlSync": false, 2220 | "type": "interval" 2221 | }, 2222 | { 2223 | "allValue": null, 2224 | "current": { 2225 | "text": "hadoop_hdfs_datanode", 2226 | "value": "hadoop_hdfs_datanode" 2227 | }, 2228 | "datasource": "Prometheus", 2229 | "definition": "label_values(hdfs_datanode_capacity, job)", 2230 | "hide": 0, 2231 | "includeAll": false, 2232 | "index": -1, 2233 | "label": "Job", 2234 | "multi": false, 2235 | "name": "job", 2236 | "options": [], 2237 | "query": "label_values(hdfs_datanode_capacity, job)", 2238 | "refresh": 1, 2239 | "regex": "", 2240 | "skipUrlSync": false, 2241 | "sort": 1, 2242 | "tagValuesQuery": "", 2243 | "tags": [], 2244 | "tagsQuery": "", 2245 | "type": "query", 2246 | "useTags": false 2247 | }, 2248 | { 2249 | "allValue": null, 2250 | "current": { 2251 | "selected": false, 2252 | "text": "All", 2253 | "value": "$__all" 2254 | }, 2255 | "datasource": "Prometheus", 2256 | "definition": "label_values(hdfs_datanode_capacity{job=~\"$job\"}, cluster_id)", 2257 | "hide": 0, 2258 | "includeAll": true, 2259 | "index": -1, 2260 | "label": "Cluster", 2261 | "multi": true, 2262 | "name": "cluster", 2263 | "options": [], 2264 | "query": "label_values(hdfs_datanode_capacity{job=~\"$job\"}, cluster_id)", 2265 | "refresh": 1, 2266 | "regex": "", 2267 | "skipUrlSync": false, 2268 | "sort": 0, 2269 | "tagValuesQuery": "", 2270 | "tags": [], 2271 | "tagsQuery": "", 2272 | "type": "query", 2273 | "useTags": false 2274 | }, 2275 | { 2276 | "allValue": null, 2277 | "current": { 2278 | "selected": false, 2279 | "text": "All", 2280 | "value": "$__all" 2281 | }, 2282 | "datasource": "Prometheus", 2283 | "definition": "label_values(hdfs_datanode_capacity{job=~\"$job\", cluster_id=~'$cluster'}, instance)", 2284 | "hide": 0, 2285 | "includeAll": true, 2286 | "index": -1, 2287 | "label": "Instance", 2288 | "multi": true, 2289 | "name": "instance", 2290 | "options": [], 2291 | "query": "label_values(hdfs_datanode_capacity{job=~\"$job\", cluster_id=~'$cluster'}, instance)", 2292 | "refresh": 1, 2293 | "regex": "", 2294 | "skipUrlSync": false, 2295 | "sort": 1, 2296 | "tagValuesQuery": "", 2297 | "tags": [], 2298 | "tagsQuery": "", 2299 | "type": "query", 2300 | "useTags": false 2301 | } 2302 | ] 2303 | }, 2304 | "time": { 2305 | "from": "now-1h", 2306 | "to": "now" 2307 | }, 2308 | "timepicker": { 2309 | "refresh_intervals": [ 2310 | "5s", 2311 | "10s", 2312 | "30s", 2313 | "1m", 2314 | "5m", 2315 | "15m", 2316 | "30m", 2317 | "1h", 2318 | "2h", 2319 | "1d" 2320 | ] 2321 | }, 2322 | "timezone": "", 2323 | "title": "HDFS - DataNode", 2324 | "uid": "mrjPd1eZz", 2325 | "variables": { 2326 | "list": [] 2327 | }, 2328 | "version": 1 2329 | } --------------------------------------------------------------------------------