├── LICENSE ├── README.md ├── common.py ├── cortex-alertmanager.dashboard.py ├── cortex-blocks.dashboard.py ├── cortex-chunks.dashboard.py ├── cortex-ruler.dashboard.py ├── cortex-services-read.dashboard.py └── cortex-services-write.dashboard.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dashboards for managing Cortex 2 | 3 | These are the dashboards used internally at Weaveworks for our 4 | [Cortex](https://github.com/cortexproject/cortex) instances. 5 | 6 | They are written in Python using 7 | [Grafanalib](https://github.com/weaveworks/grafanalib). 8 | 9 | Some aspects are specific to the Weave Cloud infrastructure, e.g. the 10 | use of DynamoDB for storage. 11 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | """Common configuration across dashboards. 2 | 3 | Copyright 2019 Weaveworks Inc 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | """ 17 | 18 | import grafanalib.core as G 19 | import grafanalib.weave as W 20 | from grafanalib import prometheus 21 | """A single Y axis of milliseconds. Use for latency graphs.""" 22 | LATENCY_AXES = G.single_y_axis(format=G.MILLISECONDS_FORMAT) 23 | """A single Y axis counting operations. Use for requests per second, etc.""" 24 | OPS_AXIS = G.single_y_axis(format=G.OPS_FORMAT) 25 | """The name of the data source for our Prometheus service.""" 26 | PROMETHEUS = "$datasource" 27 | 28 | QPS_SERIES_OVERRIDES = [ 29 | { 30 | "alias": "/^1../", 31 | "color": W.YELLOW 32 | }, 33 | { 34 | "alias": "/^2../", 35 | "color": W.GREEN 36 | }, 37 | { 38 | "alias": "/^3../", 39 | "color": W.BLUE 40 | }, 41 | { 42 | "alias": "/^4../", 43 | "color": W.ORANGE 44 | }, 45 | { 46 | "alias": "/^5../", 47 | "color": W.RED 48 | }, 49 | { 50 | "alias": "success", 51 | "color": W.GREEN 52 | }, 53 | { 54 | "alias": "error", 55 | "color": W.RED 56 | }, 57 | ] 58 | 59 | 60 | def PromGraph(*args, **kwargs): 61 | """A graph of data from our Prometheus.""" 62 | 63 | kwargs_with_defaults = dict( 64 | tooltip=G.Tooltip(sort=G.SORT_DESC), 65 | nullPointMode=G.NULL_AS_NULL, 66 | ) 67 | kwargs_with_defaults.update(kwargs) 68 | 69 | return prometheus.PromGraph(data_source=PROMETHEUS, *args, **kwargs_with_defaults) 70 | 71 | 72 | def Dashboard(**kwargs): 73 | """Standard Weave Cloud dashboard. 74 | 75 | Automatically sets panel ids and applies events from Weave Cloud as annotations. 76 | """ 77 | 78 | defaultTemplates = [G.Template( 79 | label="Datasource", 80 | name="datasource", 81 | type="datasource", 82 | query="prometheus", 83 | )] 84 | 85 | if "templating" in kwargs: 86 | extraTemplates = kwargs["templating"].list 87 | else: 88 | extraTemplates = [] 89 | 90 | kwargs["templating"] = G.Templating(list=defaultTemplates + extraTemplates) 91 | 92 | return G.Dashboard( 93 | refresh='1m', # Override the default of 10s 94 | **kwargs 95 | ).auto_panel_ids() 96 | 97 | 98 | def PercentageAxes(label=None, max=1): 99 | """Y axes that show a percentage based on a unit value.""" 100 | return G.single_y_axis( 101 | format=G.PERCENT_UNIT_FORMAT, 102 | label=label, 103 | logBase=1, 104 | max=max, 105 | min=0, 106 | ) 107 | 108 | 109 | def QPSGraph(namespace, graphName, job, metric_root="request", extra_conditions=""): 110 | expr_template = 'rate({ns}_{mroot}_duration_seconds_count{{job="{job}"{extra}}}[1m])' 111 | return StatusQPSGraph( 112 | data_source=PROMETHEUS, 113 | title='{name} QPS'.format(name=graphName), 114 | expression=expr_template.format(ns=namespace, mroot=metric_root, job=job, extra=extra_conditions) 115 | ) 116 | 117 | 118 | def StatusQPSGraph(data_source, title, expression, **kwargs): 119 | """Create a graph of QPS, coloured by status code. 120 | 121 | :param title: Title of the graph. 122 | :param expression: Format and PromQL expression; must sum by label 123 | which is http code like 404 or "success" and "error" 124 | :param kwargs: Passed on to Graph. 125 | """ 126 | return W.stacked( 127 | prometheus.PromGraph( 128 | data_source=data_source, 129 | title=title, 130 | expressions=[('{{status_code}}', 'sum by (status_code)(%s)' % (expression))], 131 | seriesOverrides=QPS_SERIES_OVERRIDES, 132 | legend=G.Legend(hideZero=True), 133 | yAxes=[ 134 | G.YAxis(format=G.OPS_FORMAT), 135 | G.YAxis(format=G.SHORT_FORMAT), 136 | ], 137 | **kwargs 138 | ) 139 | ) 140 | 141 | 142 | def LatencyGraph(namespace, graphName, job, rule_root="job:", metric_root="request", extra_conditions=""): 143 | return PromGraph( 144 | title='{name} Latency'.format(name=graphName), 145 | expressions=[ 146 | ( 147 | '99th centile', '{rroot}{ns}_{mroot}_duration_seconds:99quantile{{job="{job}"{extra}}} * 1e3' 148 | .format(rroot=rule_root, ns=namespace, mroot=metric_root, job=job, extra=extra_conditions) 149 | ), 150 | ( 151 | '50th centile', '{rroot}{ns}_{mroot}_duration_seconds:50quantile{{job="{job}"{extra}}} * 1e3' 152 | .format(rroot=rule_root, ns=namespace, mroot=metric_root, job=job, extra=extra_conditions) 153 | ), 154 | ( 155 | 'Mean', 156 | 'sum(rate({ns}_{mroot}_duration_seconds_sum{{ws="false",job="{job}"{extra}}}[5m])) * 1e3 / sum(rate({ns}_{mroot}_duration_seconds_count{{ws="false",job="{job}"{extra}}}[5m]))' 157 | .format(ns=namespace, mroot=metric_root, job=job, extra=extra_conditions) 158 | ), 159 | ], 160 | yAxes=LATENCY_AXES, 161 | ) 162 | 163 | 164 | def REDRow(namespace, graphName, job, rule_root="job:", metric_root="request", extra_conditions="", collapse=False): 165 | return G.Row( 166 | title='%s QPS & Latency' % (graphName, ), 167 | collapse=collapse, 168 | panels=[ 169 | QPSGraph(namespace, graphName, job, metric_root, extra_conditions), 170 | LatencyGraph(namespace, graphName, job, rule_root, metric_root, extra_conditions), 171 | ] 172 | ) 173 | 174 | -------------------------------------------------------------------------------- /cortex-alertmanager.dashboard.py: -------------------------------------------------------------------------------- 1 | # -*- mode: python; python-indent-offset: 2 -*- 2 | 3 | import grafanalib.core as G 4 | 5 | import sys, os 6 | sys.path.append(os.path.dirname(__file__)) 7 | import common 8 | 9 | dashboard = common.Dashboard( 10 | uid='am', 11 | title="Cortex > Alertmanager", 12 | rows=[ 13 | G.Row( 14 | title='Operations', 15 | panels=[ 16 | common.PromGraph( 17 | title="Alerts", 18 | expressions=[ 19 | ( 20 | "{{instance}} {{status}}", 21 | 'sum by (instance, status)(rate(alertmanager_alerts_received_total{job="cortex/alertmanager"}[2m]))' 22 | ), 23 | ( 24 | "{{instance}} invalid", 25 | 'sum by (instance, status)(rate(alertmanager_alerts_invalid_total{job="cortex/alertmanager"}[2m]))' 26 | ), 27 | ], 28 | yAxes=common.OPS_AXIS, 29 | ), 30 | common.PromGraph( 31 | title="Notifications", 32 | expressions=[ 33 | ( 34 | "{{integration}}", 35 | 'sum by (integration)(rate(alertmanager_notifications_total{job="cortex/alertmanager"}[2m]))' 36 | ), 37 | ], 38 | yAxes=common.OPS_AXIS, 39 | ), 40 | ] 41 | ), 42 | G.Row( 43 | title='Alertmanager fetching configs', 44 | collapse=True, 45 | panels=[ 46 | common.QPSGraph('cortex_configs', 'Configs', 'cortex/alertmanager'), 47 | common.PromGraph( 48 | title="Configs Latency", 49 | expressions=[ 50 | ( 51 | "99th centile", 52 | 'histogram_quantile(0.99, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/alertmanager"}[2m])) by (le)) * 1e3' 53 | ), 54 | ( 55 | "50th centile", 56 | 'histogram_quantile(0.50, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/alertmanager"}[2m])) by (le)) * 1e3' 57 | ), 58 | ( 59 | "Mean", 60 | 'sum(rate(cortex_configs_request_duration_seconds_sum{job="cortex/alertmanager"}[2m])) / sum(rate(cortex_configs_request_duration_seconds_count{job="cortex/alertmanager"}[2m])) * 1e3' 61 | ), 62 | ], 63 | yAxes=common.LATENCY_AXES, 64 | ), 65 | ] 66 | ), 67 | common.REDRow('cortex', 'Alertmanager', 'cortex/alertmanager'), 68 | G.Row( 69 | [ 70 | common.PromGraph( 71 | title="Known Configurations", 72 | expressions=[ 73 | ("{{instance}}", 'cortex_alertmanager_configs_total{job="cortex/alertmanager"}'), 74 | ], 75 | ), 76 | common.PromGraph( 77 | title="Cluster Members", 78 | expressions=[ 79 | ("{{instance}}", 'sum(alertmanager_cluster_members{job="cortex/alertmanager"}) by (instance)'), 80 | ], 81 | ), 82 | ] 83 | ), 84 | ], 85 | ) 86 | -------------------------------------------------------------------------------- /cortex-blocks.dashboard.py: -------------------------------------------------------------------------------- 1 | # -*- mode: python; python-indent-offset: 2 -*- 2 | 3 | import grafanalib.core as G 4 | import grafanalib.weave as W 5 | 6 | import common 7 | 8 | dashboard = common.Dashboard( 9 | uid='cortex-blocks', 10 | title="Cortex > Blocks", 11 | rows=[ 12 | G.Row( 13 | title="Data", 14 | panels=[ 15 | common.PromGraph( 16 | title="Number of series in memory, in ingesters", 17 | expressions=[ 18 | ('', 'sum(cortex_ingester_memory_series{job="cortex/ingester"})'), 19 | ], 20 | ), 21 | common.PromGraph( 22 | title="Head chunks", 23 | expressions=[ 24 | ('{{instance}}', 'cortex_ingester_tsdb_head_chunks'), 25 | ], 26 | ), 27 | ] 28 | ), 29 | G.Row( 30 | title="Resources", 31 | panels=[ 32 | common.PromGraph( 33 | title="Memory Usage", 34 | expressions=[ 35 | ( 36 | '{{pod}}', 37 | 'sum by(pod)(container_memory_usage_bytes{namespace="cortex",container!="POD",container!=""})' 38 | ), 39 | ], 40 | yAxes=[ 41 | G.YAxis(format=G.BYTES_FORMAT), 42 | G.YAxis(format=G.SHORT_FORMAT), 43 | ], 44 | ), 45 | common.PromGraph( 46 | title="Disk space usage", 47 | expressions=[ 48 | ( 49 | '{{persistentvolumeclaim}}', 50 | 'kubelet_volume_stats_used_bytes{namespace="cortex"} / kubelet_volume_stats_capacity_bytes{namespace="cortex"}' 51 | ), 52 | ], 53 | yAxes=common.PercentageAxes(), 54 | ), 55 | ], 56 | ), 57 | G.Row( 58 | title="Last runs", 59 | panels=[ 60 | G.SingleStat( 61 | dataSource=common.PROMETHEUS, 62 | title="Last Successful Compactor Run", 63 | targets=[ 64 | G.Target( 65 | '(time()-cortex_compactor_last_successful_run_timestamp_seconds) / 60', 66 | refId='A', 67 | ), 68 | ], 69 | format='m', # TODO: Add 'MINUTES_FORMAT' to grafanalib 70 | ), 71 | G.SingleStat( 72 | dataSource=common.PROMETHEUS, 73 | title="Last Successful Bucket Index Update", 74 | targets=[ 75 | G.Target( 76 | '(time()-max(cortex_bucket_index_last_successful_update_timestamp_seconds)) / 60', 77 | refId='A', 78 | ), 79 | ], 80 | format='m', # TODO: Add 'MINUTES_FORMAT' to grafanalib 81 | ), 82 | ], 83 | ), 84 | G.Row( 85 | title="Block Operations", 86 | panels=[ 87 | common.PromGraph( 88 | title="Rates", 89 | expressions=[ 90 | ('{{component}} loads', 'sum by(component)(rate(cortex_bucket_store_block_loads_total{}[1m]))'), 91 | ( 92 | '{{component}} errors', 93 | 'sum by(component)(rate(cortex_bucket_store_block_load_failures_total{}[1m])) > 0' 94 | ), 95 | ('Uploads', 'sum(rate(cortex_ingester_shipper_uploads_total[5m]))'), 96 | ('Upload errors', 'sum(rate(cortex_ingester_shipper_upload_failures_total[5m]))'), 97 | ('Dir syncs', 'sum(rate(cortex_ingester_shipper_dir_syncs_total[5m]))'), 98 | ('Dir sync errors', 'sum(rate(cortex_ingester_shipper_dir_sync_failures_total[5m]))'), 99 | ], 100 | yAxes=[ 101 | G.YAxis(format=G.OPS_FORMAT), 102 | G.YAxis(format=G.SHORT_FORMAT), 103 | ], 104 | ), 105 | common.PromGraph( 106 | title="Latency", 107 | expressions=[ 108 | ( 109 | '99th centile', 110 | 'histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{operation="upload"}[5m])) by (le))' 111 | ), 112 | ( 113 | '50th centile', 114 | 'histogram_quantile(0.5, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{operation="upload"}[5m])) by (le))' 115 | ), 116 | ( 117 | 'Mean', 118 | 'sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{operation="upload"}[5m])) / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{operation="upload"}[5m]))' 119 | ), 120 | ], 121 | yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), 122 | ), 123 | ], 124 | ), 125 | G.Row( 126 | title="Compactions", 127 | panels=[ 128 | common.PromGraph( 129 | title="Operations", 130 | expressions=[ 131 | ('Compactions', 'sum(rate(cortex_ingester_tsdb_compactions_total[5m]))'), 132 | ('errors', 'sum(rate(cortex_ingester_tsdb_compactions_failed_total[5m]))'), 133 | ], 134 | ), 135 | common.PromGraph( 136 | title="Latency", 137 | expressions=[ 138 | ( 139 | '99th centile', 140 | 'histogram_quantile(0.99, sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_bucket{}[5m])) by (le))' 141 | ), 142 | ( 143 | '50th centile', 144 | 'histogram_quantile(0.5, sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_bucket{}[5m])) by (le))' 145 | ), 146 | ( 147 | 'Mean', 148 | 'sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_sum{}[5m])) / sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_count{}[5m]))' 149 | ), 150 | ], 151 | yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), 152 | ), 153 | ], 154 | ), 155 | G.Row( 156 | title="WAL", 157 | panels=[ 158 | common.PromGraph( 159 | title="Operations", 160 | expressions=[ 161 | ('Truncations', 'sum(rate(cortex_ingester_tsdb_wal_truncations_total[5m]))'), 162 | ('Truncation errors', 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]))'), 163 | ('Checkpoint', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total[5m]))'), 164 | ('Checkpoint errors', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]))'), 165 | ('WAL corruptions', 'sum(rate(cortex_ingester_wal_corruptions_total[5m]))'), 166 | ('mmap corruptions', 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total[5m]))'), 167 | ], 168 | ), 169 | common.PromGraph( 170 | title="Latency", 171 | expressions=[ 172 | ( 173 | '99th centile', 174 | 'histogram_quantile(0.99, sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_bucket{}[5m])) by (le))' 175 | ), 176 | ( 177 | '50th centile', 178 | 'histogram_quantile(0.5, sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_bucket{}[5m])) by (le))' 179 | ), 180 | ( 181 | 'Mean', 182 | 'sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{}[5m])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{}[5m]))' 183 | ), 184 | ], 185 | yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), 186 | ), 187 | ], 188 | ), 189 | G.Row( 190 | title="Bucket Operations", 191 | panels=[ 192 | common.PromGraph( 193 | title="Operations", 194 | expressions=[ 195 | ( 196 | '{{component}}-{{operation}}', 197 | 'sum by(component,operation) (rate(thanos_objstore_bucket_operations_total[5m]))' 198 | ), 199 | ( 200 | 'errors {{component}}-{{operation}}', 201 | 'sum by(component,operation) (rate(thanos_objstore_bucket_operation_failures_total[5m]))' 202 | ), 203 | ], 204 | ), 205 | common.PromGraph( 206 | title="99% Latency", 207 | expressions=[ 208 | ( 209 | '{{component}}-{{operation}}', 210 | 'histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket[5m])) by (le, component, operation)) > 0' 211 | ), 212 | ], 213 | yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), 214 | ), 215 | ], 216 | ), 217 | G.Row( 218 | title="Ring", 219 | collapse=True, 220 | panels=[ 221 | W.stacked( 222 | common.PromGraph( 223 | title="Ingester Ring Ownership", 224 | expressions=[ 225 | ( 226 | '{{ingester}}', 227 | 'max(cortex_ring_ingester_ownership_percent{job="cortex/distributor"}) by (ingester) or label_replace(max(cortex_ring_member_ownership_percent{job="cortex/distributor"}) by (member), "ingester", "$1", "member", "(.*)")' 228 | ), 229 | ], 230 | # Show y-axis slightly above 100% in case series overlap 231 | yAxes=common.PercentageAxes(max=1.2), 232 | ) 233 | ), 234 | W.stacked( 235 | common.PromGraph( 236 | title="Ingesters In Ring", 237 | expressions=[ 238 | ( 239 | '{{state}}', 240 | 'max(cortex_ring_ingesters{job="cortex/distributor"}) by (state) or max(cortex_ring_members{job="cortex/distributor"}) by (state)' 241 | ), 242 | ], 243 | yAxes=[ 244 | G.YAxis(format=G.NO_FORMAT), 245 | G.YAxis(format=G.SHORT_FORMAT), 246 | ], 247 | ) 248 | ), 249 | ] 250 | ), 251 | ], 252 | ) 253 | -------------------------------------------------------------------------------- /cortex-chunks.dashboard.py: -------------------------------------------------------------------------------- 1 | # -*- mode: python; python-indent: 2; python-indent-offset: 2 -*- 2 | 3 | import grafanalib.core as G 4 | import grafanalib.weave as W 5 | 6 | import sys, os 7 | sys.path.append(os.path.dirname(__file__)) 8 | import common 9 | 10 | dashboard = common.Dashboard( 11 | uid='chunks', 12 | title="Cortex > Chunks", 13 | rows=[ 14 | G.Row( 15 | panels=[ 16 | common.PromGraph( 17 | title="Number of chunks (in memory, in ingesters)", 18 | expressions=[ 19 | ('', 'sum(cortex_ingester_memory_chunks{job="cortex/ingester"})'), 20 | ], 21 | ), 22 | common.PromGraph( 23 | title="Chunks per series", 24 | expressions=[ 25 | ( 26 | '', 27 | 'sum(cortex_ingester_memory_chunks{job="cortex/ingester"}) / sum(cortex_ingester_memory_series{job="cortex/ingester"})' 28 | ), 29 | ], 30 | ), 31 | ] 32 | ), 33 | G.Row( 34 | panels=[ 35 | common.PromGraph( 36 | title="Chunk Size Bytes (on flush)", 37 | expressions=[ 38 | ( 39 | "99th Percentile", 40 | 'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m])) by (le))' 41 | ), 42 | ( 43 | "50th Percentile", 44 | 'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m])) by (le))' 45 | ), 46 | ( 47 | "10th Percentile", 48 | 'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m])) by (le))' 49 | ), 50 | ( 51 | "Mean", 52 | 'sum(rate(cortex_ingester_chunk_size_bytes_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_ingester_chunk_size_bytes_count{job="cortex/ingester"}[2m]))' 53 | ), 54 | ], 55 | yAxes=[ 56 | G.YAxis(format=G.BYTES_FORMAT), 57 | G.YAxis(format=G.SHORT_FORMAT), 58 | ], 59 | ), 60 | common.PromGraph( 61 | title="Chunk Age (on flush)", 62 | expressions=[ 63 | ( 64 | "99th Percentile", 65 | 'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))' 66 | ), 67 | ( 68 | "50th Percentile", 69 | 'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))' 70 | ), 71 | ( 72 | "10th Percentile", 73 | 'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))' 74 | ), 75 | ( 76 | "Mean", 77 | 'sum(rate(cortex_ingester_chunk_age_seconds_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_ingester_chunk_age_seconds_count{job="cortex/ingester"}[2m]))' 78 | ), 79 | ], 80 | yAxes=[ 81 | G.YAxis(format=G.DURATION_FORMAT), 82 | G.YAxis(format=G.SHORT_FORMAT), 83 | ], 84 | ), 85 | common.PromGraph( 86 | title="Chunk Length (on flush)", 87 | expressions=[ 88 | ( 89 | "99th Percentile", 90 | 'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))' 91 | ), 92 | ( 93 | "50th Percentile", 94 | 'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))' 95 | ), 96 | ( 97 | "10th Percentile", 98 | 'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))' 99 | ), 100 | ( 101 | "Mean", 102 | 'sum(rate(cortex_ingester_chunk_length_sum{job=\"cortex/ingester\"}[2m])) / sum(rate(cortex_ingester_chunk_length_count{job=\"cortex/ingester\"}[2m]))' 103 | ), 104 | ], 105 | ), 106 | ] 107 | ), 108 | G.Row( 109 | panels=[ 110 | W.stacked( 111 | common.PromGraph( 112 | title="Series Flush Queue Length", 113 | expressions=[ 114 | ("{{instance}}", 'cortex_ingester_flush_queue_length{job="cortex/ingester"}'), 115 | ], 116 | ) 117 | ), 118 | W.stacked( 119 | common.PromGraph( 120 | title="Chunk Flush Rate (rate[1m])", 121 | expressions=[ 122 | # This is the rate at which chunks are added to the flush queue 123 | ( 124 | "{{reason}}", 125 | 'sum by (reason)(rate(cortex_ingester_flush_reasons[1m]) or rate(cortex_ingester_series_flushed_total[1m]) or rate(cortex_ingester_flushing_enqueued_series_total[1m]))' 126 | ), 127 | # This is the rate at which chunks are removed from the flush queue 128 | ( 129 | "Flushed", 130 | 'sum(rate(cortex_ingester_chunks_stored_total[1m]) or rate(cortex_chunk_store_stored_chunks_total[1m]))' 131 | ), 132 | # Chunks dropped for being too small 133 | ("Dropped", 'sum(rate(cortex_ingester_dropped_chunks_total[1m]))'), 134 | ], 135 | # Show flush and dropped rates as a line overlayed on enqueue rates, not stacked and not filled 136 | seriesOverrides=[ 137 | { 138 | "alias": "Flushed", 139 | "fill": 1, 140 | "linewidth": 1, 141 | "stack": False 142 | }, { 143 | "alias": "Dropped", 144 | "fill": 1, 145 | "linewidth": 1, 146 | "stack": False 147 | } 148 | ], 149 | ) 150 | ), 151 | ] 152 | ), 153 | G.Row( 154 | title="DynamoDB", 155 | collapse=True, 156 | panels=[ 157 | common.PromGraph( 158 | title="DynamoDB write capacity consumed [rate1m]", 159 | expressions=[ 160 | ( 161 | '{{table}} consumed', 162 | 'sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.BatchWriteItem"}[1m])) by (table) > 0' 163 | ), 164 | ( 165 | '{{table}} provisioned', 166 | 'max(cortex_dynamo_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0' 167 | ), 168 | ( 169 | '{{table}} provisioned', 170 | 'max(cortex_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0' 171 | ), 172 | ], 173 | yAxes=common.OPS_AXIS, 174 | ), 175 | common.PromGraph( 176 | title="DynamoDB write errors", 177 | expressions=[ 178 | ( 179 | '{{table}} - {{error}}', 180 | 'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0' 181 | ), 182 | ( 183 | '{{table}} - Throttled', 184 | 'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0' 185 | ), 186 | ], 187 | yAxes=common.OPS_AXIS, 188 | ), 189 | ], 190 | ), 191 | G.Row( 192 | title="Ring", 193 | collapse=True, 194 | panels=[ 195 | W.stacked( 196 | common.PromGraph( 197 | title="Ingester Ring Ownership", 198 | expressions=[ 199 | ( 200 | '{{ingester}}', 201 | 'max(cortex_ring_ingester_ownership_percent{job="cortex/distributor"}) by (ingester) or label_replace(max(cortex_ring_member_ownership_percent{job="cortex/distributor"}) by (member), "ingester", "$1", "member", "(.*)")' 202 | ), 203 | ], 204 | # Show y-axis slightly above 100% in case series overlap 205 | yAxes=common.PercentageAxes(max=1.2), 206 | ) 207 | ), 208 | W.stacked( 209 | common.PromGraph( 210 | title="Ingesters In Ring", 211 | expressions=[ 212 | ( 213 | '{{state}}', 214 | 'max(cortex_ring_ingesters{job="cortex/distributor"}) by (state) or max(cortex_ring_members{job="cortex/distributor"}) by (state)' 215 | ), 216 | ], 217 | yAxes=[ 218 | G.YAxis(format=G.NO_FORMAT), 219 | G.YAxis(format=G.SHORT_FORMAT), 220 | ], 221 | ) 222 | ), 223 | ] 224 | ), 225 | G.Row( 226 | title="Index and Cache", 227 | panels=[ 228 | common.PromGraph( 229 | title="Index entries per chunk", 230 | expressions=[ 231 | ( 232 | '', 233 | 'sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{job="cortex/ingester"}[2m]))' 234 | ), 235 | ], 236 | ), 237 | common.PromGraph( 238 | title="Ingester hit rate", 239 | expressions=[ 240 | ( 241 | '{{name}}', 242 | 'sum(rate(cortex_cache_hits{job="cortex/ingester"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/ingester"}[2m])) by (name)' 243 | ), 244 | ], 245 | yAxes=common.PercentageAxes(), 246 | ), 247 | ] 248 | ), 249 | ] 250 | ) 251 | -------------------------------------------------------------------------------- /cortex-ruler.dashboard.py: -------------------------------------------------------------------------------- 1 | # -*- mode: python; python-indent-offset: 2 -*- 2 | 3 | import grafanalib.core as G 4 | 5 | import sys, os 6 | sys.path.append(os.path.dirname(__file__)) 7 | import common 8 | 9 | dashboard = common.Dashboard( 10 | uid='ruler', 11 | title="Cortex > Recording Rules", 12 | rows=[ 13 | G.Row( 14 | title="Configs", 15 | collapse=True, 16 | panels=[ 17 | common.PromGraph( 18 | title="Known Configurations", 19 | expressions=[ 20 | ("Configurations", 'max(cortex_configs{job="cortex/ruler"})'), 21 | ("{{status}}", 'max by(status)(cortex_alertmanager_configs{job="cortex/alertmanager"})'), 22 | ], 23 | ), 24 | common.QPSGraph('cortex_configs', 'Configs', 'cortex/ruler'), 25 | common.PromGraph( 26 | title="Configs Latency", 27 | expressions=[ 28 | ( 29 | "99th centile", 30 | 'histogram_quantile(0.99, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' 31 | ), 32 | ( 33 | "50th centile", 34 | 'histogram_quantile(0.50, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' 35 | ), 36 | ( 37 | "Mean", 38 | 'sum(rate(cortex_configs_request_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_configs_request_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3' 39 | ), 40 | ], 41 | yAxes=common.LATENCY_AXES, 42 | ), 43 | ] 44 | ), 45 | common.REDRow('cortex', 'Ruler service', 'cortex/ruler', collapse=True), 46 | G.Row( 47 | [ 48 | common.PromGraph( 49 | title="Group Evaluations per Second", 50 | expressions=[ 51 | ( 52 | "Groups per second", 53 | 'sum(rate(cortex_group_evaluation_duration_seconds_count{job="cortex/ruler"}[1m]))' 54 | ), ( 55 | "Groups per second", 56 | 'sum(rate(cortex_prometheus_rule_group_duration_seconds_count{job="cortex/ruler"}[1m]))' 57 | ) 58 | ], 59 | yAxes=common.OPS_AXIS, 60 | ), 61 | common.PromGraph( 62 | title="Group Evaluation Durations", 63 | expressions=[ 64 | ( 65 | "99th centile", 66 | 'histogram_quantile(0.99, sum(rate(cortex_group_evaluation_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' 67 | ), 68 | ( 69 | "50th centile", 70 | 'histogram_quantile(0.50, sum(rate(cortex_group_evaluation_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' 71 | ), 72 | ( 73 | "Mean", 74 | 'sum(rate(cortex_group_evaluation_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_group_evaluation_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3' 75 | ), 76 | ("Mean", 'avg(cortex_prometheus_rule_group_last_duration_seconds)*1e3'), 77 | ( 78 | "{{rule_group}}", 79 | 'max by (rule_group)(cortex_prometheus_rule_group_last_duration_seconds)*1e3 > 500' 80 | ), 81 | ], 82 | yAxes=common.LATENCY_AXES, 83 | ), 84 | common.PromGraph( 85 | title="Group Evaluation Latency", 86 | expressions=[ 87 | ( 88 | "99th centile", 89 | 'histogram_quantile(0.99, sum(rate(cortex_group_evaluation_latency_seconds_bucket[2m])) by (le)) * 1e3' 90 | ), 91 | ( 92 | "50th centile", 93 | 'histogram_quantile(0.50, sum(rate(cortex_group_evaluation_latency_seconds_bucket[2m])) by (le)) * 1e3' 94 | ), 95 | ( 96 | "Mean", 97 | 'sum(rate(cortex_group_evaluation_latency_seconds_sum[2m])) / sum(rate(cortex_group_evaluation_latency_seconds_count[2m])) * 1e3' 98 | ), 99 | ("Mean", 'avg(time()-(cortex_prometheus_rule_group_last_evaluation_timestamp_seconds>0))*1000'), 100 | ("Max", 'max(time()-(cortex_prometheus_rule_group_last_evaluation_timestamp_seconds>0))*1e3'), 101 | ], 102 | yAxes=common.LATENCY_AXES, 103 | ), 104 | ] 105 | ), 106 | G.Row( 107 | title="Ingester Queries", 108 | panels=[ 109 | common.QPSGraph('cortex_distributor', 'Ingester Query', 'cortex/ruler', metric_root="query"), 110 | common.PromGraph( 111 | title="Ingester Query Latency", 112 | expressions=[ 113 | ( 114 | "99th centile", 115 | 'histogram_quantile(0.99, sum(rate(cortex_distributor_query_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' 116 | ), 117 | ( 118 | "50th centile", 119 | 'histogram_quantile(0.50, sum(rate(cortex_distributor_query_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' 120 | ), 121 | ( 122 | "Mean", 123 | 'sum(rate(cortex_distributor_query_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_distributor_query_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3' 124 | ), 125 | ], 126 | yAxes=common.LATENCY_AXES, 127 | ), 128 | ] 129 | ), 130 | G.Row( 131 | title="Ingester Push", 132 | panels=[ 133 | common.StatusQPSGraph( 134 | common.PROMETHEUS, "Ingester Push", 135 | 'rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/ruler",operation="/cortex.Ingester/Push"}[1m])' 136 | ), 137 | common.PromGraph( 138 | title="Ingester Push Latency", 139 | expressions=[ 140 | ( 141 | "99.7th centile", 142 | 'histogram_quantile(0.997, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3' 143 | ), 144 | ( 145 | "50th centile", 146 | 'histogram_quantile(0.50, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3' 147 | ), 148 | ( 149 | "Mean", 150 | 'sum(rate(cortex_ingester_client_request_duration_seconds_sum{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) / sum(rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) * 1e3' 151 | ), 152 | ], 153 | yAxes=common.LATENCY_AXES, 154 | ), 155 | ] 156 | ), 157 | G.Row( 158 | [ 159 | common.PromGraph( 160 | title="Rules per Second", 161 | expressions=[ 162 | ("Rules", 'sum(rate(cortex_rules_processed_total{job="cortex/ruler"}[1m]))'), 163 | ("Rules/sec", 'sum(rate(cortex_prometheus_rule_evaluations_total{job="cortex/ruler"}[1m]))'), 164 | ], 165 | yAxes=common.OPS_AXIS, 166 | ), 167 | common.PromGraph( 168 | title="Ruler DynamoDB errors", 169 | expressions=[ 170 | ( 171 | '{{table}} - {{error}}', 172 | 'sum(rate(cortex_dynamo_failures_total{job="cortex/ruler"}[1m])) by (error, table) > 0' 173 | ), 174 | ], 175 | yAxes=common.OPS_AXIS, 176 | ), 177 | ] 178 | ), 179 | G.Row( 180 | title="Memcache", 181 | panels=[ 182 | common.StatusQPSGraph( 183 | common.PROMETHEUS, "Memcache read QPS", 184 | 'sum by (job,status_code)(rate(cortex_memcache_request_duration_seconds_count{method="Memcache.GetMulti", job="cortex/ruler"}[1m]))' 185 | ), 186 | common.PromGraph( 187 | title="Memcache read latency", 188 | expressions=[ 189 | ( 190 | '99th centile', 191 | 'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) by (le)) * 1e3' 192 | ), 193 | ( 194 | '50th centile', 195 | 'histogram_quantile(0.5, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) by (le)) * 1e3' 196 | ), 197 | ( 198 | 'Mean', 199 | 'sum(rate(cortex_memcache_request_duration_seconds_sum{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) * 1e3 / sum(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ruler",method="Memcache.GetMulti"}[2m]))' 200 | ), 201 | ], 202 | yAxes=common.LATENCY_AXES, 203 | ), 204 | ], 205 | ), 206 | ], 207 | ) 208 | -------------------------------------------------------------------------------- /cortex-services-read.dashboard.py: -------------------------------------------------------------------------------- 1 | # -*- mode: python; python-indent-offset: 2 -*- 2 | 3 | import grafanalib.core as G 4 | 5 | import sys, os 6 | sys.path.append(os.path.dirname(__file__)) 7 | import common 8 | 9 | dashboard = common.Dashboard( 10 | uid='reads', 11 | title="Cortex > Services (Reads)", 12 | rows=[ 13 | common.REDRow( 14 | 'cortex', 15 | 'Query Frontend read', 16 | 'cortex/query-frontend', 17 | rule_root="job_route:", 18 | extra_conditions=",route=\"api_prom_api_v1_query_range\"" 19 | ), 20 | common.REDRow('cortex', 'Querier read', 'cortex/querier'), 21 | G.Row( 22 | title="Ingester", 23 | panels=[ 24 | common.PromGraph( 25 | title="Ingester read QPS", 26 | expressions=[ 27 | ( 28 | '{{route}}: {{status_code}}', 29 | 'sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route!="/cortex.Ingester/Push"}[1m])) by (route, status_code)' 30 | ), 31 | ], 32 | yAxes=common.OPS_AXIS, 33 | ), 34 | common.PromGraph( 35 | title="Ingester read latency", 36 | expressions=[ 37 | ( 38 | '{{route}}: 99th centile', 39 | 'job_route:cortex_request_duration_seconds:99quantile{job="cortex/ingester", route!="/cortex.Ingester/Push"} * 1e3' 40 | ), 41 | ( 42 | '{{route}}: 50th centile', 43 | 'job_route:cortex_request_duration_seconds:50quantile{job="cortex/ingester", route!="/cortex.Ingester/Push"} * 1e3' 44 | ), 45 | ( 46 | '{{route}}: Mean', 47 | 'sum(rate(cortex_request_duration_seconds_sum{job="cortex/ingester", route!="/cortex.Ingester/Push"}[2m])) by (route) * 1e3 / sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route!="/cortex.Ingester/Push"}[2m])) by (route)' 48 | ), 49 | ], 50 | yAxes=common.LATENCY_AXES, 51 | ), 52 | ], 53 | ), 54 | G.Row( 55 | title="DynamoDB", 56 | panels=[ 57 | common.PromGraph( 58 | title="DynamoDB read QPS", 59 | expressions=[ 60 | ( 61 | 'QueryPages {{job}}: {{status_code}}', 62 | 'sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[1m])) by (job, status_code)' 63 | ), 64 | ], 65 | ), 66 | common.PromGraph( 67 | title="DynamoDB read latency", 68 | expressions=[ 69 | ( 70 | 'QueryPages: 99th', 71 | 'histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) by (le)) * 1e3' 72 | ), ( 73 | 'QueryPages: 50th', 74 | 'histogram_quantile(0.5, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) by (le)) * 1e3' 75 | ), ( 76 | 'QueryPages: Mean', 77 | 'sum(rate(cortex_dynamo_request_duration_seconds_sum{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) * 1e3 / sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m]))' 78 | ) 79 | ], 80 | yAxes=common.LATENCY_AXES, 81 | ), 82 | ], 83 | ), 84 | G.Row( 85 | title="DynamoDB", 86 | panels=[ 87 | common.PromGraph( 88 | title="DynamoDB read capacity consumed [rate1m]", 89 | expressions=[ 90 | ( 91 | '{{table}} consumed', 92 | 'sum(rate(cortex_dynamo_consumed_capacity_total{job=~"cortex/.*",operation!~".*Write.*"}[1m])) by (table) > 0' 93 | ), 94 | ( 95 | '{{table}} provisioned', 96 | 'max(cortex_dynamo_table_capacity_units{job="cortex/table-manager", op="read"}) by (table) > 0' 97 | ), 98 | ( 99 | '{{table}} provisioned', 100 | 'max(cortex_table_capacity_units{job="cortex/table-manager", op="read"}) by (table) > 0' 101 | ), 102 | ], 103 | yAxes=common.OPS_AXIS, 104 | ), 105 | common.PromGraph( 106 | title="DynamoDB read errors", 107 | expressions=[ 108 | ( 109 | '{{job}} - {{table}} - {{error}}', 110 | 'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation!~".*Write.*"}[1m])) by (job, error, table) > 0' 111 | ), 112 | ( 113 | '{{job}} - {{table}} - Throttled', 114 | 'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation!~".*Write.*"}[1m])) by (job, error, table) > 0' 115 | ), 116 | ], 117 | yAxes=common.OPS_AXIS, 118 | ), 119 | ], 120 | ), 121 | G.Row( 122 | title="Memcache (blocks)", 123 | panels=[ 124 | common.PromGraph( 125 | title="Memcache read QPS (blocks)", 126 | expressions=[ 127 | ( 128 | '{{name}} {{operation}}', 129 | 'sum(rate(thanos_memcached_operation_duration_seconds_count{kubernetes_namespace="cortex"}[1m])) by (name, operation)' 130 | ), 131 | ( 132 | '{{name}} {{operation}} {{reason}}', 133 | 'sum(rate(thanos_memcached_operation_failures_total{kubernetes_namespace="cortex"}[1m])) by (name, operation, reason) > 0' 134 | ), 135 | ], 136 | yAxes=G.single_y_axis(format=G.OPS_FORMAT), 137 | ), 138 | common.PromGraph( 139 | title="Memcache read latency (blocks)", 140 | expressions=[ 141 | ( 142 | '99% {{name}}', 143 | 'histogram_quantile(0.99, sum(rate(thanos_memcached_operation_duration_seconds_bucket{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m])) by (le, name))' 144 | ), 145 | ( 146 | 'Mean', 147 | 'sum(rate(thanos_memcached_operation_duration_seconds_sum{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m])) / sum(rate(thanos_memcached_operation_duration_seconds_count{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m]))' 148 | ), 149 | ], 150 | yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), 151 | ), 152 | ], 153 | ), 154 | G.Row( 155 | title="Memcache (chunks)", 156 | panels=[ 157 | common.StatusQPSGraph( 158 | common.PROMETHEUS, "Memcache read QPS (chunks)", 159 | 'sum by (job,status_code)(rate(cortex_memcache_request_duration_seconds_count{method="Memcache.GetMulti", job=~"cortex/querier|cortex/query-frontend"}[1m]))' 160 | ), 161 | common.PromGraph( 162 | title="Memcache read latency (chunks)", 163 | expressions=[ 164 | ( 165 | '99% {{name}}', 166 | 'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m])) by (le, name))' 167 | ), 168 | ( 169 | 'Mean', 170 | 'sum(rate(cortex_memcache_request_duration_seconds_sum{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m])) / sum(rate(cortex_memcache_request_duration_seconds_count{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m]))' 171 | ), 172 | ], 173 | yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), 174 | ), 175 | ], 176 | ), 177 | G.Row( 178 | title="Cache", 179 | panels=[ 180 | common.PromGraph( 181 | title="Querier Cache hit rate", 182 | expressions=[ 183 | ( 184 | '{{name}}', 185 | 'sum(rate(cortex_cache_hits{job="cortex/querier"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/querier"}[2m])) by (name)' 186 | ), 187 | ], 188 | yAxes=common.PercentageAxes(), 189 | ), 190 | common.PromGraph( 191 | title="Query-frontend cache hit rate", 192 | expressions=[ 193 | ( 194 | '{{name}}', 195 | 'sum(rate(cortex_cache_hits{job="cortex/query-frontend"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/query-frontend"}[2m])) by (name)' 196 | ), 197 | ], 198 | yAxes=common.PercentageAxes(), 199 | ), 200 | ], 201 | ), 202 | G.Row( 203 | title="S3", 204 | collapse=True, 205 | panels=[ 206 | common.StatusQPSGraph( 207 | common.PROMETHEUS, "S3 read QPS", 208 | 'rate(cortex_s3_request_duration_seconds_count{operation="S3.GetObject", job=~"cortex/.*"}[1m])' 209 | ), 210 | common.PromGraph( 211 | title="S3 read latency", 212 | expressions=[ 213 | ( 214 | '99th centile', 215 | 'histogram_quantile(0.99, sum(rate(cortex_s3_request_duration_seconds_bucket{job=~"cortex/.*", operation="S3.GetObject"}[2m])) by (le)) * 1e3' 216 | ), 217 | ( 218 | '50th centile', 219 | 'histogram_quantile(0.5, sum(rate(cortex_s3_request_duration_seconds_bucket{job=~"cortex/.*", operation="S3.GetObject"}[2m])) by (le)) * 1e3' 220 | ), 221 | ( 222 | 'Mean', 223 | 'sum(rate(cortex_s3_request_duration_seconds_sum{job=~"cortex/.*", operation="S3.PutObject"}[2m])) * 1e3/ sum(rate(cortex_s3_request_duration_seconds_count{job=~"cortex/.*", operation="S3.GetObject"}[2m]))' 224 | ), 225 | ], 226 | yAxes=common.LATENCY_AXES, 227 | ), 228 | ], 229 | ), 230 | ], 231 | ) 232 | -------------------------------------------------------------------------------- /cortex-services-write.dashboard.py: -------------------------------------------------------------------------------- 1 | # -*- mode: python; python-indent-offset: 2 -*- 2 | 3 | import grafanalib.core as G 4 | 5 | import sys, os 6 | sys.path.append(os.path.dirname(__file__)) 7 | import common 8 | 9 | dashboard = common.Dashboard( 10 | uid='writes', 11 | title="Cortex > Services (Writes)", 12 | rows=[ 13 | G.Row( 14 | title="Retrieval Stats", 15 | collapse=True, 16 | panels=[ 17 | common.PromGraph( 18 | title="Retrieval sent batches", 19 | expressions=[ 20 | ( 21 | '{{url}}', 22 | 'sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count[1m])) by (url)' 23 | ), 24 | ], 25 | ), 26 | common.PromGraph( 27 | title="Retrieval batch latency", 28 | expressions=[ 29 | ( 30 | '{{url}} 99th', 31 | 'histogram_quantile(0.99, sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket[2m])) by (url, le)) * 1e3' 32 | ), 33 | ( 34 | '{{url}} 50th', 35 | 'histogram_quantile(0.50, sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket[2m])) by (url, le)) * 1e3' 36 | ), 37 | ( 38 | '{{url}} mean', 39 | '(sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum[2m])) by (url) / sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count[2m])) by (url)) * 1e3' 40 | ), 41 | ], 42 | yAxes=common.LATENCY_AXES, 43 | ), 44 | common.PromGraph( 45 | title="Retrieval sent samples", 46 | expressions=[ 47 | ( 48 | '{{url}} success', 49 | 'sum(rate(prometheus_remote_storage_succeeded_samples_total[1m])) by (url)' 50 | ), 51 | ('{{url}} dropped', 'sum(rate(prometheus_remote_storage_dropped_samples_total[1m])) by (url)'), 52 | ('{{url}} retried', 'sum(rate(prometheus_remote_storage_retried_samples_total[1m])) by (url)'), 53 | ('{{url}} failure', 'sum(rate(prometheus_remote_storage_failed_samples_total[1m])) by (url)'), 54 | ], 55 | ), 56 | common.PromGraph( 57 | title="Queue", 58 | expressions=[ 59 | ('{{url}}: queue length', 'sum(prometheus_remote_storage_pending_samples) by (url)'), 60 | ( 61 | '{{url}}: lag', 62 | 'max(time()-prometheus_remote_storage_queue_highest_sent_timestamp_seconds) by (url)' 63 | ), 64 | ('{{url}}: shards', 'max(prometheus_remote_storage_shards) by (url)'), 65 | ], 66 | ), 67 | ], 68 | ), 69 | G.Row( 70 | title="Distributor", 71 | panels=[ 72 | common.StatusQPSGraph( 73 | common.PROMETHEUS, "Distributor write QPS", 74 | 'rate(cortex_request_duration_seconds_count{job="cortex/distributor"}[1m])' 75 | ), 76 | common.LatencyGraph("cortex", "Distributor Write", "cortex/distributor"), 77 | ], 78 | ), 79 | G.Row( 80 | title="Distributor breakdown", 81 | collapse=True, 82 | panels=[ 83 | common.PromGraph( 84 | title="Distributor Error Rate", 85 | expressions=[ 86 | ( 87 | '{{instance}}', 88 | 'sum by (instance)(rate(cortex_request_duration_seconds_count{job="cortex/distributor", status_code =~ "5.."}[1m]))' 89 | ), 90 | ], 91 | ), 92 | common.PromGraph( 93 | title="Distributor write latency", 94 | expressions=[ 95 | ( 96 | '99th centile {{instance}}', 97 | 'histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{job="cortex/distributor"}[2m])) by (instance,le)) * 1e3' 98 | ), 99 | ], 100 | yAxes=common.LATENCY_AXES, 101 | ), 102 | ], 103 | ), 104 | G.Row( 105 | title="Distributor sends", 106 | collapse=True, 107 | panels=[ 108 | common.StatusQPSGraph( 109 | common.PROMETHEUS, "Distributor send QPS", 110 | 'rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/distributor",operation="/cortex.Ingester/Push"}[1m])' 111 | ), 112 | common.PromGraph( 113 | title="Distributor send latency", 114 | expressions=[ 115 | ( 116 | '99th centile', 117 | 'histogram_quantile(0.99, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3' 118 | ), 119 | ( 120 | '50th centile', 121 | 'histogram_quantile(0.50, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3' 122 | ), 123 | ( 124 | 'Mean', 125 | 'sum(rate(cortex_ingester_client_request_duration_seconds_sum{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) * 1e3 / sum(rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m]))' 126 | ), 127 | ], 128 | yAxes=common.LATENCY_AXES, 129 | ), 130 | ], 131 | ), 132 | G.Row( 133 | title="Samples", 134 | collapse=True, 135 | panels=[ 136 | common.PromGraph( 137 | title="Push sample ingest rate by instance (>1%)", 138 | expressions=[ 139 | ( 140 | '{{user}}', 141 | 'sum by (user)(rate(cortex_distributor_received_samples_total{job="cortex/distributor"}[1m])) > ignoring(user) group_left() (sum(rate(cortex_distributor_received_samples_total{job="cortex/distributor"}[1m]))/100)' 142 | ), 143 | ], 144 | legend=G.Legend(show=False), 145 | yAxes=common.OPS_AXIS, 146 | ), 147 | common.PromGraph( 148 | title="Rule sample ingest rate by instance", 149 | expressions=[ 150 | ( 151 | '{{user}}', 152 | # '> 1' is to exclude instances which are not connected and simply alerting on absent metrics 153 | 'sum by (user)(rate(cortex_distributor_received_samples_total{job="cortex/ruler"}[1m])) > 1' 154 | ), 155 | ], 156 | legend=G.Legend(show=False), 157 | yAxes=common.OPS_AXIS, 158 | ), 159 | common.PromGraph( 160 | title="Sample discard rate by instance ID & reason", 161 | expressions=[ 162 | ( 163 | '{{user}} - {{reason}} ', 164 | 'sum by (user, reason) (rate(cortex_discarded_samples_total{reason!="duplicate-sample"}[1m])) > 0' 165 | ), 166 | ], 167 | yAxes=common.OPS_AXIS, 168 | ), 169 | ], 170 | ), 171 | G.Row( 172 | title="Ingester", 173 | panels=[ 174 | common.StatusQPSGraph( 175 | common.PROMETHEUS, "Ingester write QPS", 176 | 'rate(cortex_request_duration_seconds_count{job="cortex/ingester"}[1m])' 177 | ), 178 | common.PromGraph( 179 | title="Ingester write latency", 180 | expressions=[ 181 | ( 182 | '99th centile', 183 | 'job_route:cortex_request_duration_seconds:99quantile{job="cortex/ingester", route="/cortex.Ingester/Push"} * 1e3' 184 | ), 185 | ( 186 | '50th centile', 187 | 'job_route:cortex_request_duration_seconds:50quantile{job="cortex/ingester", route="/cortex.Ingester/Push"} * 1e3' 188 | ), 189 | ( 190 | 'Mean', 191 | 'sum(rate(cortex_request_duration_seconds_sum{job="cortex/ingester", route="/cortex.Ingester/Push"}[2m])) * 1e3 / sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route="/cortex.Ingester/Push"}[2m]))' 192 | ), 193 | ], 194 | yAxes=common.LATENCY_AXES, 195 | ), 196 | ], 197 | ), 198 | G.Row( 199 | title="DynamoDB", 200 | panels=[ 201 | common.PromGraph( 202 | title="DynamoDB write QPS", 203 | expressions=[ 204 | ( 205 | 'BatchWriteItem {{job}}: {{status_code}}', 206 | 'sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[1m])) by (job, status_code)' 207 | ), 208 | ], 209 | ), 210 | common.PromGraph( 211 | title="DynamoDB write latency", 212 | expressions=[ 213 | ( 214 | 'BatchWriteItem: 99th', 215 | 'histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) by (le)) * 1e3' 216 | ), ( 217 | 'BatchWriteItem: 50th', 218 | 'histogram_quantile(0.5, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) by (le)) * 1e3' 219 | ), ( 220 | 'BatchWriteItem: Mean', 221 | 'sum(rate(cortex_dynamo_request_duration_seconds_sum{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) * 1e3 / sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m]))' 222 | ) 223 | ], 224 | yAxes=common.LATENCY_AXES, 225 | ), 226 | ], 227 | ), 228 | G.Row( 229 | title="DynamoDB", 230 | panels=[ 231 | common.PromGraph( 232 | title="DynamoDB write capacity consumed [rate1m]", 233 | expressions=[ 234 | ( 235 | '{{table}} consumed', 236 | 'sum(rate(cortex_dynamo_consumed_capacity_total{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[1m])) by (table) > 0' 237 | ), 238 | ( 239 | '{{table}} provisioned', 240 | 'max(cortex_dynamo_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0' 241 | ), 242 | ( 243 | '{{table}} provisioned', 244 | 'max(cortex_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0' 245 | ), 246 | ], 247 | yAxes=common.OPS_AXIS, 248 | ), 249 | common.PromGraph( 250 | title="DynamoDB write errors", 251 | expressions=[ 252 | ( 253 | '{{table}} - {{error}}', 254 | 'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0' 255 | ), 256 | ( 257 | '{{table}} - Throttled', 258 | 'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0' 259 | ), 260 | ], 261 | yAxes=common.OPS_AXIS, 262 | ), 263 | ], 264 | ), 265 | G.Row( 266 | title="Memcache", 267 | panels=[ 268 | common.PromGraph( 269 | title="Ingester hit rate", 270 | expressions=[ 271 | ( 272 | '{{name}}', 273 | 'sum(rate(cortex_cache_hits{job="cortex/ingester"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/ingester"}[2m])) by (name)' 274 | ), 275 | ], 276 | yAxes=common.PercentageAxes(), 277 | ), 278 | common.PromGraph( 279 | title="Memcache QPS", 280 | expressions=[ 281 | ( 282 | '{{method}} {{status_code}}', 283 | 'sum(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ingester"}[1m])) by (method,status_code)' 284 | ), 285 | ], 286 | ), 287 | common.PromGraph( 288 | title="Memcache latency", 289 | expressions=[ 290 | ( 291 | '{{method}} 99th centile', 292 | 'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ingester"}[2m])) by (le,method)) * 1e3' 293 | ), 294 | ( 295 | '{{method}} 50th centile', 296 | 'histogram_quantile(0.5, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ingester"}[2m])) by (le,method)) * 1e3' 297 | ), 298 | ( 299 | '{{method}} Mean', 300 | 'sum by (method)(rate(cortex_memcache_request_duration_seconds_sum{job="cortex/ingester"}[2m])) * 1e3 / sum by (method)(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ingester"}[2m]))' 301 | ), 302 | ], 303 | yAxes=common.LATENCY_AXES, 304 | ), 305 | ], 306 | ), 307 | ], 308 | ) 309 | --------------------------------------------------------------------------------