├── LICENSE
├── README.md
├── common.py
├── cortex-alertmanager.dashboard.py
├── cortex-blocks.dashboard.py
├── cortex-chunks.dashboard.py
├── cortex-ruler.dashboard.py
├── cortex-services-read.dashboard.py
└── cortex-services-write.dashboard.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Dashboards for managing Cortex
 2 | 
 3 | These are the dashboards used internally at Weaveworks for our
 4 | [Cortex](https://github.com/cortexproject/cortex) instances.
 5 | 
 6 | They are written in Python using
 7 | [Grafanalib](https://github.com/weaveworks/grafanalib).
 8 | 
 9 | Some aspects are specific to the Weave Cloud infrastructure, e.g. the
10 | use of DynamoDB for storage.
11 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
  1 | """Common configuration across dashboards.
  2 | 
  3 |    Copyright 2019 Weaveworks Inc
  4 | 
  5 |    Licensed under the Apache License, Version 2.0 (the "License");
  6 |    you may not use this file except in compliance with the License.
  7 |    You may obtain a copy of the License at
  8 | 
  9 |        http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 |    Unless required by applicable law or agreed to in writing, software
 12 |    distributed under the License is distributed on an "AS IS" BASIS,
 13 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |    See the License for the specific language governing permissions and
 15 |    limitations under the License.
 16 | """
 17 | 
 18 | import grafanalib.core as G
 19 | import grafanalib.weave as W
 20 | from grafanalib import prometheus
 21 | """A single Y axis of milliseconds. Use for latency graphs."""
 22 | LATENCY_AXES = G.single_y_axis(format=G.MILLISECONDS_FORMAT)
 23 | """A single Y axis counting operations. Use for requests per second, etc."""
 24 | OPS_AXIS = G.single_y_axis(format=G.OPS_FORMAT)
 25 | """The name of the data source for our Prometheus service."""
 26 | PROMETHEUS = "$datasource"
 27 | 
 28 | QPS_SERIES_OVERRIDES = [
 29 |     {
 30 |         "alias": "/^1../",
 31 |         "color": W.YELLOW
 32 |     },
 33 |     {
 34 |         "alias": "/^2../",
 35 |         "color": W.GREEN
 36 |     },
 37 |     {
 38 |         "alias": "/^3../",
 39 |         "color": W.BLUE
 40 |     },
 41 |     {
 42 |         "alias": "/^4../",
 43 |         "color": W.ORANGE
 44 |     },
 45 |     {
 46 |         "alias": "/^5../",
 47 |         "color": W.RED
 48 |     },
 49 |     {
 50 |         "alias": "success",
 51 |         "color": W.GREEN
 52 |     },
 53 |     {
 54 |         "alias": "error",
 55 |         "color": W.RED
 56 |     },
 57 | ]
 58 | 
 59 | 
 60 | def PromGraph(*args, **kwargs):
 61 |     """A graph of data from our Prometheus."""
 62 | 
 63 |     kwargs_with_defaults = dict(
 64 |         tooltip=G.Tooltip(sort=G.SORT_DESC),
 65 |         nullPointMode=G.NULL_AS_NULL,
 66 |     )
 67 |     kwargs_with_defaults.update(kwargs)
 68 | 
 69 |     return prometheus.PromGraph(data_source=PROMETHEUS, *args, **kwargs_with_defaults)
 70 | 
 71 | 
 72 | def Dashboard(**kwargs):
 73 |     """Standard Weave Cloud dashboard.
 74 | 
 75 |     Automatically sets panel ids and applies events from Weave Cloud as annotations.
 76 |     """
 77 | 
 78 |     defaultTemplates = [G.Template(
 79 |         label="Datasource",
 80 |         name="datasource",
 81 |         type="datasource",
 82 |         query="prometheus",
 83 |     )]
 84 | 
 85 |     if "templating" in kwargs:
 86 |         extraTemplates = kwargs["templating"].list
 87 |     else:
 88 |         extraTemplates = []
 89 | 
 90 |     kwargs["templating"] = G.Templating(list=defaultTemplates + extraTemplates)
 91 | 
 92 |     return G.Dashboard(
 93 |         refresh='1m',  # Override the default of 10s
 94 |         **kwargs
 95 |     ).auto_panel_ids()
 96 | 
 97 | 
 98 | def PercentageAxes(label=None, max=1):
 99 |     """Y axes that show a percentage based on a unit value."""
100 |     return G.single_y_axis(
101 |         format=G.PERCENT_UNIT_FORMAT,
102 |         label=label,
103 |         logBase=1,
104 |         max=max,
105 |         min=0,
106 |     )
107 | 
108 | 
109 | def QPSGraph(namespace, graphName, job, metric_root="request", extra_conditions=""):
110 |     expr_template = 'rate({ns}_{mroot}_duration_seconds_count{{job="{job}"{extra}}}[1m])'
111 |     return StatusQPSGraph(
112 |         data_source=PROMETHEUS,
113 |         title='{name} QPS'.format(name=graphName),
114 |         expression=expr_template.format(ns=namespace, mroot=metric_root, job=job, extra=extra_conditions)
115 |     )
116 | 
117 | 
118 | def StatusQPSGraph(data_source, title, expression, **kwargs):
119 |     """Create a graph of QPS, coloured by status code.
120 | 
121 |     :param title: Title of the graph.
122 |     :param expression: Format and PromQL expression; must sum by label
123 |                        which is http code like 404 or "success" and "error"
124 |     :param kwargs: Passed on to Graph.
125 |     """
126 |     return W.stacked(
127 |         prometheus.PromGraph(
128 |             data_source=data_source,
129 |             title=title,
130 |             expressions=[('{{status_code}}', 'sum by (status_code)(%s)' % (expression))],
131 |             seriesOverrides=QPS_SERIES_OVERRIDES,
132 |             legend=G.Legend(hideZero=True),
133 |             yAxes=[
134 |                 G.YAxis(format=G.OPS_FORMAT),
135 |                 G.YAxis(format=G.SHORT_FORMAT),
136 |             ],
137 |             **kwargs
138 |         )
139 |     )
140 | 
141 | 
142 | def LatencyGraph(namespace, graphName, job, rule_root="job:", metric_root="request", extra_conditions=""):
143 |     return PromGraph(
144 |         title='{name} Latency'.format(name=graphName),
145 |         expressions=[
146 |             (
147 |                 '99th centile', '{rroot}{ns}_{mroot}_duration_seconds:99quantile{{job="{job}"{extra}}} * 1e3'
148 |                 .format(rroot=rule_root, ns=namespace, mroot=metric_root, job=job, extra=extra_conditions)
149 |             ),
150 |             (
151 |                 '50th centile', '{rroot}{ns}_{mroot}_duration_seconds:50quantile{{job="{job}"{extra}}} * 1e3'
152 |                 .format(rroot=rule_root, ns=namespace, mroot=metric_root, job=job, extra=extra_conditions)
153 |             ),
154 |             (
155 |                 'Mean',
156 |                 'sum(rate({ns}_{mroot}_duration_seconds_sum{{ws="false",job="{job}"{extra}}}[5m])) * 1e3 / sum(rate({ns}_{mroot}_duration_seconds_count{{ws="false",job="{job}"{extra}}}[5m]))'
157 |                 .format(ns=namespace, mroot=metric_root, job=job, extra=extra_conditions)
158 |             ),
159 |         ],
160 |         yAxes=LATENCY_AXES,
161 |     )
162 | 
163 | 
164 | def REDRow(namespace, graphName, job, rule_root="job:", metric_root="request", extra_conditions="", collapse=False):
165 |     return G.Row(
166 |         title='%s QPS & Latency' % (graphName, ),
167 |         collapse=collapse,
168 |         panels=[
169 |             QPSGraph(namespace, graphName, job, metric_root, extra_conditions),
170 |             LatencyGraph(namespace, graphName, job, rule_root, metric_root, extra_conditions),
171 |         ]
172 |     )
173 | 
174 | 


--------------------------------------------------------------------------------
/cortex-alertmanager.dashboard.py:
--------------------------------------------------------------------------------
 1 | # -*- mode: python; python-indent-offset: 2 -*-
 2 | 
 3 | import grafanalib.core as G
 4 | 
 5 | import sys, os
 6 | sys.path.append(os.path.dirname(__file__))
 7 | import common
 8 | 
 9 | dashboard = common.Dashboard(
10 |     uid='am',
11 |     title="Cortex > Alertmanager",
12 |     rows=[
13 |         G.Row(
14 |             title='Operations',
15 |             panels=[
16 |                 common.PromGraph(
17 |                     title="Alerts",
18 |                     expressions=[
19 |                         (
20 |                             "{{instance}} {{status}}",
21 |                             'sum by (instance, status)(rate(alertmanager_alerts_received_total{job="cortex/alertmanager"}[2m]))'
22 |                         ),
23 |                         (
24 |                             "{{instance}} invalid",
25 |                             'sum by (instance, status)(rate(alertmanager_alerts_invalid_total{job="cortex/alertmanager"}[2m]))'
26 |                         ),
27 |                     ],
28 |                     yAxes=common.OPS_AXIS,
29 |                 ),
30 |                 common.PromGraph(
31 |                     title="Notifications",
32 |                     expressions=[
33 |                         (
34 |                             "{{integration}}",
35 |                             'sum by (integration)(rate(alertmanager_notifications_total{job="cortex/alertmanager"}[2m]))'
36 |                         ),
37 |                     ],
38 |                     yAxes=common.OPS_AXIS,
39 |                 ),
40 |             ]
41 |         ),
42 |         G.Row(
43 |             title='Alertmanager fetching configs',
44 |             collapse=True,
45 |             panels=[
46 |                 common.QPSGraph('cortex_configs', 'Configs', 'cortex/alertmanager'),
47 |                 common.PromGraph(
48 |                     title="Configs Latency",
49 |                     expressions=[
50 |                         (
51 |                             "99th centile",
52 |                             'histogram_quantile(0.99, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/alertmanager"}[2m])) by (le)) * 1e3'
53 |                         ),
54 |                         (
55 |                             "50th centile",
56 |                             'histogram_quantile(0.50, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/alertmanager"}[2m])) by (le)) * 1e3'
57 |                         ),
58 |                         (
59 |                             "Mean",
60 |                             'sum(rate(cortex_configs_request_duration_seconds_sum{job="cortex/alertmanager"}[2m])) / sum(rate(cortex_configs_request_duration_seconds_count{job="cortex/alertmanager"}[2m])) * 1e3'
61 |                         ),
62 |                     ],
63 |                     yAxes=common.LATENCY_AXES,
64 |                 ),
65 |             ]
66 |         ),
67 |         common.REDRow('cortex', 'Alertmanager', 'cortex/alertmanager'),
68 |         G.Row(
69 |             [
70 |                 common.PromGraph(
71 |                     title="Known Configurations",
72 |                     expressions=[
73 |                         ("{{instance}}", 'cortex_alertmanager_configs_total{job="cortex/alertmanager"}'),
74 |                     ],
75 |                 ),
76 |                 common.PromGraph(
77 |                     title="Cluster Members",
78 |                     expressions=[
79 |                         ("{{instance}}", 'sum(alertmanager_cluster_members{job="cortex/alertmanager"}) by (instance)'),
80 |                     ],
81 |                 ),
82 |             ]
83 |         ),
84 |     ],
85 | )
86 | 


--------------------------------------------------------------------------------
/cortex-blocks.dashboard.py:
--------------------------------------------------------------------------------
  1 | # -*- mode: python; python-indent-offset: 2 -*-
  2 | 
  3 | import grafanalib.core as G
  4 | import grafanalib.weave as W
  5 | 
  6 | import common
  7 | 
  8 | dashboard = common.Dashboard(
  9 |     uid='cortex-blocks',
 10 |     title="Cortex > Blocks",
 11 |     rows=[
 12 |         G.Row(
 13 |             title="Data",
 14 |             panels=[
 15 |                 common.PromGraph(
 16 |                     title="Number of series in memory, in ingesters",
 17 |                     expressions=[
 18 |                         ('', 'sum(cortex_ingester_memory_series{job="cortex/ingester"})'),
 19 |                     ],
 20 |                 ),
 21 |                 common.PromGraph(
 22 |                     title="Head chunks",
 23 |                     expressions=[
 24 |                         ('{{instance}}', 'cortex_ingester_tsdb_head_chunks'),
 25 |                     ],
 26 |                 ),
 27 |             ]
 28 |         ),
 29 |         G.Row(
 30 |             title="Resources",
 31 |             panels=[
 32 |                 common.PromGraph(
 33 |                     title="Memory Usage",
 34 |                     expressions=[
 35 |                         (
 36 |                             '{{pod}}',
 37 |                             'sum by(pod)(container_memory_usage_bytes{namespace="cortex",container!="POD",container!=""})'
 38 |                         ),
 39 |                     ],
 40 |                     yAxes=[
 41 |                         G.YAxis(format=G.BYTES_FORMAT),
 42 |                         G.YAxis(format=G.SHORT_FORMAT),
 43 |                     ],
 44 |                 ),
 45 |                 common.PromGraph(
 46 |                     title="Disk space usage",
 47 |                     expressions=[
 48 |                         (
 49 |                             '{{persistentvolumeclaim}}',
 50 |                             'kubelet_volume_stats_used_bytes{namespace="cortex"} / kubelet_volume_stats_capacity_bytes{namespace="cortex"}'
 51 |                         ),
 52 |                     ],
 53 |                     yAxes=common.PercentageAxes(),
 54 |                 ),
 55 |             ],
 56 |         ),
 57 |         G.Row(
 58 |             title="Last runs",
 59 |             panels=[
 60 |                 G.SingleStat(
 61 |                     dataSource=common.PROMETHEUS,
 62 |                     title="Last Successful Compactor Run",
 63 |                     targets=[
 64 |                         G.Target(
 65 |                             '(time()-cortex_compactor_last_successful_run_timestamp_seconds) / 60',
 66 |                             refId='A',
 67 |                         ),
 68 |                     ],
 69 |                     format='m',  # TODO: Add 'MINUTES_FORMAT' to grafanalib
 70 |                 ),
 71 |                 G.SingleStat(
 72 |                     dataSource=common.PROMETHEUS,
 73 |                     title="Last Successful Bucket Index Update",
 74 |                     targets=[
 75 |                         G.Target(
 76 |                             '(time()-max(cortex_bucket_index_last_successful_update_timestamp_seconds)) / 60',
 77 |                             refId='A',
 78 |                         ),
 79 |                     ],
 80 |                     format='m',  # TODO: Add 'MINUTES_FORMAT' to grafanalib
 81 |                 ),
 82 |             ],
 83 |         ),
 84 |         G.Row(
 85 |             title="Block Operations",
 86 |             panels=[
 87 |                 common.PromGraph(
 88 |                     title="Rates",
 89 |                     expressions=[
 90 |                         ('{{component}} loads', 'sum by(component)(rate(cortex_bucket_store_block_loads_total{}[1m]))'),
 91 |                         (
 92 |                             '{{component}} errors',
 93 |                             'sum by(component)(rate(cortex_bucket_store_block_load_failures_total{}[1m])) > 0'
 94 |                         ),
 95 |                         ('Uploads', 'sum(rate(cortex_ingester_shipper_uploads_total[5m]))'),
 96 |                         ('Upload errors', 'sum(rate(cortex_ingester_shipper_upload_failures_total[5m]))'),
 97 |                         ('Dir syncs', 'sum(rate(cortex_ingester_shipper_dir_syncs_total[5m]))'),
 98 |                         ('Dir sync errors', 'sum(rate(cortex_ingester_shipper_dir_sync_failures_total[5m]))'),
 99 |                     ],
100 |                     yAxes=[
101 |                         G.YAxis(format=G.OPS_FORMAT),
102 |                         G.YAxis(format=G.SHORT_FORMAT),
103 |                     ],
104 |                 ),
105 |                 common.PromGraph(
106 |                     title="Latency",
107 |                     expressions=[
108 |                         (
109 |                             '99th centile',
110 |                             'histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{operation="upload"}[5m])) by (le))'
111 |                         ),
112 |                         (
113 |                             '50th centile',
114 |                             'histogram_quantile(0.5, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{operation="upload"}[5m])) by (le))'
115 |                         ),
116 |                         (
117 |                             'Mean',
118 |                             'sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{operation="upload"}[5m])) / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{operation="upload"}[5m]))'
119 |                         ),
120 |                     ],
121 |                     yAxes=G.single_y_axis(format=G.SECONDS_FORMAT),
122 |                 ),
123 |             ],
124 |         ),
125 |         G.Row(
126 |             title="Compactions",
127 |             panels=[
128 |                 common.PromGraph(
129 |                     title="Operations",
130 |                     expressions=[
131 |                         ('Compactions', 'sum(rate(cortex_ingester_tsdb_compactions_total[5m]))'),
132 |                         ('errors', 'sum(rate(cortex_ingester_tsdb_compactions_failed_total[5m]))'),
133 |                     ],
134 |                 ),
135 |                 common.PromGraph(
136 |                     title="Latency",
137 |                     expressions=[
138 |                         (
139 |                             '99th centile',
140 |                             'histogram_quantile(0.99, sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_bucket{}[5m])) by (le))'
141 |                         ),
142 |                         (
143 |                             '50th centile',
144 |                             'histogram_quantile(0.5, sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_bucket{}[5m])) by (le))'
145 |                         ),
146 |                         (
147 |                             'Mean',
148 |                             'sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_sum{}[5m])) / sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_count{}[5m]))'
149 |                         ),
150 |                     ],
151 |                     yAxes=G.single_y_axis(format=G.SECONDS_FORMAT),
152 |                 ),
153 |             ],
154 |         ),
155 |         G.Row(
156 |             title="WAL",
157 |             panels=[
158 |                 common.PromGraph(
159 |                     title="Operations",
160 |                     expressions=[
161 |                         ('Truncations', 'sum(rate(cortex_ingester_tsdb_wal_truncations_total[5m]))'),
162 |                         ('Truncation errors', 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]))'),
163 |                         ('Checkpoint', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total[5m]))'),
164 |                         ('Checkpoint errors', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]))'),
165 |                         ('WAL corruptions', 'sum(rate(cortex_ingester_wal_corruptions_total[5m]))'),
166 |                         ('mmap corruptions', 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total[5m]))'),
167 |                     ],
168 |                 ),
169 |                 common.PromGraph(
170 |                     title="Latency",
171 |                     expressions=[
172 |                         (
173 |                             '99th centile',
174 |                             'histogram_quantile(0.99, sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_bucket{}[5m])) by (le))'
175 |                         ),
176 |                         (
177 |                             '50th centile',
178 |                             'histogram_quantile(0.5, sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_bucket{}[5m])) by (le))'
179 |                         ),
180 |                         (
181 |                             'Mean',
182 |                             'sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{}[5m])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{}[5m]))'
183 |                         ),
184 |                     ],
185 |                     yAxes=G.single_y_axis(format=G.SECONDS_FORMAT),
186 |                 ),
187 |             ],
188 |         ),
189 |         G.Row(
190 |             title="Bucket Operations",
191 |             panels=[
192 |                 common.PromGraph(
193 |                     title="Operations",
194 |                     expressions=[
195 |                         (
196 |                             '{{component}}-{{operation}}',
197 |                             'sum by(component,operation) (rate(thanos_objstore_bucket_operations_total[5m]))'
198 |                         ),
199 |                         (
200 |                             'errors {{component}}-{{operation}}',
201 |                             'sum by(component,operation) (rate(thanos_objstore_bucket_operation_failures_total[5m]))'
202 |                         ),
203 |                     ],
204 |                 ),
205 |                 common.PromGraph(
206 |                     title="99% Latency",
207 |                     expressions=[
208 |                         (
209 |                             '{{component}}-{{operation}}',
210 |                             'histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket[5m])) by (le, component, operation)) > 0'
211 |                         ),
212 |                     ],
213 |                     yAxes=G.single_y_axis(format=G.SECONDS_FORMAT),
214 |                 ),
215 |             ],
216 |         ),
217 |         G.Row(
218 |             title="Ring",
219 |             collapse=True,
220 |             panels=[
221 |                 W.stacked(
222 |                     common.PromGraph(
223 |                         title="Ingester Ring Ownership",
224 |                         expressions=[
225 |                             (
226 |                                 '{{ingester}}',
227 |                                 'max(cortex_ring_ingester_ownership_percent{job="cortex/distributor"}) by (ingester) or label_replace(max(cortex_ring_member_ownership_percent{job="cortex/distributor"}) by (member), "ingester", "$1", "member", "(.*)")'
228 |                             ),
229 |                         ],
230 |                         # Show y-axis slightly above 100% in case series overlap
231 |                         yAxes=common.PercentageAxes(max=1.2),
232 |                     )
233 |                 ),
234 |                 W.stacked(
235 |                     common.PromGraph(
236 |                         title="Ingesters In Ring",
237 |                         expressions=[
238 |                             (
239 |                                 '{{state}}',
240 |                                 'max(cortex_ring_ingesters{job="cortex/distributor"}) by (state) or max(cortex_ring_members{job="cortex/distributor"}) by (state)'
241 |                             ),
242 |                         ],
243 |                         yAxes=[
244 |                             G.YAxis(format=G.NO_FORMAT),
245 |                             G.YAxis(format=G.SHORT_FORMAT),
246 |                         ],
247 |                     )
248 |                 ),
249 |             ]
250 |         ),
251 |     ],
252 | )
253 | 


--------------------------------------------------------------------------------
/cortex-chunks.dashboard.py:
--------------------------------------------------------------------------------
  1 | # -*- mode: python; python-indent: 2; python-indent-offset: 2 -*-
  2 | 
  3 | import grafanalib.core as G
  4 | import grafanalib.weave as W
  5 | 
  6 | import sys, os
  7 | sys.path.append(os.path.dirname(__file__))
  8 | import common
  9 | 
 10 | dashboard = common.Dashboard(
 11 |     uid='chunks',
 12 |     title="Cortex > Chunks",
 13 |     rows=[
 14 |         G.Row(
 15 |             panels=[
 16 |                 common.PromGraph(
 17 |                     title="Number of chunks (in memory, in ingesters)",
 18 |                     expressions=[
 19 |                         ('', 'sum(cortex_ingester_memory_chunks{job="cortex/ingester"})'),
 20 |                     ],
 21 |                 ),
 22 |                 common.PromGraph(
 23 |                     title="Chunks per series",
 24 |                     expressions=[
 25 |                         (
 26 |                             '',
 27 |                             'sum(cortex_ingester_memory_chunks{job="cortex/ingester"}) / sum(cortex_ingester_memory_series{job="cortex/ingester"})'
 28 |                         ),
 29 |                     ],
 30 |                 ),
 31 |             ]
 32 |         ),
 33 |         G.Row(
 34 |             panels=[
 35 |                 common.PromGraph(
 36 |                     title="Chunk Size Bytes (on flush)",
 37 |                     expressions=[
 38 |                         (
 39 |                             "99th Percentile",
 40 |                             'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m]))  by (le))'
 41 |                         ),
 42 |                         (
 43 |                             "50th Percentile",
 44 |                             'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m]))  by (le))'
 45 |                         ),
 46 |                         (
 47 |                             "10th Percentile",
 48 |                             'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m]))  by (le))'
 49 |                         ),
 50 |                         (
 51 |                             "Mean",
 52 |                             'sum(rate(cortex_ingester_chunk_size_bytes_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_ingester_chunk_size_bytes_count{job="cortex/ingester"}[2m]))'
 53 |                         ),
 54 |                     ],
 55 |                     yAxes=[
 56 |                         G.YAxis(format=G.BYTES_FORMAT),
 57 |                         G.YAxis(format=G.SHORT_FORMAT),
 58 |                     ],
 59 |                 ),
 60 |                 common.PromGraph(
 61 |                     title="Chunk Age (on flush)",
 62 |                     expressions=[
 63 |                         (
 64 |                             "99th Percentile",
 65 |                             'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))'
 66 |                         ),
 67 |                         (
 68 |                             "50th Percentile",
 69 |                             'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))'
 70 |                         ),
 71 |                         (
 72 |                             "10th Percentile",
 73 |                             'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))'
 74 |                         ),
 75 |                         (
 76 |                             "Mean",
 77 |                             'sum(rate(cortex_ingester_chunk_age_seconds_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_ingester_chunk_age_seconds_count{job="cortex/ingester"}[2m]))'
 78 |                         ),
 79 |                     ],
 80 |                     yAxes=[
 81 |                         G.YAxis(format=G.DURATION_FORMAT),
 82 |                         G.YAxis(format=G.SHORT_FORMAT),
 83 |                     ],
 84 |                 ),
 85 |                 common.PromGraph(
 86 |                     title="Chunk Length (on flush)",
 87 |                     expressions=[
 88 |                         (
 89 |                             "99th Percentile",
 90 |                             'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))'
 91 |                         ),
 92 |                         (
 93 |                             "50th Percentile",
 94 |                             'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))'
 95 |                         ),
 96 |                         (
 97 |                             "10th Percentile",
 98 |                             'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))'
 99 |                         ),
100 |                         (
101 |                             "Mean",
102 |                             'sum(rate(cortex_ingester_chunk_length_sum{job=\"cortex/ingester\"}[2m])) / sum(rate(cortex_ingester_chunk_length_count{job=\"cortex/ingester\"}[2m]))'
103 |                         ),
104 |                     ],
105 |                 ),
106 |             ]
107 |         ),
108 |         G.Row(
109 |             panels=[
110 |                 W.stacked(
111 |                     common.PromGraph(
112 |                         title="Series Flush Queue Length",
113 |                         expressions=[
114 |                             ("{{instance}}", 'cortex_ingester_flush_queue_length{job="cortex/ingester"}'),
115 |                         ],
116 |                     )
117 |                 ),
118 |                 W.stacked(
119 |                     common.PromGraph(
120 |                         title="Chunk Flush Rate (rate[1m])",
121 |                         expressions=[
122 |                             # This is the rate at which chunks are added to the flush queue
123 |                             (
124 |                                 "{{reason}}",
125 |                                 'sum by (reason)(rate(cortex_ingester_flush_reasons[1m]) or rate(cortex_ingester_series_flushed_total[1m]) or rate(cortex_ingester_flushing_enqueued_series_total[1m]))'
126 |                             ),
127 |                             # This is the rate at which chunks are removed from the flush queue
128 |                             (
129 |                                 "Flushed",
130 |                                 'sum(rate(cortex_ingester_chunks_stored_total[1m]) or rate(cortex_chunk_store_stored_chunks_total[1m]))'
131 |                             ),
132 |                             # Chunks dropped for being too small
133 |                             ("Dropped", 'sum(rate(cortex_ingester_dropped_chunks_total[1m]))'),
134 |                         ],
135 |                         # Show flush and dropped rates as a line overlayed on enqueue rates, not stacked and not filled
136 |                         seriesOverrides=[
137 |                             {
138 |                                 "alias": "Flushed",
139 |                                 "fill": 1,
140 |                                 "linewidth": 1,
141 |                                 "stack": False
142 |                             }, {
143 |                                 "alias": "Dropped",
144 |                                 "fill": 1,
145 |                                 "linewidth": 1,
146 |                                 "stack": False
147 |                             }
148 |                         ],
149 |                     )
150 |                 ),
151 |             ]
152 |         ),
153 |         G.Row(
154 |             title="DynamoDB",
155 |             collapse=True,
156 |             panels=[
157 |                 common.PromGraph(
158 |                     title="DynamoDB write capacity consumed [rate1m]",
159 |                     expressions=[
160 |                         (
161 |                             '{{table}} consumed',
162 |                             'sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.BatchWriteItem"}[1m])) by (table) > 0'
163 |                         ),
164 |                         (
165 |                             '{{table}} provisioned',
166 |                             'max(cortex_dynamo_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0'
167 |                         ),
168 |                         (
169 |                             '{{table}} provisioned',
170 |                             'max(cortex_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0'
171 |                         ),
172 |                     ],
173 |                     yAxes=common.OPS_AXIS,
174 |                 ),
175 |                 common.PromGraph(
176 |                     title="DynamoDB write errors",
177 |                     expressions=[
178 |                         (
179 |                             '{{table}} - {{error}}',
180 |                             'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0'
181 |                         ),
182 |                         (
183 |                             '{{table}} - Throttled',
184 |                             'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0'
185 |                         ),
186 |                     ],
187 |                     yAxes=common.OPS_AXIS,
188 |                 ),
189 |             ],
190 |         ),
191 |         G.Row(
192 |             title="Ring",
193 |             collapse=True,
194 |             panels=[
195 |                 W.stacked(
196 |                     common.PromGraph(
197 |                         title="Ingester Ring Ownership",
198 |                         expressions=[
199 |                             (
200 |                                 '{{ingester}}',
201 |                                 'max(cortex_ring_ingester_ownership_percent{job="cortex/distributor"}) by (ingester) or label_replace(max(cortex_ring_member_ownership_percent{job="cortex/distributor"}) by (member), "ingester", "$1", "member", "(.*)")'
202 |                             ),
203 |                         ],
204 |                         # Show y-axis slightly above 100% in case series overlap
205 |                         yAxes=common.PercentageAxes(max=1.2),
206 |                     )
207 |                 ),
208 |                 W.stacked(
209 |                     common.PromGraph(
210 |                         title="Ingesters In Ring",
211 |                         expressions=[
212 |                             (
213 |                                 '{{state}}',
214 |                                 'max(cortex_ring_ingesters{job="cortex/distributor"}) by (state) or max(cortex_ring_members{job="cortex/distributor"}) by (state)'
215 |                             ),
216 |                         ],
217 |                         yAxes=[
218 |                             G.YAxis(format=G.NO_FORMAT),
219 |                             G.YAxis(format=G.SHORT_FORMAT),
220 |                         ],
221 |                     )
222 |                 ),
223 |             ]
224 |         ),
225 |         G.Row(
226 |             title="Index and Cache",
227 |             panels=[
228 |                 common.PromGraph(
229 |                     title="Index entries per chunk",
230 |                     expressions=[
231 |                         (
232 |                             '',
233 |                             'sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{job="cortex/ingester"}[2m]))'
234 |                         ),
235 |                     ],
236 |                 ),
237 |                 common.PromGraph(
238 |                     title="Ingester hit rate",
239 |                     expressions=[
240 |                         (
241 |                             '{{name}}',
242 |                             'sum(rate(cortex_cache_hits{job="cortex/ingester"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/ingester"}[2m])) by (name)'
243 |                         ),
244 |                     ],
245 |                     yAxes=common.PercentageAxes(),
246 |                 ),
247 |             ]
248 |         ),
249 |     ]
250 | )
251 | 


--------------------------------------------------------------------------------
/cortex-ruler.dashboard.py:
--------------------------------------------------------------------------------
  1 | # -*- mode: python; python-indent-offset: 2 -*-
  2 | 
  3 | import grafanalib.core as G
  4 | 
  5 | import sys, os
  6 | sys.path.append(os.path.dirname(__file__))
  7 | import common
  8 | 
  9 | dashboard = common.Dashboard(
 10 |     uid='ruler',
 11 |     title="Cortex > Recording Rules",
 12 |     rows=[
 13 |         G.Row(
 14 |             title="Configs",
 15 |             collapse=True,
 16 |             panels=[
 17 |                 common.PromGraph(
 18 |                     title="Known Configurations",
 19 |                     expressions=[
 20 |                         ("Configurations", 'max(cortex_configs{job="cortex/ruler"})'),
 21 |                         ("{{status}}", 'max by(status)(cortex_alertmanager_configs{job="cortex/alertmanager"})'),
 22 |                     ],
 23 |                 ),
 24 |                 common.QPSGraph('cortex_configs', 'Configs', 'cortex/ruler'),
 25 |                 common.PromGraph(
 26 |                     title="Configs Latency",
 27 |                     expressions=[
 28 |                         (
 29 |                             "99th centile",
 30 |                             'histogram_quantile(0.99, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3'
 31 |                         ),
 32 |                         (
 33 |                             "50th centile",
 34 |                             'histogram_quantile(0.50, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3'
 35 |                         ),
 36 |                         (
 37 |                             "Mean",
 38 |                             'sum(rate(cortex_configs_request_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_configs_request_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3'
 39 |                         ),
 40 |                     ],
 41 |                     yAxes=common.LATENCY_AXES,
 42 |                 ),
 43 |             ]
 44 |         ),
 45 |         common.REDRow('cortex', 'Ruler service', 'cortex/ruler', collapse=True),
 46 |         G.Row(
 47 |             [
 48 |                 common.PromGraph(
 49 |                     title="Group Evaluations per Second",
 50 |                     expressions=[
 51 |                         (
 52 |                             "Groups per second",
 53 |                             'sum(rate(cortex_group_evaluation_duration_seconds_count{job="cortex/ruler"}[1m]))'
 54 |                         ), (
 55 |                             "Groups per second",
 56 |                             'sum(rate(cortex_prometheus_rule_group_duration_seconds_count{job="cortex/ruler"}[1m]))'
 57 |                         )
 58 |                     ],
 59 |                     yAxes=common.OPS_AXIS,
 60 |                 ),
 61 |                 common.PromGraph(
 62 |                     title="Group Evaluation Durations",
 63 |                     expressions=[
 64 |                         (
 65 |                             "99th centile",
 66 |                             'histogram_quantile(0.99, sum(rate(cortex_group_evaluation_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3'
 67 |                         ),
 68 |                         (
 69 |                             "50th centile",
 70 |                             'histogram_quantile(0.50, sum(rate(cortex_group_evaluation_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3'
 71 |                         ),
 72 |                         (
 73 |                             "Mean",
 74 |                             'sum(rate(cortex_group_evaluation_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_group_evaluation_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3'
 75 |                         ),
 76 |                         ("Mean", 'avg(cortex_prometheus_rule_group_last_duration_seconds)*1e3'),
 77 |                         (
 78 |                             "{{rule_group}}",
 79 |                             'max by (rule_group)(cortex_prometheus_rule_group_last_duration_seconds)*1e3 > 500'
 80 |                         ),
 81 |                     ],
 82 |                     yAxes=common.LATENCY_AXES,
 83 |                 ),
 84 |                 common.PromGraph(
 85 |                     title="Group Evaluation Latency",
 86 |                     expressions=[
 87 |                         (
 88 |                             "99th centile",
 89 |                             'histogram_quantile(0.99, sum(rate(cortex_group_evaluation_latency_seconds_bucket[2m])) by (le)) * 1e3'
 90 |                         ),
 91 |                         (
 92 |                             "50th centile",
 93 |                             'histogram_quantile(0.50, sum(rate(cortex_group_evaluation_latency_seconds_bucket[2m])) by (le)) * 1e3'
 94 |                         ),
 95 |                         (
 96 |                             "Mean",
 97 |                             'sum(rate(cortex_group_evaluation_latency_seconds_sum[2m])) / sum(rate(cortex_group_evaluation_latency_seconds_count[2m])) * 1e3'
 98 |                         ),
 99 |                         ("Mean", 'avg(time()-(cortex_prometheus_rule_group_last_evaluation_timestamp_seconds>0))*1000'),
100 |                         ("Max", 'max(time()-(cortex_prometheus_rule_group_last_evaluation_timestamp_seconds>0))*1e3'),
101 |                     ],
102 |                     yAxes=common.LATENCY_AXES,
103 |                 ),
104 |             ]
105 |         ),
106 |         G.Row(
107 |             title="Ingester Queries",
108 |             panels=[
109 |                 common.QPSGraph('cortex_distributor', 'Ingester Query', 'cortex/ruler', metric_root="query"),
110 |                 common.PromGraph(
111 |                     title="Ingester Query Latency",
112 |                     expressions=[
113 |                         (
114 |                             "99th centile",
115 |                             'histogram_quantile(0.99, sum(rate(cortex_distributor_query_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3'
116 |                         ),
117 |                         (
118 |                             "50th centile",
119 |                             'histogram_quantile(0.50, sum(rate(cortex_distributor_query_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3'
120 |                         ),
121 |                         (
122 |                             "Mean",
123 |                             'sum(rate(cortex_distributor_query_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_distributor_query_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3'
124 |                         ),
125 |                     ],
126 |                     yAxes=common.LATENCY_AXES,
127 |                 ),
128 |             ]
129 |         ),
130 |         G.Row(
131 |             title="Ingester Push",
132 |             panels=[
133 |                 common.StatusQPSGraph(
134 |                     common.PROMETHEUS, "Ingester Push",
135 |                     'rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/ruler",operation="/cortex.Ingester/Push"}[1m])'
136 |                 ),
137 |                 common.PromGraph(
138 |                     title="Ingester Push Latency",
139 |                     expressions=[
140 |                         (
141 |                             "99.7th centile",
142 |                             'histogram_quantile(0.997, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3'
143 |                         ),
144 |                         (
145 |                             "50th centile",
146 |                             'histogram_quantile(0.50, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3'
147 |                         ),
148 |                         (
149 |                             "Mean",
150 |                             'sum(rate(cortex_ingester_client_request_duration_seconds_sum{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) / sum(rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) * 1e3'
151 |                         ),
152 |                     ],
153 |                     yAxes=common.LATENCY_AXES,
154 |                 ),
155 |             ]
156 |         ),
157 |         G.Row(
158 |             [
159 |                 common.PromGraph(
160 |                     title="Rules per Second",
161 |                     expressions=[
162 |                         ("Rules", 'sum(rate(cortex_rules_processed_total{job="cortex/ruler"}[1m]))'),
163 |                         ("Rules/sec", 'sum(rate(cortex_prometheus_rule_evaluations_total{job="cortex/ruler"}[1m]))'),
164 |                     ],
165 |                     yAxes=common.OPS_AXIS,
166 |                 ),
167 |                 common.PromGraph(
168 |                     title="Ruler DynamoDB errors",
169 |                     expressions=[
170 |                         (
171 |                             '{{table}} - {{error}}',
172 |                             'sum(rate(cortex_dynamo_failures_total{job="cortex/ruler"}[1m])) by (error, table) > 0'
173 |                         ),
174 |                     ],
175 |                     yAxes=common.OPS_AXIS,
176 |                 ),
177 |             ]
178 |         ),
179 |         G.Row(
180 |             title="Memcache",
181 |             panels=[
182 |                 common.StatusQPSGraph(
183 |                     common.PROMETHEUS, "Memcache read QPS",
184 |                     'sum by (job,status_code)(rate(cortex_memcache_request_duration_seconds_count{method="Memcache.GetMulti", job="cortex/ruler"}[1m]))'
185 |                 ),
186 |                 common.PromGraph(
187 |                     title="Memcache read latency",
188 |                     expressions=[
189 |                         (
190 |                             '99th centile',
191 |                             'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) by (le)) * 1e3'
192 |                         ),
193 |                         (
194 |                             '50th centile',
195 |                             'histogram_quantile(0.5, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) by (le)) * 1e3'
196 |                         ),
197 |                         (
198 |                             'Mean',
199 |                             'sum(rate(cortex_memcache_request_duration_seconds_sum{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) * 1e3 / sum(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ruler",method="Memcache.GetMulti"}[2m]))'
200 |                         ),
201 |                     ],
202 |                     yAxes=common.LATENCY_AXES,
203 |                 ),
204 |             ],
205 |         ),
206 |     ],
207 | )
208 | 


--------------------------------------------------------------------------------
/cortex-services-read.dashboard.py:
--------------------------------------------------------------------------------
  1 | # -*- mode: python; python-indent-offset: 2 -*-
  2 | 
  3 | import grafanalib.core as G
  4 | 
  5 | import sys, os
  6 | sys.path.append(os.path.dirname(__file__))
  7 | import common
  8 | 
  9 | dashboard = common.Dashboard(
 10 |     uid='reads',
 11 |     title="Cortex > Services (Reads)",
 12 |     rows=[
 13 |         common.REDRow(
 14 |             'cortex',
 15 |             'Query Frontend read',
 16 |             'cortex/query-frontend',
 17 |             rule_root="job_route:",
 18 |             extra_conditions=",route=\"api_prom_api_v1_query_range\""
 19 |         ),
 20 |         common.REDRow('cortex', 'Querier read', 'cortex/querier'),
 21 |         G.Row(
 22 |             title="Ingester",
 23 |             panels=[
 24 |                 common.PromGraph(
 25 |                     title="Ingester read QPS",
 26 |                     expressions=[
 27 |                         (
 28 |                             '{{route}}: {{status_code}}',
 29 |                             'sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route!="/cortex.Ingester/Push"}[1m])) by (route, status_code)'
 30 |                         ),
 31 |                     ],
 32 |                     yAxes=common.OPS_AXIS,
 33 |                 ),
 34 |                 common.PromGraph(
 35 |                     title="Ingester read latency",
 36 |                     expressions=[
 37 |                         (
 38 |                             '{{route}}: 99th centile',
 39 |                             'job_route:cortex_request_duration_seconds:99quantile{job="cortex/ingester", route!="/cortex.Ingester/Push"} * 1e3'
 40 |                         ),
 41 |                         (
 42 |                             '{{route}}: 50th centile',
 43 |                             'job_route:cortex_request_duration_seconds:50quantile{job="cortex/ingester", route!="/cortex.Ingester/Push"} * 1e3'
 44 |                         ),
 45 |                         (
 46 |                             '{{route}}: Mean',
 47 |                             'sum(rate(cortex_request_duration_seconds_sum{job="cortex/ingester", route!="/cortex.Ingester/Push"}[2m])) by (route) * 1e3 / sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route!="/cortex.Ingester/Push"}[2m])) by (route)'
 48 |                         ),
 49 |                     ],
 50 |                     yAxes=common.LATENCY_AXES,
 51 |                 ),
 52 |             ],
 53 |         ),
 54 |         G.Row(
 55 |             title="DynamoDB",
 56 |             panels=[
 57 |                 common.PromGraph(
 58 |                     title="DynamoDB read QPS",
 59 |                     expressions=[
 60 |                         (
 61 |                             'QueryPages {{job}}: {{status_code}}',
 62 |                             'sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[1m])) by (job, status_code)'
 63 |                         ),
 64 |                     ],
 65 |                 ),
 66 |                 common.PromGraph(
 67 |                     title="DynamoDB read latency",
 68 |                     expressions=[
 69 |                         (
 70 |                             'QueryPages: 99th',
 71 |                             'histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) by (le)) * 1e3'
 72 |                         ), (
 73 |                             'QueryPages: 50th',
 74 |                             'histogram_quantile(0.5, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) by (le)) * 1e3'
 75 |                         ), (
 76 |                             'QueryPages: Mean',
 77 |                             'sum(rate(cortex_dynamo_request_duration_seconds_sum{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) * 1e3 / sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m]))'
 78 |                         )
 79 |                     ],
 80 |                     yAxes=common.LATENCY_AXES,
 81 |                 ),
 82 |             ],
 83 |         ),
 84 |         G.Row(
 85 |             title="DynamoDB",
 86 |             panels=[
 87 |                 common.PromGraph(
 88 |                     title="DynamoDB read capacity consumed [rate1m]",
 89 |                     expressions=[
 90 |                         (
 91 |                             '{{table}} consumed',
 92 |                             'sum(rate(cortex_dynamo_consumed_capacity_total{job=~"cortex/.*",operation!~".*Write.*"}[1m])) by (table) > 0'
 93 |                         ),
 94 |                         (
 95 |                             '{{table}} provisioned',
 96 |                             'max(cortex_dynamo_table_capacity_units{job="cortex/table-manager", op="read"}) by (table) > 0'
 97 |                         ),
 98 |                         (
 99 |                             '{{table}} provisioned',
100 |                             'max(cortex_table_capacity_units{job="cortex/table-manager", op="read"}) by (table) > 0'
101 |                         ),
102 |                     ],
103 |                     yAxes=common.OPS_AXIS,
104 |                 ),
105 |                 common.PromGraph(
106 |                     title="DynamoDB read errors",
107 |                     expressions=[
108 |                         (
109 |                             '{{job}} - {{table}} - {{error}}',
110 |                             'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation!~".*Write.*"}[1m])) by (job, error, table) > 0'
111 |                         ),
112 |                         (
113 |                             '{{job}} - {{table}} - Throttled',
114 |                             'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation!~".*Write.*"}[1m])) by (job, error, table) > 0'
115 |                         ),
116 |                     ],
117 |                     yAxes=common.OPS_AXIS,
118 |                 ),
119 |             ],
120 |         ),
121 |         G.Row(
122 |             title="Memcache (blocks)",
123 |             panels=[
124 |                 common.PromGraph(
125 |                     title="Memcache read QPS (blocks)",
126 |                     expressions=[
127 |                         (
128 |                             '{{name}} {{operation}}',
129 |                             'sum(rate(thanos_memcached_operation_duration_seconds_count{kubernetes_namespace="cortex"}[1m])) by (name, operation)'
130 |                         ),
131 |                         (
132 |                             '{{name}} {{operation}} {{reason}}',
133 |                             'sum(rate(thanos_memcached_operation_failures_total{kubernetes_namespace="cortex"}[1m])) by (name, operation, reason) > 0'
134 |                         ),
135 |                     ],
136 |                     yAxes=G.single_y_axis(format=G.OPS_FORMAT),
137 |                 ),
138 |                 common.PromGraph(
139 |                     title="Memcache read latency (blocks)",
140 |                     expressions=[
141 |                         (
142 |                             '99% {{name}}',
143 |                             'histogram_quantile(0.99, sum(rate(thanos_memcached_operation_duration_seconds_bucket{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m])) by (le, name))'
144 |                         ),
145 |                         (
146 |                             'Mean',
147 |                             'sum(rate(thanos_memcached_operation_duration_seconds_sum{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m])) / sum(rate(thanos_memcached_operation_duration_seconds_count{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m]))'
148 |                         ),
149 |                     ],
150 |                     yAxes=G.single_y_axis(format=G.SECONDS_FORMAT),
151 |                 ),
152 |             ],
153 |         ),
154 |         G.Row(
155 |             title="Memcache (chunks)",
156 |             panels=[
157 |                 common.StatusQPSGraph(
158 |                     common.PROMETHEUS, "Memcache read QPS (chunks)",
159 |                     'sum by (job,status_code)(rate(cortex_memcache_request_duration_seconds_count{method="Memcache.GetMulti", job=~"cortex/querier|cortex/query-frontend"}[1m]))'
160 |                 ),
161 |                 common.PromGraph(
162 |                     title="Memcache read latency (chunks)",
163 |                     expressions=[
164 |                         (
165 |                             '99% {{name}}',
166 |                             'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m])) by (le, name))'
167 |                         ),
168 |                         (
169 |                             'Mean',
170 |                             'sum(rate(cortex_memcache_request_duration_seconds_sum{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m])) / sum(rate(cortex_memcache_request_duration_seconds_count{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m]))'
171 |                         ),
172 |                     ],
173 |                     yAxes=G.single_y_axis(format=G.SECONDS_FORMAT),
174 |                 ),
175 |             ],
176 |         ),
177 |         G.Row(
178 |             title="Cache",
179 |             panels=[
180 |                 common.PromGraph(
181 |                     title="Querier Cache hit rate",
182 |                     expressions=[
183 |                         (
184 |                             '{{name}}',
185 |                             'sum(rate(cortex_cache_hits{job="cortex/querier"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/querier"}[2m])) by (name)'
186 |                         ),
187 |                     ],
188 |                     yAxes=common.PercentageAxes(),
189 |                 ),
190 |                 common.PromGraph(
191 |                     title="Query-frontend cache hit rate",
192 |                     expressions=[
193 |                         (
194 |                             '{{name}}',
195 |                             'sum(rate(cortex_cache_hits{job="cortex/query-frontend"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/query-frontend"}[2m])) by (name)'
196 |                         ),
197 |                     ],
198 |                     yAxes=common.PercentageAxes(),
199 |                 ),
200 |             ],
201 |         ),
202 |         G.Row(
203 |             title="S3",
204 |             collapse=True,
205 |             panels=[
206 |                 common.StatusQPSGraph(
207 |                     common.PROMETHEUS, "S3 read QPS",
208 |                     'rate(cortex_s3_request_duration_seconds_count{operation="S3.GetObject", job=~"cortex/.*"}[1m])'
209 |                 ),
210 |                 common.PromGraph(
211 |                     title="S3 read latency",
212 |                     expressions=[
213 |                         (
214 |                             '99th centile',
215 |                             'histogram_quantile(0.99, sum(rate(cortex_s3_request_duration_seconds_bucket{job=~"cortex/.*", operation="S3.GetObject"}[2m])) by (le)) * 1e3'
216 |                         ),
217 |                         (
218 |                             '50th centile',
219 |                             'histogram_quantile(0.5, sum(rate(cortex_s3_request_duration_seconds_bucket{job=~"cortex/.*", operation="S3.GetObject"}[2m])) by (le)) * 1e3'
220 |                         ),
221 |                         (
222 |                             'Mean',
223 |                             'sum(rate(cortex_s3_request_duration_seconds_sum{job=~"cortex/.*", operation="S3.PutObject"}[2m])) * 1e3/ sum(rate(cortex_s3_request_duration_seconds_count{job=~"cortex/.*", operation="S3.GetObject"}[2m]))'
224 |                         ),
225 |                     ],
226 |                     yAxes=common.LATENCY_AXES,
227 |                 ),
228 |             ],
229 |         ),
230 |     ],
231 | )
232 | 


--------------------------------------------------------------------------------
/cortex-services-write.dashboard.py:
--------------------------------------------------------------------------------
  1 | # -*- mode: python; python-indent-offset: 2 -*-
  2 | 
  3 | import grafanalib.core as G
  4 | 
  5 | import sys, os
  6 | sys.path.append(os.path.dirname(__file__))
  7 | import common
  8 | 
  9 | dashboard = common.Dashboard(
 10 |     uid='writes',
 11 |     title="Cortex > Services (Writes)",
 12 |     rows=[
 13 |         G.Row(
 14 |             title="Retrieval Stats",
 15 |             collapse=True,
 16 |             panels=[
 17 |                 common.PromGraph(
 18 |                     title="Retrieval sent batches",
 19 |                     expressions=[
 20 |                         (
 21 |                             '{{url}}',
 22 |                             'sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count[1m])) by (url)'
 23 |                         ),
 24 |                     ],
 25 |                 ),
 26 |                 common.PromGraph(
 27 |                     title="Retrieval batch latency",
 28 |                     expressions=[
 29 |                         (
 30 |                             '{{url}} 99th',
 31 |                             'histogram_quantile(0.99, sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket[2m])) by (url, le)) * 1e3'
 32 |                         ),
 33 |                         (
 34 |                             '{{url}} 50th',
 35 |                             'histogram_quantile(0.50, sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket[2m])) by (url, le)) * 1e3'
 36 |                         ),
 37 |                         (
 38 |                             '{{url}} mean',
 39 |                             '(sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum[2m])) by (url) /  sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count[2m])) by (url)) * 1e3'
 40 |                         ),
 41 |                     ],
 42 |                     yAxes=common.LATENCY_AXES,
 43 |                 ),
 44 |                 common.PromGraph(
 45 |                     title="Retrieval sent samples",
 46 |                     expressions=[
 47 |                         (
 48 |                             '{{url}} success',
 49 |                             'sum(rate(prometheus_remote_storage_succeeded_samples_total[1m])) by (url)'
 50 |                         ),
 51 |                         ('{{url}} dropped', 'sum(rate(prometheus_remote_storage_dropped_samples_total[1m])) by (url)'),
 52 |                         ('{{url}} retried', 'sum(rate(prometheus_remote_storage_retried_samples_total[1m])) by (url)'),
 53 |                         ('{{url}} failure', 'sum(rate(prometheus_remote_storage_failed_samples_total[1m])) by (url)'),
 54 |                     ],
 55 |                 ),
 56 |                 common.PromGraph(
 57 |                     title="Queue",
 58 |                     expressions=[
 59 |                         ('{{url}}: queue length', 'sum(prometheus_remote_storage_pending_samples) by (url)'),
 60 |                         (
 61 |                             '{{url}}: lag',
 62 |                             'max(time()-prometheus_remote_storage_queue_highest_sent_timestamp_seconds) by (url)'
 63 |                         ),
 64 |                         ('{{url}}: shards', 'max(prometheus_remote_storage_shards) by (url)'),
 65 |                     ],
 66 |                 ),
 67 |             ],
 68 |         ),
 69 |         G.Row(
 70 |             title="Distributor",
 71 |             panels=[
 72 |                 common.StatusQPSGraph(
 73 |                     common.PROMETHEUS, "Distributor write QPS",
 74 |                     'rate(cortex_request_duration_seconds_count{job="cortex/distributor"}[1m])'
 75 |                 ),
 76 |                 common.LatencyGraph("cortex", "Distributor Write", "cortex/distributor"),
 77 |             ],
 78 |         ),
 79 |         G.Row(
 80 |             title="Distributor breakdown",
 81 |             collapse=True,
 82 |             panels=[
 83 |                 common.PromGraph(
 84 |                     title="Distributor Error Rate",
 85 |                     expressions=[
 86 |                         (
 87 |                             '{{instance}}',
 88 |                             'sum by (instance)(rate(cortex_request_duration_seconds_count{job="cortex/distributor", status_code =~ "5.."}[1m]))'
 89 |                         ),
 90 |                     ],
 91 |                 ),
 92 |                 common.PromGraph(
 93 |                     title="Distributor write latency",
 94 |                     expressions=[
 95 |                         (
 96 |                             '99th centile {{instance}}',
 97 |                             'histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{job="cortex/distributor"}[2m])) by (instance,le)) * 1e3'
 98 |                         ),
 99 |                     ],
100 |                     yAxes=common.LATENCY_AXES,
101 |                 ),
102 |             ],
103 |         ),
104 |         G.Row(
105 |             title="Distributor sends",
106 |             collapse=True,
107 |             panels=[
108 |                 common.StatusQPSGraph(
109 |                     common.PROMETHEUS, "Distributor send QPS",
110 |                     'rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/distributor",operation="/cortex.Ingester/Push"}[1m])'
111 |                 ),
112 |                 common.PromGraph(
113 |                     title="Distributor send latency",
114 |                     expressions=[
115 |                         (
116 |                             '99th centile',
117 |                             'histogram_quantile(0.99, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3'
118 |                         ),
119 |                         (
120 |                             '50th centile',
121 |                             'histogram_quantile(0.50, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3'
122 |                         ),
123 |                         (
124 |                             'Mean',
125 |                             'sum(rate(cortex_ingester_client_request_duration_seconds_sum{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) * 1e3 / sum(rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m]))'
126 |                         ),
127 |                     ],
128 |                     yAxes=common.LATENCY_AXES,
129 |                 ),
130 |             ],
131 |         ),
132 |         G.Row(
133 |             title="Samples",
134 |             collapse=True,
135 |             panels=[
136 |                 common.PromGraph(
137 |                     title="Push sample ingest rate by instance (>1%)",
138 |                     expressions=[
139 |                         (
140 |                             '{{user}}',
141 |                             'sum by (user)(rate(cortex_distributor_received_samples_total{job="cortex/distributor"}[1m])) > ignoring(user) group_left() (sum(rate(cortex_distributor_received_samples_total{job="cortex/distributor"}[1m]))/100)'
142 |                         ),
143 |                     ],
144 |                     legend=G.Legend(show=False),
145 |                     yAxes=common.OPS_AXIS,
146 |                 ),
147 |                 common.PromGraph(
148 |                     title="Rule sample ingest rate by instance",
149 |                     expressions=[
150 |                         (
151 |                             '{{user}}',
152 |                             # '> 1' is to exclude instances which are not connected and simply alerting on absent metrics
153 |                             'sum by (user)(rate(cortex_distributor_received_samples_total{job="cortex/ruler"}[1m])) > 1'
154 |                         ),
155 |                     ],
156 |                     legend=G.Legend(show=False),
157 |                     yAxes=common.OPS_AXIS,
158 |                 ),
159 |                 common.PromGraph(
160 |                     title="Sample discard rate by instance ID & reason",
161 |                     expressions=[
162 |                         (
163 |                             '{{user}} - {{reason}} ',
164 |                             'sum by (user, reason) (rate(cortex_discarded_samples_total{reason!="duplicate-sample"}[1m])) > 0'
165 |                         ),
166 |                     ],
167 |                     yAxes=common.OPS_AXIS,
168 |                 ),
169 |             ],
170 |         ),
171 |         G.Row(
172 |             title="Ingester",
173 |             panels=[
174 |                 common.StatusQPSGraph(
175 |                     common.PROMETHEUS, "Ingester write QPS",
176 |                     'rate(cortex_request_duration_seconds_count{job="cortex/ingester"}[1m])'
177 |                 ),
178 |                 common.PromGraph(
179 |                     title="Ingester write latency",
180 |                     expressions=[
181 |                         (
182 |                             '99th centile',
183 |                             'job_route:cortex_request_duration_seconds:99quantile{job="cortex/ingester", route="/cortex.Ingester/Push"} * 1e3'
184 |                         ),
185 |                         (
186 |                             '50th centile',
187 |                             'job_route:cortex_request_duration_seconds:50quantile{job="cortex/ingester", route="/cortex.Ingester/Push"} * 1e3'
188 |                         ),
189 |                         (
190 |                             'Mean',
191 |                             'sum(rate(cortex_request_duration_seconds_sum{job="cortex/ingester", route="/cortex.Ingester/Push"}[2m])) * 1e3 / sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route="/cortex.Ingester/Push"}[2m]))'
192 |                         ),
193 |                     ],
194 |                     yAxes=common.LATENCY_AXES,
195 |                 ),
196 |             ],
197 |         ),
198 |         G.Row(
199 |             title="DynamoDB",
200 |             panels=[
201 |                 common.PromGraph(
202 |                     title="DynamoDB write QPS",
203 |                     expressions=[
204 |                         (
205 |                             'BatchWriteItem {{job}}: {{status_code}}',
206 |                             'sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[1m])) by (job, status_code)'
207 |                         ),
208 |                     ],
209 |                 ),
210 |                 common.PromGraph(
211 |                     title="DynamoDB write latency",
212 |                     expressions=[
213 |                         (
214 |                             'BatchWriteItem: 99th',
215 |                             'histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) by (le)) * 1e3'
216 |                         ), (
217 |                             'BatchWriteItem: 50th',
218 |                             'histogram_quantile(0.5, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) by (le)) * 1e3'
219 |                         ), (
220 |                             'BatchWriteItem: Mean',
221 |                             'sum(rate(cortex_dynamo_request_duration_seconds_sum{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) * 1e3 / sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m]))'
222 |                         )
223 |                     ],
224 |                     yAxes=common.LATENCY_AXES,
225 |                 ),
226 |             ],
227 |         ),
228 |         G.Row(
229 |             title="DynamoDB",
230 |             panels=[
231 |                 common.PromGraph(
232 |                     title="DynamoDB write capacity consumed [rate1m]",
233 |                     expressions=[
234 |                         (
235 |                             '{{table}} consumed',
236 |                             'sum(rate(cortex_dynamo_consumed_capacity_total{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[1m])) by (table) > 0'
237 |                         ),
238 |                         (
239 |                             '{{table}} provisioned',
240 |                             'max(cortex_dynamo_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0'
241 |                         ),
242 |                         (
243 |                             '{{table}} provisioned',
244 |                             'max(cortex_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0'
245 |                         ),
246 |                     ],
247 |                     yAxes=common.OPS_AXIS,
248 |                 ),
249 |                 common.PromGraph(
250 |                     title="DynamoDB write errors",
251 |                     expressions=[
252 |                         (
253 |                             '{{table}} - {{error}}',
254 |                             'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0'
255 |                         ),
256 |                         (
257 |                             '{{table}} - Throttled',
258 |                             'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0'
259 |                         ),
260 |                     ],
261 |                     yAxes=common.OPS_AXIS,
262 |                 ),
263 |             ],
264 |         ),
265 |         G.Row(
266 |             title="Memcache",
267 |             panels=[
268 |                 common.PromGraph(
269 |                     title="Ingester hit rate",
270 |                     expressions=[
271 |                         (
272 |                             '{{name}}',
273 |                             'sum(rate(cortex_cache_hits{job="cortex/ingester"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/ingester"}[2m])) by (name)'
274 |                         ),
275 |                     ],
276 |                     yAxes=common.PercentageAxes(),
277 |                 ),
278 |                 common.PromGraph(
279 |                     title="Memcache QPS",
280 |                     expressions=[
281 |                         (
282 |                             '{{method}} {{status_code}}',
283 |                             'sum(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ingester"}[1m])) by (method,status_code)'
284 |                         ),
285 |                     ],
286 |                 ),
287 |                 common.PromGraph(
288 |                     title="Memcache latency",
289 |                     expressions=[
290 |                         (
291 |                             '{{method}} 99th centile',
292 |                             'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ingester"}[2m])) by (le,method)) * 1e3'
293 |                         ),
294 |                         (
295 |                             '{{method}} 50th centile',
296 |                             'histogram_quantile(0.5, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ingester"}[2m])) by (le,method)) * 1e3'
297 |                         ),
298 |                         (
299 |                             '{{method}} Mean',
300 |                             'sum by (method)(rate(cortex_memcache_request_duration_seconds_sum{job="cortex/ingester"}[2m])) * 1e3 / sum by (method)(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ingester"}[2m]))'
301 |                         ),
302 |                     ],
303 |                     yAxes=common.LATENCY_AXES,
304 |                 ),
305 |             ],
306 |         ),
307 |     ],
308 | )
309 | 


--------------------------------------------------------------------------------