├── .github └── workflows │ ├── cron.yaml │ └── tests.yaml ├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── assets ├── MSSQL │ ├── alerts.yaml │ ├── dashboards │ │ ├── mssql-overview.json │ │ └── mssql-pages.json │ └── rules.yaml ├── aerospike │ ├── alerts.yaml │ ├── dashboards │ │ ├── aerospike-instance-overview.json │ │ ├── aerospike-logs.json │ │ ├── aerospike-namespace-overview.json │ │ └── aerospike-overview.json │ └── rules.yaml ├── alertmanager │ ├── alerts.yaml │ ├── dashboards │ │ └── alertmanager-overview.json │ └── rules.yaml ├── apache-activemq │ ├── alerts.yaml │ ├── dashboards │ │ ├── apache-activemq-cluster-overview.json │ │ ├── apache-activemq-instance-overview.json │ │ ├── apache-activemq-logs.json │ │ ├── apache-activemq-queue-overview.json │ │ └── apache-activemq-topic-overview.json │ └── rules.yaml ├── apache-airflow │ ├── alerts.yaml │ ├── dashboards │ │ └── apache-airflow-overview.json │ └── rules.yaml ├── apache-camel │ ├── alerts.yaml │ ├── dashboards │ │ └── apache-camel-micrometer.json │ └── rules.yaml ├── apache-cassandra │ ├── alerts.yaml │ ├── dashboards │ │ ├── cassandra-keyspaces.json │ │ ├── cassandra-nodes.json │ │ └── cassandra-overview.json │ └── rules.yaml ├── apache-couchdb │ ├── alerts.yaml │ ├── dashboards │ │ ├── couchdb-nodes.json │ │ └── couchdb-overview.json │ └── rules.yaml ├── apache-hadoop │ ├── alerts.yaml │ ├── dashboards │ │ ├── apache-hadoop-datanode-overview.json │ │ ├── apache-hadoop-namenode-overview.json │ │ ├── apache-hadoop-nodemanager-overview.json │ │ └── apache-hadoop-resourcemanager-overview.json │ └── rules.yaml ├── apache-hbase │ ├── alerts.yaml │ ├── dashboards │ │ ├── apache-hbase-cluster-overview.json │ │ ├── apache-hbase-logs.json │ │ └── apache-hbase-regionserver-overview.json │ └── rules.yaml ├── apache-http │ ├── alerts.yaml │ ├── dashboards │ │ └── apache-http.json │ └── rules.yaml ├── apache-mesos │ ├── alerts.yaml │ ├── dashboards │ │ └── apache-mesos-overview.json │ └── rules.yaml ├── apache-solr │ ├── alerts.yaml │ ├── dashboards │ │ ├── apache-solr-cluster-overview.json │ │ ├── apache-solr-logs-overview.json │ │ ├── apache-solr-query-performance.json │ │ └── apache-solr-resource-monitoring.json │ └── rules.yaml ├── apache-tomcat │ ├── alerts.yaml │ ├── dashboards │ │ ├── apache-tomcat-hosts.json │ │ └── apache-tomcat-overview.json │ └── rules.yaml ├── argo-cd-2 │ ├── alerts.yaml │ ├── dashboards │ │ ├── argo-cd-application-overview.json │ │ ├── argo-cd-notifications-overview.json │ │ └── argo-cd-operational-overview.json │ └── rules.yaml ├── argocd │ ├── alerts.yaml │ ├── dashboards │ │ └── argocd-overview.json │ └── rules.yaml ├── asterisk │ ├── alerts.yaml │ ├── dashboards │ │ ├── asterisk-logs.json │ │ └── asterisk-overview.json │ └── rules.yaml ├── awx │ ├── alerts.yaml │ ├── dashboards │ │ └── awx.json │ └── rules.yaml ├── blackbox_exporter │ ├── alerts.yaml │ ├── dashboards │ │ └── blackbox-exporter.json │ └── rules.yaml ├── caddy │ ├── alerts.yaml │ ├── dashboards │ │ └── caddy-overview.json │ └── rules.yaml ├── celery │ ├── alerts.yaml │ ├── dashboards │ │ ├── celery-tasks-by-task.json │ │ └── celery-tasks-overview.json │ └── rules.yaml ├── ceph │ ├── alerts.yaml │ ├── dashboards │ │ ├── ceph-cluster-advanced.json │ │ ├── ceph-nvmeof-performance.json │ │ ├── ceph-nvmeof.json │ │ ├── cephfs-overview.json │ │ ├── cephfsdashboard.json │ │ ├── host-details.json │ │ ├── hosts-overview.json │ │ ├── multi-cluster-overview.json │ │ ├── osd-device-details.json │ │ ├── osds-overview.json │ │ ├── pool-detail.json │ │ ├── pool-overview.json │ │ ├── radosgw-detail.json │ │ ├── radosgw-overview.json │ │ ├── radosgw-sync-overview.json │ │ ├── rbd-details.json │ │ ├── rbd-overview.json │ │ ├── rgw-s3-analytics.json │ │ └── smb-overview.json │ └── rules.yaml ├── cert-manager │ ├── alerts.yaml │ ├── dashboards │ │ └── overview.json │ └── rules.yaml ├── cilium-enterprise │ ├── alerts.yaml │ ├── dashboards │ │ ├── cilium-L3-policy.json │ │ ├── cilium-L7-proxy.json │ │ ├── cilium-agent-overview.json │ │ ├── cilium-agent.json │ │ ├── cilium-api.json │ │ ├── cilium-bpf.json │ │ ├── cilium-conntrack.json │ │ ├── cilium-datapath.json │ │ ├── cilium-external-fqdn-proxy.json │ │ ├── cilium-fqdn-proxy.json │ │ ├── cilium-identities.json │ │ ├── cilium-kubernetes.json │ │ ├── cilium-network.json │ │ ├── cilium-nodes.json │ │ ├── cilium-operator.json │ │ ├── cilium-overview.json │ │ ├── cilium-policy.json │ │ ├── cilium-resource-utilization.json │ │ ├── hubble-overview.json │ │ └── hubble-timescape.json │ └── rules.yaml ├── clickhouse │ ├── alerts.yaml │ ├── dashboards │ │ ├── clickhouse-latency.json │ │ ├── clickhouse-logs.json │ │ ├── clickhouse-overview.json │ │ └── clickhouse-replica.json │ └── rules.yaml ├── cloudflare │ ├── alerts.yaml │ ├── dashboards │ │ ├── cloudflare-geomap-overview.json │ │ ├── cloudflare-worker-overview.json │ │ └── cloudflare-zone-overview.json │ └── rules.yaml ├── confluent-kafka │ ├── alerts.yaml │ ├── dashboards │ │ └── confluent-kafka-overview.json │ └── rules.yaml ├── consul │ ├── alerts.yaml │ ├── dashboards │ │ └── consul-overview.json │ └── rules.yaml ├── coredns │ ├── alerts.yaml │ ├── dashboards │ │ └── coredns.json │ └── rules.yaml ├── cortex │ ├── alerts.yaml │ ├── dashboards │ │ ├── alertmanager.json │ │ ├── cortex-compactor-resources.json │ │ ├── cortex-compactor.json │ │ ├── cortex-config.json │ │ ├── cortex-object-store.json │ │ ├── cortex-queries.json │ │ ├── cortex-reads.json │ │ ├── cortex-rollout-progress.json │ │ ├── cortex-scaling.json │ │ ├── cortex-slow-queries.json │ │ ├── cortex-writes.json │ │ └── ruler.json │ └── rules.yaml ├── couchbase │ ├── alerts.yaml │ ├── dashboards │ │ ├── couchbase-bucket-overview.json │ │ ├── couchbase-cluster-overview.json │ │ └── couchbase-node-overview.json │ └── rules.yaml ├── discourse │ ├── alerts.yaml │ ├── dashboards │ │ ├── discourse-jobs.json │ │ └── discourse-overview.json │ └── rules.yaml ├── django │ ├── alerts.yaml │ ├── dashboards │ │ ├── django-overview.json │ │ ├── django-requests-by-view.json │ │ └── django-requests-overview.json │ └── rules.yaml ├── docker │ ├── alerts.yaml │ ├── dashboards │ │ └── docker.json │ └── rules.yaml ├── elasticsearch │ ├── alerts.yaml │ └── rules.yaml ├── envoy │ ├── alerts.yaml │ ├── dashboards │ │ └── envoy-overview.json │ └── rules.yaml ├── etcd │ ├── alerts.yaml │ ├── dashboards │ │ └── etcd.json │ └── rules.yaml ├── f5-bigip │ ├── alerts.yaml │ ├── dashboards │ │ ├── bigip-cluster-overview.json │ │ ├── bigip-node-overview.json │ │ ├── bigip-pool-overview.json │ │ └── bigip-virtual-server-overview.json │ └── rules.yaml ├── gitea │ ├── alerts.yaml │ ├── dashboards │ │ └── gitea-overview.json │ └── rules.yaml ├── gitlab │ ├── alerts.yaml │ ├── dashboards │ │ └── gitlab-overview.json │ └── rules.yaml ├── gluster │ ├── alerts.yaml │ ├── dashboards │ │ └── k8s-storage-resources-glusterfs-pv.json │ └── rules.yaml ├── go-runtime │ ├── alerts.yaml │ ├── dashboards │ │ └── go-runtime.json │ └── rules.yaml ├── grafana │ ├── alerts.yaml │ ├── dashboards │ │ └── grafana-overview.json │ └── rules.yaml ├── haproxy │ ├── alerts.yaml │ ├── dashboards │ │ ├── haproxy-backend.json │ │ ├── haproxy-frontend.json │ │ ├── haproxy-overview.json │ │ └── haproxy-server.json │ └── rules.yaml ├── harbor │ ├── alerts.yaml │ ├── dashboards │ │ └── harbor-overview.json │ └── rules.yaml ├── hass │ ├── alerts.yaml │ ├── dashboards │ │ └── hass.json │ └── rules.yaml ├── ibm-mq │ ├── alerts.yaml │ ├── dashboards │ │ ├── ibm-mq-cluster-overview.json │ │ ├── ibm-mq-queue-manager-overview.json │ │ ├── ibm-mq-queue-overview.json │ │ └── ibm-mq-topics-overview.json │ └── rules.yaml ├── influxdb │ ├── alerts.yaml │ ├── dashboards │ │ ├── influxdb-cluster-overview.json │ │ ├── influxdb-instance-overview.json │ │ └── influxdb-logs.json │ └── rules.yaml ├── ingress-nginx-mixin │ ├── alerts.yaml │ ├── dashboards │ │ ├── ingress-nginx-overview.json │ │ └── ingress-nginx-request-handling-performance.json │ └── rules.yaml ├── istio │ ├── alerts.yaml │ ├── dashboards │ │ ├── logs │ │ ├── overview │ │ ├── servicesOverview │ │ └── workloadsOverview │ └── rules.yaml ├── jaeger │ ├── alerts.yaml │ ├── dashboards │ │ ├── jaeger-read.json │ │ └── jaeger-write.json │ └── rules.yaml ├── jenkins │ ├── alerts.yaml │ ├── dashboards │ │ └── jenkins.json │ └── rules.yaml ├── jira │ ├── alerts.yaml │ ├── dashboards │ │ └── jira-overview.json │ └── rules.yaml ├── jvm │ ├── alerts.yaml │ ├── dashboards │ │ └── jvm-dashboard.json │ └── rules.yaml ├── kafka │ ├── alerts.yaml │ ├── dashboards │ │ ├── connect-overview.json │ │ ├── kafka-ksqldb-overview.json │ │ ├── kafka-overview-dashboard.json │ │ ├── kafka-topic-dashboard.json │ │ ├── schema-registry-overview.json │ │ └── zookeeper-overview.json │ └── rules.yaml ├── kube-cockroachdb │ ├── alerts.yaml │ └── rules.yaml ├── kube-state-metrics │ ├── alerts.yaml │ └── rules.yaml ├── kubernetes-autoscaling │ ├── alerts.yaml │ ├── dashboards │ │ ├── kubernetes-autoscaling-mixin-ca.json │ │ ├── kubernetes-autoscaling-mixin-hpa.json │ │ ├── kubernetes-autoscaling-mixin-karpenter-act.json │ │ ├── kubernetes-autoscaling-mixin-karpenter-over.json │ │ ├── kubernetes-autoscaling-mixin-karpenter-perf.json │ │ ├── kubernetes-autoscaling-mixin-pdb.json │ │ └── kubernetes-autoscaling-mixin-vpa.json │ └── rules.yaml ├── kubernetes │ ├── alerts.yaml │ ├── dashboards │ │ ├── apiserver.json │ │ ├── cluster-total.json │ │ ├── controller-manager.json │ │ ├── k8s-resources-cluster.json │ │ ├── k8s-resources-namespace.json │ │ ├── k8s-resources-node.json │ │ ├── k8s-resources-pod.json │ │ ├── k8s-resources-windows-cluster.json │ │ ├── k8s-resources-windows-namespace.json │ │ ├── k8s-resources-windows-pod.json │ │ ├── k8s-resources-workload.json │ │ ├── k8s-resources-workloads-namespace.json │ │ ├── k8s-windows-cluster-rsrc-use.json │ │ ├── k8s-windows-node-rsrc-use.json │ │ ├── kubelet.json │ │ ├── namespace-by-pod.json │ │ ├── namespace-by-workload.json │ │ ├── persistentvolumesusage.json │ │ ├── pod-total.json │ │ ├── proxy.json │ │ ├── scheduler.json │ │ └── workload-total.json │ └── rules.yaml ├── loki │ ├── alerts.yaml │ ├── dashboards │ │ ├── loki-bloom-build.json │ │ ├── loki-bloom-gateway.json │ │ ├── loki-chunks.json │ │ ├── loki-deletion.json │ │ ├── loki-logs.json │ │ ├── loki-mixin-recording-rules.json │ │ ├── loki-operational.json │ │ ├── loki-reads-resources.json │ │ ├── loki-reads.json │ │ ├── loki-retention.json │ │ ├── loki-writes-resources.json │ │ ├── loki-writes.json │ │ └── loki_thanos_object_storage.json │ └── rules.yaml ├── memcached │ ├── alerts.yaml │ ├── dashboards │ │ └── memcached-overview.json │ └── rules.yaml ├── microsoft-iis │ ├── alerts.yaml │ ├── dashboards │ │ ├── microsoft-iis-applications.json │ │ └── microsoft-iis-overview.json │ └── rules.yaml ├── mongodb-atlas │ ├── alerts.yaml │ ├── dashboards │ │ ├── mongodb-atlas-cluster-overview.json │ │ ├── mongodb-atlas-elections-overview.json │ │ ├── mongodb-atlas-operations-overview.json │ │ └── mongodb-atlas-performance-overview.json │ └── rules.yaml ├── mongodb │ ├── alerts.yaml │ ├── dashboards │ │ ├── MongoDB_Cluster.json │ │ ├── MongoDB_Instance.json │ │ └── MongoDB_ReplicaSet.json │ └── rules.yaml ├── mysql │ ├── alerts.yaml │ ├── dashboards │ │ └── mysql-overview.json │ └── rules.yaml ├── nginx │ ├── alerts.yaml │ ├── dashboards │ │ ├── nginx-logs.json │ │ └── nginx-metrics.json │ └── rules.yaml ├── node-exporter │ ├── alerts.yaml │ ├── dashboards │ │ ├── node-cluster-rsrc-use.json │ │ ├── node-rsrc-use.json │ │ ├── nodes-aix.json │ │ ├── nodes-darwin.json │ │ └── nodes.json │ └── rules.yaml ├── nodejs │ ├── alerts.yaml │ ├── dashboards │ │ └── nodejs-overview.json │ └── rules.yaml ├── nomad │ ├── alerts.yaml │ ├── dashboards │ │ ├── nomad-cluster.json │ │ └── nomad-jobs.json │ └── rules.yaml ├── nsq │ ├── alerts.yaml │ ├── dashboards │ │ ├── nsq-instances.json │ │ └── nsq-topics.json │ └── rules.yaml ├── openldap │ ├── alerts.yaml │ ├── dashboards │ │ ├── logs │ │ └── overview │ └── rules.yaml ├── opensearch │ ├── alerts.yaml │ ├── dashboards │ │ ├── node-overview.json │ │ ├── opensearch-cluster-overview.json │ │ └── search-and-index-overview.json │ └── rules.yaml ├── openstack │ ├── alerts.yaml │ ├── dashboards │ │ ├── cinder │ │ ├── logs │ │ ├── neutron │ │ ├── nova │ │ └── overview │ └── rules.yaml ├── oracledb │ ├── alerts.yaml │ ├── dashboards │ │ └── oracledb-overview.json │ └── rules.yaml ├── pgbouncer │ ├── alerts.yaml │ ├── dashboards │ │ ├── clusterOverview │ │ ├── logs │ │ └── overview │ └── rules.yaml ├── postgres-exporter │ ├── alerts.yaml │ ├── dashboards │ │ └── postgres-overview.json │ └── rules.yaml ├── presto │ ├── alerts.yaml │ ├── dashboards │ │ ├── presto-coordinator.json │ │ ├── presto-logs.json │ │ ├── presto-overview.json │ │ └── presto-worker.json │ └── rules.yaml ├── prometheus-operator │ ├── alerts.yaml │ └── rules.yaml ├── prometheus │ ├── alerts.yaml │ ├── dashboards │ │ ├── prometheus-remote-write.json │ │ └── prometheus.json │ └── rules.yaml ├── promscale │ ├── alerts.yaml │ ├── dashboards │ │ ├── apm-dependencies.json │ │ ├── apm-home.json │ │ ├── apm-service-dependencies-downstream.json │ │ ├── apm-service-dependencies-upstream.json │ │ ├── apm-service-overview.json │ │ └── promscale.json │ └── rules.yaml ├── promtail │ ├── alerts.yaml │ ├── dashboards │ │ └── promtail.json │ └── rules.yaml ├── python-runtime │ ├── alerts.yaml │ ├── dashboards │ │ └── python-runtime.json │ └── rules.yaml ├── rabbitmq │ ├── alerts.yaml │ ├── dashboards │ │ ├── erlang-memory-allocators.json │ │ └── rabbitmq-overview.json │ └── rules.yaml ├── rclone │ ├── alerts.yaml │ ├── dashboards │ │ └── rclone.json │ └── rules.yaml ├── redis │ ├── alerts.yaml │ ├── dashboards │ │ └── redis_overview │ └── rules.yaml ├── ruby │ ├── alerts.yaml │ ├── dashboards │ │ └── ruby-overview.json │ └── rules.yaml ├── sap-hana │ ├── alerts.yaml │ ├── dashboards │ │ ├── sap-hana-instance-overview.json │ │ └── sap-hana-system-overview.json │ └── rules.yaml ├── sealed-secrets │ ├── alerts.yaml │ ├── dashboards │ │ └── sealed-secrets-controller.json │ └── rules.yaml ├── snmp │ ├── alerts.yaml │ ├── dashboards │ │ ├── snmp-fleet.json │ │ ├── snmp-logs.json │ │ └── snmp-overview.json │ └── rules.yaml ├── spark │ ├── alerts.yaml │ ├── dashboards │ │ └── spark-metrics.json │ └── rules.yaml ├── spinnaker │ ├── alerts.yaml │ ├── dashboards │ │ ├── clouddriver.json │ │ ├── deck.json │ │ ├── echo.json │ │ ├── fiat.json │ │ ├── front50.json │ │ ├── gate.json │ │ ├── igor.json │ │ ├── orca.json │ │ ├── rosco.json │ │ ├── spinnaker-application-details.json │ │ ├── spinnaker-aws-platform.json │ │ ├── spinnaker-google-platform.json │ │ ├── spinnaker-key-metrics.json │ │ ├── spinnaker-kubernetes-platform.json │ │ └── spinnaker-minimalist.json │ └── rules.yaml ├── spring-boot │ ├── alerts.yaml │ ├── dashboards │ │ └── jvm-dashboard.json │ └── rules.yaml ├── squid │ ├── alerts.yaml │ ├── dashboards │ │ └── squid-overview.json │ └── rules.yaml ├── supabase │ ├── alerts.yaml │ ├── dashboards │ │ └── supabase.json │ └── rules.yaml ├── tensorflow │ ├── alerts.yaml │ ├── dashboards │ │ └── tensorflow-overview.json │ └── rules.yaml ├── thanos │ ├── alerts.yaml │ ├── dashboards │ │ ├── bucket-replicate.json │ │ ├── compact.json │ │ ├── overview.json │ │ ├── query-frontend.json │ │ ├── query.json │ │ ├── receive.json │ │ ├── rule.json │ │ ├── sidecar.json │ │ └── store.json │ └── rules.yaml ├── traefik │ ├── alerts.yaml │ ├── dashboards │ │ └── traefikdash.json │ └── rules.yaml ├── ubnt-edgerouter │ ├── alerts.yaml │ ├── dashboards │ │ └── ubnt-edgrouterx-overview.json │ └── rules.yaml ├── varnish │ ├── alerts.yaml │ ├── dashboards │ │ └── varnish-overview.json │ └── rules.yaml ├── vault │ ├── alerts.yaml │ ├── dashboards │ │ └── vault.json │ └── rules.yaml ├── velero │ ├── alerts.yaml │ ├── dashboards │ │ ├── clusterOverview │ │ ├── logs │ │ └── overview │ └── rules.yaml ├── wildfly │ ├── alerts.yaml │ ├── dashboards │ │ ├── wildfly-datasource.json │ │ └── wildfly-overview.json │ └── rules.yaml ├── windows-active-directory │ ├── alerts.yaml │ ├── dashboards │ │ ├── activedirectory │ │ └── logs │ └── rules.yaml ├── windows │ ├── alerts.yaml │ ├── dashboards │ │ ├── disks │ │ ├── fleet │ │ ├── logs │ │ ├── overview │ │ └── system │ └── rules.yaml ├── wso2-enterprise-integrator │ ├── alerts.yaml │ ├── dashboards │ │ ├── API_Metrics.json │ │ ├── Cluster_Metrics.json │ │ ├── Inbound_Endpoint_Metrics.json │ │ ├── Node_Metrics.json │ │ └── Proxy_Service_Metrics.json │ └── rules.yaml └── wso2-streaming-integrator │ ├── alerts.yaml │ ├── dashboards │ ├── Siddhi_aggregation.json │ ├── Siddhi_ondemandquery.json │ ├── Siddhi_overall.json │ ├── Siddhi_query.json │ ├── Siddhi_server.json │ ├── Siddhi_sink.json │ ├── Siddhi_source.json │ ├── Siddhi_stream.json │ ├── Siddhi_table.json │ ├── StreamingIntegrator_apps.json │ └── StreamingIntegrator_overall.json │ └── rules.yaml ├── hack ├── generate.sh ├── go.mod ├── go.sum └── tools.go ├── mixins.json ├── netlify.toml └── site ├── config.yaml ├── content ├── MSSQL │ └── _index.md ├── _index.md ├── aerospike │ └── _index.md ├── alertmanager │ └── _index.md ├── apache-activemq │ └── _index.md ├── apache-airflow │ └── _index.md ├── apache-camel │ └── _index.md ├── apache-cassandra │ └── _index.md ├── apache-couchdb │ └── _index.md ├── apache-hadoop │ └── _index.md ├── apache-hbase │ └── _index.md ├── apache-http │ └── _index.md ├── apache-mesos │ └── _index.md ├── apache-solr │ └── _index.md ├── apache-tomcat │ └── _index.md ├── argo-cd-2 │ └── _index.md ├── argocd │ └── _index.md ├── asterisk │ └── _index.md ├── awx │ └── _index.md ├── blackbox_exporter │ └── _index.md ├── caddy │ └── _index.md ├── celery │ └── _index.md ├── ceph │ └── _index.md ├── cert-manager │ └── _index.md ├── cilium-enterprise │ └── _index.md ├── clickhouse │ └── _index.md ├── cloudflare │ └── _index.md ├── confluent-kafka │ └── _index.md ├── consul │ └── _index.md ├── coredns │ └── _index.md ├── cortex │ └── _index.md ├── couchbase │ └── _index.md ├── discourse │ └── _index.md ├── django │ └── _index.md ├── docker │ └── _index.md ├── elasticsearch │ └── _index.md ├── envoy │ └── _index.md ├── etcd │ └── _index.md ├── f5-bigip │ └── _index.md ├── gitea │ └── _index.md ├── gitlab │ └── _index.md ├── gluster │ └── _index.md ├── go-runtime │ └── _index.md ├── grafana │ └── _index.md ├── haproxy │ └── _index.md ├── harbor │ └── _index.md ├── hass │ └── _index.md ├── ibm-mq │ └── _index.md ├── influxdb │ └── _index.md ├── ingress-nginx-mixin │ └── _index.md ├── istio │ └── _index.md ├── jaeger │ └── _index.md ├── jenkins │ └── _index.md ├── jira │ └── _index.md ├── jvm │ └── _index.md ├── kafka │ └── _index.md ├── kube-cockroachdb │ └── _index.md ├── kube-state-metrics │ └── _index.md ├── kubernetes-autoscaling │ └── _index.md ├── kubernetes │ └── _index.md ├── loki │ └── _index.md ├── memcached │ └── _index.md ├── microsoft-iis │ └── _index.md ├── mongodb-atlas │ └── _index.md ├── mongodb │ └── _index.md ├── mysql │ └── _index.md ├── nginx │ └── _index.md ├── node-exporter │ └── _index.md ├── nodejs │ └── _index.md ├── nomad │ └── _index.md ├── nsq │ └── _index.md ├── openldap │ └── _index.md ├── opensearch │ └── _index.md ├── openstack │ └── _index.md ├── oracledb │ └── _index.md ├── pgbouncer │ └── _index.md ├── postgres-exporter │ └── _index.md ├── presto │ └── _index.md ├── prometheus-operator │ └── _index.md ├── prometheus │ └── _index.md ├── promscale │ └── _index.md ├── promtail │ └── _index.md ├── python-runtime │ └── _index.md ├── rabbitmq │ └── _index.md ├── rclone │ └── _index.md ├── redis │ └── _index.md ├── ruby │ └── _index.md ├── sap-hana │ └── _index.md ├── sealed-secrets │ └── _index.md ├── snmp │ └── _index.md ├── spark │ └── _index.md ├── spinnaker │ └── _index.md ├── spring-boot │ └── _index.md ├── squid │ └── _index.md ├── supabase │ └── _index.md ├── tensorflow │ └── _index.md ├── thanos │ └── _index.md ├── traefik │ └── _index.md ├── ubnt-edgerouter │ └── _index.md ├── varnish │ └── _index.md ├── vault │ └── _index.md ├── velero │ └── _index.md ├── wildfly │ └── _index.md ├── windows-active-directory │ └── _index.md ├── windows │ └── _index.md ├── wso2-enterprise-integrator │ └── _index.md └── wso2-streaming-integrator │ └── _index.md ├── layouts └── _default │ └── baseof.html └── static └── mixins.json /.github/workflows/cron.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Daily content regeneration 3 | 4 | on: 5 | schedule: 6 | - cron: '3 3 * * *' 7 | 8 | jobs: 9 | update: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: actions/setup-go@v5 14 | with: 15 | go-version: '^1.23' 16 | - run: make generate 17 | - uses: EndBug/add-and-commit@v4 18 | with: 19 | add: 'assets/ site/content/' 20 | author_name: "github-actions[bot]" 21 | author_email: "github-actions@users.noreply.github.com" 22 | message: 'assets,site/content: daily assets regeneration' 23 | env: 24 | # This is necessary in order to push a commit to the repo 25 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Pull request CI workflow 3 | 4 | on: 5 | pull_request: 6 | branches: 7 | - master 8 | push: 9 | branches: 10 | - master 11 | 12 | jobs: 13 | generate: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - uses: actions/setup-go@v5 18 | with: 19 | go-version: '^1.23' 20 | - run: make generate 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tmp/ 2 | site/public 3 | site/resources 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "site/themes/ace-documentation"] 2 | path = site/themes/ace-documentation 3 | url = https://github.com/vantagedesign/ace-documentation.git 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash -o pipefail 2 | 3 | BIN_DIR?=$(shell pwd)/tmp/bin 4 | 5 | JB_BIN=$(BIN_DIR)/jb 6 | GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml 7 | JSONNET_BIN=$(BIN_DIR)/jsonnet 8 | TOOLING=$(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN) 9 | 10 | .PHONY: all 11 | all: generate 12 | 13 | .PHONY: generate 14 | generate: $(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN) 15 | ./hack/generate.sh 16 | 17 | $(BIN_DIR): 18 | mkdir -p $(BIN_DIR) 19 | 20 | $(TOOLING): $(BIN_DIR) 21 | @echo Installing tools from hack/tools.go 22 | @cd hack && go list -mod=mod -tags tools -e -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) % 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Monitoring Mixins site 2 | 3 | ## Adding new mixin 4 | 5 | 0. Install [required software](#requirements) 6 | 1. Add new mixin to [mixins.json](mixins.json) file 7 | 2. Run `make` 8 | 9 | ## Requirements 10 | 11 | - jq 12 | - make 13 | - git 14 | - golang 15 | -------------------------------------------------------------------------------- /assets/MSSQL/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: MSSQLAlerts 3 | rules: 4 | - alert: MSSQLHighNumberOfDeadlocks 5 | annotations: 6 | description: '{{ printf "%.2f" $value }} deadlocks have occurred over the last 7 | 5 minutes on {{$labels.instance}}, which is above threshold of 10 deadlocks.' 8 | summary: There are deadlocks ocurring in the database. 9 | expr: | 10 | increase(mssql_deadlocks_total{}[5m]) > 10 11 | for: 5m 12 | labels: 13 | severity: warning 14 | - alert: MSSQLModerateReadStallTime 15 | annotations: 16 | description: '{{ printf "%.2f" $value }}ms of IO read stall has occurred on 17 | {{$labels.instance}}, which is above threshold of 200ms.' 18 | summary: There is a moderate amount of IO stall for database reads. 19 | expr: | 20 | 1000 * increase(mssql_io_stall_seconds_total{operation="read"}[5m]) > 200 21 | for: 5m 22 | labels: 23 | severity: warning 24 | - alert: MSSQLHighReadStallTime 25 | annotations: 26 | description: '{{ printf "%.2f" $value }}ms of IO read stall has occurred on 27 | {{$labels.instance}}, which is above threshold of 400ms.' 28 | summary: There is a high amount of IO stall for database reads. 29 | expr: | 30 | 1000 * increase(mssql_io_stall_seconds_total{operation="read"}[5m]) > 400 31 | for: 5m 32 | labels: 33 | severity: critical 34 | - alert: MSSQLModerateWriteStallTime 35 | annotations: 36 | description: '{{ printf "%.2f" $value }}ms of IO write stall has occurred on 37 | {{$labels.instance}}, which is above threshold of 200ms.' 38 | summary: There is a moderate amount of IO stall for database writes. 39 | expr: | 40 | 1000 * increase(mssql_io_stall_seconds_total{operation="write"}[5m]) > 200 41 | for: 5m 42 | labels: 43 | severity: warning 44 | - alert: MSSQLHighWriteStallTime 45 | annotations: 46 | description: '{{ printf "%.2f" $value }}ms of IO write stall has occurred on 47 | {{$labels.instance}}, which is above threshold of 400ms.' 48 | summary: There is a high amount of IO stall for database writes. 49 | expr: | 50 | 1000 * increase(mssql_io_stall_seconds_total{operation="write"}[5m]) > 400 51 | for: 5m 52 | labels: 53 | severity: critical 54 | -------------------------------------------------------------------------------- /assets/MSSQL/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/aerospike/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/alertmanager/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-activemq/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: apache-activemq-alerts 3 | rules: 4 | - alert: ApacheActiveMQHighTopicMemoryUsage 5 | annotations: 6 | description: '{{ printf "%.0f" $value }} percent of memory used by topics on 7 | {{$labels.instance}} in cluster {{$labels.activemq_cluster}}, which is above 8 | the threshold of 70 percent.' 9 | summary: Topic destination memory usage is high, which may result in a reduction 10 | of the rate at which producers send messages. 11 | expr: | 12 | sum without (destination) (activemq_topic_memory_percent_usage{destination!~"ActiveMQ.Advisory.*"}) > 70 13 | for: 5m 14 | labels: 15 | severity: warning 16 | - alert: ApacheActiveMQHighQueueMemoryUsage 17 | annotations: 18 | description: '{{ printf "%.0f" $value }} percent of memory used by queues on 19 | {{$labels.instance}} in cluster {{$labels.activemq_cluster}}, which is above 20 | the threshold of 70 percent.' 21 | summary: Queue destination memory usage is high, which may result in a reduction 22 | of the rate at which producers send messages. 23 | expr: | 24 | sum without (destination) (activemq_queue_memory_percent_usage) > 70 25 | for: 5m 26 | labels: 27 | severity: warning 28 | - alert: ApacheActiveMQHighStoreMemoryUsage 29 | annotations: 30 | description: '{{ printf "%.0f" $value }} percent of store memory used on {{$labels.instance}} 31 | in cluster {{$labels.activemq_cluster}}, which is above the threshold of 70 32 | percent.' 33 | summary: Store memory usage is high, which may result in producers unable to 34 | send messages. 35 | expr: | 36 | activemq_store_usage_ratio > 70 37 | for: 5m 38 | labels: 39 | severity: warning 40 | - alert: ApacheActiveMQHighTemporaryMemoryUsage 41 | annotations: 42 | description: '{{ printf "%.0f" $value }} percent of temporary memory used on 43 | {{$labels.instance}} in cluster {{$labels.activemq_cluster}}, which is above 44 | the threshold of 70 percent.' 45 | summary: Temporary memory usage is high, which may result in saturation of messaging 46 | throughput. 47 | expr: | 48 | activemq_temp_usage_ratio > 70 49 | for: 5m 50 | labels: 51 | severity: warning 52 | -------------------------------------------------------------------------------- /assets/apache-activemq/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-airflow/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: apache-airflow 3 | rules: 4 | - alert: ApacheAirflowStarvingPoolTasks 5 | annotations: 6 | description: | 7 | The number of starved tasks is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of 0. 8 | summary: There are starved tasks detected in the Apache Airflow pool. 9 | expr: | 10 | airflow_pool_starving_tasks > 0 11 | for: 5m 12 | labels: 13 | severity: critical 14 | - alert: ApacheAirflowDAGScheduleDelayWarningLevel 15 | annotations: 16 | description: | 17 | The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 10. 18 | summary: The delay in DAG schedule time to DAG run time has reached the warning 19 | threshold. 20 | expr: | 21 | increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 10 22 | for: 1m 23 | labels: 24 | severity: warning 25 | - alert: ApacheAirflowDAGScheduleDelayCriticalLevel 26 | annotations: 27 | description: | 28 | The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 60. 29 | summary: The delay in DAG schedule time to DAG run time has reached the critical 30 | threshold. 31 | expr: | 32 | increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 60 33 | for: 1m 34 | labels: 35 | severity: critical 36 | - alert: ApacheAirflowDAGFailures 37 | annotations: 38 | description: | 39 | The number of DAG failures seen is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 0. 40 | summary: There have been DAG failures detected. 41 | expr: | 42 | increase(airflow_dagrun_duration_failed_count[5m]) > 0 43 | for: 1m 44 | labels: 45 | severity: critical 46 | -------------------------------------------------------------------------------- /assets/apache-airflow/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-camel/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-camel/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-cassandra/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-couchdb/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-hadoop/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-hbase/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: apache-hbase-alerts 3 | rules: 4 | - alert: HBaseHighHeapMemUsage 5 | annotations: 6 | description: The heap memory usage for the JVM on instance {{$labels.instance}} 7 | in cluster {{$labels.hbase_cluster}} is {{printf "%.0f" $value}} percent, 8 | which is above the threshold of 80 percent 9 | summary: There is a limited amount of heap memory available to the JVM. 10 | expr: | 11 | 100 * sum without(context, hostname, processname) (jvm_metrics_mem_heap_used_m{job="integrations/apache-hbase"} / clamp_min(jvm_metrics_mem_heap_committed_m{job="integrations/apache-hbase"}, 1)) > 80 12 | for: 5m 13 | labels: 14 | severity: warning 15 | - alert: HBaseDeadRegionServer 16 | annotations: 17 | description: '{{$value}} RegionServer(s) in cluster {{$labels.hbase_cluster}} 18 | are unresponsive, which is above the threshold of 0. The name(s) of the dead 19 | RegionServer(s) are {{$labels.deadregionservers}}' 20 | summary: One or more RegionServer(s) has become unresponsive. 21 | expr: | 22 | server_num_dead_region_servers > 0 23 | for: 5m 24 | labels: 25 | severity: warning 26 | - alert: HBaseOldRegionsInTransition 27 | annotations: 28 | description: '{{printf "%.0f" $value}} percent of RegionServers in transition 29 | in cluster {{$labels.hbase_cluster}} are transitioning for longer than expected, 30 | which is above the threshold of 50 percent' 31 | summary: RegionServers are in transition for longer than expected. 32 | expr: | 33 | 100 * assignment_manager_rit_count_over_threshold / clamp_min(assignment_manager_rit_count, 1) > 50 34 | for: 5m 35 | labels: 36 | severity: warning 37 | - alert: HBaseHighMasterAuthFailRate 38 | annotations: 39 | description: '{{printf "%.0f" $value}} percent of authentication attempts to 40 | the master are failing in cluster {{$labels.hbase_cluster}}, which is above 41 | the threshold of 35 percent' 42 | summary: A high percentage of authentication attempts to the master are failing. 43 | expr: | 44 | 100 * rate(master_authentication_failures[5m]) / (clamp_min(rate(master_authentication_successes[5m]), 1) + clamp_min(rate(master_authentication_failures[5m]), 1)) > 35 45 | for: 5m 46 | labels: 47 | severity: warning 48 | - alert: HBaseHighRSAuthFailRate 49 | annotations: 50 | description: '{{printf "%.0f" $value}} percent of authentication attempts to 51 | the RegionServer {{$labels.instance}} are failing in cluster {{$labels.hbase_cluster}}, 52 | which is above the threshold of 35 percent' 53 | summary: A high percentage of authentication attempts to a RegionServer are 54 | failing. 55 | expr: | 56 | 100 * rate(region_server_authentication_failures[5m]) / (clamp_min(rate(region_server_authentication_successes[5m]), 1) + clamp_min(rate(region_server_authentication_failures[5m]), 1)) > 35 57 | for: 5m 58 | labels: 59 | severity: warning 60 | -------------------------------------------------------------------------------- /assets/apache-hbase/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-http/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: apache-http 3 | rules: 4 | - alert: ApacheDown 5 | annotations: 6 | description: Apache is down on {{ $labels.instance }}. 7 | summary: Apache is down. 8 | expr: apache_up == 0 9 | for: 5m 10 | labels: 11 | severity: warning 12 | - alert: ApacheRestart 13 | annotations: 14 | description: Apache has just been restarted on {{ $labels.instance }}. 15 | summary: Apache restart. 16 | expr: apache_uptime_seconds_total / 60 < 1 17 | for: "0" 18 | labels: 19 | severity: info 20 | - alert: ApacheWorkersLoad 21 | annotations: 22 | description: | 23 | Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}. 24 | The current value is {{ $value }}%. 25 | summary: Apache workers load is too high. 26 | expr: | 27 | (sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 28 | for: 15m 29 | labels: 30 | severity: warning 31 | - alert: ApacheResponseTimeTooHigh 32 | annotations: 33 | description: | 34 | Apache average response time is above the threshold of 5000 ms on {{ $labels.instance }}. 35 | The current value is {{ $value }} ms. 36 | summary: Apache response time is too high. 37 | expr: | 38 | increase(apache_duration_ms_total[5m])/increase(apache_accesses_total[5m]) > 5000 39 | for: 15m 40 | labels: 41 | severity: warning 42 | -------------------------------------------------------------------------------- /assets/apache-http/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-mesos/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: apache-mesos 3 | rules: 4 | - alert: ApacheMesosHighMemoryUsage 5 | annotations: 6 | description: '{{ printf "%.0f" $value }} percent memory usage on {{$labels.mesos_cluster}}, 7 | which is above the threshold of 90.' 8 | summary: There is a high memory usage for the cluster. 9 | expr: | 10 | min without(instance, job, type) (mesos_master_mem{type="percent"}) > 90 11 | for: 5m 12 | labels: 13 | severity: warning 14 | - alert: ApacheMesosHighDiskUsage 15 | annotations: 16 | description: '{{ printf "%.0f" $value }} percent disk usage on {{$labels.mesos_cluster}}, 17 | which is above the threshold of 90.' 18 | summary: There is a high disk usage for the cluster. 19 | expr: | 20 | min without(instance, job, type) (mesos_master_disk{type="percent"}) > 90 21 | for: 5m 22 | labels: 23 | severity: critical 24 | - alert: ApacheMesosUnreachableTasks 25 | annotations: 26 | description: '{{ printf "%.0f" $value }} unreachable tasks on {{$labels.mesos_cluster}}, 27 | which is above the threshold of 3.' 28 | summary: There are an unusually high number of unreachable tasks. 29 | expr: | 30 | max without(instance, job, state) (mesos_master_task_states_current{state="unreachable"}) > 3 31 | for: 5m 32 | labels: 33 | severity: warning 34 | - alert: ApacheMesosNoLeaderElected 35 | annotations: 36 | description: There is no cluster coordinator on {{$labels.mesos_cluster}}. 37 | summary: There is currently no cluster coordinator. 38 | expr: | 39 | max without(instance, job) (mesos_master_elected) == 0 40 | for: 1m 41 | labels: 42 | severity: critical 43 | - alert: ApacheMesosInactiveAgents 44 | annotations: 45 | description: '{{ printf "%.0f" $value }} inactive agent clients over the last 46 | 5m which is above the threshold of 1.' 47 | summary: There are currently inactive agent clients. 48 | expr: | 49 | max without(instance, job, state) (mesos_master_slaves_state{state=~"connected_inactive|disconnected_inactive"}) > 1 50 | for: 5m 51 | labels: 52 | severity: warning 53 | -------------------------------------------------------------------------------- /assets/apache-mesos/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-solr/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/apache-tomcat/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: ApacheTomcatAlerts 3 | rules: 4 | - alert: ApacheTomcatAlertsHighCpuUsage 5 | annotations: 6 | description: The CPU usage has been at {{ printf "%.0f" $value }} percent over 7 | the last 5 minutes on {{$labels.instance}}, which is above the threshold of 8 | 80 percent. 9 | summary: The instance has a CPU usage higher than the configured threshold. 10 | expr: | 11 | sum by (job,instance) (jvm_process_cpu_load{job="integrations/tomcat"}) > 80 12 | for: 5m 13 | labels: 14 | severity: critical 15 | - alert: ApacheTomcatAlertsHighMemoryUsage 16 | annotations: 17 | description: The memory usage has been at {{ printf "%.0f" $value }} percent 18 | over the last 5 minutes on {{$labels.instance}}, which is above the threshold 19 | of 80 percent. 20 | summary: The instance has a higher memory usage than the configured threshold. 21 | expr: | 22 | sum(jvm_memory_usage_used_bytes{job="integrations/tomcat"}) by (job,instance) / sum(jvm_physical_memory_bytes{job="integrations/tomcat"}) by (job,instance) * 100 > 80 23 | for: 5m 24 | labels: 25 | severity: critical 26 | - alert: ApacheTomcatAlertsHighRequestErrorPercent 27 | annotations: 28 | description: The percentage of request errors has been at {{ printf "%.0f" $value 29 | }} percent over the last 5 minutes on {{$labels.instance}}, which is above 30 | the threshold of 5 percent. 31 | summary: There are a high number of request errors. 32 | expr: | 33 | sum by (job,instance) (increase(tomcat_errorcount_total{job="integrations/tomcat"}[5m]) / increase(tomcat_requestcount_total{job="integrations/tomcat"}[5m]) * 100) > 5 34 | for: 5m 35 | labels: 36 | severity: critical 37 | - alert: ApacheTomcatAlertsModeratelyHighProcessingTime 38 | annotations: 39 | description: The processing time has been at {{ printf "%.0f" $value }}ms over 40 | the last 5 minutes on {{$labels.instance}}, which is above the threshold of 41 | 300ms. 42 | summary: The processing time has been moderately high. 43 | expr: | 44 | sum by (job,instance) (increase(tomcat_processingtime_total{job="integrations/tomcat"}[5m]) / increase(tomcat_requestcount_total{job="integrations/tomcat"}[5m])) > 300 45 | for: 5m 46 | labels: 47 | severity: warning 48 | -------------------------------------------------------------------------------- /assets/apache-tomcat/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/argo-cd-2/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/argocd/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: ArgoCD 3 | rules: 4 | - alert: ArgoAppOutOfSync 5 | annotations: 6 | description: Application {{ $labels.name }} has sync status as {{ $labels.sync_status 7 | }}. 8 | summary: Application is OutOfSync. 9 | expr: argocd_app_info{sync_status="OutOfSync"} == 1 10 | for: 1m 11 | labels: 12 | severity: warning 13 | - alert: ArgoAppSyncFailed 14 | annotations: 15 | description: Application {{ $labels.name }} has sync phase as {{ $labels.phase 16 | }}. 17 | summary: Application Sync Failed. 18 | expr: argocd_app_sync_total{phase!="Succeeded"} == 1 19 | for: 1m 20 | labels: 21 | severity: warning 22 | - alert: ArgoAppMissing 23 | annotations: 24 | description: "ArgoCD has not reported any applications data for the past 15 25 | minutes which means that it must be down or not functioning properly. \n" 26 | summary: No reported applications in ArgoCD. 27 | expr: absent(argocd_app_info) 28 | for: 15m 29 | labels: 30 | severity: critical 31 | -------------------------------------------------------------------------------- /assets/argocd/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/asterisk/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: AsteriskAlerts 3 | rules: 4 | - alert: AsteriskRestarted 5 | annotations: 6 | description: |- 7 | Asterisk instance restarted in the last minute 8 | VALUE = {{ $value }} 9 | LABELS = {{ $labels }} 10 | summary: Asterisk instance restarted in the last minute. 11 | expr: asterisk_core_uptime_seconds < 60 12 | for: 5s 13 | labels: 14 | severity: critical 15 | - alert: AsteriskReloaded 16 | annotations: 17 | description: |- 18 | Asterisk instance reloaded in the last minute 19 | VALUE = {{ $value }} 20 | LABELS = {{ $labels }} 21 | summary: Asterisk instance reloaded in the last minute. 22 | expr: asterisk_core_last_reload_seconds < 60 23 | for: 5s 24 | labels: 25 | severity: warning 26 | - alert: AsteriskHighScrapeTime 27 | annotations: 28 | description: |- 29 | Asterisk instance core high scrape time (Possible system performance degradation) 30 | VALUE = {{ $value }} 31 | LABELS = {{ $labels }} 32 | summary: Asterisk instance core high scrape time. 33 | expr: asterisk_core_scrape_time_ms > 100 34 | for: 10s 35 | labels: 36 | severity: critical 37 | - alert: AsteriskHighActiveCallsCount 38 | annotations: 39 | description: |- 40 | Asterisk high active call count 41 | VALUE = {{ $value }} 42 | LABELS = {{ $labels }} 43 | summary: Asterisk high active call count. 44 | expr: asterisk_calls_count > 100 45 | for: 10s 46 | labels: 47 | severity: warning 48 | -------------------------------------------------------------------------------- /assets/asterisk/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/awx/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/awx/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/blackbox_exporter/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: blackbox-exporter.rules 3 | rules: 4 | - alert: BlackboxProbeFailed 5 | annotations: 6 | dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{ 7 | $labels.instance }} 8 | description: The probe failed for the instance {{ $labels.instance }}. 9 | summary: Probe has failed for the past 1m interval. 10 | expr: | 11 | probe_success{job="blackbox-exporter"} == 0 12 | for: 1m 13 | labels: 14 | severity: critical 15 | - alert: BlackboxLowUptime30d 16 | annotations: 17 | dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{ 18 | $labels.instance }} 19 | description: The probe has a lower uptime than 99.9% the last 30 days for the 20 | instance {{ $labels.instance }}. 21 | summary: Probe uptime is lower than 99.9% for the last 30 days. 22 | expr: | 23 | avg_over_time(probe_success{job="blackbox-exporter"}[30d]) * 100 < 99.900000000000006 24 | labels: 25 | severity: info 26 | - alert: BlackboxSslCertificateWillExpireSoon 27 | annotations: 28 | dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{ 29 | $labels.instance }} 30 | description: | 31 | The SSL certificate of the instance {{ $labels.instance }} is expiring within 21 days. 32 | Actual time left: {{ $value | humanizeDuration }}. 33 | summary: SSL certificate will expire soon. 34 | expr: | 35 | probe_ssl_earliest_cert_expiry{job="blackbox-exporter"} - time() < 21 * 24 * 3600 36 | labels: 37 | severity: warning 38 | -------------------------------------------------------------------------------- /assets/blackbox_exporter/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/caddy/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/caddy/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/celery/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: celery 3 | rules: 4 | - alert: CeleryTaskHighFailRate 5 | annotations: 6 | dashboard_url: https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{ 7 | $labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name 8 | }} 9 | description: More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name 10 | }}/{{ $labels.name }} the past 10m. 11 | summary: Celery high task fail rate. 12 | expr: | 13 | sum( 14 | increase( 15 | celery_task_failed_total{ 16 | job=~".*celery.*", 17 | queue_name!~"None", 18 | name!~"None" 19 | }[10m] 20 | ) 21 | ) by (job, namespace, queue_name, name) 22 | / 23 | ( 24 | sum( 25 | increase( 26 | celery_task_failed_total{ 27 | job=~".*celery.*", 28 | queue_name!~"None", 29 | name!~"None" 30 | }[10m] 31 | ) 32 | ) by (job, namespace, queue_name, name) 33 | + 34 | sum( 35 | increase( 36 | celery_task_succeeded_total{ 37 | job=~".*celery.*", 38 | queue_name!~"None", 39 | name!~"None" 40 | }[10m] 41 | ) 42 | ) by (job, namespace, queue_name, name) 43 | ) 44 | * 100 > 5 45 | for: 1m 46 | labels: 47 | severity: warning 48 | - alert: CeleryHighQueueLength 49 | annotations: 50 | dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{ 51 | $labels.job }}&var-queue_name={{ $labels.queue_name }} 52 | description: More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name 53 | }} the past 20m. 54 | summary: Celery high queue length. 55 | expr: | 56 | sum( 57 | celery_queue_length{ 58 | job=~".*celery.*", 59 | queue_name!~"None" 60 | } 61 | ) by (job, namespace, queue_name) 62 | > 100 63 | for: 20m 64 | labels: 65 | severity: warning 66 | - alert: CeleryWorkerDown 67 | annotations: 68 | dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{ 69 | $labels.job }} 70 | description: The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline. 71 | summary: A Celery worker is offline. 72 | expr: | 73 | celery_worker_up{job=~".*celery.*"} == 0 74 | for: 15m 75 | labels: 76 | severity: warning 77 | -------------------------------------------------------------------------------- /assets/celery/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/ceph/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/cert-manager/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: cert-manager 3 | rules: 4 | - alert: CertManagerAbsent 5 | annotations: 6 | description: New certificates will not be able to be minted, and existing ones 7 | can't be renewed until cert-manager is back. 8 | runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerabsent 9 | summary: Cert Manager has disappeared from Prometheus service discovery. 10 | expr: absent(up{job="cert-manager"}) 11 | for: 10m 12 | labels: 13 | severity: critical 14 | - name: certificates 15 | rules: 16 | - alert: CertManagerCertExpirySoon 17 | annotations: 18 | dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager 19 | description: The domain that this cert covers will be unavailable after {{ $value 20 | | humanizeDuration }}. Clients using endpoints that this cert protects will 21 | start to fail in {{ $value | humanizeDuration }}. 22 | runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertexpirysoon 23 | summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from 24 | expiry, it should have renewed over a week ago. 25 | expr: | 26 | avg by (exported_namespace, namespace, name) ( 27 | certmanager_certificate_expiration_timestamp_seconds - time() 28 | ) < (21 * 24 * 3600) # 21 days in seconds 29 | for: 1h 30 | labels: 31 | severity: warning 32 | - alert: CertManagerCertNotReady 33 | annotations: 34 | dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager 35 | description: This certificate has not been ready to serve traffic for at least 36 | 10m. If the cert is being renewed or there is another valid cert, the ingress 37 | controller _may_ be able to serve that instead. 38 | runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertnotready 39 | summary: The cert `{{ $labels.name }}` is not ready to serve traffic. 40 | expr: | 41 | max by (name, exported_namespace, namespace, condition) ( 42 | certmanager_certificate_ready_status{condition!="True"} == 1 43 | ) 44 | for: 10m 45 | labels: 46 | severity: critical 47 | - alert: CertManagerHittingRateLimits 48 | annotations: 49 | dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager 50 | description: Depending on the rate limit, cert-manager may be unable to generate 51 | certificates for up to a week. 52 | runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerhittingratelimits 53 | summary: Cert manager hitting LetsEncrypt rate limits. 54 | expr: | 55 | sum by (host) ( 56 | rate(certmanager_http_acme_client_request_count{status="429"}[5m]) 57 | ) > 0 58 | for: 5m 59 | labels: 60 | severity: critical 61 | -------------------------------------------------------------------------------- /assets/cert-manager/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: [] 2 | -------------------------------------------------------------------------------- /assets/cilium-enterprise/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/clickhouse/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: ClickHouseAlerts 3 | rules: 4 | - alert: ClickHouseReplicationQueueBackingUp 5 | annotations: 6 | description: | 7 | ClickHouse replication tasks are processing slower than expected on {{ $labels.instance }} causing replication queue size to back up at {{ $value }} exceeding the threshold value of 99. 8 | summary: ClickHouse replica max queue size backing up. 9 | expr: | 10 | ClickHouseAsyncMetrics_ReplicasMaxQueueSize > 99 11 | for: 5m 12 | keep_firing_for: 5m 13 | labels: 14 | severity: warning 15 | - alert: ClickHouseRejectedInserts 16 | annotations: 17 | description: ClickHouse inserts are being rejected on {{ $labels.instance }} 18 | as items are being inserted faster than ClickHouse is able to merge them. 19 | summary: ClickHouse has too many rejected inserts. 20 | expr: ClickHouseProfileEvents_RejectedInserts > 1 21 | for: 5m 22 | keep_firing_for: 5m 23 | labels: 24 | severity: critical 25 | - alert: ClickHouseZookeeperSessions 26 | annotations: 27 | description: | 28 | ClickHouse has more than one connection to a Zookeeper on {{ $labels.instance }} which can lead to bugs due to stale reads in Zookeepers consistency model. 29 | summary: ClickHouse has too many Zookeeper sessions. 30 | expr: ClickHouseMetrics_ZooKeeperSession > 1 31 | for: 5m 32 | keep_firing_for: 5m 33 | labels: 34 | severity: critical 35 | - alert: ClickHouseReplicasInReadOnly 36 | annotations: 37 | description: | 38 | ClickHouse has replicas in a read only state on {{ $labels.instance }} after losing connection to Zookeeper or at startup. 39 | summary: ClickHouse has too many replicas in read only state. 40 | expr: ClickHouseMetrics_ReadonlyReplica > 0 41 | for: 5m 42 | keep_firing_for: 5m 43 | labels: 44 | severity: critical 45 | -------------------------------------------------------------------------------- /assets/clickhouse/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/cloudflare/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: cloudflare-alerts 3 | rules: 4 | - alert: CloudflareHighThreatCount 5 | annotations: 6 | description: The number of detected threats targeting the zone {{$labels.zone}} 7 | is {{ printf "%.0f" $value }} which is greater than the threshold of 3. 8 | summary: There are detected threats targeting the zone. 9 | expr: | 10 | sum without (instance) (increase(cloudflare_zone_threats_total[5m])) > 3 11 | for: 5m 12 | labels: 13 | severity: critical 14 | - alert: CloudflareHighRequestRate 15 | annotations: 16 | description: The rate of requests to {{$labels.zone}} is {{ printf "%.0f" $value 17 | }}% of the prior 50 minute baseline which is above the threshold of 150%. 18 | summary: A high spike in requests is occurring which may indicate an attack 19 | or unexpected load. 20 | expr: | 21 | sum without (instance) (100 * (rate(cloudflare_zone_requests_total[10m]) / clamp_min(rate(cloudflare_zone_requests_total[50m] offset 10m), 1))) > 150 22 | for: 5m 23 | labels: 24 | severity: warning 25 | - alert: CloudflareHighHTTPErrorCodes 26 | annotations: 27 | description: The number of {{$labels.status}} HTTP status codes occurring in 28 | the zone {{$labels.zone}} is {{ printf "%.0f" $value }} which is greater than 29 | the threshold of 100. 30 | summary: A high number of 4xx or 5xx HTTP status codes are occurring. 31 | expr: | 32 | sum without (instance) (increase(cloudflare_zone_requests_status{status=~"4.*|5.*"}[5m])) > 100 33 | for: 5m 34 | labels: 35 | severity: warning 36 | - alert: CloudflareUnhealthyPools 37 | annotations: 38 | description: The pool {{$labels.pool_name}} in zone {{$labels.zone}} is currently 39 | down and unhealthy. 40 | summary: There are unhealthy pools. 41 | expr: | 42 | sum without (instance, load_balancer_name) (cloudflare_zone_pool_health_status) == 0 43 | for: 5m 44 | labels: 45 | severity: critical 46 | - alert: CloudflareMetricsDown 47 | annotations: 48 | description: Grafana is no longer receiving metrics for the Cloudflare integration 49 | from instance {{$labels.instance}}. 50 | summary: Cloudflare metrics are down. 51 | expr: | 52 | up{job="integrations/cloudflare"} == 0 53 | for: 5m 54 | labels: 55 | severity: critical 56 | -------------------------------------------------------------------------------- /assets/cloudflare/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/confluent-kafka/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/confluent-kafka/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/consul/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: consul 3 | rules: 4 | - alert: ConsulUp 5 | annotations: 6 | description: Consul '{{ $labels.job }}' is not up. 7 | summary: Consul is not up. 8 | expr: | 9 | consul_up != 1 10 | for: 1m 11 | labels: 12 | severity: critical 13 | - alert: ConsulMaster 14 | annotations: 15 | description: Consul '{{ $labels.job }}' has no master. 16 | summary: Consul has no master. 17 | expr: | 18 | consul_raft_leader != 1 19 | for: 1m 20 | labels: 21 | severity: critical 22 | - alert: ConsulPeers 23 | annotations: 24 | description: Consul '{{ $labels.job }}' does not have 3 peers. 25 | summary: Consul does not have peers. 26 | expr: | 27 | consul_raft_peers != 3 28 | for: 10m 29 | labels: 30 | severity: critical 31 | -------------------------------------------------------------------------------- /assets/consul/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/coredns/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/couchbase/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: couchbase 3 | rules: 4 | - alert: CouchbaseHighCPUUsage 5 | annotations: 6 | description: '{{ printf "%.0f" $value }} percent CPU usage on node {{$labels.instance}} 7 | and on cluster {{$labels.couchbase_cluster}}, which is above the threshold 8 | of 85.' 9 | summary: The node CPU usage has exceeded the critical threshold. 10 | expr: | 11 | (sys_cpu_utilization_rate) > 85 12 | for: 5m 13 | labels: 14 | severity: critical 15 | - alert: CouchbaseHighMemoryUsage 16 | annotations: 17 | description: '{{ printf "%.0f" $value }} percent memory usage on node {{$labels.instance}} 18 | and on cluster {{$labels.couchbase_cluster}}, which is above the threshold 19 | of 85.' 20 | summary: There is a limited amount of memory available for a node. 21 | expr: | 22 | 100 * (sys_mem_actual_used / clamp_min(sys_mem_actual_used + sys_mem_actual_free, 1)) > 85 23 | for: 5m 24 | labels: 25 | severity: critical 26 | - alert: CouchbaseMemoryEvictionRate 27 | annotations: 28 | description: '{{ printf "%.0f" $value }} evictions in bucket {{$labels.bucket}}, 29 | on node {{$labels.instance}}, and on cluster {{$labels.couchbase_cluster}}, 30 | which is above the threshold of 10.' 31 | summary: There is a spike in evictions in a bucket, which indicates high memory 32 | pressure. 33 | expr: | 34 | (kv_ep_num_value_ejects) > 10 35 | for: 5m 36 | labels: 37 | severity: warning 38 | - alert: CouchbaseInvalidRequestVolume 39 | annotations: 40 | description: '{{ printf "%.0f" $value }} invalid requests to {{$labels.couchbase_cluster}}, 41 | which is above the threshold of 1000.' 42 | summary: There is a high volume of incoming invalid requests, which may indicate 43 | a DOS or injection attack. 44 | expr: | 45 | sum without(instance, job) (rate(n1ql_invalid_requests[2m])) > 1000 46 | for: 2m 47 | labels: 48 | severity: warning 49 | -------------------------------------------------------------------------------- /assets/couchbase/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/discourse/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: DiscourseAlerts 3 | rules: 4 | - alert: DiscourseRequestsHigh5xxErrors 5 | annotations: 6 | description: '{{ printf "%.2f" $value }}% of all requests are resulting in 500 7 | status codes, which is above the threshold 10%, indicating a potentially larger 8 | issue for {{$labels.instance}}' 9 | summary: More than 10% of all requests result in a 5XX. 10 | expr: | 11 | 100 * rate(discourse_http_requests{status="500"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 10 12 | for: 5m 13 | labels: 14 | severity: critical 15 | - alert: DiscourseRequestsHigh4xxErrors 16 | annotations: 17 | description: '{{ printf "%.2f" $value }}% of all requests are resulting in 400 18 | status code, which is above the threshold 30%, indicating a potentially larger 19 | issue for {{$labels.instance}}' 20 | summary: More than 30% of all requests result in a 4XX. 21 | expr: | 22 | 100 * rate(discourse_http_requests{status=~"^4.*"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 30 23 | for: 5m 24 | labels: 25 | severity: warning 26 | -------------------------------------------------------------------------------- /assets/discourse/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/django/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/docker/alerts.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/docker/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/elasticsearch/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/elasticsearch/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/envoy/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/envoy/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/etcd/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/f5-bigip/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: bigip-alerts 3 | rules: 4 | - alert: BigIPLowNodeAvailabilityStatus 5 | annotations: 6 | description: '{{ printf "%.0f" $value }} percent of available nodes, which is 7 | below the threshold of 95.' 8 | summary: Detecting a significant number of unavailable nodes which can causes 9 | potential downtime or degraded performance. 10 | expr: | 11 | 100 * (sum(bigip_node_status_availability_state) / clamp_min(count(bigip_node_status_availability_state), 1)) < 95 12 | for: 5m 13 | labels: 14 | severity: critical 15 | - alert: BigIPServerSideConnectionLimit 16 | annotations: 17 | description: '{{ printf "%.0f" $value }} percent of the max number of connections 18 | in use on node {{$labels.node}}, which is above the threshold of 80 percent.' 19 | summary: Approaching the connection limit may lead to rejecting new connections, 20 | impacting availability. 21 | expr: | 22 | max without(instance, job) (100 * bigip_node_serverside_cur_conns / clamp_min(bigip_node_serverside_max_conns, 1)) > 80 23 | for: 5m 24 | labels: 25 | severity: warning 26 | - alert: BigIPHighRequestRate 27 | annotations: 28 | description: '{{ printf "%.0f" $value }} percent increase in requests on pool 29 | {{$labels.pool}}, which is above the threshold of 150.' 30 | summary: An unexpected spike in requests might indicate an issue like a DDoS 31 | attack or unexpected high load. 32 | expr: | 33 | max without(instance, job) (100 * rate(bigip_pool_tot_requests[10m]) / clamp_min(rate(bigip_pool_tot_requests[50m] offset 10m), 1)) > 150 34 | for: 10m 35 | labels: 36 | severity: warning 37 | - alert: BigIPHighConnectionQueueDepth 38 | annotations: 39 | description: '{{ printf "%.0f" $value }} percent increase in connection queue 40 | depth on node {{$labels.pool}}, which is above the threshold of 75.' 41 | summary: A sudden spike or sustained high queue depth may indicate a bottleneck 42 | in handling incoming connections. 43 | expr: | 44 | max without(instance, job) (100 * rate(bigip_pool_connq_depth[5m])) / clamp_min(rate(bigip_pool_connq_depth[50m] offset 10m), 1) > 75 45 | for: 5m 46 | labels: 47 | severity: warning 48 | -------------------------------------------------------------------------------- /assets/f5-bigip/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/gitea/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/gitea/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/gitlab/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: GitLabAlerts 3 | rules: 4 | - alert: GitLabHighJobRegistrationFailures 5 | annotations: 6 | description: '{{ printf "%.2f" $value }}% of job registrations have failed on 7 | {{$labels.instance}}, which is above threshold of 10%.' 8 | summary: Large percentage of failed attempts to register a job. 9 | expr: "100 * rate(job_register_attempts_failed_total{}[5m]) / rate(job_register_attempts_total{}[5m]) 10 | \n> 10\n" 11 | for: 5m 12 | labels: 13 | severity: warning 14 | - alert: GitLabHighRunnerAuthFailure 15 | annotations: 16 | description: '{{ printf "%.2f" $value }}% of GitLab runner authentication attempts 17 | are failing on {{$labels.instance}}, which is above the threshold of 10%.' 18 | summary: Large percentage of runner authentication failures. 19 | expr: "100 * sum by (instance) (rate(gitlab_ci_runner_authentication_failure_total{}[5m])) 20 | \ / \n(sum by (instance) (rate(gitlab_ci_runner_authentication_success_total{}[5m])) 21 | \ + sum by (instance) (rate(gitlab_ci_runner_authentication_failure_total{}[5m])))\n> 22 | 10\n" 23 | for: 5m 24 | labels: 25 | severity: warning 26 | - alert: GitLabHigh5xxResponses 27 | annotations: 28 | description: '{{ printf "%.2f" $value }}% of all requests returned 5XX HTTP 29 | responses, which is above the threshold 10%, indicating a system issue on 30 | {{$labels.instance}}.' 31 | summary: Large rate of HTTP 5XX errors. 32 | expr: "100 * sum by (instance) (rate(http_requests_total{status=~\"^5.*\"}[5m])) 33 | / sum by (instance) (rate(http_requests_total{}[5m])) \n> 10\n" 34 | for: 5m 35 | labels: 36 | severity: critical 37 | - alert: GitLabHigh4xxResponses 38 | annotations: 39 | description: '{{ printf "%.2f" $value }}% of all requests returned 4XX HTTP 40 | responses, which is above the threshold 10%, indicating many failed requests 41 | on {{$labels.instance}}.' 42 | summary: Large rate of HTTP 4XX errors. 43 | expr: | 44 | 100 * sum by (instance) (rate(http_requests_total{status=~"^4.*"}[5m])) / sum by (instance) (rate(http_requests_total{}[5m])) 45 | > 10 46 | for: 5m 47 | labels: 48 | severity: warning 49 | -------------------------------------------------------------------------------- /assets/gitlab/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/gluster/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: gluster-volume.rules 3 | rules: 4 | - expr: | 5 | sum(max(gluster_subvol_capacity_used_bytes{job="glusterd2-client"}) BY (volume, subvolume)) BY (volume) 6 | record: gluster:volume_capacity_used_bytes_total:sum 7 | - expr: | 8 | sum(max(gluster_subvol_capacity_total_bytes{job="glusterd2-client"}) BY (volume, subvolume)) BY (volume) 9 | record: gluster:volume_capacity_total_bytes:sum 10 | -------------------------------------------------------------------------------- /assets/go-runtime/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/go-runtime/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/grafana/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: GrafanaAlerts 3 | rules: 4 | - alert: GrafanaRequestsFailing 5 | annotations: 6 | message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is 7 | experiencing {{ $value | humanize }}% errors' 8 | expr: | 9 | 100 * sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}) 10 | / 11 | sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) 12 | > 50 13 | for: 5m 14 | labels: 15 | severity: warning 16 | -------------------------------------------------------------------------------- /assets/grafana/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: grafana_rules 3 | rules: 4 | - expr: | 5 | sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) 6 | record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m 7 | -------------------------------------------------------------------------------- /assets/haproxy/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: HAProxyAlerts 3 | rules: 4 | - alert: HAProxyDroppingLogs 5 | annotations: 6 | description: HAProxy {{$labels.job}} on {{$labels.instance}} is dropping logs. 7 | summary: HAProxy is dropping logs. 8 | expr: rate(haproxy_process_dropped_logs_total[5m]) != 0 9 | for: 10m 10 | labels: 11 | severity: critical 12 | - alert: HAProxyBackendCheckFlapping 13 | annotations: 14 | description: HAProxy {{$labels.job}} backend {{$labels.proxy}} on {{$labels.instance}} 15 | has flapping checks. 16 | summary: HAProxy backend checks are flapping. 17 | expr: rate(haproxy_backend_check_up_down_total[5m]) != 0 18 | for: 10m 19 | labels: 20 | severity: critical 21 | - alert: HAProxyServerCheckFlapping 22 | annotations: 23 | description: HAProxy {{$labels.job}} server {{$labels.server}} on {{$labels.instance}} 24 | has flapping checks. 25 | summary: HAProxy server checks are flapping. 26 | expr: rate(haproxy_server_check_up_down_total[5m]) != 0 27 | for: 10m 28 | labels: 29 | severity: critical 30 | -------------------------------------------------------------------------------- /assets/haproxy/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: [] 2 | -------------------------------------------------------------------------------- /assets/harbor/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: Harbor 3 | rules: 4 | - alert: HarborComponentStatus 5 | annotations: 6 | description: Harbor {{ $labels.component }} has been down for more than 5 minutes 7 | summary: Harbor Component is Down. 8 | expr: | 9 | harbor_up == 0 10 | for: 5m 11 | labels: 12 | severity: critical 13 | - alert: HarborProjectQuataExceeded 14 | annotations: 15 | description: Harbor project {{ $labels.project_name }} has exceeded the configured 16 | disk usage quota for the past 15 minutes 17 | summary: Harbor project exceeds disk usage quota. 18 | expr: | 19 | harbor_project_quota_usage_byte > harbor_project_quota_byte and on(harbor_project_quota_usage_byte) harbor_project_quota_byte != -1 20 | for: 15m 21 | labels: 22 | severity: warning 23 | - alert: HarborHighErrorRate 24 | annotations: 25 | description: HTTP Requests of {{ $labels.instance }} are having a high Error 26 | rate 27 | summary: Harbor high error rate. 28 | expr: sum(rate(harbor_core_http_request_total{code=~"4..|5.."}[5m]))/sum(rate(harbor_core_http_request_total[5m])) 29 | > 0.15 30 | for: 5m 31 | labels: 32 | severity: warning 33 | -------------------------------------------------------------------------------- /assets/harbor/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/hass/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/hass/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/ibm-mq/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: ibm-mq-alerts 3 | rules: 4 | - alert: IBMMQExpiredMessages 5 | annotations: 6 | description: The number of expired messages in the {{$labels.qmgr}} is {{$labels.value}} 7 | which is above the threshold of 2. 8 | summary: There are expired messages, which imply that application resilience 9 | is failing. 10 | expr: | 11 | sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_expired_message_count) > 2 12 | for: 5m 13 | labels: 14 | severity: critical 15 | - alert: IBMMQStaleMessages 16 | annotations: 17 | description: A stale message with an age of {{$labels.value}} has been sitting 18 | in the {{$labels.queue}} which is above the threshold of 300s. 19 | summary: Stale messages have been detected. 20 | expr: | 21 | sum without (description,instance,job,platform) (ibmmq_queue_oldest_message_age) >= 300 22 | for: 5m 23 | labels: 24 | severity: warning 25 | - alert: IBMMQLowDiskSpace 26 | annotations: 27 | description: The amount of disk space available for {{$labels.qmgr}} is at {{$labels.value}}% 28 | which is below the threshold of 5%. 29 | summary: There is limited disk available for a queue manager. 30 | expr: | 31 | sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_queue_manager_file_system_free_space_percentage) <= 5 32 | for: 5m 33 | labels: 34 | severity: critical 35 | - alert: IBMMQHighQueueManagerCpuUsage 36 | annotations: 37 | description: The amount of CPU usage for the queue manager {{$labels.qmgr}} 38 | is at {{$labels.value}}% which is above the threshold of 85%. 39 | summary: There is a high CPU usage estimate for a queue manager. 40 | expr: | 41 | sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_user_cpu_time_estimate_for_queue_manager_percentage) >= 85 42 | for: 5m 43 | labels: 44 | severity: critical 45 | -------------------------------------------------------------------------------- /assets/ibm-mq/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/influxdb/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/ingress-nginx-mixin/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: nginx.rules 3 | rules: 4 | - alert: NginxConfigReloadFailed 5 | annotations: 6 | dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-job={{ 7 | $labels.job }}&var-controller_class={{ $labels.controller_class }} 8 | description: Nginx config reload failed for the controller with the class {{ 9 | $labels.controller_class }}. 10 | summary: Nginx config reload failed. 11 | expr: | 12 | sum( 13 | nginx_ingress_controller_config_last_reload_successful{job=~"ingress-nginx-controller-metrics"} 14 | ) by (cluster, job, controller_class) 15 | == 0 16 | for: 5m 17 | labels: 18 | severity: warning 19 | - alert: NginxHighHttp4xxErrorRate 20 | annotations: 21 | dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{ 22 | $labels.exported_namespace }}&var-ingress={{ $labels.ingress }} 23 | description: More than 5% HTTP requests with status 4xx for {{ $labels.exported_namespace 24 | }}/{{ $labels.ingress }} the past 5m. 25 | summary: Nginx high HTTP 4xx error rate. 26 | expr: | 27 | ( 28 | sum( 29 | rate( 30 | nginx_ingress_controller_requests{ 31 | job=~"ingress-nginx-controller-metrics", 32 | status=~"^4.*", 33 | ingress!~"" 34 | }[5m] 35 | ) 36 | ) by (cluster, exported_namespace, ingress) 37 | / 38 | sum( 39 | rate( 40 | nginx_ingress_controller_requests{ 41 | job=~"ingress-nginx-controller-metrics", 42 | ingress!~"" 43 | }[5m] 44 | ) 45 | ) by (cluster, exported_namespace, ingress) 46 | * 100 47 | ) > 5 48 | for: 1m 49 | labels: 50 | severity: info 51 | - alert: NginxHighHttp5xxErrorRate 52 | annotations: 53 | dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{ 54 | $labels.exported_namespace }}&var-ingress={{ $labels.ingress }} 55 | description: More than 5% HTTP requests with status 5xx for {{ $labels.exported_namespace 56 | }}/{{ $labels.ingress }} the past 5m. 57 | summary: Nginx high HTTP 5xx error rate. 58 | expr: | 59 | ( 60 | sum( 61 | rate( 62 | nginx_ingress_controller_requests{ 63 | job=~"ingress-nginx-controller-metrics", 64 | status=~"^5.*", 65 | ingress!~"" 66 | }[5m] 67 | ) 68 | ) by (cluster, exported_namespace, ingress) 69 | / 70 | sum( 71 | rate( 72 | nginx_ingress_controller_requests{ 73 | job=~"ingress-nginx-controller-metrics", 74 | ingress!~"" 75 | }[5m] 76 | ) 77 | ) by (cluster, exported_namespace, ingress) 78 | * 100 79 | ) > 5 80 | for: 1m 81 | labels: 82 | severity: warning 83 | -------------------------------------------------------------------------------- /assets/ingress-nginx-mixin/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: [] 2 | -------------------------------------------------------------------------------- /assets/istio/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/jaeger/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/jenkins/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/jenkins/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/jira/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: alert.rules 3 | rules: 4 | - alert: LicenseExpired 5 | annotations: 6 | description: The JIRA license has expired. 7 | summary: JIRA license expired. 8 | expr: jira_license_expiry_days_gauge <= 0 9 | for: 1m 10 | labels: 11 | severity: critical 12 | - alert: LicenseWarning 13 | annotations: 14 | description: The JIRA license will expire in less than one week. 15 | summary: License expiring soon. 16 | expr: jira_license_expiry_days_gauge <= 7 and jira_license_expiry_days_gauge > 17 | 0 18 | for: 1m 19 | labels: 20 | severity: warning 21 | - alert: NoUserCapacity 22 | annotations: 23 | description: There is no more capacity for additional users to be added to the 24 | system. 25 | summary: All available accounts are taken. 26 | expr: jira_all_users_gauge/jira_allowed_users_gauge == 1 27 | for: 1m 28 | labels: 29 | severity: critical 30 | - alert: EmailErrorsHigh 31 | annotations: 32 | description: More than 1% of emails have resulted in an error in the past minute. 33 | summary: Email errors are high. 34 | expr: jira_mail_queue_error_gauge /jira_mail_queue_gauge > 0.01 35 | for: 1m 36 | labels: 37 | severity: critical 38 | -------------------------------------------------------------------------------- /assets/jira/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/jvm/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: jvm-jvm-alerts 3 | rules: 4 | - alert: JvmMemoryFillingUp 5 | annotations: 6 | description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the 7 | last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%. 8 | summary: JVM heap memory filling up. 9 | expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", }))/(sum without 10 | (id) (jvm_memory_max_bytes{area="heap", } != -1))) * 100 > 80 11 | for: 5m 12 | keep_firing_for: 5m 13 | labels: 14 | severity: warning 15 | - alert: JvmThreadsDeadlocked 16 | annotations: 17 | description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}} 18 | are in a cyclic dependency with each other. The restart is required to resolve 19 | the deadlock.' 20 | summary: JVM deadlock detected. 21 | expr: (jvm_threads_deadlocked{}) > 0 22 | for: 2m 23 | keep_firing_for: 5m 24 | labels: 25 | severity: critical 26 | -------------------------------------------------------------------------------- /assets/jvm/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/kafka/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/kube-cockroachdb/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: cockroachdb 3 | rules: 4 | - alert: CockroachInstanceFlapping 5 | annotations: 6 | description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted 7 | {{ $value }} time(s) in 10m.' 8 | summary: CockroachDB instances have restarted in the last 10 minutes. 9 | expr: | 10 | resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5 11 | for: 1m 12 | labels: 13 | severity: warning 14 | - alert: CockroachLivenessMismatch 15 | annotations: 16 | description: Liveness mismatch for {{ $labels.instance }} 17 | summary: CockroachDB has liveness mismatches. 18 | expr: | 19 | (cockroachdb_liveness_livenodes{job="cockroachdb-public"}) 20 | != 21 | ignoring(instance) group_left() (count by(cluster, job) (up{job="cockroachdb-public"} == 1)) 22 | for: 5m 23 | labels: 24 | severity: warning 25 | - alert: CockroachVersionMismatch 26 | annotations: 27 | description: Cluster {{ $labels.cluster }} running {{ $value }} different versions 28 | summary: CockroachDB cluster is running different versions. 29 | expr: | 30 | count by(cluster) (count_values by(tag, cluster) ("version", cockroachdb_build_timestamp{job="cockroachdb-public"})) > 1 31 | for: 1h 32 | labels: 33 | severity: warning 34 | - alert: CockroachStoreDiskLow 35 | annotations: 36 | description: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ 37 | $value }} available disk fraction 38 | summary: CockroachDB is at low disk capacity. 39 | expr: | 40 | :cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15 41 | for: 30m 42 | labels: 43 | severity: critical 44 | - alert: CockroachClusterDiskLow 45 | annotations: 46 | description: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction 47 | summary: CockroachDB cluster is at critically low disk capacity. 48 | expr: | 49 | cluster:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.2 50 | for: 30m 51 | labels: 52 | severity: critical 53 | - alert: CockroachUnavailableRanges 54 | annotations: 55 | description: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges 56 | summary: CockroachDB has unavailable ranges. 57 | expr: | 58 | (sum by(instance, cluster) (cockroachdb_ranges_unavailable{job="cockroachdb-public"})) > 0 59 | for: 10m 60 | labels: 61 | severity: critical 62 | - alert: CockroachNoLeaseRanges 63 | annotations: 64 | description: Instance {{ $labels.instance }} has {{ $value }} ranges without 65 | leases 66 | summary: CockroachDB has ranges without leases. 67 | expr: | 68 | (sum by(instance, cluster) (cockroachdb_replicas_leaders_not_leaseholders{job="cockroachdb-public"})) > 0 69 | for: 10m 70 | labels: 71 | severity: warning 72 | - alert: CockroachHighOpenFDCount 73 | annotations: 74 | description: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value 75 | }} fraction used' 76 | summary: CockroachDB has too many open file descriptors. 77 | expr: | 78 | cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8 79 | for: 10m 80 | labels: 81 | severity: warning 82 | -------------------------------------------------------------------------------- /assets/kube-cockroachdb/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: cockroachdb.rules 3 | rules: 4 | - expr: | 5 | sum without(store) (cockroachdb_capacity{job="cockroachdb-public"}) 6 | record: node:cockroachdb_capacity:sum 7 | - expr: | 8 | sum without(instance) (node:cockroachdb_capacity:sum{job="cockroachdb-public"}) 9 | record: cluster:cockroachdb_capacity:sum 10 | - expr: | 11 | sum without(store) (cockroachdb_capacity_available{job="cockroachdb-public"}) 12 | record: node:cockroachdb_capacity_available:sum 13 | - expr: | 14 | sum without(instance) (node:cockroachdb_capacity_available:sum{job="cockroachdb-public"}) 15 | record: cluster:cockroachdb_capacity_available:sum 16 | - expr: | 17 | cockroachdb_capacity_available{job="cockroachdb-public"} / cockroachdb_capacity{job="cockroachdb-public"} 18 | record: :cockroachdb_capacity_available:ratio 19 | - expr: | 20 | node:cockroachdb_capacity_available:sum{job="cockroachdb-public"} / node:cockroachdb_capacity:sum{job="cockroachdb-public"} 21 | record: node:cockroachdb_capacity_available:ratio 22 | - expr: | 23 | cluster:cockroachdb_capacity_available:sum{job="cockroachdb-public"} / cluster:cockroachdb_capacity:sum{job="cockroachdb-public"} 24 | record: cluster:cockroachdb_capacity_available:ratio 25 | -------------------------------------------------------------------------------- /assets/kube-state-metrics/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: kube-state-metrics 3 | rules: 4 | - alert: KubeStateMetricsListErrors 5 | annotations: 6 | description: kube-state-metrics is experiencing errors at an elevated rate in 7 | list operations. This is likely causing it to not be able to expose metrics 8 | about Kubernetes objects correctly or at all. 9 | summary: kube-state-metrics is experiencing errors in list operations. 10 | expr: | 11 | (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) 12 | / 13 | sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) 14 | > 0.01 15 | for: 15m 16 | labels: 17 | severity: critical 18 | - alert: KubeStateMetricsWatchErrors 19 | annotations: 20 | description: kube-state-metrics is experiencing errors at an elevated rate in 21 | watch operations. This is likely causing it to not be able to expose metrics 22 | about Kubernetes objects correctly or at all. 23 | summary: kube-state-metrics is experiencing errors in watch operations. 24 | expr: | 25 | (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) 26 | / 27 | sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) 28 | > 0.01 29 | for: 15m 30 | labels: 31 | severity: critical 32 | - alert: KubeStateMetricsShardingMismatch 33 | annotations: 34 | description: kube-state-metrics pods are running with different --total-shards 35 | configuration, some Kubernetes objects may be exposed multiple times or not 36 | exposed at all. 37 | summary: kube-state-metrics sharding is misconfigured. 38 | expr: | 39 | stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 40 | for: 15m 41 | labels: 42 | severity: critical 43 | - alert: KubeStateMetricsShardsMissing 44 | annotations: 45 | description: kube-state-metrics shards are missing, some Kubernetes objects 46 | are not being exposed. 47 | summary: kube-state-metrics shards are missing. 48 | expr: | 49 | 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1 50 | - 51 | sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) 52 | != 0 53 | for: 15m 54 | labels: 55 | severity: critical 56 | -------------------------------------------------------------------------------- /assets/kube-state-metrics/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/kubernetes-autoscaling/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/memcached/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: memcached 3 | rules: 4 | - alert: MemcachedDown 5 | annotations: 6 | description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} is 7 | down for more than 15 minutes. 8 | summary: Memcached instance is down. 9 | expr: | 10 | memcached_up == 0 11 | for: 15m 12 | labels: 13 | severity: critical 14 | - alert: MemcachedConnectionLimitApproaching 15 | annotations: 16 | description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection 17 | usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes. 18 | summary: Memcached max connection limit is approaching. 19 | expr: | 20 | (memcached_current_connections / memcached_max_connections * 100) > 80 21 | for: 15m 22 | labels: 23 | severity: warning 24 | - alert: MemcachedConnectionLimitApproaching 25 | annotations: 26 | description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection 27 | usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes. 28 | summary: Memcached connections at critical level. 29 | expr: | 30 | (memcached_current_connections / memcached_max_connections * 100) > 95 31 | for: 15m 32 | labels: 33 | severity: critical 34 | - alert: MemcachedOutOfMemoryErrors 35 | annotations: 36 | description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} has 37 | OutOfMemory errors for at least 15 minutes, current rate is {{ printf "%0.0f" 38 | $value }} 39 | summary: Memcached has OutOfMemory errors. 40 | expr: | 41 | sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0 42 | for: 15m 43 | labels: 44 | severity: warning 45 | -------------------------------------------------------------------------------- /assets/memcached/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/microsoft-iis/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: microsoft-iis 3 | rules: 4 | - alert: MicrosoftIISHighNumberOfRejectedAsyncIORequests 5 | annotations: 6 | description: | 7 | The number of rejected async IO requests is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.site }} which is above the threshold of 20. 8 | summary: There are a high number of rejected async I/O requests for a site. 9 | expr: | 10 | increase(windows_iis_rejected_async_io_requests_total[5m]) > 20 11 | for: 5m 12 | labels: 13 | severity: warning 14 | - alert: MicrosoftIISHighNumberOf5xxRequestErrors 15 | annotations: 16 | description: | 17 | The number of 5xx request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of 5. 18 | summary: There are a high number of 5xx request errors for an application. 19 | expr: | 20 | sum without (pid, status_code)(increase(windows_iis_worker_request_errors_total{status_code=~"5.*"}[5m])) > 5 21 | for: 5m 22 | labels: 23 | severity: critical 24 | - alert: MicrosoftIISLowSuccessRateForWebsocketConnections 25 | annotations: 26 | description: | 27 | The success rate for websocket connections is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of 80. 28 | summary: There is a low success rate for websocket connections for an application. 29 | expr: | 30 | sum without (pid) (increase(windows_iis_worker_websocket_connection_accepted_total[5m]) / clamp_min(increase(windows_iis_worker_websocket_connection_attempts_total[5m]),1)) * 100 > 80 31 | for: 5m 32 | labels: 33 | severity: critical 34 | - alert: MicrosoftIISThreadpoolUtilizationNearingMax 35 | annotations: 36 | description: | 37 | The threadpool utilization is at {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of 90. 38 | summary: The thread pool utilization is nearing max capacity. 39 | expr: | 40 | sum without (pid, state)(windows_iis_worker_threads / windows_iis_worker_max_threads) * 100 > 90 41 | for: 5m 42 | labels: 43 | severity: critical 44 | - alert: MicrosoftIISHighNumberOfWorkerProcessFailures 45 | annotations: 46 | description: | 47 | The number of worker process failures is at {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of 10. 48 | summary: There are a high number of worker process failures for an application. 49 | expr: | 50 | increase(windows_iis_total_worker_process_failures[5m]) > 10 51 | for: 5m 52 | labels: 53 | severity: warning 54 | -------------------------------------------------------------------------------- /assets/microsoft-iis/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/mongodb-atlas/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/mongodb/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/mysql/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/mysql/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/nginx/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/nginx/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/node-exporter/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: node-exporter.rules 3 | rules: 4 | - expr: | 5 | count without (cpu, mode) ( 6 | node_cpu_seconds_total{job="node",mode="idle"} 7 | ) 8 | record: instance:node_num_cpu:sum 9 | - expr: | 10 | 1 - avg without (cpu) ( 11 | sum without (mode) (rate(node_cpu_seconds_total{job="node", mode=~"idle|iowait|steal"}[5m])) 12 | ) 13 | record: instance:node_cpu_utilisation:rate5m 14 | - expr: | 15 | ( 16 | node_load1{job="node"} 17 | / 18 | instance:node_num_cpu:sum{job="node"} 19 | ) 20 | record: instance:node_load1_per_cpu:ratio 21 | - expr: | 22 | 1 - ( 23 | ( 24 | node_memory_MemAvailable_bytes{job="node"} 25 | or 26 | ( 27 | node_memory_Buffers_bytes{job="node"} 28 | + 29 | node_memory_Cached_bytes{job="node"} 30 | + 31 | node_memory_MemFree_bytes{job="node"} 32 | + 33 | node_memory_Slab_bytes{job="node"} 34 | ) 35 | ) 36 | / 37 | node_memory_MemTotal_bytes{job="node"} 38 | ) 39 | record: instance:node_memory_utilisation:ratio 40 | - expr: | 41 | rate(node_vmstat_pgmajfault{job="node"}[5m]) 42 | record: instance:node_vmstat_pgmajfault:rate5m 43 | - expr: | 44 | rate(node_disk_io_time_seconds_total{job="node", device!=""}[5m]) 45 | record: instance_device:node_disk_io_time_seconds:rate5m 46 | - expr: | 47 | rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[5m]) 48 | record: instance_device:node_disk_io_time_weighted_seconds:rate5m 49 | - expr: | 50 | sum without (device) ( 51 | rate(node_network_receive_bytes_total{job="node", device!="lo"}[5m]) 52 | ) 53 | record: instance:node_network_receive_bytes_excluding_lo:rate5m 54 | - expr: | 55 | sum without (device) ( 56 | rate(node_network_transmit_bytes_total{job="node", device!="lo"}[5m]) 57 | ) 58 | record: instance:node_network_transmit_bytes_excluding_lo:rate5m 59 | - expr: | 60 | sum without (device) ( 61 | rate(node_network_receive_drop_total{job="node", device!="lo"}[5m]) 62 | ) 63 | record: instance:node_network_receive_drop_excluding_lo:rate5m 64 | - expr: | 65 | sum without (device) ( 66 | rate(node_network_transmit_drop_total{job="node", device!="lo"}[5m]) 67 | ) 68 | record: instance:node_network_transmit_drop_excluding_lo:rate5m 69 | -------------------------------------------------------------------------------- /assets/nodejs/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: NodejsAlerts 3 | rules: 4 | - alert: NodejsDown 5 | annotations: 6 | description: Node.js {{$labels.job}} on {{$labels.instance}} is not up. 7 | summary: Node.js not up. 8 | expr: absent(nodejs_version_info) or (sum by (version) (nodejs_version_info) < 9 | 1) 10 | for: 0m 11 | labels: 12 | severity: critical 13 | -------------------------------------------------------------------------------- /assets/nodejs/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/nomad/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/nomad/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/nsq/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: nsq 3 | rules: 4 | - alert: NsqTopicDepthIncreasing 5 | annotations: 6 | description: | 7 | Topic {{ $labels.topic }} depth is higher than 100. The current queue is {{ $value }}. 8 | summary: Topic depth is increasing. 9 | expr: | 10 | sum by (topic) (nsq_topic_depth) > 100 11 | for: 5m 12 | labels: 13 | severity: critical 14 | - alert: NsqChannelDepthIncreasing 15 | annotations: 16 | description: | 17 | Channel {{ $labels.channel }} depth in topic {{ $labels.topic }} is higher than 100. The current queue is {{ $value }}. 18 | summary: Topic channel depth is increasing. 19 | expr: | 20 | sum by (topic) (nsq_topic_channel_backend_depth) > 100 21 | for: 5m 22 | labels: 23 | severity: critical 24 | -------------------------------------------------------------------------------- /assets/nsq/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/openldap/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: openldap-alerts 3 | rules: 4 | - alert: OpenLDAPConnectionSpike 5 | annotations: 6 | description: There are {{ printf "%.0f" $value }} OpenLDAP connections on instance 7 | {{$labels.instance}}, which is above the threshold of 100. 8 | summary: A sudden spike in OpenLDAP connections indicates potential high usage 9 | or security issues. 10 | expr: | 11 | increase(openldap_monitor_counter_object{dn="cn=Current,cn=Connections,cn=Monitor"}[5m]) > 100 12 | for: 5m 13 | labels: 14 | severity: warning 15 | - alert: OpenLDAPHighSearchOperationRateSpike 16 | annotations: 17 | description: The rate of search operations in OpenLDAP on instance {{$labels.instance}} 18 | has increased by {{ printf "%.0f" $value }} percent in the last 5 minutes, 19 | compared to the average over the last 15 minutes, which is above the threshold 20 | of 200 percent. 21 | summary: A significant spike in OpenLDAP search operations indicates inefficient 22 | queries, potential abuse, or unintended heavy load. 23 | expr: "100 * (\n rate(openldap_monitor_operation{dn=\"cn=Search,cn=Operations,cn=Monitor\"}[5m]) 24 | \n / \n clamp_min(rate(openldap_monitor_operation{dn=\"cn=Search,cn=Operations,cn=Monitor\"}[15m] 25 | offset 5m), 0.0001)\n) > 200\n" 26 | for: 5m 27 | labels: 28 | severity: warning 29 | - alert: OpenLDAPDialFailures 30 | annotations: 31 | description: LDAP dial failures on instance {{$labels.instance}} have increased 32 | by {{ printf "%.0f" $value }} in the last 10 minutes, which is above the threshold 33 | of 10. 34 | summary: Significant increase in LDAP dial failures indicates network issues, 35 | problems with the LDAP service, or configuration errors that may lead to service 36 | unavailability. 37 | expr: | 38 | increase(openldap_dial{result!="ok"}[10m]) > 10 39 | for: 10m 40 | labels: 41 | severity: warning 42 | - alert: OpenLDAPBindFailureRateIncrease 43 | annotations: 44 | description: LDAP bind failures on instance {{$labels.instance}} have increased 45 | by {{ printf "%.0f" $value }} in the last 10 minutes, which is above the threshold 46 | of 10. 47 | summary: Significant increase in LDAP bind failures indicates authentication 48 | issues, potential security threats or problems with user directories. 49 | expr: | 50 | increase(openldap_bind{result!="ok"}[10m]) > 10 51 | for: 10m 52 | labels: 53 | severity: warning 54 | -------------------------------------------------------------------------------- /assets/openldap/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/opensearch/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/openstack/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/oracledb/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: OracleDBAlerts 3 | rules: 4 | - alert: OracledbReachingSessionLimit 5 | annotations: 6 | description: '{{ printf "%.2f" $value }}% of sessions are being utilized which 7 | is above the threshold 85%. This could mean that {{$labels.instance}} is being 8 | overutilized.' 9 | summary: The number of sessions being utilized exceeded 85%. 10 | expr: | 11 | oracledb_resource_current_utilization{resource_name="sessions"} / oracledb_resource_limit_value{resource_name="sessions"} * 100 > 85 12 | for: 5m 13 | labels: 14 | severity: critical 15 | - alert: OracledbReachingProcessLimit 16 | annotations: 17 | description: '{{ printf "%.2f" $value }} of processes are being utilized which 18 | is above thethreshold 85%. This could potentially mean that {{$labels.instance}} 19 | runs out of processes it can spin up.' 20 | summary: The number of processess being utilized exceeded the threshold of 85%. 21 | expr: | 22 | oracledb_resource_current_utilization{resource_name="processes"} / oracledb_resource_limit_value{resource_name="processes"} * 100 > 85 23 | for: 5m 24 | labels: 25 | severity: critical 26 | - alert: OracledbTablespaceReachingCapacity 27 | annotations: 28 | description: '{{ printf "%.2f" $value }}% of bytes are being utilized by the 29 | tablespace {{$labels.tablespace}} on the instance {{$labels.instance}}, which 30 | is above the threshold 85%.' 31 | summary: A tablespace is exceeding more than 85% of its maximum allotted space. 32 | expr: | 33 | oracledb_tablespace_bytes / oracledb_tablespace_max_bytes * 100 > 85 34 | for: 5m 35 | labels: 36 | severity: critical 37 | -------------------------------------------------------------------------------- /assets/oracledb/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/pgbouncer/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: pgbouncer 3 | rules: 4 | - alert: PGBouncerHighNumberClientWaitingConnections 5 | annotations: 6 | description: | 7 | The number of clients waiting for connections on {{ $labels.instance }} is now above 20. The current value is {{ $value | printf "%.2f" }}. 8 | summary: May indicate a bottleneck in connection pooling where too many clients 9 | are waiting for available server connections. 10 | expr: | 11 | pgbouncer_pools_client_waiting_connections{job="integrations/pgbouncer"} > 20 12 | for: 5m 13 | labels: 14 | severity: warning 15 | - alert: PGBouncerHighClientWaitTime 16 | annotations: 17 | description: | 18 | The wait time for user connections on {{ $labels.instance }}, is above 15. The current value is {{ $value | printf "%.2f" }}. 19 | summary: Clients are experiencing significant delays, which could indicate issues 20 | with connection pool saturation or server performance. 21 | expr: | 22 | pgbouncer_pools_client_maxwait_seconds{job="integrations/pgbouncer"} > 15 23 | for: 5m 24 | labels: 25 | severity: warning 26 | - alert: PGBouncerHighServerConnectionSaturationWarning 27 | annotations: 28 | description: | 29 | User connection capacity on {{ $labels.instance }}, is above 80%. The current value is {{ $value | printf "%.2f" }}. 30 | summary: PGBouncer is nearing user connection capacity. 31 | expr: | 32 | 100 * (sum without (database, user) (pgbouncer_pools_server_active_connections{job="integrations/pgbouncer"} + pgbouncer_pools_server_idle_connections{job="integrations/pgbouncer"} + pgbouncer_pools_server_used_connections{job="integrations/pgbouncer"}) / clamp_min(pgbouncer_config_max_user_connections{job="integrations/pgbouncer"},1)) > 80 33 | for: 5m 34 | labels: 35 | severity: warning 36 | - alert: PGBouncerHighServerConnectionSaturationCritical 37 | annotations: 38 | description: | 39 | User connection capacity on {{ $labels.instance }}, is above 90%. The current value is {{ $value | printf "%.2f" }}. 40 | summary: PGBouncer is nearing critical levels of user connection capacity. 41 | expr: | 42 | 100 * (sum without (database, user) (pgbouncer_pools_server_active_connections{job="integrations/pgbouncer"} + pgbouncer_pools_server_idle_connections{job="integrations/pgbouncer"} + pgbouncer_pools_server_used_connections{job="integrations/pgbouncer"}) / clamp_min(pgbouncer_config_max_user_connections{job="integrations/pgbouncer"},1)) > 90 43 | for: 5m 44 | labels: 45 | severity: critical 46 | -------------------------------------------------------------------------------- /assets/pgbouncer/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/postgres-exporter/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/presto/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/prometheus-operator/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/prometheus/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/promscale/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/promtail/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: promtail_alerts 3 | rules: 4 | - alert: PromtailRequestsErrors 5 | annotations: 6 | description: | 7 | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. 8 | summary: Promtail request error rate is high. 9 | expr: | 10 | 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) 11 | / 12 | sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) 13 | > 10 14 | for: 15m 15 | labels: 16 | severity: critical 17 | - alert: PromtailRequestLatency 18 | annotations: 19 | description: | 20 | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. 21 | summary: Promtail request latency P99 is high. 22 | expr: | 23 | job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1 24 | for: 15m 25 | labels: 26 | severity: critical 27 | - alert: PromtailFileMissing 28 | annotations: 29 | description: | 30 | {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed. 31 | summary: Promtail cannot find a file it should be tailing. 32 | expr: | 33 | promtail_file_bytes_total unless promtail_read_bytes_total 34 | for: 15m 35 | labels: 36 | severity: warning 37 | -------------------------------------------------------------------------------- /assets/promtail/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: promtail_rules 3 | rules: 4 | - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) 5 | by (le, job)) 6 | record: job:promtail_request_duration_seconds:99quantile 7 | - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) 8 | by (le, job)) 9 | record: job:promtail_request_duration_seconds:50quantile 10 | - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m])) 11 | by (job) 12 | record: job:promtail_request_duration_seconds:avg 13 | - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job) 14 | record: job:promtail_request_duration_seconds_bucket:sum_rate 15 | - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) 16 | record: job:promtail_request_duration_seconds_sum:sum_rate 17 | - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job) 18 | record: job:promtail_request_duration_seconds_count:sum_rate 19 | - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) 20 | by (le, job, namespace)) 21 | record: job_namespace:promtail_request_duration_seconds:99quantile 22 | - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) 23 | by (le, job, namespace)) 24 | record: job_namespace:promtail_request_duration_seconds:50quantile 25 | - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) 26 | / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) 27 | record: job_namespace:promtail_request_duration_seconds:avg 28 | - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace) 29 | record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate 30 | - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) 31 | record: job_namespace:promtail_request_duration_seconds_sum:sum_rate 32 | - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) 33 | record: job_namespace:promtail_request_duration_seconds_count:sum_rate 34 | - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) 35 | by (le, job, status_code, namespace)) 36 | record: job_status_code_namespace:promtail_request_duration_seconds:99quantile 37 | - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) 38 | by (le, job, status_code, namespace)) 39 | record: job_status_code_namespace:promtail_request_duration_seconds:50quantile 40 | - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, 41 | namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, 42 | status_code, namespace) 43 | record: job_status_code_namespace:promtail_request_duration_seconds:avg 44 | - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, 45 | namespace) 46 | record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate 47 | - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, 48 | namespace) 49 | record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate 50 | - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, 51 | namespace) 52 | record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate 53 | -------------------------------------------------------------------------------- /assets/python-runtime/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/python-runtime/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/rabbitmq/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: RabbitMQClusterAlerts 3 | rules: 4 | - alert: RabbitMQMemoryHigh 5 | annotations: 6 | description: A node {{ $labels.instance }} is using more than 90% of allocated 7 | RAM. 8 | summary: RabbitMQ memory usage is high. 9 | expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes 10 | * 100 > 90 11 | for: 5m 12 | labels: 13 | severity: warning 14 | - alert: RabbitMQFileDescriptorsUsage 15 | annotations: 16 | description: A node {{ $labels.instance }} is using more than 90% of file descriptors. 17 | summary: RabbitMQ file descriptors usage is high. 18 | expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 19 | for: 5m 20 | labels: 21 | severity: warning 22 | - alert: RabbitMQUnroutableMessages 23 | annotations: 24 | description: A queue has unroutable messages on {{ $labels.instance }}. 25 | summary: A RabbitMQ queue has unroutable messages. 26 | expr: increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or 27 | increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0 28 | for: 5m 29 | labels: 30 | severity: warning 31 | - alert: RabbitMQNodeNotDistributed 32 | annotations: 33 | description: "Distribution link state is not 'up' on {{ $labels.instance }}, 34 | current value is {{ $value }}. \nNote: The state is represented as a numerical 35 | value where pending=1, up_pending=2 and up=3." 36 | summary: RabbitMQ node not distributed, link state is down. 37 | expr: erlang_vm_dist_node_state{rabbitmq_cluster!=""} < 3 38 | for: 5m 39 | labels: 40 | severity: critical 41 | -------------------------------------------------------------------------------- /assets/rabbitmq/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/rclone/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/rclone/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/redis/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/redis/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/ruby/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/ruby/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/sap-hana/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/sealed-secrets/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: sealed-secrets 3 | rules: 4 | - alert: SealedSecretsUnsealErrorHigh 5 | annotations: 6 | description: High number of errors during unsealing Sealed Secrets in {{ $labels.namespace 7 | }} namespace. 8 | runbook_url: https://github.com/bitnami-labs/sealed-secrets 9 | summary: Sealed Secrets Unseal Error High 10 | expr: | 11 | sum by (reason, namespace) (rate(sealed_secrets_controller_unseal_errors_total{}[5m])) > 0 12 | labels: 13 | severity: warning 14 | -------------------------------------------------------------------------------- /assets/sealed-secrets/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: [] 2 | -------------------------------------------------------------------------------- /assets/snmp/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/spark/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/spark/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/spinnaker/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: igor 3 | rules: 4 | - alert: PollingMonitorItemsOverThreshold 5 | annotations: 6 | description: '{{ $labels.monitor }} polling monitor for {{ $labels.partition 7 | }} threshold exceeded, preventing pipeline triggers.' 8 | runbook_url: https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds 9 | summary: Polling monitor item threshold exceeded. 10 | expr: sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0 11 | for: 5m 12 | labels: 13 | severity: critical 14 | -------------------------------------------------------------------------------- /assets/spinnaker/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: [] 2 | -------------------------------------------------------------------------------- /assets/spring-boot/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: jvm-micrometer-jvm-alerts 3 | rules: 4 | - alert: JvmMemoryFillingUp 5 | annotations: 6 | description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the 7 | last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%. 8 | summary: JVM heap memory filling up. 9 | expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without 10 | (id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80 11 | for: 5m 12 | keep_firing_for: 5m 13 | labels: 14 | severity: warning 15 | -------------------------------------------------------------------------------- /assets/spring-boot/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/squid/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: squid 3 | rules: 4 | - alert: SquidHighPercentageOfHTTPServerRequestErrors 5 | annotations: 6 | description: | 7 | The percentage of HTTP server request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} which is above the threshold of 5. 8 | summary: There are a high number of HTTP server errors. 9 | expr: | 10 | rate(squid_server_http_errors_total[5m]) / clamp_min(rate(squid_server_http_requests_total[5m]),1) * 100 > 5 11 | for: 5m 12 | labels: 13 | severity: critical 14 | - alert: SquidHighPercentageOfFTPServerRequestErrors 15 | annotations: 16 | description: | 17 | The percentage of FTP server request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} which is above the threshold of 5. 18 | summary: There are a high number of FTP server request errors. 19 | expr: | 20 | rate(squid_server_ftp_errors_total[5m]) / clamp_min(rate(squid_server_ftp_requests_total[5m]),1) * 100 > 5 21 | for: 5m 22 | labels: 23 | severity: critical 24 | - alert: SquidHighPercentageOfOtherServerRequestErrors 25 | annotations: 26 | description: | 27 | The percentage of other server request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} which is above the threshold of 5. 28 | summary: There are a high number of other server request errors. 29 | expr: | 30 | rate(squid_server_other_errors_total[5m]) / clamp_min(rate(squid_server_other_requests_total[5m]),1) * 100 > 5 31 | for: 5m 32 | labels: 33 | severity: critical 34 | - alert: SquidHighPercentageOfClientRequestErrors 35 | annotations: 36 | description: | 37 | The percentage of HTTP client request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} which is above the threshold of 5. 38 | summary: There are a high number of HTTP client request errors. 39 | expr: | 40 | rate(squid_client_http_errors_total[5m]) / clamp_min(rate(squid_client_http_requests_total[5m]),1) * 100 > 5 41 | for: 5m 42 | labels: 43 | severity: critical 44 | - alert: SquidLowCacheHitRatio 45 | annotations: 46 | description: | 47 | The cache hit ratio is {{ printf "%.0f" $value }} over the last 10m on {{ $labels.instance }} which is below the threshold of 85. 48 | summary: The cache hit ratio has fallen below the configured threshold (%). 49 | expr: | 50 | rate(squid_client_http_hits_total[10m]) / clamp_min(rate(squid_client_http_requests_total[10m]),1) * 100 < 85 51 | for: 10m 52 | labels: 53 | severity: warning 54 | -------------------------------------------------------------------------------- /assets/squid/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/supabase/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/supabase/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/tensorflow/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: TensorFlowServingAlerts 3 | rules: 4 | - alert: TensorFlowModelRequestHighErrorRate 5 | annotations: 6 | description: '{{ printf "%.2f" $value }}% of all model requests are not successful, 7 | which is above the threshold 30%, indicating a potentially larger issue for 8 | {{$labels.instance}}' 9 | summary: More than 30% of all model requests are not successful. 10 | expr: | 11 | 100 * sum(rate(:tensorflow:serving:request_count{status!="OK"}[5m])) by (instance) / sum(rate(:tensorflow:serving:request_count[5m])) by (instance) > 30 12 | for: 5m 13 | labels: 14 | severity: critical 15 | - alert: TensorFlowServingHighBatchQueuingLatency 16 | annotations: 17 | description: Batch queuing latency greater than {{ printf "%.2f" $value }}µs, 18 | which is above the threshold 5000000µs, indicating a potentially larger issue 19 | for {{$labels.instance}} 20 | summary: Batch queuing latency more than 5000000µs. 21 | expr: | 22 | increase(:tensorflow:serving:batching_session:queuing_latency_sum[2m]) / increase(:tensorflow:serving:batching_session:queuing_latency_count[2m]) > 5000000 23 | for: 5m 24 | labels: 25 | severity: warning 26 | -------------------------------------------------------------------------------- /assets/tensorflow/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/traefik/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/traefik/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/ubnt-edgerouter/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/ubnt-edgerouter/rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: ubnt.rules 3 | rules: 4 | - expr: label_join(ifAdminStatus,"nicename", ":", "ifName", "ifAlias") 5 | record: ifNiceName 6 | -------------------------------------------------------------------------------- /assets/varnish/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: varnish-cache 3 | rules: 4 | - alert: VarnishCacheLowCacheHitRate 5 | annotations: 6 | description: The Cache hit rate is {{ printf "%.0f" $value }} percent over the 7 | last 5 minutes on {{$labels.instance}}, which is below the threshold of 80 8 | percent. 9 | summary: Cache is not answering a sufficient percentage of read requests. 10 | expr: | 11 | increase(varnish_main_cache_hit[10m]) / (clamp_min((increase(varnish_main_cache_hit[10m]) + increase(varnish_main_cache_miss[10m])), 1)) * 100 < 80 and (increase(varnish_main_cache_hit[10m]) + increase(varnish_main_cache_miss[10m]) > 0) 12 | for: 10m 13 | labels: 14 | severity: warning 15 | - alert: VarnishCacheHighMemoryUsage 16 | annotations: 17 | description: Current Memory Usage is {{ printf "%.0f" $value }} percent on {{$labels.instance}}, 18 | which is above the threshold of 90 percent. 19 | summary: Varnish Cache is running low on available memory. 20 | expr: | 21 | (varnish_sma_g_bytes{type="s0"} / (varnish_sma_g_bytes{type="s0"} + varnish_sma_g_space{type="s0"})) * 100 > 90 22 | for: 5m 23 | labels: 24 | severity: warning 25 | - alert: VarnishCacheHighCacheEvictionRate 26 | annotations: 27 | description: The Cache has evicted {{ printf "%.0f" $value }} objects over the 28 | last 5 minutes on {{$labels.instance}}, which is above the threshold of 0. 29 | summary: The cache is evicting too many objects. 30 | expr: | 31 | increase(varnish_main_n_lru_nuked[5m]) > 0 32 | for: 5m 33 | labels: 34 | severity: critical 35 | - alert: VarnishCacheHighSaturation 36 | annotations: 37 | description: The thread queue length is {{ printf "%.0f" $value }} over the 38 | last 5 minutes on {{$labels.instance}}, which is above the threshold of 0. 39 | summary: There are too many threads in queue, Varnish is saturated and responses 40 | are slowed. 41 | expr: | 42 | varnish_main_thread_queue_len > 0 43 | for: 5m 44 | labels: 45 | severity: warning 46 | - alert: VarnishCacheSessionsDropping 47 | annotations: 48 | description: The amount of sessions dropped is {{ printf "%.0f" $value }} over 49 | the last 5 minutes on {{$labels.instance}}, which is above the threshold of 50 | 0. 51 | summary: Incoming requests are being dropped due to a lack of free worker threads. 52 | expr: | 53 | increase(varnish_main_sessions{type="dropped"}[5m]) > 0 54 | for: 5m 55 | labels: 56 | severity: critical 57 | - alert: VarnishCacheBackendUnhealthy 58 | annotations: 59 | description: The amount of unhealthy backend statuses detected is {{ printf 60 | "%.0f" $value }} over the last 5 minutes on {{$labels.instance}}, which is 61 | above the threshold of 0. 62 | summary: Backend has been marked as unhealthy due to slow 200 responses. 63 | expr: | 64 | increase(varnish_main_backend_unhealthy[5m]) > 0 65 | for: 5m 66 | labels: 67 | severity: critical 68 | -------------------------------------------------------------------------------- /assets/varnish/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/vault/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/vault/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/velero/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: velero 3 | rules: 4 | - alert: VeleroBackupFailure 5 | annotations: 6 | description: | 7 | Backup failures detected on {{ $labels.instance }}. This could lead to data loss or inability to recover in case of a disaster. 8 | summary: Velero backup failures detected. 9 | expr: | 10 | increase(velero_backup_failure_total{job="integrations/velero"}[5m]) > 0 11 | for: 5m 12 | labels: 13 | severity: critical 14 | - alert: VeleroHighBackupDuration 15 | annotations: 16 | description: | 17 | Backup duration on {{ $labels.instance }} is higher than the average duration over the past 48 hours. This could indicate performance issues or network congestion. The current value is {{ $value | printf "%.2f" }} seconds. 18 | summary: Velero backups taking longer than usual. 19 | expr: | 20 | histogram_quantile(0.5, sum(rate(velero_backup_duration_seconds_bucket{job="integrations/velero"}[5m])) by (le, schedule)) > 1.2 * 1.2 * avg_over_time(histogram_quantile(0.5, sum(rate(velero_backup_duration_seconds_bucket{job="integrations/velero"}[48h])) by (le, schedule))[5m:]) 21 | for: 5m 22 | labels: 23 | severity: warning 24 | - alert: VeleroHighRestoreFailureRate 25 | annotations: 26 | description: | 27 | Restore failures detected on {{ $labels.instance }}. This could prevent timely data recovery and business continuity. 28 | summary: Velero restore failures detected. 29 | expr: | 30 | increase(velero_restore_failed_total{job="integrations/velero"}[5m]) > 0 31 | for: 5m 32 | labels: 33 | severity: critical 34 | - alert: VeleroUpStatus 35 | annotations: 36 | description: "Cannot find any metrics related to Velero on {{ $labels.instance 37 | }}. This may indicate further issues with Velero or the scraping agent. \n" 38 | summary: Velero is down. 39 | expr: | 40 | up{job="integrations/velero"} != 0 41 | for: 5m 42 | labels: 43 | severity: critical 44 | -------------------------------------------------------------------------------- /assets/velero/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/wildfly/alerts.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: wildfly 3 | rules: 4 | - alert: HighPercentageOfErrorResponses 5 | annotations: 6 | description: | 7 | The percentage of error responses is {{ printf "%.2f" $value }} on {{ $labels.instance }} - {{ $labels.server }} which is higher than {{30 }}. 8 | summary: Large percentage of requests are resulting in 5XX responses. 9 | expr: | 10 | sum by (job, instance, server) (increase(wildfly_undertow_error_count_total{}[5m]) / increase(wildfly_undertow_request_count_total{}[5m])) * 100 > 30 11 | for: 5m 12 | labels: 13 | severity: critical 14 | - alert: HighNumberOfRejectedSessionsForDeployment 15 | annotations: 16 | description: | 17 | Deployemnt {{ $labels.deployment }} on {{ $labels.instance }} is exceeding the threshold for rejected sessions {{ printf "%.0f" $value }} is higher than 20. 18 | summary: Large number of sessions are being rejected for a deployment. 19 | expr: | 20 | sum by (deployment, instance, job) (increase(wildfly_undertow_rejected_sessions_total{}[5m])) > 20 21 | for: 5m 22 | labels: 23 | severity: critical 24 | -------------------------------------------------------------------------------- /assets/wildfly/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/windows-active-directory/rules.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /assets/windows/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/wso2-enterprise-integrator/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/wso2-enterprise-integrator/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/wso2-streaming-integrator/alerts.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /assets/wso2-streaming-integrator/rules.yaml: -------------------------------------------------------------------------------- 1 | null 2 | -------------------------------------------------------------------------------- /hack/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/monitoring-mixins/website/hack 2 | 3 | go 1.23 4 | 5 | require ( 6 | github.com/brancz/gojsontoyaml v0.0.0-20191212081931-bf2969bbd742 7 | github.com/google/go-jsonnet v0.20.0 8 | github.com/jsonnet-bundler/jsonnet-bundler v0.6.0 9 | ) 10 | 11 | require ( 12 | github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect 13 | github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect 14 | github.com/fatih/color v1.13.0 // indirect 15 | github.com/ghodss/yaml v1.0.0 // indirect 16 | github.com/mattn/go-colorable v0.1.12 // indirect 17 | github.com/mattn/go-isatty v0.0.14 // indirect 18 | github.com/pkg/errors v0.9.1 // indirect 19 | golang.org/x/sys v0.1.0 // indirect 20 | gopkg.in/alecthomas/kingpin.v2 v2.2.6 // indirect 21 | gopkg.in/yaml.v2 v2.2.7 // indirect 22 | sigs.k8s.io/yaml v1.1.0 // indirect 23 | ) 24 | -------------------------------------------------------------------------------- /hack/tools.go: -------------------------------------------------------------------------------- 1 | //+build tools 2 | 3 | // Package tools tracks dependencies for tools that used in the build process. 4 | // See https://github.com/golang/go/wiki/Modules 5 | package hack 6 | 7 | import ( 8 | _ "github.com/brancz/gojsontoyaml" 9 | _ "github.com/google/go-jsonnet/cmd/jsonnet" 10 | _ "github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb" 11 | ) 12 | -------------------------------------------------------------------------------- /netlify.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | base = "site/" 3 | publish = "public" 4 | command = "hugo --gc --minify" 5 | 6 | [context.production.environment] 7 | HUGO_VERSION = "0.70.0" 8 | HUGO_ENV = "production" 9 | HUGO_ENABLEGITINFO = "true" 10 | 11 | [context.split1] 12 | command = "hugo --gc --minify --enableGitInfo" 13 | 14 | [context.split1.environment] 15 | HUGO_VERSION = "0.70.0" 16 | HUGO_ENV = "production" 17 | 18 | [context.deploy-preview] 19 | command = "hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL" 20 | 21 | [context.deploy-preview.environment] 22 | HUGO_VERSION = "0.70.0" 23 | 24 | [context.branch-deploy] 25 | command = "hugo --gc --minify -b $DEPLOY_PRIME_URL" 26 | 27 | [context.branch-deploy.environment] 28 | HUGO_VERSION = "0.70.0" 29 | 30 | [context.next.environment] 31 | HUGO_ENABLEGITINFO = "true" 32 | -------------------------------------------------------------------------------- /site/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | baseURL: "https://monitoring.mixins.dev/" 3 | # baseURL: "" 4 | languageCode: "en-us" 5 | title: "Monitoring Mixins" 6 | 7 | theme: 'ace-documentation' 8 | 9 | # Google analytics 10 | # googleAnalytics: UA-123456789-1 11 | 12 | permalinks: 13 | post: /:filename/ 14 | 15 | params: 16 | project_name: Monitoring Mixins 17 | 18 | project_tagline: Combination of alerts, recording rules, and dashboards for prometheus exporters 19 | 20 | disableSearch: true 21 | disableReadmoreNav: true 22 | 23 | markup: 24 | highlight: 25 | style: monokailight 26 | 27 | menu: 28 | shortcuts: 29 | - name: Homepage 30 | url: / 31 | weight: 1 32 | - name: About mixins 33 | url: "https://github.com/monitoring-mixins/docs" 34 | weight: 2 35 | - name: "GitHub" 36 | url: "https://github.com/monitoring-mixins/website" 37 | weight: 3 38 | 39 | -------------------------------------------------------------------------------- /site/content/MSSQL/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: MSSQL 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/mssql-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/MSSQL/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### MSSQLAlerts 20 | 21 | ##### MSSQLHighNumberOfDeadlocks 22 | 23 | {{< code lang="yaml" >}} 24 | alert: MSSQLHighNumberOfDeadlocks 25 | annotations: 26 | description: '{{ printf "%.2f" $value }} deadlocks have occurred over the last 5 27 | minutes on {{$labels.instance}}, which is above threshold of 10 deadlocks.' 28 | summary: There are deadlocks ocurring in the database. 29 | expr: | 30 | increase(mssql_deadlocks_total{}[5m]) > 10 31 | for: 5m 32 | labels: 33 | severity: warning 34 | {{< /code >}} 35 | 36 | ##### MSSQLModerateReadStallTime 37 | 38 | {{< code lang="yaml" >}} 39 | alert: MSSQLModerateReadStallTime 40 | annotations: 41 | description: '{{ printf "%.2f" $value }}ms of IO read stall has occurred on {{$labels.instance}}, 42 | which is above threshold of 200ms.' 43 | summary: There is a moderate amount of IO stall for database reads. 44 | expr: | 45 | 1000 * increase(mssql_io_stall_seconds_total{operation="read"}[5m]) > 200 46 | for: 5m 47 | labels: 48 | severity: warning 49 | {{< /code >}} 50 | 51 | ##### MSSQLHighReadStallTime 52 | 53 | {{< code lang="yaml" >}} 54 | alert: MSSQLHighReadStallTime 55 | annotations: 56 | description: '{{ printf "%.2f" $value }}ms of IO read stall has occurred on {{$labels.instance}}, 57 | which is above threshold of 400ms.' 58 | summary: There is a high amount of IO stall for database reads. 59 | expr: | 60 | 1000 * increase(mssql_io_stall_seconds_total{operation="read"}[5m]) > 400 61 | for: 5m 62 | labels: 63 | severity: critical 64 | {{< /code >}} 65 | 66 | ##### MSSQLModerateWriteStallTime 67 | 68 | {{< code lang="yaml" >}} 69 | alert: MSSQLModerateWriteStallTime 70 | annotations: 71 | description: '{{ printf "%.2f" $value }}ms of IO write stall has occurred on {{$labels.instance}}, 72 | which is above threshold of 200ms.' 73 | summary: There is a moderate amount of IO stall for database writes. 74 | expr: | 75 | 1000 * increase(mssql_io_stall_seconds_total{operation="write"}[5m]) > 200 76 | for: 5m 77 | labels: 78 | severity: warning 79 | {{< /code >}} 80 | 81 | ##### MSSQLHighWriteStallTime 82 | 83 | {{< code lang="yaml" >}} 84 | alert: MSSQLHighWriteStallTime 85 | annotations: 86 | description: '{{ printf "%.2f" $value }}ms of IO write stall has occurred on {{$labels.instance}}, 87 | which is above threshold of 400ms.' 88 | summary: There is a high amount of IO stall for database writes. 89 | expr: | 90 | 1000 * increase(mssql_io_stall_seconds_total{operation="write"}[5m]) > 400 91 | for: 5m 92 | labels: 93 | severity: critical 94 | {{< /code >}} 95 | 96 | ## Dashboards 97 | Following dashboards are generated from mixins and hosted on github: 98 | 99 | 100 | - [mssql-overview](https://github.com/monitoring-mixins/website/blob/master/assets/MSSQL/dashboards/mssql-overview.json) 101 | - [mssql-pages](https://github.com/monitoring-mixins/website/blob/master/assets/MSSQL/dashboards/mssql-pages.json) 102 | -------------------------------------------------------------------------------- /site/content/apache-airflow/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: apache-airflow 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-airflow-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/apache-airflow/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### apache-airflow 20 | 21 | ##### ApacheAirflowStarvingPoolTasks 22 | 23 | {{< code lang="yaml" >}} 24 | alert: ApacheAirflowStarvingPoolTasks 25 | annotations: 26 | description: | 27 | The number of starved tasks is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of 0. 28 | summary: There are starved tasks detected in the Apache Airflow pool. 29 | expr: | 30 | airflow_pool_starving_tasks > 0 31 | for: 5m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ##### ApacheAirflowDAGScheduleDelayWarningLevel 37 | 38 | {{< code lang="yaml" >}} 39 | alert: ApacheAirflowDAGScheduleDelayWarningLevel 40 | annotations: 41 | description: | 42 | The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 10. 43 | summary: The delay in DAG schedule time to DAG run time has reached the warning 44 | threshold. 45 | expr: | 46 | increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 10 47 | for: 1m 48 | labels: 49 | severity: warning 50 | {{< /code >}} 51 | 52 | ##### ApacheAirflowDAGScheduleDelayCriticalLevel 53 | 54 | {{< code lang="yaml" >}} 55 | alert: ApacheAirflowDAGScheduleDelayCriticalLevel 56 | annotations: 57 | description: | 58 | The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 60. 59 | summary: The delay in DAG schedule time to DAG run time has reached the critical 60 | threshold. 61 | expr: | 62 | increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 60 63 | for: 1m 64 | labels: 65 | severity: critical 66 | {{< /code >}} 67 | 68 | ##### ApacheAirflowDAGFailures 69 | 70 | {{< code lang="yaml" >}} 71 | alert: ApacheAirflowDAGFailures 72 | annotations: 73 | description: | 74 | The number of DAG failures seen is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 0. 75 | summary: There have been DAG failures detected. 76 | expr: | 77 | increase(airflow_dagrun_duration_failed_count[5m]) > 0 78 | for: 1m 79 | labels: 80 | severity: critical 81 | {{< /code >}} 82 | 83 | ## Dashboards 84 | Following dashboards are generated from mixins and hosted on github: 85 | 86 | 87 | - [apache-airflow-overview](https://github.com/monitoring-mixins/website/blob/master/assets/apache-airflow/dashboards/apache-airflow-overview.json) 88 | -------------------------------------------------------------------------------- /site/content/apache-camel/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: apache-camel 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-camel-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [apache-camel-micrometer](https://github.com/monitoring-mixins/website/blob/master/assets/apache-camel/dashboards/apache-camel-micrometer.json) 18 | -------------------------------------------------------------------------------- /site/content/apache-http/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: apache-http 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-http-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/apache-http/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### apache-http 20 | 21 | ##### ApacheDown 22 | 23 | {{< code lang="yaml" >}} 24 | alert: ApacheDown 25 | annotations: 26 | description: Apache is down on {{ $labels.instance }}. 27 | summary: Apache is down. 28 | expr: apache_up == 0 29 | for: 5m 30 | labels: 31 | severity: warning 32 | {{< /code >}} 33 | 34 | ##### ApacheRestart 35 | 36 | {{< code lang="yaml" >}} 37 | alert: ApacheRestart 38 | annotations: 39 | description: Apache has just been restarted on {{ $labels.instance }}. 40 | summary: Apache restart. 41 | expr: apache_uptime_seconds_total / 60 < 1 42 | for: "0" 43 | labels: 44 | severity: info 45 | {{< /code >}} 46 | 47 | ##### ApacheWorkersLoad 48 | 49 | {{< code lang="yaml" >}} 50 | alert: ApacheWorkersLoad 51 | annotations: 52 | description: | 53 | Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}. 54 | The current value is {{ $value }}%. 55 | summary: Apache workers load is too high. 56 | expr: | 57 | (sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 58 | for: 15m 59 | labels: 60 | severity: warning 61 | {{< /code >}} 62 | 63 | ##### ApacheResponseTimeTooHigh 64 | 65 | {{< code lang="yaml" >}} 66 | alert: ApacheResponseTimeTooHigh 67 | annotations: 68 | description: | 69 | Apache average response time is above the threshold of 5000 ms on {{ $labels.instance }}. 70 | The current value is {{ $value }} ms. 71 | summary: Apache response time is too high. 72 | expr: | 73 | increase(apache_duration_ms_total[5m])/increase(apache_accesses_total[5m]) > 5000 74 | for: 15m 75 | labels: 76 | severity: warning 77 | {{< /code >}} 78 | 79 | ## Dashboards 80 | Following dashboards are generated from mixins and hosted on github: 81 | 82 | 83 | - [apache-http](https://github.com/monitoring-mixins/website/blob/master/assets/apache-http/dashboards/apache-http.json) 84 | -------------------------------------------------------------------------------- /site/content/apache-mesos/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: apache-mesos 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-mesos-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/apache-mesos/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### apache-mesos 20 | 21 | ##### ApacheMesosHighMemoryUsage 22 | 23 | {{< code lang="yaml" >}} 24 | alert: ApacheMesosHighMemoryUsage 25 | annotations: 26 | description: '{{ printf "%.0f" $value }} percent memory usage on {{$labels.mesos_cluster}}, 27 | which is above the threshold of 90.' 28 | summary: There is a high memory usage for the cluster. 29 | expr: | 30 | min without(instance, job, type) (mesos_master_mem{type="percent"}) > 90 31 | for: 5m 32 | labels: 33 | severity: warning 34 | {{< /code >}} 35 | 36 | ##### ApacheMesosHighDiskUsage 37 | 38 | {{< code lang="yaml" >}} 39 | alert: ApacheMesosHighDiskUsage 40 | annotations: 41 | description: '{{ printf "%.0f" $value }} percent disk usage on {{$labels.mesos_cluster}}, 42 | which is above the threshold of 90.' 43 | summary: There is a high disk usage for the cluster. 44 | expr: | 45 | min without(instance, job, type) (mesos_master_disk{type="percent"}) > 90 46 | for: 5m 47 | labels: 48 | severity: critical 49 | {{< /code >}} 50 | 51 | ##### ApacheMesosUnreachableTasks 52 | 53 | {{< code lang="yaml" >}} 54 | alert: ApacheMesosUnreachableTasks 55 | annotations: 56 | description: '{{ printf "%.0f" $value }} unreachable tasks on {{$labels.mesos_cluster}}, 57 | which is above the threshold of 3.' 58 | summary: There are an unusually high number of unreachable tasks. 59 | expr: | 60 | max without(instance, job, state) (mesos_master_task_states_current{state="unreachable"}) > 3 61 | for: 5m 62 | labels: 63 | severity: warning 64 | {{< /code >}} 65 | 66 | ##### ApacheMesosNoLeaderElected 67 | 68 | {{< code lang="yaml" >}} 69 | alert: ApacheMesosNoLeaderElected 70 | annotations: 71 | description: There is no cluster coordinator on {{$labels.mesos_cluster}}. 72 | summary: There is currently no cluster coordinator. 73 | expr: | 74 | max without(instance, job) (mesos_master_elected) == 0 75 | for: 1m 76 | labels: 77 | severity: critical 78 | {{< /code >}} 79 | 80 | ##### ApacheMesosInactiveAgents 81 | 82 | {{< code lang="yaml" >}} 83 | alert: ApacheMesosInactiveAgents 84 | annotations: 85 | description: '{{ printf "%.0f" $value }} inactive agent clients over the last 5m 86 | which is above the threshold of 1.' 87 | summary: There are currently inactive agent clients. 88 | expr: | 89 | max without(instance, job, state) (mesos_master_slaves_state{state=~"connected_inactive|disconnected_inactive"}) > 1 90 | for: 5m 91 | labels: 92 | severity: warning 93 | {{< /code >}} 94 | 95 | ## Dashboards 96 | Following dashboards are generated from mixins and hosted on github: 97 | 98 | 99 | - [apache-mesos-overview](https://github.com/monitoring-mixins/website/blob/master/assets/apache-mesos/dashboards/apache-mesos-overview.json) 100 | -------------------------------------------------------------------------------- /site/content/apache-tomcat/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: apache-tomcat 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-tomcat-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/apache-tomcat/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### ApacheTomcatAlerts 20 | 21 | ##### ApacheTomcatAlertsHighCpuUsage 22 | 23 | {{< code lang="yaml" >}} 24 | alert: ApacheTomcatAlertsHighCpuUsage 25 | annotations: 26 | description: The CPU usage has been at {{ printf "%.0f" $value }} percent over the 27 | last 5 minutes on {{$labels.instance}}, which is above the threshold of 80 percent. 28 | summary: The instance has a CPU usage higher than the configured threshold. 29 | expr: | 30 | sum by (job,instance) (jvm_process_cpu_load{job="integrations/tomcat"}) > 80 31 | for: 5m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ##### ApacheTomcatAlertsHighMemoryUsage 37 | 38 | {{< code lang="yaml" >}} 39 | alert: ApacheTomcatAlertsHighMemoryUsage 40 | annotations: 41 | description: The memory usage has been at {{ printf "%.0f" $value }} percent over 42 | the last 5 minutes on {{$labels.instance}}, which is above the threshold of 80 43 | percent. 44 | summary: The instance has a higher memory usage than the configured threshold. 45 | expr: | 46 | sum(jvm_memory_usage_used_bytes{job="integrations/tomcat"}) by (job,instance) / sum(jvm_physical_memory_bytes{job="integrations/tomcat"}) by (job,instance) * 100 > 80 47 | for: 5m 48 | labels: 49 | severity: critical 50 | {{< /code >}} 51 | 52 | ##### ApacheTomcatAlertsHighRequestErrorPercent 53 | 54 | {{< code lang="yaml" >}} 55 | alert: ApacheTomcatAlertsHighRequestErrorPercent 56 | annotations: 57 | description: The percentage of request errors has been at {{ printf "%.0f" $value 58 | }} percent over the last 5 minutes on {{$labels.instance}}, which is above the 59 | threshold of 5 percent. 60 | summary: There are a high number of request errors. 61 | expr: | 62 | sum by (job,instance) (increase(tomcat_errorcount_total{job="integrations/tomcat"}[5m]) / increase(tomcat_requestcount_total{job="integrations/tomcat"}[5m]) * 100) > 5 63 | for: 5m 64 | labels: 65 | severity: critical 66 | {{< /code >}} 67 | 68 | ##### ApacheTomcatAlertsModeratelyHighProcessingTime 69 | 70 | {{< code lang="yaml" >}} 71 | alert: ApacheTomcatAlertsModeratelyHighProcessingTime 72 | annotations: 73 | description: The processing time has been at {{ printf "%.0f" $value }}ms over the 74 | last 5 minutes on {{$labels.instance}}, which is above the threshold of 300ms. 75 | summary: The processing time has been moderately high. 76 | expr: | 77 | sum by (job,instance) (increase(tomcat_processingtime_total{job="integrations/tomcat"}[5m]) / increase(tomcat_requestcount_total{job="integrations/tomcat"}[5m])) > 300 78 | for: 5m 79 | labels: 80 | severity: warning 81 | {{< /code >}} 82 | 83 | ## Dashboards 84 | Following dashboards are generated from mixins and hosted on github: 85 | 86 | 87 | - [apache-tomcat-hosts](https://github.com/monitoring-mixins/website/blob/master/assets/apache-tomcat/dashboards/apache-tomcat-hosts.json) 88 | - [apache-tomcat-overview](https://github.com/monitoring-mixins/website/blob/master/assets/apache-tomcat/dashboards/apache-tomcat-overview.json) 89 | -------------------------------------------------------------------------------- /site/content/argocd/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: argocd 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/argocd-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/argocd/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### ArgoCD 20 | 21 | ##### ArgoAppOutOfSync 22 | 23 | {{< code lang="yaml" >}} 24 | alert: ArgoAppOutOfSync 25 | annotations: 26 | description: Application {{ $labels.name }} has sync status as {{ $labels.sync_status 27 | }}. 28 | summary: Application is OutOfSync. 29 | expr: argocd_app_info{sync_status="OutOfSync"} == 1 30 | for: 1m 31 | labels: 32 | severity: warning 33 | {{< /code >}} 34 | 35 | ##### ArgoAppSyncFailed 36 | 37 | {{< code lang="yaml" >}} 38 | alert: ArgoAppSyncFailed 39 | annotations: 40 | description: Application {{ $labels.name }} has sync phase as {{ $labels.phase }}. 41 | summary: Application Sync Failed. 42 | expr: argocd_app_sync_total{phase!="Succeeded"} == 1 43 | for: 1m 44 | labels: 45 | severity: warning 46 | {{< /code >}} 47 | 48 | ##### ArgoAppMissing 49 | 50 | {{< code lang="yaml" >}} 51 | alert: ArgoAppMissing 52 | annotations: 53 | description: "ArgoCD has not reported any applications data for the past 15 minutes 54 | which means that it must be down or not functioning properly. 55 | " 56 | summary: No reported applications in ArgoCD. 57 | expr: absent(argocd_app_info) 58 | for: 15m 59 | labels: 60 | severity: critical 61 | {{< /code >}} 62 | 63 | ## Dashboards 64 | Following dashboards are generated from mixins and hosted on github: 65 | 66 | 67 | - [argocd-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argocd/dashboards/argocd-overview.json) 68 | -------------------------------------------------------------------------------- /site/content/asterisk/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: asterisk 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/asterisk-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/asterisk/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### AsteriskAlerts 20 | 21 | ##### AsteriskRestarted 22 | 23 | {{< code lang="yaml" >}} 24 | alert: AsteriskRestarted 25 | annotations: 26 | description: |- 27 | Asterisk instance restarted in the last minute 28 | VALUE = {{ $value }} 29 | LABELS = {{ $labels }} 30 | summary: Asterisk instance restarted in the last minute. 31 | expr: asterisk_core_uptime_seconds < 60 32 | for: 5s 33 | labels: 34 | severity: critical 35 | {{< /code >}} 36 | 37 | ##### AsteriskReloaded 38 | 39 | {{< code lang="yaml" >}} 40 | alert: AsteriskReloaded 41 | annotations: 42 | description: |- 43 | Asterisk instance reloaded in the last minute 44 | VALUE = {{ $value }} 45 | LABELS = {{ $labels }} 46 | summary: Asterisk instance reloaded in the last minute. 47 | expr: asterisk_core_last_reload_seconds < 60 48 | for: 5s 49 | labels: 50 | severity: warning 51 | {{< /code >}} 52 | 53 | ##### AsteriskHighScrapeTime 54 | 55 | {{< code lang="yaml" >}} 56 | alert: AsteriskHighScrapeTime 57 | annotations: 58 | description: |- 59 | Asterisk instance core high scrape time (Possible system performance degradation) 60 | VALUE = {{ $value }} 61 | LABELS = {{ $labels }} 62 | summary: Asterisk instance core high scrape time. 63 | expr: asterisk_core_scrape_time_ms > 100 64 | for: 10s 65 | labels: 66 | severity: critical 67 | {{< /code >}} 68 | 69 | ##### AsteriskHighActiveCallsCount 70 | 71 | {{< code lang="yaml" >}} 72 | alert: AsteriskHighActiveCallsCount 73 | annotations: 74 | description: |- 75 | Asterisk high active call count 76 | VALUE = {{ $value }} 77 | LABELS = {{ $labels }} 78 | summary: Asterisk high active call count. 79 | expr: asterisk_calls_count > 100 80 | for: 10s 81 | labels: 82 | severity: warning 83 | {{< /code >}} 84 | 85 | ## Dashboards 86 | Following dashboards are generated from mixins and hosted on github: 87 | 88 | 89 | - [asterisk-logs](https://github.com/monitoring-mixins/website/blob/master/assets/asterisk/dashboards/asterisk-logs.json) 90 | - [asterisk-overview](https://github.com/monitoring-mixins/website/blob/master/assets/asterisk/dashboards/asterisk-overview.json) 91 | -------------------------------------------------------------------------------- /site/content/awx/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: awx 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/awx-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [awx](https://github.com/monitoring-mixins/website/blob/master/assets/awx/dashboards/awx.json) 18 | -------------------------------------------------------------------------------- /site/content/blackbox_exporter/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: blackbox_exporter 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/adinhodovic/blackbox-exporter-mixin/](https://github.com/adinhodovic/blackbox-exporter-mixin/) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/blackbox_exporter/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### blackbox-exporter.rules 20 | 21 | ##### BlackboxProbeFailed 22 | 23 | {{< code lang="yaml" >}} 24 | alert: BlackboxProbeFailed 25 | annotations: 26 | dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{ 27 | $labels.instance }} 28 | description: The probe failed for the instance {{ $labels.instance }}. 29 | summary: Probe has failed for the past 1m interval. 30 | expr: | 31 | probe_success{job="blackbox-exporter"} == 0 32 | for: 1m 33 | labels: 34 | severity: critical 35 | {{< /code >}} 36 | 37 | ##### BlackboxLowUptime30d 38 | 39 | {{< code lang="yaml" >}} 40 | alert: BlackboxLowUptime30d 41 | annotations: 42 | dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{ 43 | $labels.instance }} 44 | description: The probe has a lower uptime than 99.9% the last 30 days for the instance 45 | {{ $labels.instance }}. 46 | summary: Probe uptime is lower than 99.9% for the last 30 days. 47 | expr: | 48 | avg_over_time(probe_success{job="blackbox-exporter"}[30d]) * 100 < 99.900000000000006 49 | labels: 50 | severity: info 51 | {{< /code >}} 52 | 53 | ##### BlackboxSslCertificateWillExpireSoon 54 | 55 | {{< code lang="yaml" >}} 56 | alert: BlackboxSslCertificateWillExpireSoon 57 | annotations: 58 | dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{ 59 | $labels.instance }} 60 | description: | 61 | The SSL certificate of the instance {{ $labels.instance }} is expiring within 21 days. 62 | Actual time left: {{ $value | humanizeDuration }}. 63 | summary: SSL certificate will expire soon. 64 | expr: | 65 | probe_ssl_earliest_cert_expiry{job="blackbox-exporter"} - time() < 21 * 24 * 3600 66 | labels: 67 | severity: warning 68 | {{< /code >}} 69 | 70 | ## Dashboards 71 | Following dashboards are generated from mixins and hosted on github: 72 | 73 | 74 | - [blackbox-exporter](https://github.com/monitoring-mixins/website/blob/master/assets/blackbox_exporter/dashboards/blackbox-exporter.json) 75 | -------------------------------------------------------------------------------- /site/content/caddy/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: caddy 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/caddy-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [caddy-overview](https://github.com/monitoring-mixins/website/blob/master/assets/caddy/dashboards/caddy-overview.json) 18 | -------------------------------------------------------------------------------- /site/content/clickhouse/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: clickhouse 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/clickhouse-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### ClickHouseAlerts 20 | 21 | ##### ClickHouseReplicationQueueBackingUp 22 | 23 | {{< code lang="yaml" >}} 24 | alert: ClickHouseReplicationQueueBackingUp 25 | annotations: 26 | description: | 27 | ClickHouse replication tasks are processing slower than expected on {{ $labels.instance }} causing replication queue size to back up at {{ $value }} exceeding the threshold value of 99. 28 | summary: ClickHouse replica max queue size backing up. 29 | expr: | 30 | ClickHouseAsyncMetrics_ReplicasMaxQueueSize > 99 31 | for: 5m 32 | keep_firing_for: 5m 33 | labels: 34 | severity: warning 35 | {{< /code >}} 36 | 37 | ##### ClickHouseRejectedInserts 38 | 39 | {{< code lang="yaml" >}} 40 | alert: ClickHouseRejectedInserts 41 | annotations: 42 | description: ClickHouse inserts are being rejected on {{ $labels.instance }} as 43 | items are being inserted faster than ClickHouse is able to merge them. 44 | summary: ClickHouse has too many rejected inserts. 45 | expr: ClickHouseProfileEvents_RejectedInserts > 1 46 | for: 5m 47 | keep_firing_for: 5m 48 | labels: 49 | severity: critical 50 | {{< /code >}} 51 | 52 | ##### ClickHouseZookeeperSessions 53 | 54 | {{< code lang="yaml" >}} 55 | alert: ClickHouseZookeeperSessions 56 | annotations: 57 | description: | 58 | ClickHouse has more than one connection to a Zookeeper on {{ $labels.instance }} which can lead to bugs due to stale reads in Zookeepers consistency model. 59 | summary: ClickHouse has too many Zookeeper sessions. 60 | expr: ClickHouseMetrics_ZooKeeperSession > 1 61 | for: 5m 62 | keep_firing_for: 5m 63 | labels: 64 | severity: critical 65 | {{< /code >}} 66 | 67 | ##### ClickHouseReplicasInReadOnly 68 | 69 | {{< code lang="yaml" >}} 70 | alert: ClickHouseReplicasInReadOnly 71 | annotations: 72 | description: | 73 | ClickHouse has replicas in a read only state on {{ $labels.instance }} after losing connection to Zookeeper or at startup. 74 | summary: ClickHouse has too many replicas in read only state. 75 | expr: ClickHouseMetrics_ReadonlyReplica > 0 76 | for: 5m 77 | keep_firing_for: 5m 78 | labels: 79 | severity: critical 80 | {{< /code >}} 81 | 82 | ## Dashboards 83 | Following dashboards are generated from mixins and hosted on github: 84 | 85 | 86 | - [clickhouse-latency](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/dashboards/clickhouse-latency.json) 87 | - [clickhouse-logs](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/dashboards/clickhouse-logs.json) 88 | - [clickhouse-overview](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/dashboards/clickhouse-overview.json) 89 | - [clickhouse-replica](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/dashboards/clickhouse-replica.json) 90 | -------------------------------------------------------------------------------- /site/content/confluent-kafka/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: confluent-kafka 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/confluent-kafka-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [confluent-kafka-overview](https://github.com/monitoring-mixins/website/blob/master/assets/confluent-kafka/dashboards/confluent-kafka-overview.json) 18 | -------------------------------------------------------------------------------- /site/content/consul/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: consul 3 | --- 4 | 5 | ## Overview 6 | 7 | Grafana dashboards and Prometheus alerts for operating Consul, in the form of a monitoring mixin. 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/consul-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/consul/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### consul 20 | 21 | ##### ConsulUp 22 | 23 | {{< code lang="yaml" >}} 24 | alert: ConsulUp 25 | annotations: 26 | description: Consul '{{ $labels.job }}' is not up. 27 | summary: Consul is not up. 28 | expr: | 29 | consul_up != 1 30 | for: 1m 31 | labels: 32 | severity: critical 33 | {{< /code >}} 34 | 35 | ##### ConsulMaster 36 | 37 | {{< code lang="yaml" >}} 38 | alert: ConsulMaster 39 | annotations: 40 | description: Consul '{{ $labels.job }}' has no master. 41 | summary: Consul has no master. 42 | expr: | 43 | consul_raft_leader != 1 44 | for: 1m 45 | labels: 46 | severity: critical 47 | {{< /code >}} 48 | 49 | ##### ConsulPeers 50 | 51 | {{< code lang="yaml" >}} 52 | alert: ConsulPeers 53 | annotations: 54 | description: Consul '{{ $labels.job }}' does not have 3 peers. 55 | summary: Consul does not have peers. 56 | expr: | 57 | consul_raft_peers != 3 58 | for: 10m 59 | labels: 60 | severity: critical 61 | {{< /code >}} 62 | 63 | ## Dashboards 64 | Following dashboards are generated from mixins and hosted on github: 65 | 66 | 67 | - [consul-overview](https://github.com/monitoring-mixins/website/blob/master/assets/consul/dashboards/consul-overview.json) 68 | -------------------------------------------------------------------------------- /site/content/couchbase/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: couchbase 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/couchbase-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/couchbase/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### couchbase 20 | 21 | ##### CouchbaseHighCPUUsage 22 | 23 | {{< code lang="yaml" >}} 24 | alert: CouchbaseHighCPUUsage 25 | annotations: 26 | description: '{{ printf "%.0f" $value }} percent CPU usage on node {{$labels.instance}} 27 | and on cluster {{$labels.couchbase_cluster}}, which is above the threshold of 28 | 85.' 29 | summary: The node CPU usage has exceeded the critical threshold. 30 | expr: | 31 | (sys_cpu_utilization_rate) > 85 32 | for: 5m 33 | labels: 34 | severity: critical 35 | {{< /code >}} 36 | 37 | ##### CouchbaseHighMemoryUsage 38 | 39 | {{< code lang="yaml" >}} 40 | alert: CouchbaseHighMemoryUsage 41 | annotations: 42 | description: '{{ printf "%.0f" $value }} percent memory usage on node {{$labels.instance}} 43 | and on cluster {{$labels.couchbase_cluster}}, which is above the threshold of 44 | 85.' 45 | summary: There is a limited amount of memory available for a node. 46 | expr: | 47 | 100 * (sys_mem_actual_used / clamp_min(sys_mem_actual_used + sys_mem_actual_free, 1)) > 85 48 | for: 5m 49 | labels: 50 | severity: critical 51 | {{< /code >}} 52 | 53 | ##### CouchbaseMemoryEvictionRate 54 | 55 | {{< code lang="yaml" >}} 56 | alert: CouchbaseMemoryEvictionRate 57 | annotations: 58 | description: '{{ printf "%.0f" $value }} evictions in bucket {{$labels.bucket}}, 59 | on node {{$labels.instance}}, and on cluster {{$labels.couchbase_cluster}}, which 60 | is above the threshold of 10.' 61 | summary: There is a spike in evictions in a bucket, which indicates high memory 62 | pressure. 63 | expr: | 64 | (kv_ep_num_value_ejects) > 10 65 | for: 5m 66 | labels: 67 | severity: warning 68 | {{< /code >}} 69 | 70 | ##### CouchbaseInvalidRequestVolume 71 | 72 | {{< code lang="yaml" >}} 73 | alert: CouchbaseInvalidRequestVolume 74 | annotations: 75 | description: '{{ printf "%.0f" $value }} invalid requests to {{$labels.couchbase_cluster}}, 76 | which is above the threshold of 1000.' 77 | summary: There is a high volume of incoming invalid requests, which may indicate 78 | a DOS or injection attack. 79 | expr: | 80 | sum without(instance, job) (rate(n1ql_invalid_requests[2m])) > 1000 81 | for: 2m 82 | labels: 83 | severity: warning 84 | {{< /code >}} 85 | 86 | ## Dashboards 87 | Following dashboards are generated from mixins and hosted on github: 88 | 89 | 90 | - [couchbase-bucket-overview](https://github.com/monitoring-mixins/website/blob/master/assets/couchbase/dashboards/couchbase-bucket-overview.json) 91 | - [couchbase-cluster-overview](https://github.com/monitoring-mixins/website/blob/master/assets/couchbase/dashboards/couchbase-cluster-overview.json) 92 | - [couchbase-node-overview](https://github.com/monitoring-mixins/website/blob/master/assets/couchbase/dashboards/couchbase-node-overview.json) 93 | -------------------------------------------------------------------------------- /site/content/discourse/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: discourse 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/discourse-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/discourse/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### DiscourseAlerts 20 | 21 | ##### DiscourseRequestsHigh5xxErrors 22 | 23 | {{< code lang="yaml" >}} 24 | alert: DiscourseRequestsHigh5xxErrors 25 | annotations: 26 | description: '{{ printf "%.2f" $value }}% of all requests are resulting in 500 status 27 | codes, which is above the threshold 10%, indicating a potentially larger issue 28 | for {{$labels.instance}}' 29 | summary: More than 10% of all requests result in a 5XX. 30 | expr: | 31 | 100 * rate(discourse_http_requests{status="500"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 10 32 | for: 5m 33 | labels: 34 | severity: critical 35 | {{< /code >}} 36 | 37 | ##### DiscourseRequestsHigh4xxErrors 38 | 39 | {{< code lang="yaml" >}} 40 | alert: DiscourseRequestsHigh4xxErrors 41 | annotations: 42 | description: '{{ printf "%.2f" $value }}% of all requests are resulting in 400 status 43 | code, which is above the threshold 30%, indicating a potentially larger issue 44 | for {{$labels.instance}}' 45 | summary: More than 30% of all requests result in a 4XX. 46 | expr: | 47 | 100 * rate(discourse_http_requests{status=~"^4.*"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 30 48 | for: 5m 49 | labels: 50 | severity: warning 51 | {{< /code >}} 52 | 53 | ## Dashboards 54 | Following dashboards are generated from mixins and hosted on github: 55 | 56 | 57 | - [discourse-jobs](https://github.com/monitoring-mixins/website/blob/master/assets/discourse/dashboards/discourse-jobs.json) 58 | - [discourse-overview](https://github.com/monitoring-mixins/website/blob/master/assets/discourse/dashboards/discourse-overview.json) 59 | -------------------------------------------------------------------------------- /site/content/docker/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: docker 3 | --- 4 | 5 | ## Overview 6 | 7 | A set of Grafana dashboards for Docker (based on cadvisor). 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/docker-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [docker](https://github.com/monitoring-mixins/website/blob/master/assets/docker/dashboards/docker.json) 18 | -------------------------------------------------------------------------------- /site/content/elasticsearch/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: elasticsearch 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/dockelasticsearcher-mixin) 11 | {{< /panel >}} 12 | 13 | -------------------------------------------------------------------------------- /site/content/envoy/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: envoy 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/envoy-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [envoy-overview](https://github.com/monitoring-mixins/website/blob/master/assets/envoy/dashboards/envoy-overview.json) 18 | -------------------------------------------------------------------------------- /site/content/gitea/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: gitea 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/go-gitea/gitea](https://github.com/go-gitea/gitea/tree/master/contrib/gitea-monitoring-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [gitea-overview](https://github.com/monitoring-mixins/website/blob/master/assets/gitea/dashboards/gitea-overview.json) 18 | -------------------------------------------------------------------------------- /site/content/gitlab/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: gitlab 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/gitlab-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/gitlab/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### GitLabAlerts 20 | 21 | ##### GitLabHighJobRegistrationFailures 22 | 23 | {{< code lang="yaml" >}} 24 | alert: GitLabHighJobRegistrationFailures 25 | annotations: 26 | description: '{{ printf "%.2f" $value }}% of job registrations have failed on {{$labels.instance}}, 27 | which is above threshold of 10%.' 28 | summary: Large percentage of failed attempts to register a job. 29 | expr: "100 * rate(job_register_attempts_failed_total{}[5m]) / rate(job_register_attempts_total{}[5m]) 30 | 31 | > 10 32 | " 33 | for: 5m 34 | labels: 35 | severity: warning 36 | {{< /code >}} 37 | 38 | ##### GitLabHighRunnerAuthFailure 39 | 40 | {{< code lang="yaml" >}} 41 | alert: GitLabHighRunnerAuthFailure 42 | annotations: 43 | description: '{{ printf "%.2f" $value }}% of GitLab runner authentication attempts 44 | are failing on {{$labels.instance}}, which is above the threshold of 10%.' 45 | summary: Large percentage of runner authentication failures. 46 | expr: "100 * sum by (instance) (rate(gitlab_ci_runner_authentication_failure_total{}[5m])) 47 | \ / 48 | (sum by (instance) (rate(gitlab_ci_runner_authentication_success_total{}[5m])) 49 | \ + sum by (instance) (rate(gitlab_ci_runner_authentication_failure_total{}[5m]))) 50 | > 51 | 10 52 | " 53 | for: 5m 54 | labels: 55 | severity: warning 56 | {{< /code >}} 57 | 58 | ##### GitLabHigh5xxResponses 59 | 60 | {{< code lang="yaml" >}} 61 | alert: GitLabHigh5xxResponses 62 | annotations: 63 | description: '{{ printf "%.2f" $value }}% of all requests returned 5XX HTTP responses, 64 | which is above the threshold 10%, indicating a system issue on {{$labels.instance}}.' 65 | summary: Large rate of HTTP 5XX errors. 66 | expr: "100 * sum by (instance) (rate(http_requests_total{status=~\"^5.*\"}[5m])) / 67 | sum by (instance) (rate(http_requests_total{}[5m])) 68 | > 10 69 | " 70 | for: 5m 71 | labels: 72 | severity: critical 73 | {{< /code >}} 74 | 75 | ##### GitLabHigh4xxResponses 76 | 77 | {{< code lang="yaml" >}} 78 | alert: GitLabHigh4xxResponses 79 | annotations: 80 | description: '{{ printf "%.2f" $value }}% of all requests returned 4XX HTTP responses, 81 | which is above the threshold 10%, indicating many failed requests on {{$labels.instance}}.' 82 | summary: Large rate of HTTP 4XX errors. 83 | expr: | 84 | 100 * sum by (instance) (rate(http_requests_total{status=~"^4.*"}[5m])) / sum by (instance) (rate(http_requests_total{}[5m])) 85 | > 10 86 | for: 5m 87 | labels: 88 | severity: warning 89 | {{< /code >}} 90 | 91 | ## Dashboards 92 | Following dashboards are generated from mixins and hosted on github: 93 | 94 | 95 | - [gitlab-overview](https://github.com/monitoring-mixins/website/blob/master/assets/gitlab/dashboards/gitlab-overview.json) 96 | -------------------------------------------------------------------------------- /site/content/go-runtime/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: go-runtime 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/go-runtime-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [go-runtime](https://github.com/monitoring-mixins/website/blob/master/assets/go-runtime/dashboards/go-runtime.json) 18 | -------------------------------------------------------------------------------- /site/content/grafana/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: grafana 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/grafana](https://github.com/grafana/grafana/tree/master/grafana-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### GrafanaAlerts 20 | 21 | ##### GrafanaRequestsFailing 22 | 23 | {{< code lang="yaml" >}} 24 | alert: GrafanaRequestsFailing 25 | annotations: 26 | message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing 27 | {{ $value | humanize }}% errors' 28 | expr: | 29 | 100 * sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}) 30 | / 31 | sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) 32 | > 50 33 | for: 5m 34 | labels: 35 | severity: warning 36 | {{< /code >}} 37 | 38 | ## Recording rules 39 | 40 | {{< panel style="warning" >}} 41 | Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/rules.yaml). 42 | {{< /panel >}} 43 | 44 | ### grafana_rules 45 | 46 | ##### namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m 47 | 48 | {{< code lang="yaml" >}} 49 | expr: | 50 | sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) 51 | record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m 52 | {{< /code >}} 53 | 54 | ## Dashboards 55 | Following dashboards are generated from mixins and hosted on github: 56 | 57 | 58 | - [grafana-overview](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/dashboards/grafana-overview.json) 59 | -------------------------------------------------------------------------------- /site/content/haproxy/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: haproxy 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/haproxy-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### HAProxyAlerts 20 | 21 | ##### HAProxyDroppingLogs 22 | 23 | {{< code lang="yaml" >}} 24 | alert: HAProxyDroppingLogs 25 | annotations: 26 | description: HAProxy {{$labels.job}} on {{$labels.instance}} is dropping logs. 27 | summary: HAProxy is dropping logs. 28 | expr: rate(haproxy_process_dropped_logs_total[5m]) != 0 29 | for: 10m 30 | labels: 31 | severity: critical 32 | {{< /code >}} 33 | 34 | ##### HAProxyBackendCheckFlapping 35 | 36 | {{< code lang="yaml" >}} 37 | alert: HAProxyBackendCheckFlapping 38 | annotations: 39 | description: HAProxy {{$labels.job}} backend {{$labels.proxy}} on {{$labels.instance}} 40 | has flapping checks. 41 | summary: HAProxy backend checks are flapping. 42 | expr: rate(haproxy_backend_check_up_down_total[5m]) != 0 43 | for: 10m 44 | labels: 45 | severity: critical 46 | {{< /code >}} 47 | 48 | ##### HAProxyServerCheckFlapping 49 | 50 | {{< code lang="yaml" >}} 51 | alert: HAProxyServerCheckFlapping 52 | annotations: 53 | description: HAProxy {{$labels.job}} server {{$labels.server}} on {{$labels.instance}} 54 | has flapping checks. 55 | summary: HAProxy server checks are flapping. 56 | expr: rate(haproxy_server_check_up_down_total[5m]) != 0 57 | for: 10m 58 | labels: 59 | severity: critical 60 | {{< /code >}} 61 | 62 | ## Dashboards 63 | Following dashboards are generated from mixins and hosted on github: 64 | 65 | 66 | - [haproxy-backend](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/dashboards/haproxy-backend.json) 67 | - [haproxy-frontend](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/dashboards/haproxy-frontend.json) 68 | - [haproxy-overview](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/dashboards/haproxy-overview.json) 69 | - [haproxy-server](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/dashboards/haproxy-server.json) 70 | -------------------------------------------------------------------------------- /site/content/harbor/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: harbor 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/harbor-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/harbor/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### Harbor 20 | 21 | ##### HarborComponentStatus 22 | 23 | {{< code lang="yaml" >}} 24 | alert: HarborComponentStatus 25 | annotations: 26 | description: Harbor {{ $labels.component }} has been down for more than 5 minutes 27 | summary: Harbor Component is Down. 28 | expr: | 29 | harbor_up == 0 30 | for: 5m 31 | labels: 32 | severity: critical 33 | {{< /code >}} 34 | 35 | ##### HarborProjectQuataExceeded 36 | 37 | {{< code lang="yaml" >}} 38 | alert: HarborProjectQuataExceeded 39 | annotations: 40 | description: Harbor project {{ $labels.project_name }} has exceeded the configured 41 | disk usage quota for the past 15 minutes 42 | summary: Harbor project exceeds disk usage quota. 43 | expr: | 44 | harbor_project_quota_usage_byte > harbor_project_quota_byte and on(harbor_project_quota_usage_byte) harbor_project_quota_byte != -1 45 | for: 15m 46 | labels: 47 | severity: warning 48 | {{< /code >}} 49 | 50 | ##### HarborHighErrorRate 51 | 52 | {{< code lang="yaml" >}} 53 | alert: HarborHighErrorRate 54 | annotations: 55 | description: HTTP Requests of {{ $labels.instance }} are having a high Error rate 56 | summary: Harbor high error rate. 57 | expr: sum(rate(harbor_core_http_request_total{code=~"4..|5.."}[5m]))/sum(rate(harbor_core_http_request_total[5m])) 58 | > 0.15 59 | for: 5m 60 | labels: 61 | severity: warning 62 | {{< /code >}} 63 | 64 | ## Dashboards 65 | Following dashboards are generated from mixins and hosted on github: 66 | 67 | 68 | - [harbor-overview](https://github.com/monitoring-mixins/website/blob/master/assets/harbor/dashboards/harbor-overview.json) 69 | -------------------------------------------------------------------------------- /site/content/hass/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: hass 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/hass-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [hass](https://github.com/monitoring-mixins/website/blob/master/assets/hass/dashboards/hass.json) 18 | -------------------------------------------------------------------------------- /site/content/ibm-mq/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ibm-mq 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/ibm-mq-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### ibm-mq-alerts 20 | 21 | ##### IBMMQExpiredMessages 22 | 23 | {{< code lang="yaml" >}} 24 | alert: IBMMQExpiredMessages 25 | annotations: 26 | description: The number of expired messages in the {{$labels.qmgr}} is {{$labels.value}} 27 | which is above the threshold of 2. 28 | summary: There are expired messages, which imply that application resilience is 29 | failing. 30 | expr: | 31 | sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_expired_message_count) > 2 32 | for: 5m 33 | labels: 34 | severity: critical 35 | {{< /code >}} 36 | 37 | ##### IBMMQStaleMessages 38 | 39 | {{< code lang="yaml" >}} 40 | alert: IBMMQStaleMessages 41 | annotations: 42 | description: A stale message with an age of {{$labels.value}} has been sitting in 43 | the {{$labels.queue}} which is above the threshold of 300s. 44 | summary: Stale messages have been detected. 45 | expr: | 46 | sum without (description,instance,job,platform) (ibmmq_queue_oldest_message_age) >= 300 47 | for: 5m 48 | labels: 49 | severity: warning 50 | {{< /code >}} 51 | 52 | ##### IBMMQLowDiskSpace 53 | 54 | {{< code lang="yaml" >}} 55 | alert: IBMMQLowDiskSpace 56 | annotations: 57 | description: The amount of disk space available for {{$labels.qmgr}} is at {{$labels.value}}% 58 | which is below the threshold of 5%. 59 | summary: There is limited disk available for a queue manager. 60 | expr: | 61 | sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_queue_manager_file_system_free_space_percentage) <= 5 62 | for: 5m 63 | labels: 64 | severity: critical 65 | {{< /code >}} 66 | 67 | ##### IBMMQHighQueueManagerCpuUsage 68 | 69 | {{< code lang="yaml" >}} 70 | alert: IBMMQHighQueueManagerCpuUsage 71 | annotations: 72 | description: The amount of CPU usage for the queue manager {{$labels.qmgr}} is at 73 | {{$labels.value}}% which is above the threshold of 85%. 74 | summary: There is a high CPU usage estimate for a queue manager. 75 | expr: | 76 | sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_user_cpu_time_estimate_for_queue_manager_percentage) >= 85 77 | for: 5m 78 | labels: 79 | severity: critical 80 | {{< /code >}} 81 | 82 | ## Dashboards 83 | Following dashboards are generated from mixins and hosted on github: 84 | 85 | 86 | - [ibm-mq-cluster-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/dashboards/ibm-mq-cluster-overview.json) 87 | - [ibm-mq-queue-manager-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/dashboards/ibm-mq-queue-manager-overview.json) 88 | - [ibm-mq-queue-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/dashboards/ibm-mq-queue-overview.json) 89 | - [ibm-mq-topics-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/dashboards/ibm-mq-topics-overview.json) 90 | -------------------------------------------------------------------------------- /site/content/jenkins/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: jenkins 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/jenkins-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [jenkins](https://github.com/monitoring-mixins/website/blob/master/assets/jenkins/dashboards/jenkins.json) 18 | -------------------------------------------------------------------------------- /site/content/jira/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: jira 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/jira-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/jira/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### alert.rules 20 | 21 | ##### LicenseExpired 22 | 23 | {{< code lang="yaml" >}} 24 | alert: LicenseExpired 25 | annotations: 26 | description: The JIRA license has expired. 27 | summary: JIRA license expired. 28 | expr: jira_license_expiry_days_gauge <= 0 29 | for: 1m 30 | labels: 31 | severity: critical 32 | {{< /code >}} 33 | 34 | ##### LicenseWarning 35 | 36 | {{< code lang="yaml" >}} 37 | alert: LicenseWarning 38 | annotations: 39 | description: The JIRA license will expire in less than one week. 40 | summary: License expiring soon. 41 | expr: jira_license_expiry_days_gauge <= 7 and jira_license_expiry_days_gauge > 0 42 | for: 1m 43 | labels: 44 | severity: warning 45 | {{< /code >}} 46 | 47 | ##### NoUserCapacity 48 | 49 | {{< code lang="yaml" >}} 50 | alert: NoUserCapacity 51 | annotations: 52 | description: There is no more capacity for additional users to be added to the system. 53 | summary: All available accounts are taken. 54 | expr: jira_all_users_gauge/jira_allowed_users_gauge == 1 55 | for: 1m 56 | labels: 57 | severity: critical 58 | {{< /code >}} 59 | 60 | ##### EmailErrorsHigh 61 | 62 | {{< code lang="yaml" >}} 63 | alert: EmailErrorsHigh 64 | annotations: 65 | description: More than 1% of emails have resulted in an error in the past minute. 66 | summary: Email errors are high. 67 | expr: jira_mail_queue_error_gauge /jira_mail_queue_gauge > 0.01 68 | for: 1m 69 | labels: 70 | severity: critical 71 | {{< /code >}} 72 | 73 | ## Dashboards 74 | Following dashboards are generated from mixins and hosted on github: 75 | 76 | 77 | - [jira-overview](https://github.com/monitoring-mixins/website/blob/master/assets/jira/dashboards/jira-overview.json) 78 | -------------------------------------------------------------------------------- /site/content/jvm/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: jvm 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/jvm-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/jvm/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### jvm-jvm-alerts 20 | 21 | ##### JvmMemoryFillingUp 22 | 23 | {{< code lang="yaml" >}} 24 | alert: JvmMemoryFillingUp 25 | annotations: 26 | description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last 27 | 5 minutes on {{$labels.instance}}, which is above the threshold of 80%. 28 | summary: JVM heap memory filling up. 29 | expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", }))/(sum without (id) 30 | (jvm_memory_max_bytes{area="heap", } != -1))) * 100 > 80 31 | for: 5m 32 | keep_firing_for: 5m 33 | labels: 34 | severity: warning 35 | {{< /code >}} 36 | 37 | ##### JvmThreadsDeadlocked 38 | 39 | {{< code lang="yaml" >}} 40 | alert: JvmThreadsDeadlocked 41 | annotations: 42 | description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}} 43 | are in a cyclic dependency with each other. The restart is required to resolve 44 | the deadlock.' 45 | summary: JVM deadlock detected. 46 | expr: (jvm_threads_deadlocked{}) > 0 47 | for: 2m 48 | keep_firing_for: 5m 49 | labels: 50 | severity: critical 51 | {{< /code >}} 52 | 53 | ## Dashboards 54 | Following dashboards are generated from mixins and hosted on github: 55 | 56 | 57 | - [jvm-dashboard](https://github.com/monitoring-mixins/website/blob/master/assets/jvm/dashboards/jvm-dashboard.json) 58 | -------------------------------------------------------------------------------- /site/content/kube-state-metrics/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: kube-state-metrics 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/kubernetes/kube-state-metrics](https://github.com/kubernetes/kube-state-metrics/tree/master/jsonnet/kube-state-metrics-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/kube-state-metrics/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### kube-state-metrics 20 | 21 | ##### KubeStateMetricsListErrors 22 | 23 | {{< code lang="yaml" >}} 24 | alert: KubeStateMetricsListErrors 25 | annotations: 26 | description: kube-state-metrics is experiencing errors at an elevated rate in list 27 | operations. This is likely causing it to not be able to expose metrics about Kubernetes 28 | objects correctly or at all. 29 | summary: kube-state-metrics is experiencing errors in list operations. 30 | expr: | 31 | (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) 32 | / 33 | sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) 34 | > 0.01 35 | for: 15m 36 | labels: 37 | severity: critical 38 | {{< /code >}} 39 | 40 | ##### KubeStateMetricsWatchErrors 41 | 42 | {{< code lang="yaml" >}} 43 | alert: KubeStateMetricsWatchErrors 44 | annotations: 45 | description: kube-state-metrics is experiencing errors at an elevated rate in watch 46 | operations. This is likely causing it to not be able to expose metrics about Kubernetes 47 | objects correctly or at all. 48 | summary: kube-state-metrics is experiencing errors in watch operations. 49 | expr: | 50 | (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) 51 | / 52 | sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) 53 | > 0.01 54 | for: 15m 55 | labels: 56 | severity: critical 57 | {{< /code >}} 58 | 59 | ##### KubeStateMetricsShardingMismatch 60 | 61 | {{< code lang="yaml" >}} 62 | alert: KubeStateMetricsShardingMismatch 63 | annotations: 64 | description: kube-state-metrics pods are running with different --total-shards configuration, 65 | some Kubernetes objects may be exposed multiple times or not exposed at all. 66 | summary: kube-state-metrics sharding is misconfigured. 67 | expr: | 68 | stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 69 | for: 15m 70 | labels: 71 | severity: critical 72 | {{< /code >}} 73 | 74 | ##### KubeStateMetricsShardsMissing 75 | 76 | {{< code lang="yaml" >}} 77 | alert: KubeStateMetricsShardsMissing 78 | annotations: 79 | description: kube-state-metrics shards are missing, some Kubernetes objects are 80 | not being exposed. 81 | summary: kube-state-metrics shards are missing. 82 | expr: | 83 | 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1 84 | - 85 | sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) 86 | != 0 87 | for: 15m 88 | labels: 89 | severity: critical 90 | {{< /code >}} 91 | 92 | -------------------------------------------------------------------------------- /site/content/memcached/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: memcached 3 | --- 4 | 5 | ## Overview 6 | 7 | Grafana dashboard for operating Memcached, in the form of a monitoring mixin. 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/memcached-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/memcached/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### memcached 20 | 21 | ##### MemcachedDown 22 | 23 | {{< code lang="yaml" >}} 24 | alert: MemcachedDown 25 | annotations: 26 | description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} is down 27 | for more than 15 minutes. 28 | summary: Memcached instance is down. 29 | expr: | 30 | memcached_up == 0 31 | for: 15m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ##### MemcachedConnectionLimitApproaching 37 | 38 | {{< code lang="yaml" >}} 39 | alert: MemcachedConnectionLimitApproaching 40 | annotations: 41 | description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection 42 | usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes. 43 | summary: Memcached max connection limit is approaching. 44 | expr: | 45 | (memcached_current_connections / memcached_max_connections * 100) > 80 46 | for: 15m 47 | labels: 48 | severity: warning 49 | {{< /code >}} 50 | 51 | ##### MemcachedConnectionLimitApproaching 52 | 53 | {{< code lang="yaml" >}} 54 | alert: MemcachedConnectionLimitApproaching 55 | annotations: 56 | description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection 57 | usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes. 58 | summary: Memcached connections at critical level. 59 | expr: | 60 | (memcached_current_connections / memcached_max_connections * 100) > 95 61 | for: 15m 62 | labels: 63 | severity: critical 64 | {{< /code >}} 65 | 66 | ##### MemcachedOutOfMemoryErrors 67 | 68 | {{< code lang="yaml" >}} 69 | alert: MemcachedOutOfMemoryErrors 70 | annotations: 71 | description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} has OutOfMemory 72 | errors for at least 15 minutes, current rate is {{ printf "%0.0f" $value }} 73 | summary: Memcached has OutOfMemory errors. 74 | expr: | 75 | sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0 76 | for: 15m 77 | labels: 78 | severity: warning 79 | {{< /code >}} 80 | 81 | ## Dashboards 82 | Following dashboards are generated from mixins and hosted on github: 83 | 84 | 85 | - [memcached-overview](https://github.com/monitoring-mixins/website/blob/master/assets/memcached/dashboards/memcached-overview.json) 86 | -------------------------------------------------------------------------------- /site/content/mysql/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: mysql 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/prometheus/mysqld_exporter](https://github.com/prometheus/mysqld_exporter/tree/master/mysqld-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [mysql-overview](https://github.com/monitoring-mixins/website/blob/master/assets/mysql/dashboards/mysql-overview.json) 18 | -------------------------------------------------------------------------------- /site/content/nginx/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: nginx 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/nginx-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [nginx-logs](https://github.com/monitoring-mixins/website/blob/master/assets/nginx/dashboards/nginx-logs.json) 18 | - [nginx-metrics](https://github.com/monitoring-mixins/website/blob/master/assets/nginx/dashboards/nginx-metrics.json) 19 | -------------------------------------------------------------------------------- /site/content/nodejs/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: nodejs 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/nodejs-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/nodejs/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### NodejsAlerts 20 | 21 | ##### NodejsDown 22 | 23 | {{< code lang="yaml" >}} 24 | alert: NodejsDown 25 | annotations: 26 | description: Node.js {{$labels.job}} on {{$labels.instance}} is not up. 27 | summary: Node.js not up. 28 | expr: absent(nodejs_version_info) or (sum by (version) (nodejs_version_info) < 1) 29 | for: 0m 30 | labels: 31 | severity: critical 32 | {{< /code >}} 33 | 34 | ## Dashboards 35 | Following dashboards are generated from mixins and hosted on github: 36 | 37 | 38 | - [nodejs-overview](https://github.com/monitoring-mixins/website/blob/master/assets/nodejs/dashboards/nodejs-overview.json) 39 | -------------------------------------------------------------------------------- /site/content/nomad/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: nomad 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/nomad-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [nomad-cluster](https://github.com/monitoring-mixins/website/blob/master/assets/nomad/dashboards/nomad-cluster.json) 18 | - [nomad-jobs](https://github.com/monitoring-mixins/website/blob/master/assets/nomad/dashboards/nomad-jobs.json) 19 | -------------------------------------------------------------------------------- /site/content/nsq/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: nsq 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/nsq-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/nsq/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### nsq 20 | 21 | ##### NsqTopicDepthIncreasing 22 | 23 | {{< code lang="yaml" >}} 24 | alert: NsqTopicDepthIncreasing 25 | annotations: 26 | description: | 27 | Topic {{ $labels.topic }} depth is higher than 100. The current queue is {{ $value }}. 28 | summary: Topic depth is increasing. 29 | expr: | 30 | sum by (topic) (nsq_topic_depth) > 100 31 | for: 5m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ##### NsqChannelDepthIncreasing 37 | 38 | {{< code lang="yaml" >}} 39 | alert: NsqChannelDepthIncreasing 40 | annotations: 41 | description: | 42 | Channel {{ $labels.channel }} depth in topic {{ $labels.topic }} is higher than 100. The current queue is {{ $value }}. 43 | summary: Topic channel depth is increasing. 44 | expr: | 45 | sum by (topic) (nsq_topic_channel_backend_depth) > 100 46 | for: 5m 47 | labels: 48 | severity: critical 49 | {{< /code >}} 50 | 51 | ## Dashboards 52 | Following dashboards are generated from mixins and hosted on github: 53 | 54 | 55 | - [nsq-instances](https://github.com/monitoring-mixins/website/blob/master/assets/nsq/dashboards/nsq-instances.json) 56 | - [nsq-topics](https://github.com/monitoring-mixins/website/blob/master/assets/nsq/dashboards/nsq-topics.json) 57 | -------------------------------------------------------------------------------- /site/content/openldap/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: openldap 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/openldap-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/openldap/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### openldap-alerts 20 | 21 | ##### OpenLDAPConnectionSpike 22 | 23 | {{< code lang="yaml" >}} 24 | alert: OpenLDAPConnectionSpike 25 | annotations: 26 | description: There are {{ printf "%.0f" $value }} OpenLDAP connections on instance 27 | {{$labels.instance}}, which is above the threshold of 100. 28 | summary: A sudden spike in OpenLDAP connections indicates potential high usage or 29 | security issues. 30 | expr: | 31 | increase(openldap_monitor_counter_object{dn="cn=Current,cn=Connections,cn=Monitor"}[5m]) > 100 32 | for: 5m 33 | labels: 34 | severity: warning 35 | {{< /code >}} 36 | 37 | ##### OpenLDAPHighSearchOperationRateSpike 38 | 39 | {{< code lang="yaml" >}} 40 | alert: OpenLDAPHighSearchOperationRateSpike 41 | annotations: 42 | description: The rate of search operations in OpenLDAP on instance {{$labels.instance}} 43 | has increased by {{ printf "%.0f" $value }} percent in the last 5 minutes, compared 44 | to the average over the last 15 minutes, which is above the threshold of 200 percent. 45 | summary: A significant spike in OpenLDAP search operations indicates inefficient 46 | queries, potential abuse, or unintended heavy load. 47 | expr: "100 * ( 48 | rate(openldap_monitor_operation{dn=\"cn=Search,cn=Operations,cn=Monitor\"}[5m]) 49 | 50 | / 51 | clamp_min(rate(openldap_monitor_operation{dn=\"cn=Search,cn=Operations,cn=Monitor\"}[15m] 52 | offset 5m), 0.0001) 53 | ) > 200 54 | " 55 | for: 5m 56 | labels: 57 | severity: warning 58 | {{< /code >}} 59 | 60 | ##### OpenLDAPDialFailures 61 | 62 | {{< code lang="yaml" >}} 63 | alert: OpenLDAPDialFailures 64 | annotations: 65 | description: LDAP dial failures on instance {{$labels.instance}} have increased 66 | by {{ printf "%.0f" $value }} in the last 10 minutes, which is above the threshold 67 | of 10. 68 | summary: Significant increase in LDAP dial failures indicates network issues, problems 69 | with the LDAP service, or configuration errors that may lead to service unavailability. 70 | expr: | 71 | increase(openldap_dial{result!="ok"}[10m]) > 10 72 | for: 10m 73 | labels: 74 | severity: warning 75 | {{< /code >}} 76 | 77 | ##### OpenLDAPBindFailureRateIncrease 78 | 79 | {{< code lang="yaml" >}} 80 | alert: OpenLDAPBindFailureRateIncrease 81 | annotations: 82 | description: LDAP bind failures on instance {{$labels.instance}} have increased 83 | by {{ printf "%.0f" $value }} in the last 10 minutes, which is above the threshold 84 | of 10. 85 | summary: Significant increase in LDAP bind failures indicates authentication issues, 86 | potential security threats or problems with user directories. 87 | expr: | 88 | increase(openldap_bind{result!="ok"}[10m]) > 10 89 | for: 10m 90 | labels: 91 | severity: warning 92 | {{< /code >}} 93 | 94 | ## Dashboards 95 | Following dashboards are generated from mixins and hosted on github: 96 | 97 | 98 | - [*](https://github.com/monitoring-mixins/website/blob/master/assets/openldap/dashboards/*.json) 99 | -------------------------------------------------------------------------------- /site/content/oracledb/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: oracledb 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/oracledb-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/oracledb/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### OracleDBAlerts 20 | 21 | ##### OracledbReachingSessionLimit 22 | 23 | {{< code lang="yaml" >}} 24 | alert: OracledbReachingSessionLimit 25 | annotations: 26 | description: '{{ printf "%.2f" $value }}% of sessions are being utilized which is 27 | above the threshold 85%. This could mean that {{$labels.instance}} is being overutilized.' 28 | summary: The number of sessions being utilized exceeded 85%. 29 | expr: | 30 | oracledb_resource_current_utilization{resource_name="sessions"} / oracledb_resource_limit_value{resource_name="sessions"} * 100 > 85 31 | for: 5m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ##### OracledbReachingProcessLimit 37 | 38 | {{< code lang="yaml" >}} 39 | alert: OracledbReachingProcessLimit 40 | annotations: 41 | description: '{{ printf "%.2f" $value }} of processes are being utilized which is 42 | above thethreshold 85%. This could potentially mean that {{$labels.instance}} 43 | runs out of processes it can spin up.' 44 | summary: The number of processess being utilized exceeded the threshold of 85%. 45 | expr: | 46 | oracledb_resource_current_utilization{resource_name="processes"} / oracledb_resource_limit_value{resource_name="processes"} * 100 > 85 47 | for: 5m 48 | labels: 49 | severity: critical 50 | {{< /code >}} 51 | 52 | ##### OracledbTablespaceReachingCapacity 53 | 54 | {{< code lang="yaml" >}} 55 | alert: OracledbTablespaceReachingCapacity 56 | annotations: 57 | description: '{{ printf "%.2f" $value }}% of bytes are being utilized by the tablespace 58 | {{$labels.tablespace}} on the instance {{$labels.instance}}, which is above the 59 | threshold 85%.' 60 | summary: A tablespace is exceeding more than 85% of its maximum allotted space. 61 | expr: | 62 | oracledb_tablespace_bytes / oracledb_tablespace_max_bytes * 100 > 85 63 | for: 5m 64 | labels: 65 | severity: critical 66 | {{< /code >}} 67 | 68 | ## Dashboards 69 | Following dashboards are generated from mixins and hosted on github: 70 | 71 | 72 | - [oracledb-overview](https://github.com/monitoring-mixins/website/blob/master/assets/oracledb/dashboards/oracledb-overview.json) 73 | -------------------------------------------------------------------------------- /site/content/python-runtime/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: python-runtime 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/python-runtime-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [python-runtime](https://github.com/monitoring-mixins/website/blob/master/assets/python-runtime/dashboards/python-runtime.json) 18 | -------------------------------------------------------------------------------- /site/content/rabbitmq/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: rabbitmq 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/rabbitmq-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/rabbitmq/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### RabbitMQClusterAlerts 20 | 21 | ##### RabbitMQMemoryHigh 22 | 23 | {{< code lang="yaml" >}} 24 | alert: RabbitMQMemoryHigh 25 | annotations: 26 | description: A node {{ $labels.instance }} is using more than 90% of allocated RAM. 27 | summary: RabbitMQ memory usage is high. 28 | expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes 29 | * 100 > 90 30 | for: 5m 31 | labels: 32 | severity: warning 33 | {{< /code >}} 34 | 35 | ##### RabbitMQFileDescriptorsUsage 36 | 37 | {{< code lang="yaml" >}} 38 | alert: RabbitMQFileDescriptorsUsage 39 | annotations: 40 | description: A node {{ $labels.instance }} is using more than 90% of file descriptors. 41 | summary: RabbitMQ file descriptors usage is high. 42 | expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 43 | for: 5m 44 | labels: 45 | severity: warning 46 | {{< /code >}} 47 | 48 | ##### RabbitMQUnroutableMessages 49 | 50 | {{< code lang="yaml" >}} 51 | alert: RabbitMQUnroutableMessages 52 | annotations: 53 | description: A queue has unroutable messages on {{ $labels.instance }}. 54 | summary: A RabbitMQ queue has unroutable messages. 55 | expr: increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) 56 | > 0 57 | for: 5m 58 | labels: 59 | severity: warning 60 | {{< /code >}} 61 | 62 | ##### RabbitMQNodeNotDistributed 63 | 64 | {{< code lang="yaml" >}} 65 | alert: RabbitMQNodeNotDistributed 66 | annotations: 67 | description: "Distribution link state is not 'up' on {{ $labels.instance }}, current 68 | value is {{ $value }}. 69 | Note: The state is represented as a numerical value where 70 | pending=1, up_pending=2 and up=3." 71 | summary: RabbitMQ node not distributed, link state is down. 72 | expr: erlang_vm_dist_node_state{rabbitmq_cluster!=""} < 3 73 | for: 5m 74 | labels: 75 | severity: critical 76 | {{< /code >}} 77 | 78 | ## Dashboards 79 | Following dashboards are generated from mixins and hosted on github: 80 | 81 | 82 | - [erlang-memory-allocators](https://github.com/monitoring-mixins/website/blob/master/assets/rabbitmq/dashboards/erlang-memory-allocators.json) 83 | - [rabbitmq-overview](https://github.com/monitoring-mixins/website/blob/master/assets/rabbitmq/dashboards/rabbitmq-overview.json) 84 | -------------------------------------------------------------------------------- /site/content/rclone/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: rclone 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/rclone-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [rclone](https://github.com/monitoring-mixins/website/blob/master/assets/rclone/dashboards/rclone.json) 18 | -------------------------------------------------------------------------------- /site/content/redis/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: redis 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/redis-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [*](https://github.com/monitoring-mixins/website/blob/master/assets/redis/dashboards/*.json) 18 | -------------------------------------------------------------------------------- /site/content/ruby/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ruby 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/ruby-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [ruby-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ruby/dashboards/ruby-overview.json) 18 | -------------------------------------------------------------------------------- /site/content/sealed-secrets/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: sealed-secrets 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/bitnami-labs/sealed-secrets](https://github.com/bitnami-labs/sealed-secrets/tree/master/contrib/prometheus-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/sealed-secrets/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### sealed-secrets 20 | 21 | ##### SealedSecretsUnsealErrorHigh 22 | 23 | {{< code lang="yaml" >}} 24 | alert: SealedSecretsUnsealErrorHigh 25 | annotations: 26 | description: High number of errors during unsealing Sealed Secrets in {{ $labels.namespace 27 | }} namespace. 28 | runbook_url: https://github.com/bitnami-labs/sealed-secrets 29 | summary: Sealed Secrets Unseal Error High 30 | expr: | 31 | sum by (reason, namespace) (rate(sealed_secrets_controller_unseal_errors_total{}[5m])) > 0 32 | labels: 33 | severity: warning 34 | {{< /code >}} 35 | 36 | ## Dashboards 37 | Following dashboards are generated from mixins and hosted on github: 38 | 39 | 40 | - [sealed-secrets-controller](https://github.com/monitoring-mixins/website/blob/master/assets/sealed-secrets/dashboards/sealed-secrets-controller.json) 41 | -------------------------------------------------------------------------------- /site/content/spark/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: spark 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/spark-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [spark-metrics](https://github.com/monitoring-mixins/website/blob/master/assets/spark/dashboards/spark-metrics.json) 18 | -------------------------------------------------------------------------------- /site/content/spinnaker/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: spinnaker 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/uneeq-oss/spinnaker-mixin.git](https://github.com/uneeq-oss/spinnaker-mixin.git) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### igor 20 | 21 | ##### PollingMonitorItemsOverThreshold 22 | 23 | {{< code lang="yaml" >}} 24 | alert: PollingMonitorItemsOverThreshold 25 | annotations: 26 | description: '{{ $labels.monitor }} polling monitor for {{ $labels.partition }} 27 | threshold exceeded, preventing pipeline triggers.' 28 | runbook_url: https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds 29 | summary: Polling monitor item threshold exceeded. 30 | expr: sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0 31 | for: 5m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ## Dashboards 37 | Following dashboards are generated from mixins and hosted on github: 38 | 39 | 40 | - [clouddriver](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/clouddriver.json) 41 | - [deck](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/deck.json) 42 | - [echo](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/echo.json) 43 | - [fiat](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/fiat.json) 44 | - [front50](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/front50.json) 45 | - [gate](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/gate.json) 46 | - [igor](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/igor.json) 47 | - [orca](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/orca.json) 48 | - [rosco](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/rosco.json) 49 | - [spinnaker-application-details](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-application-details.json) 50 | - [spinnaker-aws-platform](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-aws-platform.json) 51 | - [spinnaker-google-platform](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-google-platform.json) 52 | - [spinnaker-key-metrics](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-key-metrics.json) 53 | - [spinnaker-kubernetes-platform](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-kubernetes-platform.json) 54 | - [spinnaker-minimalist](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-minimalist.json) 55 | -------------------------------------------------------------------------------- /site/content/spring-boot/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: spring-boot 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/spring-boot-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/spring-boot/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### jvm-micrometer-jvm-alerts 20 | 21 | ##### JvmMemoryFillingUp 22 | 23 | {{< code lang="yaml" >}} 24 | alert: JvmMemoryFillingUp 25 | annotations: 26 | description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last 27 | 5 minutes on {{$labels.instance}}, which is above the threshold of 80%. 28 | summary: JVM heap memory filling up. 29 | expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without 30 | (id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80 31 | for: 5m 32 | keep_firing_for: 5m 33 | labels: 34 | severity: warning 35 | {{< /code >}} 36 | 37 | ## Dashboards 38 | Following dashboards are generated from mixins and hosted on github: 39 | 40 | 41 | - [jvm-dashboard](https://github.com/monitoring-mixins/website/blob/master/assets/spring-boot/dashboards/jvm-dashboard.json) 42 | -------------------------------------------------------------------------------- /site/content/supabase/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: supabase 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/supabase-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [supabase](https://github.com/monitoring-mixins/website/blob/master/assets/supabase/dashboards/supabase.json) 18 | -------------------------------------------------------------------------------- /site/content/tensorflow/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: tensorflow 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/tensorflow-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/tensorflow/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### TensorFlowServingAlerts 20 | 21 | ##### TensorFlowModelRequestHighErrorRate 22 | 23 | {{< code lang="yaml" >}} 24 | alert: TensorFlowModelRequestHighErrorRate 25 | annotations: 26 | description: '{{ printf "%.2f" $value }}% of all model requests are not successful, 27 | which is above the threshold 30%, indicating a potentially larger issue for {{$labels.instance}}' 28 | summary: More than 30% of all model requests are not successful. 29 | expr: | 30 | 100 * sum(rate(:tensorflow:serving:request_count{status!="OK"}[5m])) by (instance) / sum(rate(:tensorflow:serving:request_count[5m])) by (instance) > 30 31 | for: 5m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ##### TensorFlowServingHighBatchQueuingLatency 37 | 38 | {{< code lang="yaml" >}} 39 | alert: TensorFlowServingHighBatchQueuingLatency 40 | annotations: 41 | description: Batch queuing latency greater than {{ printf "%.2f" $value }}µs, which 42 | is above the threshold 5000000µs, indicating a potentially larger issue for {{$labels.instance}} 43 | summary: Batch queuing latency more than 5000000µs. 44 | expr: | 45 | increase(:tensorflow:serving:batching_session:queuing_latency_sum[2m]) / increase(:tensorflow:serving:batching_session:queuing_latency_count[2m]) > 5000000 46 | for: 5m 47 | labels: 48 | severity: warning 49 | {{< /code >}} 50 | 51 | ## Dashboards 52 | Following dashboards are generated from mixins and hosted on github: 53 | 54 | 55 | - [tensorflow-overview](https://github.com/monitoring-mixins/website/blob/master/assets/tensorflow/dashboards/tensorflow-overview.json) 56 | -------------------------------------------------------------------------------- /site/content/traefik/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: traefik 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/traefik-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [traefikdash](https://github.com/monitoring-mixins/website/blob/master/assets/traefik/dashboards/traefikdash.json) 18 | -------------------------------------------------------------------------------- /site/content/ubnt-edgerouter/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ubnt-edgerouter 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/ubnt-edgerouter-mixin) 11 | {{< /panel >}} 12 | 13 | ## Recording rules 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ubnt-edgerouter/rules.yaml). 17 | {{< /panel >}} 18 | 19 | ### ubnt.rules 20 | 21 | ##### ifNiceName 22 | 23 | {{< code lang="yaml" >}} 24 | expr: label_join(ifAdminStatus,"nicename", ":", "ifName", "ifAlias") 25 | record: ifNiceName 26 | {{< /code >}} 27 | 28 | ## Dashboards 29 | Following dashboards are generated from mixins and hosted on github: 30 | 31 | 32 | - [ubnt-edgrouterx-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ubnt-edgerouter/dashboards/ubnt-edgrouterx-overview.json) 33 | -------------------------------------------------------------------------------- /site/content/vault/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: vault 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/vault-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [vault](https://github.com/monitoring-mixins/website/blob/master/assets/vault/dashboards/vault.json) 18 | -------------------------------------------------------------------------------- /site/content/velero/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: velero 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/velero-2-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/velero/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### velero 20 | 21 | ##### VeleroBackupFailure 22 | 23 | {{< code lang="yaml" >}} 24 | alert: VeleroBackupFailure 25 | annotations: 26 | description: | 27 | Backup failures detected on {{ $labels.instance }}. This could lead to data loss or inability to recover in case of a disaster. 28 | summary: Velero backup failures detected. 29 | expr: | 30 | increase(velero_backup_failure_total{job="integrations/velero"}[5m]) > 0 31 | for: 5m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ##### VeleroHighBackupDuration 37 | 38 | {{< code lang="yaml" >}} 39 | alert: VeleroHighBackupDuration 40 | annotations: 41 | description: | 42 | Backup duration on {{ $labels.instance }} is higher than the average duration over the past 48 hours. This could indicate performance issues or network congestion. The current value is {{ $value | printf "%.2f" }} seconds. 43 | summary: Velero backups taking longer than usual. 44 | expr: | 45 | histogram_quantile(0.5, sum(rate(velero_backup_duration_seconds_bucket{job="integrations/velero"}[5m])) by (le, schedule)) > 1.2 * 1.2 * avg_over_time(histogram_quantile(0.5, sum(rate(velero_backup_duration_seconds_bucket{job="integrations/velero"}[48h])) by (le, schedule))[5m:]) 46 | for: 5m 47 | labels: 48 | severity: warning 49 | {{< /code >}} 50 | 51 | ##### VeleroHighRestoreFailureRate 52 | 53 | {{< code lang="yaml" >}} 54 | alert: VeleroHighRestoreFailureRate 55 | annotations: 56 | description: | 57 | Restore failures detected on {{ $labels.instance }}. This could prevent timely data recovery and business continuity. 58 | summary: Velero restore failures detected. 59 | expr: | 60 | increase(velero_restore_failed_total{job="integrations/velero"}[5m]) > 0 61 | for: 5m 62 | labels: 63 | severity: critical 64 | {{< /code >}} 65 | 66 | ##### VeleroUpStatus 67 | 68 | {{< code lang="yaml" >}} 69 | alert: VeleroUpStatus 70 | annotations: 71 | description: "Cannot find any metrics related to Velero on {{ $labels.instance }}. 72 | This may indicate further issues with Velero or the scraping agent. 73 | " 74 | summary: Velero is down. 75 | expr: | 76 | up{job="integrations/velero"} != 0 77 | for: 5m 78 | labels: 79 | severity: critical 80 | {{< /code >}} 81 | 82 | ## Dashboards 83 | Following dashboards are generated from mixins and hosted on github: 84 | 85 | 86 | - [*](https://github.com/monitoring-mixins/website/blob/master/assets/velero/dashboards/*.json) 87 | -------------------------------------------------------------------------------- /site/content/wildfly/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: wildfly 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/wildfly-mixin) 11 | {{< /panel >}} 12 | 13 | ## Alerts 14 | 15 | {{< panel style="warning" >}} 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/wildfly/alerts.yaml). 17 | {{< /panel >}} 18 | 19 | ### wildfly 20 | 21 | ##### HighPercentageOfErrorResponses 22 | 23 | {{< code lang="yaml" >}} 24 | alert: HighPercentageOfErrorResponses 25 | annotations: 26 | description: | 27 | The percentage of error responses is {{ printf "%.2f" $value }} on {{ $labels.instance }} - {{ $labels.server }} which is higher than {{30 }}. 28 | summary: Large percentage of requests are resulting in 5XX responses. 29 | expr: | 30 | sum by (job, instance, server) (increase(wildfly_undertow_error_count_total{}[5m]) / increase(wildfly_undertow_request_count_total{}[5m])) * 100 > 30 31 | for: 5m 32 | labels: 33 | severity: critical 34 | {{< /code >}} 35 | 36 | ##### HighNumberOfRejectedSessionsForDeployment 37 | 38 | {{< code lang="yaml" >}} 39 | alert: HighNumberOfRejectedSessionsForDeployment 40 | annotations: 41 | description: | 42 | Deployemnt {{ $labels.deployment }} on {{ $labels.instance }} is exceeding the threshold for rejected sessions {{ printf "%.0f" $value }} is higher than 20. 43 | summary: Large number of sessions are being rejected for a deployment. 44 | expr: | 45 | sum by (deployment, instance, job) (increase(wildfly_undertow_rejected_sessions_total{}[5m])) > 20 46 | for: 5m 47 | labels: 48 | severity: critical 49 | {{< /code >}} 50 | 51 | ## Dashboards 52 | Following dashboards are generated from mixins and hosted on github: 53 | 54 | 55 | - [wildfly-datasource](https://github.com/monitoring-mixins/website/blob/master/assets/wildfly/dashboards/wildfly-datasource.json) 56 | - [wildfly-overview](https://github.com/monitoring-mixins/website/blob/master/assets/wildfly/dashboards/wildfly-overview.json) 57 | -------------------------------------------------------------------------------- /site/content/wso2-enterprise-integrator/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: wso2-enterprise-integrator 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/wso2-enterprise-integrator-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [API_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/API_Metrics.json) 18 | - [Cluster_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/Cluster_Metrics.json) 19 | - [Inbound_Endpoint_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/Inbound_Endpoint_Metrics.json) 20 | - [Node_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/Node_Metrics.json) 21 | - [Proxy_Service_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/Proxy_Service_Metrics.json) 22 | -------------------------------------------------------------------------------- /site/content/wso2-streaming-integrator/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: wso2-streaming-integrator 3 | --- 4 | 5 | ## Overview 6 | 7 | 8 | 9 | {{< panel style="danger" >}} 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/wso2-streaming-integrator-mixin) 11 | {{< /panel >}} 12 | 13 | ## Dashboards 14 | Following dashboards are generated from mixins and hosted on github: 15 | 16 | 17 | - [Siddhi_aggregation](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_aggregation.json) 18 | - [Siddhi_ondemandquery](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_ondemandquery.json) 19 | - [Siddhi_overall](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_overall.json) 20 | - [Siddhi_query](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_query.json) 21 | - [Siddhi_server](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_server.json) 22 | - [Siddhi_sink](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_sink.json) 23 | - [Siddhi_source](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_source.json) 24 | - [Siddhi_stream](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_stream.json) 25 | - [Siddhi_table](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_table.json) 26 | - [StreamingIntegrator_apps](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/StreamingIntegrator_apps.json) 27 | - [StreamingIntegrator_overall](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/StreamingIntegrator_overall.json) 28 | -------------------------------------------------------------------------------- /site/layouts/_default/baseof.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {{- partial "head.html" . -}} 4 | 5 | 6 | {{- partial "header.html" . -}} 7 | 8 |
9 |
10 | 11 |
12 | {{- partial "menu.html" . -}} 13 |
14 | 15 | {{- if and (ne .Site.Params.toc false) (ne .Params.toc false) }} 16 |
17 | {{- partial "tableofcontents.html" . -}} 18 |
19 |
20 | {{else}} 21 |
22 | {{end}} 23 | 24 | {{- block "main" . }}{{- end }} 25 | 26 |
27 | {{- if and (ne .Site.Params.disableReadmoreNav true) (ne .Params.disableReadmoreNav true) -}} 28 |
29 | {{ partial "next-prev-page.html" . }} 30 |
31 | {{- end -}} 32 |
33 | 34 |
35 | 36 |
37 | 38 |
39 | 40 | {{- partial "footer.html" . -}} 41 | 42 | 43 | 44 | --------------------------------------------------------------------------------