├── .github
    └── workflows
    │   ├── cron.yaml
    │   └── tests.yaml
├── .gitignore
├── .gitmodules
├── LICENSE
├── Makefile
├── README.md
├── assets
    ├── MSSQL
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── mssql-overview.json
    │   │   └── mssql-pages.json
    │   └── rules.yaml
    ├── aerospike
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── aerospike-instance-overview.json
    │   │   ├── aerospike-logs.json
    │   │   ├── aerospike-namespace-overview.json
    │   │   └── aerospike-overview.json
    │   └── rules.yaml
    ├── alertmanager
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── alertmanager-overview.json
    │   └── rules.yaml
    ├── apache-activemq
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── apache-activemq-cluster-overview.json
    │   │   ├── apache-activemq-instance-overview.json
    │   │   ├── apache-activemq-logs.json
    │   │   ├── apache-activemq-queue-overview.json
    │   │   └── apache-activemq-topic-overview.json
    │   └── rules.yaml
    ├── apache-airflow
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── apache-airflow-overview.json
    │   └── rules.yaml
    ├── apache-camel
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── apache-camel-micrometer.json
    │   └── rules.yaml
    ├── apache-cassandra
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── cassandra-keyspaces.json
    │   │   ├── cassandra-nodes.json
    │   │   └── cassandra-overview.json
    │   └── rules.yaml
    ├── apache-couchdb
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── couchdb-nodes.json
    │   │   └── couchdb-overview.json
    │   └── rules.yaml
    ├── apache-hadoop
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── apache-hadoop-datanode-overview.json
    │   │   ├── apache-hadoop-namenode-overview.json
    │   │   ├── apache-hadoop-nodemanager-overview.json
    │   │   └── apache-hadoop-resourcemanager-overview.json
    │   └── rules.yaml
    ├── apache-hbase
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── apache-hbase-cluster-overview.json
    │   │   ├── apache-hbase-logs.json
    │   │   └── apache-hbase-regionserver-overview.json
    │   └── rules.yaml
    ├── apache-http
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── apache-http.json
    │   └── rules.yaml
    ├── apache-mesos
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── apache-mesos-overview.json
    │   └── rules.yaml
    ├── apache-solr
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── apache-solr-cluster-overview.json
    │   │   ├── apache-solr-logs-overview.json
    │   │   ├── apache-solr-query-performance.json
    │   │   └── apache-solr-resource-monitoring.json
    │   └── rules.yaml
    ├── apache-tomcat
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── apache-tomcat-hosts.json
    │   │   └── apache-tomcat-overview.json
    │   └── rules.yaml
    ├── argo-cd-2
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── argo-cd-application-overview.json
    │   │   ├── argo-cd-notifications-overview.json
    │   │   └── argo-cd-operational-overview.json
    │   └── rules.yaml
    ├── argocd
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── argocd-overview.json
    │   └── rules.yaml
    ├── asterisk
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── asterisk-logs.json
    │   │   └── asterisk-overview.json
    │   └── rules.yaml
    ├── awx
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── awx.json
    │   └── rules.yaml
    ├── blackbox_exporter
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── blackbox-exporter.json
    │   └── rules.yaml
    ├── caddy
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── caddy-overview.json
    │   └── rules.yaml
    ├── celery
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── celery-tasks-by-task.json
    │   │   └── celery-tasks-overview.json
    │   └── rules.yaml
    ├── ceph
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── ceph-cluster-advanced.json
    │   │   ├── ceph-nvmeof-performance.json
    │   │   ├── ceph-nvmeof.json
    │   │   ├── cephfs-overview.json
    │   │   ├── cephfsdashboard.json
    │   │   ├── host-details.json
    │   │   ├── hosts-overview.json
    │   │   ├── multi-cluster-overview.json
    │   │   ├── osd-device-details.json
    │   │   ├── osds-overview.json
    │   │   ├── pool-detail.json
    │   │   ├── pool-overview.json
    │   │   ├── radosgw-detail.json
    │   │   ├── radosgw-overview.json
    │   │   ├── radosgw-sync-overview.json
    │   │   ├── rbd-details.json
    │   │   ├── rbd-overview.json
    │   │   ├── rgw-s3-analytics.json
    │   │   └── smb-overview.json
    │   └── rules.yaml
    ├── cert-manager
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── overview.json
    │   └── rules.yaml
    ├── cilium-enterprise
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── cilium-L3-policy.json
    │   │   ├── cilium-L7-proxy.json
    │   │   ├── cilium-agent-overview.json
    │   │   ├── cilium-agent.json
    │   │   ├── cilium-api.json
    │   │   ├── cilium-bpf.json
    │   │   ├── cilium-conntrack.json
    │   │   ├── cilium-datapath.json
    │   │   ├── cilium-external-fqdn-proxy.json
    │   │   ├── cilium-fqdn-proxy.json
    │   │   ├── cilium-identities.json
    │   │   ├── cilium-kubernetes.json
    │   │   ├── cilium-network.json
    │   │   ├── cilium-nodes.json
    │   │   ├── cilium-operator.json
    │   │   ├── cilium-overview.json
    │   │   ├── cilium-policy.json
    │   │   ├── cilium-resource-utilization.json
    │   │   ├── hubble-overview.json
    │   │   └── hubble-timescape.json
    │   └── rules.yaml
    ├── clickhouse
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── clickhouse-latency.json
    │   │   ├── clickhouse-logs.json
    │   │   ├── clickhouse-overview.json
    │   │   └── clickhouse-replica.json
    │   └── rules.yaml
    ├── cloudflare
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── cloudflare-geomap-overview.json
    │   │   ├── cloudflare-worker-overview.json
    │   │   └── cloudflare-zone-overview.json
    │   └── rules.yaml
    ├── confluent-kafka
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── confluent-kafka-overview.json
    │   └── rules.yaml
    ├── consul
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── consul-overview.json
    │   └── rules.yaml
    ├── coredns
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── coredns.json
    │   └── rules.yaml
    ├── cortex
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── alertmanager.json
    │   │   ├── cortex-compactor-resources.json
    │   │   ├── cortex-compactor.json
    │   │   ├── cortex-config.json
    │   │   ├── cortex-object-store.json
    │   │   ├── cortex-queries.json
    │   │   ├── cortex-reads.json
    │   │   ├── cortex-rollout-progress.json
    │   │   ├── cortex-scaling.json
    │   │   ├── cortex-slow-queries.json
    │   │   ├── cortex-writes.json
    │   │   └── ruler.json
    │   └── rules.yaml
    ├── couchbase
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── couchbase-bucket-overview.json
    │   │   ├── couchbase-cluster-overview.json
    │   │   └── couchbase-node-overview.json
    │   └── rules.yaml
    ├── discourse
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── discourse-jobs.json
    │   │   └── discourse-overview.json
    │   └── rules.yaml
    ├── django
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── django-overview.json
    │   │   ├── django-requests-by-view.json
    │   │   └── django-requests-overview.json
    │   └── rules.yaml
    ├── docker
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── docker.json
    │   └── rules.yaml
    ├── elasticsearch
    │   ├── alerts.yaml
    │   └── rules.yaml
    ├── envoy
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── envoy-overview.json
    │   └── rules.yaml
    ├── etcd
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── etcd.json
    │   └── rules.yaml
    ├── f5-bigip
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── bigip-cluster-overview.json
    │   │   ├── bigip-node-overview.json
    │   │   ├── bigip-pool-overview.json
    │   │   └── bigip-virtual-server-overview.json
    │   └── rules.yaml
    ├── gitea
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── gitea-overview.json
    │   └── rules.yaml
    ├── gitlab
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── gitlab-overview.json
    │   └── rules.yaml
    ├── gluster
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── k8s-storage-resources-glusterfs-pv.json
    │   └── rules.yaml
    ├── go-runtime
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── go-runtime.json
    │   └── rules.yaml
    ├── grafana
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── grafana-overview.json
    │   └── rules.yaml
    ├── haproxy
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── haproxy-backend.json
    │   │   ├── haproxy-frontend.json
    │   │   ├── haproxy-overview.json
    │   │   └── haproxy-server.json
    │   └── rules.yaml
    ├── harbor
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── harbor-overview.json
    │   └── rules.yaml
    ├── hass
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── hass.json
    │   └── rules.yaml
    ├── ibm-mq
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── ibm-mq-cluster-overview.json
    │   │   ├── ibm-mq-queue-manager-overview.json
    │   │   ├── ibm-mq-queue-overview.json
    │   │   └── ibm-mq-topics-overview.json
    │   └── rules.yaml
    ├── influxdb
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── influxdb-cluster-overview.json
    │   │   ├── influxdb-instance-overview.json
    │   │   └── influxdb-logs.json
    │   └── rules.yaml
    ├── ingress-nginx-mixin
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── ingress-nginx-overview.json
    │   │   └── ingress-nginx-request-handling-performance.json
    │   └── rules.yaml
    ├── istio
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── logs
    │   │   ├── overview
    │   │   ├── servicesOverview
    │   │   └── workloadsOverview
    │   └── rules.yaml
    ├── jaeger
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── jaeger-read.json
    │   │   └── jaeger-write.json
    │   └── rules.yaml
    ├── jenkins
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── jenkins.json
    │   └── rules.yaml
    ├── jira
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── jira-overview.json
    │   └── rules.yaml
    ├── jvm
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── jvm-dashboard.json
    │   └── rules.yaml
    ├── kafka
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── connect-overview.json
    │   │   ├── kafka-ksqldb-overview.json
    │   │   ├── kafka-overview-dashboard.json
    │   │   ├── kafka-topic-dashboard.json
    │   │   ├── schema-registry-overview.json
    │   │   └── zookeeper-overview.json
    │   └── rules.yaml
    ├── kube-cockroachdb
    │   ├── alerts.yaml
    │   └── rules.yaml
    ├── kube-state-metrics
    │   ├── alerts.yaml
    │   └── rules.yaml
    ├── kubernetes-autoscaling
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── kubernetes-autoscaling-mixin-ca.json
    │   │   ├── kubernetes-autoscaling-mixin-hpa.json
    │   │   ├── kubernetes-autoscaling-mixin-karpenter-act.json
    │   │   ├── kubernetes-autoscaling-mixin-karpenter-over.json
    │   │   ├── kubernetes-autoscaling-mixin-karpenter-perf.json
    │   │   ├── kubernetes-autoscaling-mixin-pdb.json
    │   │   └── kubernetes-autoscaling-mixin-vpa.json
    │   └── rules.yaml
    ├── kubernetes
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── apiserver.json
    │   │   ├── cluster-total.json
    │   │   ├── controller-manager.json
    │   │   ├── k8s-resources-cluster.json
    │   │   ├── k8s-resources-namespace.json
    │   │   ├── k8s-resources-node.json
    │   │   ├── k8s-resources-pod.json
    │   │   ├── k8s-resources-windows-cluster.json
    │   │   ├── k8s-resources-windows-namespace.json
    │   │   ├── k8s-resources-windows-pod.json
    │   │   ├── k8s-resources-workload.json
    │   │   ├── k8s-resources-workloads-namespace.json
    │   │   ├── k8s-windows-cluster-rsrc-use.json
    │   │   ├── k8s-windows-node-rsrc-use.json
    │   │   ├── kubelet.json
    │   │   ├── namespace-by-pod.json
    │   │   ├── namespace-by-workload.json
    │   │   ├── persistentvolumesusage.json
    │   │   ├── pod-total.json
    │   │   ├── proxy.json
    │   │   ├── scheduler.json
    │   │   └── workload-total.json
    │   └── rules.yaml
    ├── loki
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── loki-bloom-build.json
    │   │   ├── loki-bloom-gateway.json
    │   │   ├── loki-chunks.json
    │   │   ├── loki-deletion.json
    │   │   ├── loki-logs.json
    │   │   ├── loki-mixin-recording-rules.json
    │   │   ├── loki-operational.json
    │   │   ├── loki-reads-resources.json
    │   │   ├── loki-reads.json
    │   │   ├── loki-retention.json
    │   │   ├── loki-writes-resources.json
    │   │   ├── loki-writes.json
    │   │   └── loki_thanos_object_storage.json
    │   └── rules.yaml
    ├── memcached
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── memcached-overview.json
    │   └── rules.yaml
    ├── microsoft-iis
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── microsoft-iis-applications.json
    │   │   └── microsoft-iis-overview.json
    │   └── rules.yaml
    ├── mongodb-atlas
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── mongodb-atlas-cluster-overview.json
    │   │   ├── mongodb-atlas-elections-overview.json
    │   │   ├── mongodb-atlas-operations-overview.json
    │   │   └── mongodb-atlas-performance-overview.json
    │   └── rules.yaml
    ├── mongodb
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── MongoDB_Cluster.json
    │   │   ├── MongoDB_Instance.json
    │   │   └── MongoDB_ReplicaSet.json
    │   └── rules.yaml
    ├── mysql
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── mysql-overview.json
    │   └── rules.yaml
    ├── nginx
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── nginx-logs.json
    │   │   └── nginx-metrics.json
    │   └── rules.yaml
    ├── node-exporter
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── node-cluster-rsrc-use.json
    │   │   ├── node-rsrc-use.json
    │   │   ├── nodes-aix.json
    │   │   ├── nodes-darwin.json
    │   │   └── nodes.json
    │   └── rules.yaml
    ├── nodejs
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── nodejs-overview.json
    │   └── rules.yaml
    ├── nomad
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── nomad-cluster.json
    │   │   └── nomad-jobs.json
    │   └── rules.yaml
    ├── nsq
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── nsq-instances.json
    │   │   └── nsq-topics.json
    │   └── rules.yaml
    ├── openldap
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── logs
    │   │   └── overview
    │   └── rules.yaml
    ├── opensearch
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── node-overview.json
    │   │   ├── opensearch-cluster-overview.json
    │   │   └── search-and-index-overview.json
    │   └── rules.yaml
    ├── openstack
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── cinder
    │   │   ├── logs
    │   │   ├── neutron
    │   │   ├── nova
    │   │   └── overview
    │   └── rules.yaml
    ├── oracledb
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── oracledb-overview.json
    │   └── rules.yaml
    ├── pgbouncer
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── clusterOverview
    │   │   ├── logs
    │   │   └── overview
    │   └── rules.yaml
    ├── postgres-exporter
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── postgres-overview.json
    │   └── rules.yaml
    ├── presto
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── presto-coordinator.json
    │   │   ├── presto-logs.json
    │   │   ├── presto-overview.json
    │   │   └── presto-worker.json
    │   └── rules.yaml
    ├── prometheus-operator
    │   ├── alerts.yaml
    │   └── rules.yaml
    ├── prometheus
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── prometheus-remote-write.json
    │   │   └── prometheus.json
    │   └── rules.yaml
    ├── promscale
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── apm-dependencies.json
    │   │   ├── apm-home.json
    │   │   ├── apm-service-dependencies-downstream.json
    │   │   ├── apm-service-dependencies-upstream.json
    │   │   ├── apm-service-overview.json
    │   │   └── promscale.json
    │   └── rules.yaml
    ├── promtail
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── promtail.json
    │   └── rules.yaml
    ├── python-runtime
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── python-runtime.json
    │   └── rules.yaml
    ├── rabbitmq
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── erlang-memory-allocators.json
    │   │   └── rabbitmq-overview.json
    │   └── rules.yaml
    ├── rclone
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── rclone.json
    │   └── rules.yaml
    ├── redis
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── redis_overview
    │   └── rules.yaml
    ├── ruby
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── ruby-overview.json
    │   └── rules.yaml
    ├── sap-hana
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── sap-hana-instance-overview.json
    │   │   └── sap-hana-system-overview.json
    │   └── rules.yaml
    ├── sealed-secrets
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── sealed-secrets-controller.json
    │   └── rules.yaml
    ├── snmp
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── snmp-fleet.json
    │   │   ├── snmp-logs.json
    │   │   └── snmp-overview.json
    │   └── rules.yaml
    ├── spark
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── spark-metrics.json
    │   └── rules.yaml
    ├── spinnaker
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── clouddriver.json
    │   │   ├── deck.json
    │   │   ├── echo.json
    │   │   ├── fiat.json
    │   │   ├── front50.json
    │   │   ├── gate.json
    │   │   ├── igor.json
    │   │   ├── orca.json
    │   │   ├── rosco.json
    │   │   ├── spinnaker-application-details.json
    │   │   ├── spinnaker-aws-platform.json
    │   │   ├── spinnaker-google-platform.json
    │   │   ├── spinnaker-key-metrics.json
    │   │   ├── spinnaker-kubernetes-platform.json
    │   │   └── spinnaker-minimalist.json
    │   └── rules.yaml
    ├── spring-boot
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── jvm-dashboard.json
    │   └── rules.yaml
    ├── squid
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── squid-overview.json
    │   └── rules.yaml
    ├── supabase
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── supabase.json
    │   └── rules.yaml
    ├── tensorflow
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── tensorflow-overview.json
    │   └── rules.yaml
    ├── thanos
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── bucket-replicate.json
    │   │   ├── compact.json
    │   │   ├── overview.json
    │   │   ├── query-frontend.json
    │   │   ├── query.json
    │   │   ├── receive.json
    │   │   ├── rule.json
    │   │   ├── sidecar.json
    │   │   └── store.json
    │   └── rules.yaml
    ├── traefik
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── traefikdash.json
    │   └── rules.yaml
    ├── ubnt-edgerouter
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── ubnt-edgrouterx-overview.json
    │   └── rules.yaml
    ├── varnish
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── varnish-overview.json
    │   └── rules.yaml
    ├── vault
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   └── vault.json
    │   └── rules.yaml
    ├── velero
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── clusterOverview
    │   │   ├── logs
    │   │   └── overview
    │   └── rules.yaml
    ├── wildfly
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── wildfly-datasource.json
    │   │   └── wildfly-overview.json
    │   └── rules.yaml
    ├── windows-active-directory
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── activedirectory
    │   │   └── logs
    │   └── rules.yaml
    ├── windows
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── disks
    │   │   ├── fleet
    │   │   ├── logs
    │   │   ├── overview
    │   │   └── system
    │   └── rules.yaml
    ├── wso2-enterprise-integrator
    │   ├── alerts.yaml
    │   ├── dashboards
    │   │   ├── API_Metrics.json
    │   │   ├── Cluster_Metrics.json
    │   │   ├── Inbound_Endpoint_Metrics.json
    │   │   ├── Node_Metrics.json
    │   │   └── Proxy_Service_Metrics.json
    │   └── rules.yaml
    └── wso2-streaming-integrator
    │   ├── alerts.yaml
    │   ├── dashboards
    │       ├── Siddhi_aggregation.json
    │       ├── Siddhi_ondemandquery.json
    │       ├── Siddhi_overall.json
    │       ├── Siddhi_query.json
    │       ├── Siddhi_server.json
    │       ├── Siddhi_sink.json
    │       ├── Siddhi_source.json
    │       ├── Siddhi_stream.json
    │       ├── Siddhi_table.json
    │       ├── StreamingIntegrator_apps.json
    │       └── StreamingIntegrator_overall.json
    │   └── rules.yaml
├── hack
    ├── generate.sh
    ├── go.mod
    ├── go.sum
    └── tools.go
├── mixins.json
├── netlify.toml
└── site
    ├── config.yaml
    ├── content
        ├── MSSQL
        │   └── _index.md
        ├── _index.md
        ├── aerospike
        │   └── _index.md
        ├── alertmanager
        │   └── _index.md
        ├── apache-activemq
        │   └── _index.md
        ├── apache-airflow
        │   └── _index.md
        ├── apache-camel
        │   └── _index.md
        ├── apache-cassandra
        │   └── _index.md
        ├── apache-couchdb
        │   └── _index.md
        ├── apache-hadoop
        │   └── _index.md
        ├── apache-hbase
        │   └── _index.md
        ├── apache-http
        │   └── _index.md
        ├── apache-mesos
        │   └── _index.md
        ├── apache-solr
        │   └── _index.md
        ├── apache-tomcat
        │   └── _index.md
        ├── argo-cd-2
        │   └── _index.md
        ├── argocd
        │   └── _index.md
        ├── asterisk
        │   └── _index.md
        ├── awx
        │   └── _index.md
        ├── blackbox_exporter
        │   └── _index.md
        ├── caddy
        │   └── _index.md
        ├── celery
        │   └── _index.md
        ├── ceph
        │   └── _index.md
        ├── cert-manager
        │   └── _index.md
        ├── cilium-enterprise
        │   └── _index.md
        ├── clickhouse
        │   └── _index.md
        ├── cloudflare
        │   └── _index.md
        ├── confluent-kafka
        │   └── _index.md
        ├── consul
        │   └── _index.md
        ├── coredns
        │   └── _index.md
        ├── cortex
        │   └── _index.md
        ├── couchbase
        │   └── _index.md
        ├── discourse
        │   └── _index.md
        ├── django
        │   └── _index.md
        ├── docker
        │   └── _index.md
        ├── elasticsearch
        │   └── _index.md
        ├── envoy
        │   └── _index.md
        ├── etcd
        │   └── _index.md
        ├── f5-bigip
        │   └── _index.md
        ├── gitea
        │   └── _index.md
        ├── gitlab
        │   └── _index.md
        ├── gluster
        │   └── _index.md
        ├── go-runtime
        │   └── _index.md
        ├── grafana
        │   └── _index.md
        ├── haproxy
        │   └── _index.md
        ├── harbor
        │   └── _index.md
        ├── hass
        │   └── _index.md
        ├── ibm-mq
        │   └── _index.md
        ├── influxdb
        │   └── _index.md
        ├── ingress-nginx-mixin
        │   └── _index.md
        ├── istio
        │   └── _index.md
        ├── jaeger
        │   └── _index.md
        ├── jenkins
        │   └── _index.md
        ├── jira
        │   └── _index.md
        ├── jvm
        │   └── _index.md
        ├── kafka
        │   └── _index.md
        ├── kube-cockroachdb
        │   └── _index.md
        ├── kube-state-metrics
        │   └── _index.md
        ├── kubernetes-autoscaling
        │   └── _index.md
        ├── kubernetes
        │   └── _index.md
        ├── loki
        │   └── _index.md
        ├── memcached
        │   └── _index.md
        ├── microsoft-iis
        │   └── _index.md
        ├── mongodb-atlas
        │   └── _index.md
        ├── mongodb
        │   └── _index.md
        ├── mysql
        │   └── _index.md
        ├── nginx
        │   └── _index.md
        ├── node-exporter
        │   └── _index.md
        ├── nodejs
        │   └── _index.md
        ├── nomad
        │   └── _index.md
        ├── nsq
        │   └── _index.md
        ├── openldap
        │   └── _index.md
        ├── opensearch
        │   └── _index.md
        ├── openstack
        │   └── _index.md
        ├── oracledb
        │   └── _index.md
        ├── pgbouncer
        │   └── _index.md
        ├── postgres-exporter
        │   └── _index.md
        ├── presto
        │   └── _index.md
        ├── prometheus-operator
        │   └── _index.md
        ├── prometheus
        │   └── _index.md
        ├── promscale
        │   └── _index.md
        ├── promtail
        │   └── _index.md
        ├── python-runtime
        │   └── _index.md
        ├── rabbitmq
        │   └── _index.md
        ├── rclone
        │   └── _index.md
        ├── redis
        │   └── _index.md
        ├── ruby
        │   └── _index.md
        ├── sap-hana
        │   └── _index.md
        ├── sealed-secrets
        │   └── _index.md
        ├── snmp
        │   └── _index.md
        ├── spark
        │   └── _index.md
        ├── spinnaker
        │   └── _index.md
        ├── spring-boot
        │   └── _index.md
        ├── squid
        │   └── _index.md
        ├── supabase
        │   └── _index.md
        ├── tensorflow
        │   └── _index.md
        ├── thanos
        │   └── _index.md
        ├── traefik
        │   └── _index.md
        ├── ubnt-edgerouter
        │   └── _index.md
        ├── varnish
        │   └── _index.md
        ├── vault
        │   └── _index.md
        ├── velero
        │   └── _index.md
        ├── wildfly
        │   └── _index.md
        ├── windows-active-directory
        │   └── _index.md
        ├── windows
        │   └── _index.md
        ├── wso2-enterprise-integrator
        │   └── _index.md
        └── wso2-streaming-integrator
        │   └── _index.md
    ├── layouts
        └── _default
        │   └── baseof.html
    └── static
        └── mixins.json


/.github/workflows/cron.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Daily content regeneration
 3 | 
 4 | on:
 5 |   schedule:
 6 |     - cron:  '3 3 * * *'
 7 | 
 8 | jobs:
 9 |   update:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     - uses: actions/setup-go@v5
14 |       with:
15 |         go-version: '^1.23'
16 |     - run: make generate
17 |     - uses: EndBug/add-and-commit@v4
18 |       with:
19 |         add: 'assets/ site/content/'
20 |         author_name: "github-actions[bot]"
21 |         author_email: "github-actions@users.noreply.github.com"
22 |         message: 'assets,site/content: daily assets regeneration'
23 |       env:
24 |         # This is necessary in order to push a commit to the repo
25 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Pull request CI workflow
 3 | 
 4 | on:
 5 |   pull_request:
 6 |     branches:
 7 |     - master
 8 |   push:
 9 |     branches:
10 |     - master
11 | 
12 | jobs:
13 |   generate:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - uses: actions/setup-go@v5
18 |       with:
19 |         go-version: '^1.23'
20 |     - run: make generate
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tmp/
2 | site/public
3 | site/resources
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "site/themes/ace-documentation"]
2 | 	path = site/themes/ace-documentation
3 | 	url = https://github.com/vantagedesign/ace-documentation.git
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/bash -o pipefail
 2 | 
 3 | BIN_DIR?=$(shell pwd)/tmp/bin
 4 | 
 5 | JB_BIN=$(BIN_DIR)/jb
 6 | GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml
 7 | JSONNET_BIN=$(BIN_DIR)/jsonnet
 8 | TOOLING=$(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN)
 9 | 
10 | .PHONY: all
11 | all: generate
12 | 
13 | .PHONY: generate
14 | generate: $(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN)
15 | 	./hack/generate.sh
16 | 
17 | $(BIN_DIR):
18 | 	mkdir -p $(BIN_DIR)
19 | 
20 | $(TOOLING): $(BIN_DIR)
21 | 	@echo Installing tools from hack/tools.go
22 | 	@cd hack && go list -mod=mod -tags tools -e -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) %
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Monitoring Mixins site
 2 | 
 3 | ## Adding new mixin
 4 | 
 5 | 0. Install [required software](#requirements)
 6 | 1. Add new mixin to [mixins.json](mixins.json) file
 7 | 2. Run `make`
 8 | 
 9 | ## Requirements
10 | 
11 | - jq
12 | - make
13 | - git
14 | - golang
15 | 


--------------------------------------------------------------------------------
/assets/MSSQL/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: MSSQLAlerts
 3 |   rules:
 4 |   - alert: MSSQLHighNumberOfDeadlocks
 5 |     annotations:
 6 |       description: '{{ printf "%.2f" $value }} deadlocks have occurred over the last
 7 |         5 minutes on {{$labels.instance}}, which is above threshold of 10 deadlocks.'
 8 |       summary: There are deadlocks ocurring in the database.
 9 |     expr: |
10 |       increase(mssql_deadlocks_total{}[5m]) > 10
11 |     for: 5m
12 |     labels:
13 |       severity: warning
14 |   - alert: MSSQLModerateReadStallTime
15 |     annotations:
16 |       description: '{{ printf "%.2f" $value }}ms of IO read stall has occurred on
17 |         {{$labels.instance}}, which is above threshold of 200ms.'
18 |       summary: There is a moderate amount of IO stall for database reads.
19 |     expr: |
20 |       1000 * increase(mssql_io_stall_seconds_total{operation="read"}[5m]) > 200
21 |     for: 5m
22 |     labels:
23 |       severity: warning
24 |   - alert: MSSQLHighReadStallTime
25 |     annotations:
26 |       description: '{{ printf "%.2f" $value }}ms of IO read stall has occurred on
27 |         {{$labels.instance}}, which is above threshold of 400ms.'
28 |       summary: There is a high amount of IO stall for database reads.
29 |     expr: |
30 |       1000 * increase(mssql_io_stall_seconds_total{operation="read"}[5m]) > 400
31 |     for: 5m
32 |     labels:
33 |       severity: critical
34 |   - alert: MSSQLModerateWriteStallTime
35 |     annotations:
36 |       description: '{{ printf "%.2f" $value }}ms of IO write stall has occurred on
37 |         {{$labels.instance}}, which is above threshold of 200ms.'
38 |       summary: There is a moderate amount of IO stall for database writes.
39 |     expr: |
40 |       1000 * increase(mssql_io_stall_seconds_total{operation="write"}[5m]) > 200
41 |     for: 5m
42 |     labels:
43 |       severity: warning
44 |   - alert: MSSQLHighWriteStallTime
45 |     annotations:
46 |       description: '{{ printf "%.2f" $value }}ms of IO write stall has occurred on
47 |         {{$labels.instance}}, which is above threshold of 400ms.'
48 |       summary: There is a high amount of IO stall for database writes.
49 |     expr: |
50 |       1000 * increase(mssql_io_stall_seconds_total{operation="write"}[5m]) > 400
51 |     for: 5m
52 |     labels:
53 |       severity: critical
54 | 


--------------------------------------------------------------------------------
/assets/MSSQL/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/aerospike/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/alertmanager/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-activemq/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: apache-activemq-alerts
 3 |   rules:
 4 |   - alert: ApacheActiveMQHighTopicMemoryUsage
 5 |     annotations:
 6 |       description: '{{ printf "%.0f" $value }} percent of memory used by topics on
 7 |         {{$labels.instance}} in cluster {{$labels.activemq_cluster}}, which is above
 8 |         the threshold of 70 percent.'
 9 |       summary: Topic destination memory usage is high, which may result in a reduction
10 |         of the rate at which producers send messages.
11 |     expr: |
12 |       sum without (destination) (activemq_topic_memory_percent_usage{destination!~"ActiveMQ.Advisory.*"}) > 70
13 |     for: 5m
14 |     labels:
15 |       severity: warning
16 |   - alert: ApacheActiveMQHighQueueMemoryUsage
17 |     annotations:
18 |       description: '{{ printf "%.0f" $value }} percent of memory used by queues on
19 |         {{$labels.instance}} in cluster {{$labels.activemq_cluster}}, which is above
20 |         the threshold of 70 percent.'
21 |       summary: Queue destination memory usage is high, which may result in a reduction
22 |         of the rate at which producers send messages.
23 |     expr: |
24 |       sum without (destination) (activemq_queue_memory_percent_usage) > 70
25 |     for: 5m
26 |     labels:
27 |       severity: warning
28 |   - alert: ApacheActiveMQHighStoreMemoryUsage
29 |     annotations:
30 |       description: '{{ printf "%.0f" $value }} percent of store memory used on {{$labels.instance}}
31 |         in cluster {{$labels.activemq_cluster}}, which is above the threshold of 70
32 |         percent.'
33 |       summary: Store memory usage is high, which may result in producers unable to
34 |         send messages.
35 |     expr: |
36 |       activemq_store_usage_ratio > 70
37 |     for: 5m
38 |     labels:
39 |       severity: warning
40 |   - alert: ApacheActiveMQHighTemporaryMemoryUsage
41 |     annotations:
42 |       description: '{{ printf "%.0f" $value }} percent of temporary memory used on
43 |         {{$labels.instance}} in cluster {{$labels.activemq_cluster}}, which is above
44 |         the threshold of 70 percent.'
45 |       summary: Temporary memory usage is high, which may result in saturation of messaging
46 |         throughput.
47 |     expr: |
48 |       activemq_temp_usage_ratio > 70
49 |     for: 5m
50 |     labels:
51 |       severity: warning
52 | 


--------------------------------------------------------------------------------
/assets/apache-activemq/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-airflow/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: apache-airflow
 3 |   rules:
 4 |   - alert: ApacheAirflowStarvingPoolTasks
 5 |     annotations:
 6 |       description: |
 7 |         The number of starved tasks is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of 0.
 8 |       summary: There are starved tasks detected in the Apache Airflow pool.
 9 |     expr: |
10 |       airflow_pool_starving_tasks > 0
11 |     for: 5m
12 |     labels:
13 |       severity: critical
14 |   - alert: ApacheAirflowDAGScheduleDelayWarningLevel
15 |     annotations:
16 |       description: |
17 |         The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 10.
18 |       summary: The delay in DAG schedule time to DAG run time has reached the warning
19 |         threshold.
20 |     expr: |
21 |       increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 10
22 |     for: 1m
23 |     labels:
24 |       severity: warning
25 |   - alert: ApacheAirflowDAGScheduleDelayCriticalLevel
26 |     annotations:
27 |       description: |
28 |         The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 60.
29 |       summary: The delay in DAG schedule time to DAG run time has reached the critical
30 |         threshold.
31 |     expr: |
32 |       increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 60
33 |     for: 1m
34 |     labels:
35 |       severity: critical
36 |   - alert: ApacheAirflowDAGFailures
37 |     annotations:
38 |       description: |
39 |         The number of DAG failures seen is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 0.
40 |       summary: There have been DAG failures detected.
41 |     expr: |
42 |       increase(airflow_dagrun_duration_failed_count[5m]) > 0
43 |     for: 1m
44 |     labels:
45 |       severity: critical
46 | 


--------------------------------------------------------------------------------
/assets/apache-airflow/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-camel/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-camel/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-cassandra/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-couchdb/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-hadoop/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-hbase/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: apache-hbase-alerts
 3 |   rules:
 4 |   - alert: HBaseHighHeapMemUsage
 5 |     annotations:
 6 |       description: The heap memory usage for the JVM on instance {{$labels.instance}}
 7 |         in cluster {{$labels.hbase_cluster}} is {{printf "%.0f" $value}} percent,
 8 |         which is above the threshold of 80 percent
 9 |       summary: There is a limited amount of heap memory available to the JVM.
10 |     expr: |
11 |       100 * sum without(context, hostname, processname) (jvm_metrics_mem_heap_used_m{job="integrations/apache-hbase"} / clamp_min(jvm_metrics_mem_heap_committed_m{job="integrations/apache-hbase"}, 1))  > 80
12 |     for: 5m
13 |     labels:
14 |       severity: warning
15 |   - alert: HBaseDeadRegionServer
16 |     annotations:
17 |       description: '{{$value}} RegionServer(s) in cluster {{$labels.hbase_cluster}}
18 |         are unresponsive, which is above the threshold of 0. The name(s) of the dead
19 |         RegionServer(s) are {{$labels.deadregionservers}}'
20 |       summary: One or more RegionServer(s) has become unresponsive.
21 |     expr: |
22 |       server_num_dead_region_servers > 0
23 |     for: 5m
24 |     labels:
25 |       severity: warning
26 |   - alert: HBaseOldRegionsInTransition
27 |     annotations:
28 |       description: '{{printf "%.0f" $value}} percent of RegionServers in transition
29 |         in cluster {{$labels.hbase_cluster}} are transitioning for longer than expected,
30 |         which is above the threshold of 50 percent'
31 |       summary: RegionServers are in transition for longer than expected.
32 |     expr: |
33 |       100 * assignment_manager_rit_count_over_threshold / clamp_min(assignment_manager_rit_count, 1) > 50
34 |     for: 5m
35 |     labels:
36 |       severity: warning
37 |   - alert: HBaseHighMasterAuthFailRate
38 |     annotations:
39 |       description: '{{printf "%.0f" $value}} percent of authentication attempts to
40 |         the master are failing in cluster {{$labels.hbase_cluster}}, which is above
41 |         the threshold of 35 percent'
42 |       summary: A high percentage of authentication attempts to the master are failing.
43 |     expr: |
44 |       100 * rate(master_authentication_failures[5m]) / (clamp_min(rate(master_authentication_successes[5m]), 1) + clamp_min(rate(master_authentication_failures[5m]), 1)) > 35
45 |     for: 5m
46 |     labels:
47 |       severity: warning
48 |   - alert: HBaseHighRSAuthFailRate
49 |     annotations:
50 |       description: '{{printf "%.0f" $value}} percent of authentication attempts to
51 |         the RegionServer {{$labels.instance}} are failing in cluster {{$labels.hbase_cluster}},
52 |         which is above the threshold of 35 percent'
53 |       summary: A high percentage of authentication attempts to a RegionServer are
54 |         failing.
55 |     expr: |
56 |       100 * rate(region_server_authentication_failures[5m]) / (clamp_min(rate(region_server_authentication_successes[5m]), 1) + clamp_min(rate(region_server_authentication_failures[5m]), 1)) > 35
57 |     for: 5m
58 |     labels:
59 |       severity: warning
60 | 


--------------------------------------------------------------------------------
/assets/apache-hbase/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-http/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: apache-http
 3 |   rules:
 4 |   - alert: ApacheDown
 5 |     annotations:
 6 |       description: Apache is down on {{ $labels.instance }}.
 7 |       summary: Apache is down.
 8 |     expr: apache_up == 0
 9 |     for: 5m
10 |     labels:
11 |       severity: warning
12 |   - alert: ApacheRestart
13 |     annotations:
14 |       description: Apache has just been restarted on {{ $labels.instance }}.
15 |       summary: Apache restart.
16 |     expr: apache_uptime_seconds_total / 60 < 1
17 |     for: "0"
18 |     labels:
19 |       severity: info
20 |   - alert: ApacheWorkersLoad
21 |     annotations:
22 |       description: |
23 |         Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}.
24 |         The current value is {{ $value }}%.
25 |       summary: Apache workers load is too high.
26 |     expr: |
27 |       (sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80
28 |     for: 15m
29 |     labels:
30 |       severity: warning
31 |   - alert: ApacheResponseTimeTooHigh
32 |     annotations:
33 |       description: |
34 |         Apache average response time is above the threshold of 5000 ms on {{ $labels.instance }}.
35 |         The current value is {{ $value }} ms.
36 |       summary: Apache response time is too high.
37 |     expr: |
38 |       increase(apache_duration_ms_total[5m])/increase(apache_accesses_total[5m]) > 5000
39 |     for: 15m
40 |     labels:
41 |       severity: warning
42 | 


--------------------------------------------------------------------------------
/assets/apache-http/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-mesos/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: apache-mesos
 3 |   rules:
 4 |   - alert: ApacheMesosHighMemoryUsage
 5 |     annotations:
 6 |       description: '{{ printf "%.0f" $value }} percent memory usage on {{$labels.mesos_cluster}},
 7 |         which is above the threshold of 90.'
 8 |       summary: There is a high memory usage for the cluster.
 9 |     expr: |
10 |       min without(instance, job, type) (mesos_master_mem{type="percent"}) > 90
11 |     for: 5m
12 |     labels:
13 |       severity: warning
14 |   - alert: ApacheMesosHighDiskUsage
15 |     annotations:
16 |       description: '{{ printf "%.0f" $value }} percent disk usage on {{$labels.mesos_cluster}},
17 |         which is above the threshold of 90.'
18 |       summary: There is a high disk usage for the cluster.
19 |     expr: |
20 |       min without(instance, job, type) (mesos_master_disk{type="percent"}) > 90
21 |     for: 5m
22 |     labels:
23 |       severity: critical
24 |   - alert: ApacheMesosUnreachableTasks
25 |     annotations:
26 |       description: '{{ printf "%.0f" $value }} unreachable tasks on {{$labels.mesos_cluster}},
27 |         which is above the threshold of 3.'
28 |       summary: There are an unusually high number of unreachable tasks.
29 |     expr: |
30 |       max without(instance, job, state) (mesos_master_task_states_current{state="unreachable"}) > 3
31 |     for: 5m
32 |     labels:
33 |       severity: warning
34 |   - alert: ApacheMesosNoLeaderElected
35 |     annotations:
36 |       description: There is no cluster coordinator on {{$labels.mesos_cluster}}.
37 |       summary: There is currently no cluster coordinator.
38 |     expr: |
39 |       max without(instance, job) (mesos_master_elected) == 0
40 |     for: 1m
41 |     labels:
42 |       severity: critical
43 |   - alert: ApacheMesosInactiveAgents
44 |     annotations:
45 |       description: '{{ printf "%.0f" $value }} inactive agent clients over the last
46 |         5m which is above the threshold of 1.'
47 |       summary: There are currently inactive agent clients.
48 |     expr: |
49 |       max without(instance, job, state) (mesos_master_slaves_state{state=~"connected_inactive|disconnected_inactive"}) > 1
50 |     for: 5m
51 |     labels:
52 |       severity: warning
53 | 


--------------------------------------------------------------------------------
/assets/apache-mesos/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-solr/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/apache-tomcat/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: ApacheTomcatAlerts
 3 |   rules:
 4 |   - alert: ApacheTomcatAlertsHighCpuUsage
 5 |     annotations:
 6 |       description: The CPU usage has been at {{ printf "%.0f" $value }} percent over
 7 |         the last 5 minutes on {{$labels.instance}}, which is above the threshold of
 8 |         80 percent.
 9 |       summary: The instance has a CPU usage higher than the configured threshold.
10 |     expr: |
11 |       sum by (job,instance) (jvm_process_cpu_load{job="integrations/tomcat"}) > 80
12 |     for: 5m
13 |     labels:
14 |       severity: critical
15 |   - alert: ApacheTomcatAlertsHighMemoryUsage
16 |     annotations:
17 |       description: The memory usage has been at {{ printf "%.0f" $value }} percent
18 |         over the last 5 minutes on {{$labels.instance}}, which is above the threshold
19 |         of 80 percent.
20 |       summary: The instance has a higher memory usage than the configured threshold.
21 |     expr: |
22 |       sum(jvm_memory_usage_used_bytes{job="integrations/tomcat"}) by (job,instance) / sum(jvm_physical_memory_bytes{job="integrations/tomcat"}) by (job,instance) * 100 > 80
23 |     for: 5m
24 |     labels:
25 |       severity: critical
26 |   - alert: ApacheTomcatAlertsHighRequestErrorPercent
27 |     annotations:
28 |       description: The percentage of request errors has been at {{ printf "%.0f" $value
29 |         }} percent over the last 5 minutes on {{$labels.instance}}, which is above
30 |         the threshold of 5 percent.
31 |       summary: There are a high number of request errors.
32 |     expr: |
33 |       sum by (job,instance) (increase(tomcat_errorcount_total{job="integrations/tomcat"}[5m]) / increase(tomcat_requestcount_total{job="integrations/tomcat"}[5m]) * 100) > 5
34 |     for: 5m
35 |     labels:
36 |       severity: critical
37 |   - alert: ApacheTomcatAlertsModeratelyHighProcessingTime
38 |     annotations:
39 |       description: The processing time has been at {{ printf "%.0f" $value }}ms over
40 |         the last 5 minutes on {{$labels.instance}}, which is above the threshold of
41 |         300ms.
42 |       summary: The processing time has been moderately high.
43 |     expr: |
44 |       sum by (job,instance) (increase(tomcat_processingtime_total{job="integrations/tomcat"}[5m]) / increase(tomcat_requestcount_total{job="integrations/tomcat"}[5m])) > 300
45 |     for: 5m
46 |     labels:
47 |       severity: warning
48 | 


--------------------------------------------------------------------------------
/assets/apache-tomcat/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/argo-cd-2/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/argocd/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: ArgoCD
 3 |   rules:
 4 |   - alert: ArgoAppOutOfSync
 5 |     annotations:
 6 |       description: Application {{ $labels.name }} has sync status as {{ $labels.sync_status
 7 |         }}.
 8 |       summary: Application is OutOfSync.
 9 |     expr: argocd_app_info{sync_status="OutOfSync"} == 1
10 |     for: 1m
11 |     labels:
12 |       severity: warning
13 |   - alert: ArgoAppSyncFailed
14 |     annotations:
15 |       description: Application {{ $labels.name }} has sync phase as {{ $labels.phase
16 |         }}.
17 |       summary: Application Sync Failed.
18 |     expr: argocd_app_sync_total{phase!="Succeeded"} == 1
19 |     for: 1m
20 |     labels:
21 |       severity: warning
22 |   - alert: ArgoAppMissing
23 |     annotations:
24 |       description: "ArgoCD has not reported any applications data for the past 15
25 |         minutes which means that it must be down or not functioning properly.  \n"
26 |       summary: No reported applications in ArgoCD.
27 |     expr: absent(argocd_app_info)
28 |     for: 15m
29 |     labels:
30 |       severity: critical
31 | 


--------------------------------------------------------------------------------
/assets/argocd/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/asterisk/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: AsteriskAlerts
 3 |   rules:
 4 |   - alert: AsteriskRestarted
 5 |     annotations:
 6 |       description: |-
 7 |         Asterisk instance restarted in the last minute
 8 |           VALUE = {{ $value }}
 9 |           LABELS = {{ $labels }}
10 |       summary: Asterisk instance restarted in the last minute.
11 |     expr: asterisk_core_uptime_seconds < 60
12 |     for: 5s
13 |     labels:
14 |       severity: critical
15 |   - alert: AsteriskReloaded
16 |     annotations:
17 |       description: |-
18 |         Asterisk instance reloaded in the last minute
19 |           VALUE = {{ $value }}
20 |           LABELS = {{ $labels }}
21 |       summary: Asterisk instance reloaded in the last minute.
22 |     expr: asterisk_core_last_reload_seconds < 60
23 |     for: 5s
24 |     labels:
25 |       severity: warning
26 |   - alert: AsteriskHighScrapeTime
27 |     annotations:
28 |       description: |-
29 |         Asterisk instance core high scrape time (Possible system performance degradation)
30 |           VALUE = {{ $value }}
31 |           LABELS = {{ $labels }}
32 |       summary: Asterisk instance core high scrape time.
33 |     expr: asterisk_core_scrape_time_ms > 100
34 |     for: 10s
35 |     labels:
36 |       severity: critical
37 |   - alert: AsteriskHighActiveCallsCount
38 |     annotations:
39 |       description: |-
40 |         Asterisk high active call count
41 |           VALUE = {{ $value }}
42 |           LABELS = {{ $labels }}
43 |       summary: Asterisk high active call count.
44 |     expr: asterisk_calls_count > 100
45 |     for: 10s
46 |     labels:
47 |       severity: warning
48 | 


--------------------------------------------------------------------------------
/assets/asterisk/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/awx/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/awx/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/blackbox_exporter/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: blackbox-exporter.rules
 3 |   rules:
 4 |   - alert: BlackboxProbeFailed
 5 |     annotations:
 6 |       dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{
 7 |         $labels.instance }}
 8 |       description: The probe failed for the instance {{ $labels.instance }}.
 9 |       summary: Probe has failed for the past 1m interval.
10 |     expr: |
11 |       probe_success{job="blackbox-exporter"} == 0
12 |     for: 1m
13 |     labels:
14 |       severity: critical
15 |   - alert: BlackboxLowUptime30d
16 |     annotations:
17 |       dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{
18 |         $labels.instance }}
19 |       description: The probe has a lower uptime than 99.9% the last 30 days for the
20 |         instance {{ $labels.instance }}.
21 |       summary: Probe uptime is lower than 99.9% for the last 30 days.
22 |     expr: |
23 |       avg_over_time(probe_success{job="blackbox-exporter"}[30d]) * 100 < 99.900000000000006
24 |     labels:
25 |       severity: info
26 |   - alert: BlackboxSslCertificateWillExpireSoon
27 |     annotations:
28 |       dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{
29 |         $labels.instance }}
30 |       description: |
31 |         The SSL certificate of the instance {{ $labels.instance }} is expiring within 21 days.
32 |         Actual time left: {{ $value | humanizeDuration }}.
33 |       summary: SSL certificate will expire soon.
34 |     expr: |
35 |       probe_ssl_earliest_cert_expiry{job="blackbox-exporter"} - time() < 21 * 24 * 3600
36 |     labels:
37 |       severity: warning
38 | 


--------------------------------------------------------------------------------
/assets/blackbox_exporter/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/caddy/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/caddy/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/celery/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: celery
 3 |   rules:
 4 |   - alert: CeleryTaskHighFailRate
 5 |     annotations:
 6 |       dashboard_url: https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{
 7 |         $labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name
 8 |         }}
 9 |       description: More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name
10 |         }}/{{ $labels.name }} the past 10m.
11 |       summary: Celery high task fail rate.
12 |     expr: |
13 |       sum(
14 |         increase(
15 |           celery_task_failed_total{
16 |             job=~".*celery.*",
17 |             queue_name!~"None",
18 |             name!~"None"
19 |           }[10m]
20 |         )
21 |       )  by (job, namespace, queue_name, name)
22 |       /
23 |       (
24 |         sum(
25 |           increase(
26 |             celery_task_failed_total{
27 |               job=~".*celery.*",
28 |               queue_name!~"None",
29 |               name!~"None"
30 |             }[10m]
31 |           )
32 |         )  by (job, namespace, queue_name, name)
33 |         +
34 |         sum(
35 |           increase(
36 |             celery_task_succeeded_total{
37 |               job=~".*celery.*",
38 |               queue_name!~"None",
39 |               name!~"None"
40 |             }[10m]
41 |           )
42 |         )  by (job, namespace, queue_name, name)
43 |       )
44 |       * 100 > 5
45 |     for: 1m
46 |     labels:
47 |       severity: warning
48 |   - alert: CeleryHighQueueLength
49 |     annotations:
50 |       dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
51 |         $labels.job }}&var-queue_name={{ $labels.queue_name }}
52 |       description: More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name
53 |         }} the past 20m.
54 |       summary: Celery high queue length.
55 |     expr: |
56 |       sum(
57 |         celery_queue_length{
58 |           job=~".*celery.*",
59 |           queue_name!~"None"
60 |         }
61 |       )  by (job, namespace, queue_name)
62 |       > 100
63 |     for: 20m
64 |     labels:
65 |       severity: warning
66 |   - alert: CeleryWorkerDown
67 |     annotations:
68 |       dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
69 |         $labels.job }}
70 |       description: The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline.
71 |       summary: A Celery worker is offline.
72 |     expr: |
73 |       celery_worker_up{job=~".*celery.*"} == 0
74 |     for: 15m
75 |     labels:
76 |       severity: warning
77 | 


--------------------------------------------------------------------------------
/assets/celery/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/ceph/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/cert-manager/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: cert-manager
 3 |   rules:
 4 |   - alert: CertManagerAbsent
 5 |     annotations:
 6 |       description: New certificates will not be able to be minted, and existing ones
 7 |         can't be renewed until cert-manager is back.
 8 |       runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerabsent
 9 |       summary: Cert Manager has disappeared from Prometheus service discovery.
10 |     expr: absent(up{job="cert-manager"})
11 |     for: 10m
12 |     labels:
13 |       severity: critical
14 | - name: certificates
15 |   rules:
16 |   - alert: CertManagerCertExpirySoon
17 |     annotations:
18 |       dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
19 |       description: The domain that this cert covers will be unavailable after {{ $value
20 |         | humanizeDuration }}. Clients using endpoints that this cert protects will
21 |         start to fail in {{ $value | humanizeDuration }}.
22 |       runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertexpirysoon
23 |       summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from
24 |         expiry, it should have renewed over a week ago.
25 |     expr: |
26 |       avg by (exported_namespace, namespace, name) (
27 |         certmanager_certificate_expiration_timestamp_seconds - time()
28 |       ) < (21 * 24 * 3600) # 21 days in seconds
29 |     for: 1h
30 |     labels:
31 |       severity: warning
32 |   - alert: CertManagerCertNotReady
33 |     annotations:
34 |       dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
35 |       description: This certificate has not been ready to serve traffic for at least
36 |         10m. If the cert is being renewed or there is another valid cert, the ingress
37 |         controller _may_ be able to serve that instead.
38 |       runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertnotready
39 |       summary: The cert `{{ $labels.name }}` is not ready to serve traffic.
40 |     expr: |
41 |       max by (name, exported_namespace, namespace, condition) (
42 |         certmanager_certificate_ready_status{condition!="True"} == 1
43 |       )
44 |     for: 10m
45 |     labels:
46 |       severity: critical
47 |   - alert: CertManagerHittingRateLimits
48 |     annotations:
49 |       dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
50 |       description: Depending on the rate limit, cert-manager may be unable to generate
51 |         certificates for up to a week.
52 |       runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerhittingratelimits
53 |       summary: Cert manager hitting LetsEncrypt rate limits.
54 |     expr: |
55 |       sum by (host) (
56 |         rate(certmanager_http_acme_client_request_count{status="429"}[5m])
57 |       ) > 0
58 |     for: 5m
59 |     labels:
60 |       severity: critical
61 | 


--------------------------------------------------------------------------------
/assets/cert-manager/rules.yaml:
--------------------------------------------------------------------------------
1 | groups: []
2 | 


--------------------------------------------------------------------------------
/assets/cilium-enterprise/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/clickhouse/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: ClickHouseAlerts
 3 |   rules:
 4 |   - alert: ClickHouseReplicationQueueBackingUp
 5 |     annotations:
 6 |       description: |
 7 |         ClickHouse replication tasks are processing slower than expected on {{ $labels.instance }} causing replication queue size to back up at {{ $value }} exceeding the threshold value of 99.
 8 |       summary: ClickHouse replica max queue size backing up.
 9 |     expr: |
10 |       ClickHouseAsyncMetrics_ReplicasMaxQueueSize > 99
11 |     for: 5m
12 |     keep_firing_for: 5m
13 |     labels:
14 |       severity: warning
15 |   - alert: ClickHouseRejectedInserts
16 |     annotations:
17 |       description: ClickHouse inserts are being rejected on {{ $labels.instance }}
18 |         as items are being inserted faster than ClickHouse is able to merge them.
19 |       summary: ClickHouse has too many rejected inserts.
20 |     expr: ClickHouseProfileEvents_RejectedInserts > 1
21 |     for: 5m
22 |     keep_firing_for: 5m
23 |     labels:
24 |       severity: critical
25 |   - alert: ClickHouseZookeeperSessions
26 |     annotations:
27 |       description: |
28 |         ClickHouse has more than one connection to a Zookeeper on {{ $labels.instance }} which can lead to bugs due to stale reads in Zookeepers consistency model.
29 |       summary: ClickHouse has too many Zookeeper sessions.
30 |     expr: ClickHouseMetrics_ZooKeeperSession > 1
31 |     for: 5m
32 |     keep_firing_for: 5m
33 |     labels:
34 |       severity: critical
35 |   - alert: ClickHouseReplicasInReadOnly
36 |     annotations:
37 |       description: |
38 |         ClickHouse has replicas in a read only state on {{ $labels.instance }} after losing connection to Zookeeper or at startup.
39 |       summary: ClickHouse has too many replicas in read only state.
40 |     expr: ClickHouseMetrics_ReadonlyReplica > 0
41 |     for: 5m
42 |     keep_firing_for: 5m
43 |     labels:
44 |       severity: critical
45 | 


--------------------------------------------------------------------------------
/assets/clickhouse/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/cloudflare/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: cloudflare-alerts
 3 |   rules:
 4 |   - alert: CloudflareHighThreatCount
 5 |     annotations:
 6 |       description: The number of detected threats targeting the zone {{$labels.zone}}
 7 |         is {{ printf "%.0f" $value }} which is greater than the threshold of 3.
 8 |       summary: There are detected threats targeting the zone.
 9 |     expr: |
10 |       sum without (instance) (increase(cloudflare_zone_threats_total[5m])) > 3
11 |     for: 5m
12 |     labels:
13 |       severity: critical
14 |   - alert: CloudflareHighRequestRate
15 |     annotations:
16 |       description: The rate of requests to {{$labels.zone}} is {{ printf "%.0f" $value
17 |         }}% of the prior 50 minute baseline which is above the threshold of 150%.
18 |       summary: A high spike in requests is occurring which may indicate an attack
19 |         or unexpected load.
20 |     expr: |
21 |       sum without (instance) (100 * (rate(cloudflare_zone_requests_total[10m]) / clamp_min(rate(cloudflare_zone_requests_total[50m] offset 10m), 1))) > 150
22 |     for: 5m
23 |     labels:
24 |       severity: warning
25 |   - alert: CloudflareHighHTTPErrorCodes
26 |     annotations:
27 |       description: The number of {{$labels.status}} HTTP status codes occurring in
28 |         the zone {{$labels.zone}} is {{ printf "%.0f" $value }} which is greater than
29 |         the threshold of 100.
30 |       summary: A high number of 4xx or 5xx HTTP status codes are occurring.
31 |     expr: |
32 |       sum without (instance) (increase(cloudflare_zone_requests_status{status=~"4.*|5.*"}[5m])) > 100
33 |     for: 5m
34 |     labels:
35 |       severity: warning
36 |   - alert: CloudflareUnhealthyPools
37 |     annotations:
38 |       description: The pool {{$labels.pool_name}} in zone {{$labels.zone}} is currently
39 |         down and unhealthy.
40 |       summary: There are unhealthy pools.
41 |     expr: |
42 |       sum without (instance, load_balancer_name) (cloudflare_zone_pool_health_status) == 0
43 |     for: 5m
44 |     labels:
45 |       severity: critical
46 |   - alert: CloudflareMetricsDown
47 |     annotations:
48 |       description: Grafana is no longer receiving metrics for the Cloudflare integration
49 |         from instance {{$labels.instance}}.
50 |       summary: Cloudflare metrics are down.
51 |     expr: |
52 |       up{job="integrations/cloudflare"} == 0
53 |     for: 5m
54 |     labels:
55 |       severity: critical
56 | 


--------------------------------------------------------------------------------
/assets/cloudflare/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/confluent-kafka/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/confluent-kafka/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/consul/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: consul
 3 |   rules:
 4 |   - alert: ConsulUp
 5 |     annotations:
 6 |       description: Consul '{{ $labels.job }}' is not up.
 7 |       summary: Consul is not up.
 8 |     expr: |
 9 |       consul_up != 1
10 |     for: 1m
11 |     labels:
12 |       severity: critical
13 |   - alert: ConsulMaster
14 |     annotations:
15 |       description: Consul '{{ $labels.job }}' has no master.
16 |       summary: Consul has no master.
17 |     expr: |
18 |       consul_raft_leader != 1
19 |     for: 1m
20 |     labels:
21 |       severity: critical
22 |   - alert: ConsulPeers
23 |     annotations:
24 |       description: Consul '{{ $labels.job }}' does not have 3 peers.
25 |       summary: Consul does not have peers.
26 |     expr: |
27 |       consul_raft_peers != 3
28 |     for: 10m
29 |     labels:
30 |       severity: critical
31 | 


--------------------------------------------------------------------------------
/assets/consul/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/coredns/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/couchbase/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: couchbase
 3 |   rules:
 4 |   - alert: CouchbaseHighCPUUsage
 5 |     annotations:
 6 |       description: '{{ printf "%.0f" $value }} percent CPU usage on node {{$labels.instance}}
 7 |         and on cluster {{$labels.couchbase_cluster}}, which is above the threshold
 8 |         of 85.'
 9 |       summary: The node CPU usage has exceeded the critical threshold.
10 |     expr: |
11 |       (sys_cpu_utilization_rate) > 85
12 |     for: 5m
13 |     labels:
14 |       severity: critical
15 |   - alert: CouchbaseHighMemoryUsage
16 |     annotations:
17 |       description: '{{ printf "%.0f" $value }} percent memory usage on node {{$labels.instance}}
18 |         and on cluster {{$labels.couchbase_cluster}}, which is above the threshold
19 |         of 85.'
20 |       summary: There is a limited amount of memory available for a node.
21 |     expr: |
22 |       100 * (sys_mem_actual_used / clamp_min(sys_mem_actual_used + sys_mem_actual_free, 1)) > 85
23 |     for: 5m
24 |     labels:
25 |       severity: critical
26 |   - alert: CouchbaseMemoryEvictionRate
27 |     annotations:
28 |       description: '{{ printf "%.0f" $value }} evictions in bucket {{$labels.bucket}},
29 |         on node {{$labels.instance}}, and on cluster {{$labels.couchbase_cluster}},
30 |         which is above the threshold of 10.'
31 |       summary: There is a spike in evictions in a bucket, which indicates high memory
32 |         pressure.
33 |     expr: |
34 |       (kv_ep_num_value_ejects) > 10
35 |     for: 5m
36 |     labels:
37 |       severity: warning
38 |   - alert: CouchbaseInvalidRequestVolume
39 |     annotations:
40 |       description: '{{ printf "%.0f" $value }} invalid requests to {{$labels.couchbase_cluster}},
41 |         which is above the threshold of 1000.'
42 |       summary: There is a high volume of incoming invalid requests, which may indicate
43 |         a DOS or injection attack.
44 |     expr: |
45 |       sum without(instance, job) (rate(n1ql_invalid_requests[2m])) > 1000
46 |     for: 2m
47 |     labels:
48 |       severity: warning
49 | 


--------------------------------------------------------------------------------
/assets/couchbase/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/discourse/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: DiscourseAlerts
 3 |   rules:
 4 |   - alert: DiscourseRequestsHigh5xxErrors
 5 |     annotations:
 6 |       description: '{{ printf "%.2f" $value }}% of all requests are resulting in 500
 7 |         status codes, which is above the threshold 10%, indicating a potentially larger
 8 |         issue for {{$labels.instance}}'
 9 |       summary: More than 10% of all requests result in a 5XX.
10 |     expr: |
11 |       100 * rate(discourse_http_requests{status="500"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 10
12 |     for: 5m
13 |     labels:
14 |       severity: critical
15 |   - alert: DiscourseRequestsHigh4xxErrors
16 |     annotations:
17 |       description: '{{ printf "%.2f" $value }}% of all requests are resulting in 400
18 |         status code, which is above the threshold 30%, indicating a potentially larger
19 |         issue for {{$labels.instance}}'
20 |       summary: More than 30% of all requests result in a 4XX.
21 |     expr: |
22 |       100 * rate(discourse_http_requests{status=~"^4.*"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 30
23 |     for: 5m
24 |     labels:
25 |       severity: warning
26 | 


--------------------------------------------------------------------------------
/assets/discourse/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/django/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/docker/alerts.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/docker/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/elasticsearch/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/elasticsearch/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/envoy/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/envoy/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/etcd/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/f5-bigip/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: bigip-alerts
 3 |   rules:
 4 |   - alert: BigIPLowNodeAvailabilityStatus
 5 |     annotations:
 6 |       description: '{{ printf "%.0f" $value }} percent of available nodes, which is
 7 |         below the threshold of 95.'
 8 |       summary: Detecting a significant number of unavailable nodes which can causes
 9 |         potential downtime or degraded performance.
10 |     expr: |
11 |       100 * (sum(bigip_node_status_availability_state) / clamp_min(count(bigip_node_status_availability_state), 1)) < 95
12 |     for: 5m
13 |     labels:
14 |       severity: critical
15 |   - alert: BigIPServerSideConnectionLimit
16 |     annotations:
17 |       description: '{{ printf "%.0f" $value }} percent of the max number of connections
18 |         in use on node {{$labels.node}}, which is above the threshold of 80 percent.'
19 |       summary: Approaching the connection limit may lead to rejecting new connections,
20 |         impacting availability.
21 |     expr: |
22 |       max without(instance, job) (100 * bigip_node_serverside_cur_conns / clamp_min(bigip_node_serverside_max_conns, 1)) > 80
23 |     for: 5m
24 |     labels:
25 |       severity: warning
26 |   - alert: BigIPHighRequestRate
27 |     annotations:
28 |       description: '{{ printf "%.0f" $value }} percent increase in requests on pool
29 |         {{$labels.pool}}, which is above the threshold of 150.'
30 |       summary: An unexpected spike in requests might indicate an issue like a DDoS
31 |         attack or unexpected high load.
32 |     expr: |
33 |       max without(instance, job) (100 * rate(bigip_pool_tot_requests[10m]) / clamp_min(rate(bigip_pool_tot_requests[50m] offset 10m), 1)) > 150
34 |     for: 10m
35 |     labels:
36 |       severity: warning
37 |   - alert: BigIPHighConnectionQueueDepth
38 |     annotations:
39 |       description: '{{ printf "%.0f" $value }} percent increase in connection queue
40 |         depth on node {{$labels.pool}}, which is above the threshold of 75.'
41 |       summary: A sudden spike or sustained high queue depth may indicate a bottleneck
42 |         in handling incoming connections.
43 |     expr: |
44 |       max without(instance, job) (100 * rate(bigip_pool_connq_depth[5m])) / clamp_min(rate(bigip_pool_connq_depth[50m] offset 10m), 1) > 75
45 |     for: 5m
46 |     labels:
47 |       severity: warning
48 | 


--------------------------------------------------------------------------------
/assets/f5-bigip/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/gitea/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/gitea/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/gitlab/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: GitLabAlerts
 3 |   rules:
 4 |   - alert: GitLabHighJobRegistrationFailures
 5 |     annotations:
 6 |       description: '{{ printf "%.2f" $value }}% of job registrations have failed on
 7 |         {{$labels.instance}}, which is above threshold of 10%.'
 8 |       summary: Large percentage of failed attempts to register a job.
 9 |     expr: "100 * rate(job_register_attempts_failed_total{}[5m]) / rate(job_register_attempts_total{}[5m])
10 |       \n> 10\n"
11 |     for: 5m
12 |     labels:
13 |       severity: warning
14 |   - alert: GitLabHighRunnerAuthFailure
15 |     annotations:
16 |       description: '{{ printf "%.2f" $value }}% of GitLab runner authentication attempts
17 |         are failing on {{$labels.instance}}, which is above the threshold of 10%.'
18 |       summary: Large percentage of runner authentication failures.
19 |     expr: "100 * sum by (instance) (rate(gitlab_ci_runner_authentication_failure_total{}[5m]))
20 |       \ / \n(sum by (instance) (rate(gitlab_ci_runner_authentication_success_total{}[5m]))
21 |       \ + sum by (instance) (rate(gitlab_ci_runner_authentication_failure_total{}[5m])))\n>
22 |       10\n"
23 |     for: 5m
24 |     labels:
25 |       severity: warning
26 |   - alert: GitLabHigh5xxResponses
27 |     annotations:
28 |       description: '{{ printf "%.2f" $value }}% of all requests returned 5XX HTTP
29 |         responses, which is above the threshold 10%, indicating a system issue on
30 |         {{$labels.instance}}.'
31 |       summary: Large rate of HTTP 5XX errors.
32 |     expr: "100 * sum by (instance) (rate(http_requests_total{status=~\"^5.*\"}[5m]))
33 |       / sum by (instance) (rate(http_requests_total{}[5m])) \n> 10\n"
34 |     for: 5m
35 |     labels:
36 |       severity: critical
37 |   - alert: GitLabHigh4xxResponses
38 |     annotations:
39 |       description: '{{ printf "%.2f" $value }}% of all requests returned 4XX HTTP
40 |         responses, which is above the threshold 10%, indicating many failed requests
41 |         on {{$labels.instance}}.'
42 |       summary: Large rate of HTTP 4XX errors.
43 |     expr: |
44 |       100 * sum by (instance) (rate(http_requests_total{status=~"^4.*"}[5m])) / sum by (instance) (rate(http_requests_total{}[5m]))
45 |       > 10
46 |     for: 5m
47 |     labels:
48 |       severity: warning
49 | 


--------------------------------------------------------------------------------
/assets/gitlab/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/gluster/rules.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: gluster-volume.rules
 3 |   rules:
 4 |   - expr: |
 5 |       sum(max(gluster_subvol_capacity_used_bytes{job="glusterd2-client"}) BY (volume, subvolume)) BY (volume)
 6 |     record: gluster:volume_capacity_used_bytes_total:sum
 7 |   - expr: |
 8 |       sum(max(gluster_subvol_capacity_total_bytes{job="glusterd2-client"}) BY (volume, subvolume)) BY (volume)
 9 |     record: gluster:volume_capacity_total_bytes:sum
10 | 


--------------------------------------------------------------------------------
/assets/go-runtime/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/go-runtime/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/grafana/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: GrafanaAlerts
 3 |   rules:
 4 |   - alert: GrafanaRequestsFailing
 5 |     annotations:
 6 |       message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is
 7 |         experiencing {{ $value | humanize }}% errors'
 8 |     expr: |
 9 |       100 * sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."})
10 |       /
11 |       sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
12 |       > 50
13 |     for: 5m
14 |     labels:
15 |       severity: warning
16 | 


--------------------------------------------------------------------------------
/assets/grafana/rules.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: grafana_rules
3 |   rules:
4 |   - expr: |
5 |       sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
6 |     record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
7 | 


--------------------------------------------------------------------------------
/assets/haproxy/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: HAProxyAlerts
 3 |   rules:
 4 |   - alert: HAProxyDroppingLogs
 5 |     annotations:
 6 |       description: HAProxy {{$labels.job}} on {{$labels.instance}} is dropping logs.
 7 |       summary: HAProxy is dropping logs.
 8 |     expr: rate(haproxy_process_dropped_logs_total[5m]) != 0
 9 |     for: 10m
10 |     labels:
11 |       severity: critical
12 |   - alert: HAProxyBackendCheckFlapping
13 |     annotations:
14 |       description: HAProxy {{$labels.job}} backend {{$labels.proxy}} on {{$labels.instance}}
15 |         has flapping checks.
16 |       summary: HAProxy backend checks are flapping.
17 |     expr: rate(haproxy_backend_check_up_down_total[5m]) != 0
18 |     for: 10m
19 |     labels:
20 |       severity: critical
21 |   - alert: HAProxyServerCheckFlapping
22 |     annotations:
23 |       description: HAProxy {{$labels.job}} server {{$labels.server}} on {{$labels.instance}}
24 |         has flapping checks.
25 |       summary: HAProxy server checks are flapping.
26 |     expr: rate(haproxy_server_check_up_down_total[5m]) != 0
27 |     for: 10m
28 |     labels:
29 |       severity: critical
30 | 


--------------------------------------------------------------------------------
/assets/haproxy/rules.yaml:
--------------------------------------------------------------------------------
1 | groups: []
2 | 


--------------------------------------------------------------------------------
/assets/harbor/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: Harbor
 3 |   rules:
 4 |   - alert: HarborComponentStatus
 5 |     annotations:
 6 |       description: Harbor {{ $labels.component }} has been down for more than 5 minutes
 7 |       summary: Harbor Component is Down.
 8 |     expr: |
 9 |       harbor_up == 0
10 |     for: 5m
11 |     labels:
12 |       severity: critical
13 |   - alert: HarborProjectQuataExceeded
14 |     annotations:
15 |       description: Harbor project {{ $labels.project_name }} has exceeded the configured
16 |         disk usage quota for the past 15 minutes
17 |       summary: Harbor project exceeds disk usage quota.
18 |     expr: |
19 |       harbor_project_quota_usage_byte > harbor_project_quota_byte and on(harbor_project_quota_usage_byte) harbor_project_quota_byte != -1
20 |     for: 15m
21 |     labels:
22 |       severity: warning
23 |   - alert: HarborHighErrorRate
24 |     annotations:
25 |       description: HTTP Requests of {{ $labels.instance }} are having a high Error
26 |         rate
27 |       summary: Harbor high error rate.
28 |     expr: sum(rate(harbor_core_http_request_total{code=~"4..|5.."}[5m]))/sum(rate(harbor_core_http_request_total[5m]))
29 |       > 0.15
30 |     for: 5m
31 |     labels:
32 |       severity: warning
33 | 


--------------------------------------------------------------------------------
/assets/harbor/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/hass/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/hass/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/ibm-mq/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: ibm-mq-alerts
 3 |   rules:
 4 |   - alert: IBMMQExpiredMessages
 5 |     annotations:
 6 |       description: The number of expired messages in the {{$labels.qmgr}} is {{$labels.value}}
 7 |         which is above the threshold of 2.
 8 |       summary: There are expired messages, which imply that application resilience
 9 |         is failing.
10 |     expr: |
11 |       sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_expired_message_count) > 2
12 |     for: 5m
13 |     labels:
14 |       severity: critical
15 |   - alert: IBMMQStaleMessages
16 |     annotations:
17 |       description: A stale message with an age of {{$labels.value}} has been sitting
18 |         in the {{$labels.queue}} which is above the threshold of 300s.
19 |       summary: Stale messages have been detected.
20 |     expr: |
21 |       sum without (description,instance,job,platform) (ibmmq_queue_oldest_message_age) >= 300
22 |     for: 5m
23 |     labels:
24 |       severity: warning
25 |   - alert: IBMMQLowDiskSpace
26 |     annotations:
27 |       description: The amount of disk space available for {{$labels.qmgr}} is at {{$labels.value}}%
28 |         which is below the threshold of 5%.
29 |       summary: There is limited disk available for a queue manager.
30 |     expr: |
31 |       sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_queue_manager_file_system_free_space_percentage) <= 5
32 |     for: 5m
33 |     labels:
34 |       severity: critical
35 |   - alert: IBMMQHighQueueManagerCpuUsage
36 |     annotations:
37 |       description: The amount of CPU usage for the queue manager {{$labels.qmgr}}
38 |         is at {{$labels.value}}% which is above the threshold of 85%.
39 |       summary: There is a high CPU usage estimate for a queue manager.
40 |     expr: |
41 |       sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_user_cpu_time_estimate_for_queue_manager_percentage) >= 85
42 |     for: 5m
43 |     labels:
44 |       severity: critical
45 | 


--------------------------------------------------------------------------------
/assets/ibm-mq/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/influxdb/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/ingress-nginx-mixin/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: nginx.rules
 3 |   rules:
 4 |   - alert: NginxConfigReloadFailed
 5 |     annotations:
 6 |       dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-job={{
 7 |         $labels.job }}&var-controller_class={{ $labels.controller_class }}
 8 |       description: Nginx config reload failed for the controller with the class {{
 9 |         $labels.controller_class }}.
10 |       summary: Nginx config reload failed.
11 |     expr: |
12 |       sum(
13 |         nginx_ingress_controller_config_last_reload_successful{job=~"ingress-nginx-controller-metrics"}
14 |       ) by (cluster, job, controller_class)
15 |       == 0
16 |     for: 5m
17 |     labels:
18 |       severity: warning
19 |   - alert: NginxHighHttp4xxErrorRate
20 |     annotations:
21 |       dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
22 |         $labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
23 |       description: More than 5% HTTP requests with status 4xx for {{ $labels.exported_namespace
24 |         }}/{{ $labels.ingress }} the past 5m.
25 |       summary: Nginx high HTTP 4xx error rate.
26 |     expr: |
27 |       (
28 |         sum(
29 |           rate(
30 |             nginx_ingress_controller_requests{
31 |               job=~"ingress-nginx-controller-metrics",
32 |               status=~"^4.*",
33 |               ingress!~""
34 |             }[5m]
35 |           )
36 |         ) by (cluster, exported_namespace, ingress)
37 |         /
38 |         sum(
39 |           rate(
40 |             nginx_ingress_controller_requests{
41 |               job=~"ingress-nginx-controller-metrics",
42 |               ingress!~""
43 |             }[5m]
44 |           )
45 |         ) by (cluster, exported_namespace, ingress)
46 |         * 100
47 |       ) > 5
48 |     for: 1m
49 |     labels:
50 |       severity: info
51 |   - alert: NginxHighHttp5xxErrorRate
52 |     annotations:
53 |       dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
54 |         $labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
55 |       description: More than 5% HTTP requests with status 5xx for {{ $labels.exported_namespace
56 |         }}/{{ $labels.ingress }} the past 5m.
57 |       summary: Nginx high HTTP 5xx error rate.
58 |     expr: |
59 |       (
60 |         sum(
61 |           rate(
62 |             nginx_ingress_controller_requests{
63 |               job=~"ingress-nginx-controller-metrics",
64 |               status=~"^5.*",
65 |               ingress!~""
66 |             }[5m]
67 |           )
68 |         ) by (cluster, exported_namespace, ingress)
69 |         /
70 |         sum(
71 |           rate(
72 |             nginx_ingress_controller_requests{
73 |               job=~"ingress-nginx-controller-metrics",
74 |               ingress!~""
75 |             }[5m]
76 |           )
77 |         ) by (cluster, exported_namespace, ingress)
78 |         * 100
79 |       ) > 5
80 |     for: 1m
81 |     labels:
82 |       severity: warning
83 | 


--------------------------------------------------------------------------------
/assets/ingress-nginx-mixin/rules.yaml:
--------------------------------------------------------------------------------
1 | groups: []
2 | 


--------------------------------------------------------------------------------
/assets/istio/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/jaeger/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/jenkins/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/jenkins/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/jira/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: alert.rules
 3 |   rules:
 4 |   - alert: LicenseExpired
 5 |     annotations:
 6 |       description: The JIRA license has expired.
 7 |       summary: JIRA license expired.
 8 |     expr: jira_license_expiry_days_gauge <= 0
 9 |     for: 1m
10 |     labels:
11 |       severity: critical
12 |   - alert: LicenseWarning
13 |     annotations:
14 |       description: The JIRA license will expire in less than one week.
15 |       summary: License expiring soon.
16 |     expr: jira_license_expiry_days_gauge <= 7 and jira_license_expiry_days_gauge >
17 |       0
18 |     for: 1m
19 |     labels:
20 |       severity: warning
21 |   - alert: NoUserCapacity
22 |     annotations:
23 |       description: There is no more capacity for additional users to be added to the
24 |         system.
25 |       summary: All available accounts are taken.
26 |     expr: jira_all_users_gauge/jira_allowed_users_gauge == 1
27 |     for: 1m
28 |     labels:
29 |       severity: critical
30 |   - alert: EmailErrorsHigh
31 |     annotations:
32 |       description: More than 1% of emails have resulted in an error in the past minute.
33 |       summary: Email errors are high.
34 |     expr: jira_mail_queue_error_gauge /jira_mail_queue_gauge > 0.01
35 |     for: 1m
36 |     labels:
37 |       severity: critical
38 | 


--------------------------------------------------------------------------------
/assets/jira/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/jvm/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: jvm-jvm-alerts
 3 |   rules:
 4 |   - alert: JvmMemoryFillingUp
 5 |     annotations:
 6 |       description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the
 7 |         last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
 8 |       summary: JVM heap memory filling up.
 9 |     expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", }))/(sum without
10 |       (id) (jvm_memory_max_bytes{area="heap", } != -1))) * 100 > 80
11 |     for: 5m
12 |     keep_firing_for: 5m
13 |     labels:
14 |       severity: warning
15 |   - alert: JvmThreadsDeadlocked
16 |     annotations:
17 |       description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}}
18 |         are in a cyclic dependency with each other. The restart is required to resolve
19 |         the deadlock.'
20 |       summary: JVM deadlock detected.
21 |     expr: (jvm_threads_deadlocked{}) > 0
22 |     for: 2m
23 |     keep_firing_for: 5m
24 |     labels:
25 |       severity: critical
26 | 


--------------------------------------------------------------------------------
/assets/jvm/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/kafka/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/kube-cockroachdb/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: cockroachdb
 3 |   rules:
 4 |   - alert: CockroachInstanceFlapping
 5 |     annotations:
 6 |       description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
 7 |         {{ $value }} time(s) in 10m.'
 8 |       summary: CockroachDB instances have restarted in the last 10 minutes.
 9 |     expr: |
10 |       resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
11 |     for: 1m
12 |     labels:
13 |       severity: warning
14 |   - alert: CockroachLivenessMismatch
15 |     annotations:
16 |       description: Liveness mismatch for {{ $labels.instance }}
17 |       summary: CockroachDB has liveness mismatches.
18 |     expr: |
19 |       (cockroachdb_liveness_livenodes{job="cockroachdb-public"})
20 |         !=
21 |       ignoring(instance) group_left() (count by(cluster, job) (up{job="cockroachdb-public"} == 1))
22 |     for: 5m
23 |     labels:
24 |       severity: warning
25 |   - alert: CockroachVersionMismatch
26 |     annotations:
27 |       description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
28 |       summary: CockroachDB cluster is running different versions.
29 |     expr: |
30 |       count by(cluster) (count_values by(tag, cluster) ("version", cockroachdb_build_timestamp{job="cockroachdb-public"})) > 1
31 |     for: 1h
32 |     labels:
33 |       severity: warning
34 |   - alert: CockroachStoreDiskLow
35 |     annotations:
36 |       description: Store {{ $labels.store }} on node {{ $labels.instance }} at {{
37 |         $value }} available disk fraction
38 |       summary: CockroachDB is at low disk capacity.
39 |     expr: |
40 |       :cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
41 |     for: 30m
42 |     labels:
43 |       severity: critical
44 |   - alert: CockroachClusterDiskLow
45 |     annotations:
46 |       description: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
47 |       summary: CockroachDB cluster is at critically low disk capacity.
48 |     expr: |
49 |       cluster:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.2
50 |     for: 30m
51 |     labels:
52 |       severity: critical
53 |   - alert: CockroachUnavailableRanges
54 |     annotations:
55 |       description: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
56 |       summary: CockroachDB has unavailable ranges.
57 |     expr: |
58 |       (sum by(instance, cluster) (cockroachdb_ranges_unavailable{job="cockroachdb-public"})) > 0
59 |     for: 10m
60 |     labels:
61 |       severity: critical
62 |   - alert: CockroachNoLeaseRanges
63 |     annotations:
64 |       description: Instance {{ $labels.instance }} has {{ $value }} ranges without
65 |         leases
66 |       summary: CockroachDB has ranges without leases.
67 |     expr: |
68 |       (sum by(instance, cluster) (cockroachdb_replicas_leaders_not_leaseholders{job="cockroachdb-public"})) > 0
69 |     for: 10m
70 |     labels:
71 |       severity: warning
72 |   - alert: CockroachHighOpenFDCount
73 |     annotations:
74 |       description: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
75 |         }} fraction used'
76 |       summary: CockroachDB has too many open file descriptors.
77 |     expr: |
78 |       cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
79 |     for: 10m
80 |     labels:
81 |       severity: warning
82 | 


--------------------------------------------------------------------------------
/assets/kube-cockroachdb/rules.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: cockroachdb.rules
 3 |   rules:
 4 |   - expr: |
 5 |       sum without(store) (cockroachdb_capacity{job="cockroachdb-public"})
 6 |     record: node:cockroachdb_capacity:sum
 7 |   - expr: |
 8 |       sum without(instance) (node:cockroachdb_capacity:sum{job="cockroachdb-public"})
 9 |     record: cluster:cockroachdb_capacity:sum
10 |   - expr: |
11 |       sum without(store) (cockroachdb_capacity_available{job="cockroachdb-public"})
12 |     record: node:cockroachdb_capacity_available:sum
13 |   - expr: |
14 |       sum without(instance) (node:cockroachdb_capacity_available:sum{job="cockroachdb-public"})
15 |     record: cluster:cockroachdb_capacity_available:sum
16 |   - expr: |
17 |       cockroachdb_capacity_available{job="cockroachdb-public"} / cockroachdb_capacity{job="cockroachdb-public"}
18 |     record: :cockroachdb_capacity_available:ratio
19 |   - expr: |
20 |       node:cockroachdb_capacity_available:sum{job="cockroachdb-public"} / node:cockroachdb_capacity:sum{job="cockroachdb-public"}
21 |     record: node:cockroachdb_capacity_available:ratio
22 |   - expr: |
23 |       cluster:cockroachdb_capacity_available:sum{job="cockroachdb-public"} / cluster:cockroachdb_capacity:sum{job="cockroachdb-public"}
24 |     record: cluster:cockroachdb_capacity_available:ratio
25 | 


--------------------------------------------------------------------------------
/assets/kube-state-metrics/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: kube-state-metrics
 3 |   rules:
 4 |   - alert: KubeStateMetricsListErrors
 5 |     annotations:
 6 |       description: kube-state-metrics is experiencing errors at an elevated rate in
 7 |         list operations. This is likely causing it to not be able to expose metrics
 8 |         about Kubernetes objects correctly or at all.
 9 |       summary: kube-state-metrics is experiencing errors in list operations.
10 |     expr: |
11 |       (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
12 |         /
13 |       sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
14 |       > 0.01
15 |     for: 15m
16 |     labels:
17 |       severity: critical
18 |   - alert: KubeStateMetricsWatchErrors
19 |     annotations:
20 |       description: kube-state-metrics is experiencing errors at an elevated rate in
21 |         watch operations. This is likely causing it to not be able to expose metrics
22 |         about Kubernetes objects correctly or at all.
23 |       summary: kube-state-metrics is experiencing errors in watch operations.
24 |     expr: |
25 |       (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
26 |         /
27 |       sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
28 |       > 0.01
29 |     for: 15m
30 |     labels:
31 |       severity: critical
32 |   - alert: KubeStateMetricsShardingMismatch
33 |     annotations:
34 |       description: kube-state-metrics pods are running with different --total-shards
35 |         configuration, some Kubernetes objects may be exposed multiple times or not
36 |         exposed at all.
37 |       summary: kube-state-metrics sharding is misconfigured.
38 |     expr: |
39 |       stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
40 |     for: 15m
41 |     labels:
42 |       severity: critical
43 |   - alert: KubeStateMetricsShardsMissing
44 |     annotations:
45 |       description: kube-state-metrics shards are missing, some Kubernetes objects
46 |         are not being exposed.
47 |       summary: kube-state-metrics shards are missing.
48 |     expr: |
49 |       2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
50 |         -
51 |       sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
52 |       != 0
53 |     for: 15m
54 |     labels:
55 |       severity: critical
56 | 


--------------------------------------------------------------------------------
/assets/kube-state-metrics/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/kubernetes-autoscaling/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/memcached/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: memcached
 3 |   rules:
 4 |   - alert: MemcachedDown
 5 |     annotations:
 6 |       description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} is
 7 |         down for more than 15 minutes.
 8 |       summary: Memcached instance is down.
 9 |     expr: |
10 |       memcached_up == 0
11 |     for: 15m
12 |     labels:
13 |       severity: critical
14 |   - alert: MemcachedConnectionLimitApproaching
15 |     annotations:
16 |       description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection
17 |         usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes.
18 |       summary: Memcached max connection limit is approaching.
19 |     expr: |
20 |       (memcached_current_connections / memcached_max_connections * 100) > 80
21 |     for: 15m
22 |     labels:
23 |       severity: warning
24 |   - alert: MemcachedConnectionLimitApproaching
25 |     annotations:
26 |       description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection
27 |         usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes.
28 |       summary: Memcached connections at critical level.
29 |     expr: |
30 |       (memcached_current_connections / memcached_max_connections * 100) > 95
31 |     for: 15m
32 |     labels:
33 |       severity: critical
34 |   - alert: MemcachedOutOfMemoryErrors
35 |     annotations:
36 |       description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} has
37 |         OutOfMemory errors for at least 15 minutes, current rate is {{ printf "%0.0f"
38 |         $value }}
39 |       summary: Memcached has OutOfMemory errors.
40 |     expr: |
41 |       sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0
42 |     for: 15m
43 |     labels:
44 |       severity: warning
45 | 


--------------------------------------------------------------------------------
/assets/memcached/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/microsoft-iis/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: microsoft-iis
 3 |   rules:
 4 |   - alert: MicrosoftIISHighNumberOfRejectedAsyncIORequests
 5 |     annotations:
 6 |       description: |
 7 |         The number of rejected async IO requests is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.site }} which is above the threshold of 20.
 8 |       summary: There are a high number of rejected async I/O requests for a site.
 9 |     expr: |
10 |       increase(windows_iis_rejected_async_io_requests_total[5m]) > 20
11 |     for: 5m
12 |     labels:
13 |       severity: warning
14 |   - alert: MicrosoftIISHighNumberOf5xxRequestErrors
15 |     annotations:
16 |       description: |
17 |         The number of 5xx request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of 5.
18 |       summary: There are a high number of 5xx request errors for an application.
19 |     expr: |
20 |       sum without (pid, status_code)(increase(windows_iis_worker_request_errors_total{status_code=~"5.*"}[5m])) > 5
21 |     for: 5m
22 |     labels:
23 |       severity: critical
24 |   - alert: MicrosoftIISLowSuccessRateForWebsocketConnections
25 |     annotations:
26 |       description: |
27 |         The success rate for websocket connections is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of 80.
28 |       summary: There is a low success rate for websocket connections for an application.
29 |     expr: |
30 |       sum without (pid)  (increase(windows_iis_worker_websocket_connection_accepted_total[5m]) / clamp_min(increase(windows_iis_worker_websocket_connection_attempts_total[5m]),1)) * 100 > 80
31 |     for: 5m
32 |     labels:
33 |       severity: critical
34 |   - alert: MicrosoftIISThreadpoolUtilizationNearingMax
35 |     annotations:
36 |       description: |
37 |         The threadpool utilization is at {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of 90.
38 |       summary: The thread pool utilization is nearing max capacity.
39 |     expr: |
40 |       sum without (pid, state)(windows_iis_worker_threads / windows_iis_worker_max_threads) * 100 > 90
41 |     for: 5m
42 |     labels:
43 |       severity: critical
44 |   - alert: MicrosoftIISHighNumberOfWorkerProcessFailures
45 |     annotations:
46 |       description: |
47 |         The number of worker process failures is at {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of 10.
48 |       summary: There are a high number of worker process failures for an application.
49 |     expr: |
50 |       increase(windows_iis_total_worker_process_failures[5m]) > 10
51 |     for: 5m
52 |     labels:
53 |       severity: warning
54 | 


--------------------------------------------------------------------------------
/assets/microsoft-iis/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/mongodb-atlas/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/mongodb/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/mysql/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/mysql/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/nginx/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/nginx/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/node-exporter/rules.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: node-exporter.rules
 3 |   rules:
 4 |   - expr: |
 5 |       count without (cpu, mode) (
 6 |         node_cpu_seconds_total{job="node",mode="idle"}
 7 |       )
 8 |     record: instance:node_num_cpu:sum
 9 |   - expr: |
10 |       1 - avg without (cpu) (
11 |         sum without (mode) (rate(node_cpu_seconds_total{job="node", mode=~"idle|iowait|steal"}[5m]))
12 |       )
13 |     record: instance:node_cpu_utilisation:rate5m
14 |   - expr: |
15 |       (
16 |         node_load1{job="node"}
17 |       /
18 |         instance:node_num_cpu:sum{job="node"}
19 |       )
20 |     record: instance:node_load1_per_cpu:ratio
21 |   - expr: |
22 |       1 - (
23 |         (
24 |           node_memory_MemAvailable_bytes{job="node"}
25 |           or
26 |           (
27 |             node_memory_Buffers_bytes{job="node"}
28 |             +
29 |             node_memory_Cached_bytes{job="node"}
30 |             +
31 |             node_memory_MemFree_bytes{job="node"}
32 |             +
33 |             node_memory_Slab_bytes{job="node"}
34 |           )
35 |         )
36 |       /
37 |         node_memory_MemTotal_bytes{job="node"}
38 |       )
39 |     record: instance:node_memory_utilisation:ratio
40 |   - expr: |
41 |       rate(node_vmstat_pgmajfault{job="node"}[5m])
42 |     record: instance:node_vmstat_pgmajfault:rate5m
43 |   - expr: |
44 |       rate(node_disk_io_time_seconds_total{job="node", device!=""}[5m])
45 |     record: instance_device:node_disk_io_time_seconds:rate5m
46 |   - expr: |
47 |       rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[5m])
48 |     record: instance_device:node_disk_io_time_weighted_seconds:rate5m
49 |   - expr: |
50 |       sum without (device) (
51 |         rate(node_network_receive_bytes_total{job="node", device!="lo"}[5m])
52 |       )
53 |     record: instance:node_network_receive_bytes_excluding_lo:rate5m
54 |   - expr: |
55 |       sum without (device) (
56 |         rate(node_network_transmit_bytes_total{job="node", device!="lo"}[5m])
57 |       )
58 |     record: instance:node_network_transmit_bytes_excluding_lo:rate5m
59 |   - expr: |
60 |       sum without (device) (
61 |         rate(node_network_receive_drop_total{job="node", device!="lo"}[5m])
62 |       )
63 |     record: instance:node_network_receive_drop_excluding_lo:rate5m
64 |   - expr: |
65 |       sum without (device) (
66 |         rate(node_network_transmit_drop_total{job="node", device!="lo"}[5m])
67 |       )
68 |     record: instance:node_network_transmit_drop_excluding_lo:rate5m
69 | 


--------------------------------------------------------------------------------
/assets/nodejs/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: NodejsAlerts
 3 |   rules:
 4 |   - alert: NodejsDown
 5 |     annotations:
 6 |       description: Node.js {{$labels.job}} on {{$labels.instance}} is not up.
 7 |       summary: Node.js not up.
 8 |     expr: absent(nodejs_version_info) or (sum by (version) (nodejs_version_info) <
 9 |       1)
10 |     for: 0m
11 |     labels:
12 |       severity: critical
13 | 


--------------------------------------------------------------------------------
/assets/nodejs/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/nomad/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/nomad/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/nsq/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: nsq
 3 |   rules:
 4 |   - alert: NsqTopicDepthIncreasing
 5 |     annotations:
 6 |       description: |
 7 |         Topic {{ $labels.topic }} depth is higher than 100. The current queue is {{ $value }}.
 8 |       summary: Topic depth is increasing.
 9 |     expr: |
10 |       sum by (topic) (nsq_topic_depth) > 100
11 |     for: 5m
12 |     labels:
13 |       severity: critical
14 |   - alert: NsqChannelDepthIncreasing
15 |     annotations:
16 |       description: |
17 |         Channel {{ $labels.channel }} depth in topic {{ $labels.topic }} is higher than 100. The current queue is {{ $value }}.
18 |       summary: Topic channel depth is increasing.
19 |     expr: |
20 |       sum by (topic) (nsq_topic_channel_backend_depth) > 100
21 |     for: 5m
22 |     labels:
23 |       severity: critical
24 | 


--------------------------------------------------------------------------------
/assets/nsq/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/openldap/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: openldap-alerts
 3 |   rules:
 4 |   - alert: OpenLDAPConnectionSpike
 5 |     annotations:
 6 |       description: There are {{ printf "%.0f" $value }} OpenLDAP connections on instance
 7 |         {{$labels.instance}}, which is above the threshold of 100.
 8 |       summary: A sudden spike in OpenLDAP connections indicates potential high usage
 9 |         or security issues.
10 |     expr: |
11 |       increase(openldap_monitor_counter_object{dn="cn=Current,cn=Connections,cn=Monitor"}[5m]) > 100
12 |     for: 5m
13 |     labels:
14 |       severity: warning
15 |   - alert: OpenLDAPHighSearchOperationRateSpike
16 |     annotations:
17 |       description: The rate of search operations in OpenLDAP on instance {{$labels.instance}}
18 |         has increased by {{ printf "%.0f" $value }} percent in the last 5 minutes,
19 |         compared to the average over the last 15 minutes, which is above the threshold
20 |         of 200 percent.
21 |       summary: A significant spike in OpenLDAP search operations indicates inefficient
22 |         queries, potential abuse, or unintended heavy load.
23 |     expr: "100 * (\n  rate(openldap_monitor_operation{dn=\"cn=Search,cn=Operations,cn=Monitor\"}[5m])
24 |       \n  / \n  clamp_min(rate(openldap_monitor_operation{dn=\"cn=Search,cn=Operations,cn=Monitor\"}[15m]
25 |       offset 5m), 0.0001)\n) > 200\n"
26 |     for: 5m
27 |     labels:
28 |       severity: warning
29 |   - alert: OpenLDAPDialFailures
30 |     annotations:
31 |       description: LDAP dial failures on instance {{$labels.instance}} have increased
32 |         by {{ printf "%.0f" $value }} in the last 10 minutes, which is above the threshold
33 |         of 10.
34 |       summary: Significant increase in LDAP dial failures indicates network issues,
35 |         problems with the LDAP service, or configuration errors that may lead to service
36 |         unavailability.
37 |     expr: |
38 |       increase(openldap_dial{result!="ok"}[10m]) > 10
39 |     for: 10m
40 |     labels:
41 |       severity: warning
42 |   - alert: OpenLDAPBindFailureRateIncrease
43 |     annotations:
44 |       description: LDAP bind failures on instance {{$labels.instance}} have increased
45 |         by {{ printf "%.0f" $value }} in the last 10 minutes, which is above the threshold
46 |         of 10.
47 |       summary: Significant increase in LDAP bind failures indicates authentication
48 |         issues, potential security threats or problems with user directories.
49 |     expr: |
50 |       increase(openldap_bind{result!="ok"}[10m]) > 10
51 |     for: 10m
52 |     labels:
53 |       severity: warning
54 | 


--------------------------------------------------------------------------------
/assets/openldap/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/opensearch/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/openstack/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/oracledb/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: OracleDBAlerts
 3 |   rules:
 4 |   - alert: OracledbReachingSessionLimit
 5 |     annotations:
 6 |       description: '{{ printf "%.2f" $value }}% of sessions are being utilized which
 7 |         is above the threshold 85%. This could mean that {{$labels.instance}} is being
 8 |         overutilized.'
 9 |       summary: The number of sessions being utilized exceeded 85%.
10 |     expr: |
11 |       oracledb_resource_current_utilization{resource_name="sessions"} / oracledb_resource_limit_value{resource_name="sessions"} * 100 > 85
12 |     for: 5m
13 |     labels:
14 |       severity: critical
15 |   - alert: OracledbReachingProcessLimit
16 |     annotations:
17 |       description: '{{ printf "%.2f" $value }} of processes are being utilized which
18 |         is above thethreshold 85%. This could potentially mean that {{$labels.instance}}
19 |         runs out of processes it can spin up.'
20 |       summary: The number of processess being utilized exceeded the threshold of 85%.
21 |     expr: |
22 |       oracledb_resource_current_utilization{resource_name="processes"} / oracledb_resource_limit_value{resource_name="processes"} * 100 > 85
23 |     for: 5m
24 |     labels:
25 |       severity: critical
26 |   - alert: OracledbTablespaceReachingCapacity
27 |     annotations:
28 |       description: '{{ printf "%.2f" $value }}% of bytes are being utilized by the
29 |         tablespace {{$labels.tablespace}} on the instance {{$labels.instance}}, which
30 |         is above the threshold 85%.'
31 |       summary: A tablespace is exceeding more than 85% of its maximum allotted space.
32 |     expr: |
33 |       oracledb_tablespace_bytes / oracledb_tablespace_max_bytes * 100 > 85
34 |     for: 5m
35 |     labels:
36 |       severity: critical
37 | 


--------------------------------------------------------------------------------
/assets/oracledb/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/pgbouncer/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: pgbouncer
 3 |   rules:
 4 |   - alert: PGBouncerHighNumberClientWaitingConnections
 5 |     annotations:
 6 |       description: |
 7 |         The number of clients waiting for connections on {{ $labels.instance }} is now above 20. The current value is {{ $value | printf "%.2f" }}.
 8 |       summary: May indicate a bottleneck in connection pooling where too many clients
 9 |         are waiting for available server connections.
10 |     expr: |
11 |       pgbouncer_pools_client_waiting_connections{job="integrations/pgbouncer"} > 20
12 |     for: 5m
13 |     labels:
14 |       severity: warning
15 |   - alert: PGBouncerHighClientWaitTime
16 |     annotations:
17 |       description: |
18 |         The wait time for user connections on {{ $labels.instance }}, is above 15. The current value is {{ $value | printf "%.2f" }}.
19 |       summary: Clients are experiencing significant delays, which could indicate issues
20 |         with connection pool saturation or server performance.
21 |     expr: |
22 |       pgbouncer_pools_client_maxwait_seconds{job="integrations/pgbouncer"} > 15
23 |     for: 5m
24 |     labels:
25 |       severity: warning
26 |   - alert: PGBouncerHighServerConnectionSaturationWarning
27 |     annotations:
28 |       description: |
29 |         User connection capacity on {{ $labels.instance }}, is above 80%. The current value is {{ $value | printf "%.2f" }}.
30 |       summary: PGBouncer is nearing user connection capacity.
31 |     expr: |
32 |       100 * (sum without (database, user) (pgbouncer_pools_server_active_connections{job="integrations/pgbouncer"} + pgbouncer_pools_server_idle_connections{job="integrations/pgbouncer"} + pgbouncer_pools_server_used_connections{job="integrations/pgbouncer"}) / clamp_min(pgbouncer_config_max_user_connections{job="integrations/pgbouncer"},1)) > 80
33 |     for: 5m
34 |     labels:
35 |       severity: warning
36 |   - alert: PGBouncerHighServerConnectionSaturationCritical
37 |     annotations:
38 |       description: |
39 |         User connection capacity on {{ $labels.instance }}, is above 90%. The current value is {{ $value | printf "%.2f" }}.
40 |       summary: PGBouncer is nearing critical levels of user connection capacity.
41 |     expr: |
42 |       100 * (sum without (database, user) (pgbouncer_pools_server_active_connections{job="integrations/pgbouncer"} + pgbouncer_pools_server_idle_connections{job="integrations/pgbouncer"} + pgbouncer_pools_server_used_connections{job="integrations/pgbouncer"}) / clamp_min(pgbouncer_config_max_user_connections{job="integrations/pgbouncer"},1)) > 90
43 |     for: 5m
44 |     labels:
45 |       severity: critical
46 | 


--------------------------------------------------------------------------------
/assets/pgbouncer/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/postgres-exporter/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/presto/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/prometheus-operator/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/prometheus/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/promscale/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/promtail/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: promtail_alerts
 3 |   rules:
 4 |   - alert: PromtailRequestsErrors
 5 |     annotations:
 6 |       description: |
 7 |         {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
 8 |       summary: Promtail request error rate is high.
 9 |     expr: |
10 |       100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
11 |         /
12 |       sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
13 |         > 10
14 |     for: 15m
15 |     labels:
16 |       severity: critical
17 |   - alert: PromtailRequestLatency
18 |     annotations:
19 |       description: |
20 |         {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
21 |       summary: Promtail request latency P99 is high.
22 |     expr: |
23 |       job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
24 |     for: 15m
25 |     labels:
26 |       severity: critical
27 |   - alert: PromtailFileMissing
28 |     annotations:
29 |       description: |
30 |         {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
31 |       summary: Promtail cannot find a file it should be tailing.
32 |     expr: |
33 |       promtail_file_bytes_total unless promtail_read_bytes_total
34 |     for: 15m
35 |     labels:
36 |       severity: warning
37 | 


--------------------------------------------------------------------------------
/assets/promtail/rules.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: promtail_rules
 3 |   rules:
 4 |   - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
 5 |       by (le, job))
 6 |     record: job:promtail_request_duration_seconds:99quantile
 7 |   - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
 8 |       by (le, job))
 9 |     record: job:promtail_request_duration_seconds:50quantile
10 |   - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
11 |       by (job)
12 |     record: job:promtail_request_duration_seconds:avg
13 |   - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
14 |     record: job:promtail_request_duration_seconds_bucket:sum_rate
15 |   - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
16 |     record: job:promtail_request_duration_seconds_sum:sum_rate
17 |   - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
18 |     record: job:promtail_request_duration_seconds_count:sum_rate
19 |   - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
20 |       by (le, job, namespace))
21 |     record: job_namespace:promtail_request_duration_seconds:99quantile
22 |   - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
23 |       by (le, job, namespace))
24 |     record: job_namespace:promtail_request_duration_seconds:50quantile
25 |   - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
26 |       / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
27 |     record: job_namespace:promtail_request_duration_seconds:avg
28 |   - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
29 |     record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
30 |   - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
31 |     record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
32 |   - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
33 |     record: job_namespace:promtail_request_duration_seconds_count:sum_rate
34 |   - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
35 |       by (le, job, status_code, namespace))
36 |     record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
37 |   - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
38 |       by (le, job, status_code, namespace))
39 |     record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
40 |   - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
41 |       namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
42 |       status_code, namespace)
43 |     record: job_status_code_namespace:promtail_request_duration_seconds:avg
44 |   - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
45 |       namespace)
46 |     record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
47 |   - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
48 |       namespace)
49 |     record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
50 |   - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
51 |       namespace)
52 |     record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
53 | 


--------------------------------------------------------------------------------
/assets/python-runtime/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/python-runtime/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/rabbitmq/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: RabbitMQClusterAlerts
 3 |   rules:
 4 |   - alert: RabbitMQMemoryHigh
 5 |     annotations:
 6 |       description: A node {{ $labels.instance }} is using more than 90% of allocated
 7 |         RAM.
 8 |       summary: RabbitMQ memory usage is high.
 9 |     expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes
10 |       * 100 > 90
11 |     for: 5m
12 |     labels:
13 |       severity: warning
14 |   - alert: RabbitMQFileDescriptorsUsage
15 |     annotations:
16 |       description: A node {{ $labels.instance }} is using more than 90% of file descriptors.
17 |       summary: RabbitMQ file descriptors usage is high.
18 |     expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90
19 |     for: 5m
20 |     labels:
21 |       severity: warning
22 |   - alert: RabbitMQUnroutableMessages
23 |     annotations:
24 |       description: A queue has unroutable messages on {{ $labels.instance }}.
25 |       summary: A RabbitMQ queue has unroutable messages.
26 |     expr: increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or
27 |       increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0
28 |     for: 5m
29 |     labels:
30 |       severity: warning
31 |   - alert: RabbitMQNodeNotDistributed
32 |     annotations:
33 |       description: "Distribution link state is not 'up' on {{ $labels.instance }},
34 |         current value is {{ $value }}. \nNote: The state is represented as a numerical
35 |         value where pending=1, up_pending=2 and up=3."
36 |       summary: RabbitMQ node not distributed, link state is down.
37 |     expr: erlang_vm_dist_node_state{rabbitmq_cluster!=""} < 3
38 |     for: 5m
39 |     labels:
40 |       severity: critical
41 | 


--------------------------------------------------------------------------------
/assets/rabbitmq/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/rclone/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/rclone/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/redis/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/redis/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/ruby/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/ruby/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/sap-hana/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/sealed-secrets/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: sealed-secrets
 3 |   rules:
 4 |   - alert: SealedSecretsUnsealErrorHigh
 5 |     annotations:
 6 |       description: High number of errors during unsealing Sealed Secrets in {{ $labels.namespace
 7 |         }} namespace.
 8 |       runbook_url: https://github.com/bitnami-labs/sealed-secrets
 9 |       summary: Sealed Secrets Unseal Error High
10 |     expr: |
11 |       sum by (reason, namespace) (rate(sealed_secrets_controller_unseal_errors_total{}[5m])) > 0
12 |     labels:
13 |       severity: warning
14 | 


--------------------------------------------------------------------------------
/assets/sealed-secrets/rules.yaml:
--------------------------------------------------------------------------------
1 | groups: []
2 | 


--------------------------------------------------------------------------------
/assets/snmp/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/spark/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/spark/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/spinnaker/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: igor
 3 |   rules:
 4 |   - alert: PollingMonitorItemsOverThreshold
 5 |     annotations:
 6 |       description: '{{ $labels.monitor }} polling monitor for {{ $labels.partition
 7 |         }} threshold exceeded, preventing pipeline triggers.'
 8 |       runbook_url: https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds
 9 |       summary: Polling monitor item threshold exceeded.
10 |     expr: sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0
11 |     for: 5m
12 |     labels:
13 |       severity: critical
14 | 


--------------------------------------------------------------------------------
/assets/spinnaker/rules.yaml:
--------------------------------------------------------------------------------
1 | groups: []
2 | 


--------------------------------------------------------------------------------
/assets/spring-boot/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: jvm-micrometer-jvm-alerts
 3 |   rules:
 4 |   - alert: JvmMemoryFillingUp
 5 |     annotations:
 6 |       description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the
 7 |         last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
 8 |       summary: JVM heap memory filling up.
 9 |     expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
10 |       (id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
11 |     for: 5m
12 |     keep_firing_for: 5m
13 |     labels:
14 |       severity: warning
15 | 


--------------------------------------------------------------------------------
/assets/spring-boot/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/squid/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: squid
 3 |   rules:
 4 |   - alert: SquidHighPercentageOfHTTPServerRequestErrors
 5 |     annotations:
 6 |       description: |
 7 |         The percentage of HTTP server request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} which is above the threshold of 5.
 8 |       summary: There are a high number of HTTP server errors.
 9 |     expr: |
10 |       rate(squid_server_http_errors_total[5m]) / clamp_min(rate(squid_server_http_requests_total[5m]),1) * 100 > 5
11 |     for: 5m
12 |     labels:
13 |       severity: critical
14 |   - alert: SquidHighPercentageOfFTPServerRequestErrors
15 |     annotations:
16 |       description: |
17 |         The percentage of FTP server request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} which is above the threshold of 5.
18 |       summary: There are a high number of FTP server request errors.
19 |     expr: |
20 |       rate(squid_server_ftp_errors_total[5m]) / clamp_min(rate(squid_server_ftp_requests_total[5m]),1) * 100 > 5
21 |     for: 5m
22 |     labels:
23 |       severity: critical
24 |   - alert: SquidHighPercentageOfOtherServerRequestErrors
25 |     annotations:
26 |       description: |
27 |         The percentage of other server request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} which is above the threshold of 5.
28 |       summary: There are a high number of other server request errors.
29 |     expr: |
30 |       rate(squid_server_other_errors_total[5m]) / clamp_min(rate(squid_server_other_requests_total[5m]),1) * 100 > 5
31 |     for: 5m
32 |     labels:
33 |       severity: critical
34 |   - alert: SquidHighPercentageOfClientRequestErrors
35 |     annotations:
36 |       description: |
37 |         The percentage of HTTP client request errors is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} which is above the threshold of 5.
38 |       summary: There are a high number of HTTP client request errors.
39 |     expr: |
40 |       rate(squid_client_http_errors_total[5m]) / clamp_min(rate(squid_client_http_requests_total[5m]),1) * 100 > 5
41 |     for: 5m
42 |     labels:
43 |       severity: critical
44 |   - alert: SquidLowCacheHitRatio
45 |     annotations:
46 |       description: |
47 |         The cache hit ratio is {{ printf "%.0f" $value }} over the last 10m on {{ $labels.instance }} which is below the threshold of 85.
48 |       summary: The cache hit ratio has fallen below the configured threshold (%).
49 |     expr: |
50 |       rate(squid_client_http_hits_total[10m]) / clamp_min(rate(squid_client_http_requests_total[10m]),1) * 100 < 85
51 |     for: 10m
52 |     labels:
53 |       severity: warning
54 | 


--------------------------------------------------------------------------------
/assets/squid/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/supabase/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/supabase/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/tensorflow/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: TensorFlowServingAlerts
 3 |   rules:
 4 |   - alert: TensorFlowModelRequestHighErrorRate
 5 |     annotations:
 6 |       description: '{{ printf "%.2f" $value }}% of all model requests are not successful,
 7 |         which is above the threshold 30%, indicating a potentially larger issue for
 8 |         {{$labels.instance}}'
 9 |       summary: More than 30% of all model requests are not successful.
10 |     expr: |
11 |       100 * sum(rate(:tensorflow:serving:request_count{status!="OK"}[5m])) by (instance) / sum(rate(:tensorflow:serving:request_count[5m])) by (instance) > 30
12 |     for: 5m
13 |     labels:
14 |       severity: critical
15 |   - alert: TensorFlowServingHighBatchQueuingLatency
16 |     annotations:
17 |       description: Batch queuing latency greater than {{ printf "%.2f" $value }}µs,
18 |         which is above the threshold 5000000µs, indicating a potentially larger issue
19 |         for {{$labels.instance}}
20 |       summary: Batch queuing latency more than 5000000µs.
21 |     expr: |
22 |       increase(:tensorflow:serving:batching_session:queuing_latency_sum[2m]) / increase(:tensorflow:serving:batching_session:queuing_latency_count[2m]) > 5000000
23 |     for: 5m
24 |     labels:
25 |       severity: warning
26 | 


--------------------------------------------------------------------------------
/assets/tensorflow/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/traefik/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/traefik/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/ubnt-edgerouter/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/ubnt-edgerouter/rules.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: ubnt.rules
3 |   rules:
4 |   - expr: label_join(ifAdminStatus,"nicename", ":", "ifName", "ifAlias")
5 |     record: ifNiceName
6 | 


--------------------------------------------------------------------------------
/assets/varnish/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: varnish-cache
 3 |   rules:
 4 |   - alert: VarnishCacheLowCacheHitRate
 5 |     annotations:
 6 |       description: The Cache hit rate is {{ printf "%.0f" $value }} percent over the
 7 |         last 5 minutes on {{$labels.instance}}, which is below the threshold of 80
 8 |         percent.
 9 |       summary: Cache is not answering a sufficient percentage of read requests.
10 |     expr: |
11 |       increase(varnish_main_cache_hit[10m]) / (clamp_min((increase(varnish_main_cache_hit[10m]) + increase(varnish_main_cache_miss[10m])), 1)) * 100 < 80 and (increase(varnish_main_cache_hit[10m]) + increase(varnish_main_cache_miss[10m]) > 0)
12 |     for: 10m
13 |     labels:
14 |       severity: warning
15 |   - alert: VarnishCacheHighMemoryUsage
16 |     annotations:
17 |       description: Current Memory Usage is {{ printf "%.0f" $value }} percent on {{$labels.instance}},
18 |         which is above the threshold of 90 percent.
19 |       summary: Varnish Cache is running low on available memory.
20 |     expr: |
21 |       (varnish_sma_g_bytes{type="s0"} / (varnish_sma_g_bytes{type="s0"} + varnish_sma_g_space{type="s0"})) * 100 > 90
22 |     for: 5m
23 |     labels:
24 |       severity: warning
25 |   - alert: VarnishCacheHighCacheEvictionRate
26 |     annotations:
27 |       description: The Cache has evicted {{ printf "%.0f" $value }} objects over the
28 |         last 5 minutes on {{$labels.instance}}, which is above the threshold of 0.
29 |       summary: The cache is evicting too many objects.
30 |     expr: |
31 |       increase(varnish_main_n_lru_nuked[5m]) > 0
32 |     for: 5m
33 |     labels:
34 |       severity: critical
35 |   - alert: VarnishCacheHighSaturation
36 |     annotations:
37 |       description: The thread queue length is {{ printf "%.0f" $value }} over the
38 |         last 5 minutes on {{$labels.instance}}, which is above the threshold of 0.
39 |       summary: There are too many threads in queue, Varnish is saturated and responses
40 |         are slowed.
41 |     expr: |
42 |       varnish_main_thread_queue_len > 0
43 |     for: 5m
44 |     labels:
45 |       severity: warning
46 |   - alert: VarnishCacheSessionsDropping
47 |     annotations:
48 |       description: The amount of sessions dropped is {{ printf "%.0f" $value }} over
49 |         the last 5 minutes on {{$labels.instance}}, which is above the threshold of
50 |         0.
51 |       summary: Incoming requests are being dropped due to a lack of free worker threads.
52 |     expr: |
53 |       increase(varnish_main_sessions{type="dropped"}[5m]) > 0
54 |     for: 5m
55 |     labels:
56 |       severity: critical
57 |   - alert: VarnishCacheBackendUnhealthy
58 |     annotations:
59 |       description: The amount of unhealthy backend statuses detected is {{ printf
60 |         "%.0f" $value }} over the last 5 minutes on {{$labels.instance}}, which is
61 |         above the threshold of 0.
62 |       summary: Backend has been marked as unhealthy due to slow 200 responses.
63 |     expr: |
64 |       increase(varnish_main_backend_unhealthy[5m]) > 0
65 |     for: 5m
66 |     labels:
67 |       severity: critical
68 | 


--------------------------------------------------------------------------------
/assets/varnish/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/vault/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/vault/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/velero/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: velero
 3 |   rules:
 4 |   - alert: VeleroBackupFailure
 5 |     annotations:
 6 |       description: |
 7 |         Backup failures detected on {{ $labels.instance }}. This could lead to data loss or inability to recover in case of a disaster.
 8 |       summary: Velero backup failures detected.
 9 |     expr: |
10 |       increase(velero_backup_failure_total{job="integrations/velero"}[5m]) > 0
11 |     for: 5m
12 |     labels:
13 |       severity: critical
14 |   - alert: VeleroHighBackupDuration
15 |     annotations:
16 |       description: |
17 |         Backup duration on {{ $labels.instance }} is higher than the average duration over the past 48 hours. This could indicate performance issues or network congestion. The current value is {{ $value | printf "%.2f" }} seconds.
18 |       summary: Velero backups taking longer than usual.
19 |     expr: |
20 |       histogram_quantile(0.5, sum(rate(velero_backup_duration_seconds_bucket{job="integrations/velero"}[5m])) by (le, schedule)) > 1.2 * 1.2 * avg_over_time(histogram_quantile(0.5, sum(rate(velero_backup_duration_seconds_bucket{job="integrations/velero"}[48h])) by (le, schedule))[5m:])
21 |     for: 5m
22 |     labels:
23 |       severity: warning
24 |   - alert: VeleroHighRestoreFailureRate
25 |     annotations:
26 |       description: |
27 |         Restore failures detected on {{ $labels.instance }}. This could prevent timely data recovery and business continuity.
28 |       summary: Velero restore failures detected.
29 |     expr: |
30 |       increase(velero_restore_failed_total{job="integrations/velero"}[5m]) > 0
31 |     for: 5m
32 |     labels:
33 |       severity: critical
34 |   - alert: VeleroUpStatus
35 |     annotations:
36 |       description: "Cannot find any metrics related to Velero on {{ $labels.instance
37 |         }}. This may indicate further issues with Velero or the scraping agent. \n"
38 |       summary: Velero is down.
39 |     expr: |
40 |       up{job="integrations/velero"} != 0
41 |     for: 5m
42 |     labels:
43 |       severity: critical
44 | 


--------------------------------------------------------------------------------
/assets/velero/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/wildfly/alerts.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: wildfly
 3 |   rules:
 4 |   - alert: HighPercentageOfErrorResponses
 5 |     annotations:
 6 |       description: |
 7 |         The percentage of error responses is {{ printf "%.2f" $value }} on {{ $labels.instance }} - {{ $labels.server }} which is higher than {{30 }}.
 8 |       summary: Large percentage of requests are resulting in 5XX responses.
 9 |     expr: |
10 |       sum by (job, instance, server) (increase(wildfly_undertow_error_count_total{}[5m]) / increase(wildfly_undertow_request_count_total{}[5m])) * 100 > 30
11 |     for: 5m
12 |     labels:
13 |       severity: critical
14 |   - alert: HighNumberOfRejectedSessionsForDeployment
15 |     annotations:
16 |       description: |
17 |         Deployemnt {{ $labels.deployment }} on {{ $labels.instance }} is exceeding the threshold for rejected sessions {{ printf "%.0f" $value }} is higher than 20.
18 |       summary: Large number of sessions are being rejected for a deployment.
19 |     expr: |
20 |       sum by (deployment, instance, job) (increase(wildfly_undertow_rejected_sessions_total{}[5m])) > 20
21 |     for: 5m
22 |     labels:
23 |       severity: critical
24 | 


--------------------------------------------------------------------------------
/assets/wildfly/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/windows-active-directory/rules.yaml:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/assets/windows/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/wso2-enterprise-integrator/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/wso2-enterprise-integrator/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/wso2-streaming-integrator/alerts.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/assets/wso2-streaming-integrator/rules.yaml:
--------------------------------------------------------------------------------
1 | null
2 | 


--------------------------------------------------------------------------------
/hack/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/monitoring-mixins/website/hack
 2 | 
 3 | go 1.23
 4 | 
 5 | require (
 6 | 	github.com/brancz/gojsontoyaml v0.0.0-20191212081931-bf2969bbd742
 7 | 	github.com/google/go-jsonnet v0.20.0
 8 | 	github.com/jsonnet-bundler/jsonnet-bundler v0.6.0
 9 | )
10 | 
11 | require (
12 | 	github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect
13 | 	github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect
14 | 	github.com/fatih/color v1.13.0 // indirect
15 | 	github.com/ghodss/yaml v1.0.0 // indirect
16 | 	github.com/mattn/go-colorable v0.1.12 // indirect
17 | 	github.com/mattn/go-isatty v0.0.14 // indirect
18 | 	github.com/pkg/errors v0.9.1 // indirect
19 | 	golang.org/x/sys v0.1.0 // indirect
20 | 	gopkg.in/alecthomas/kingpin.v2 v2.2.6 // indirect
21 | 	gopkg.in/yaml.v2 v2.2.7 // indirect
22 | 	sigs.k8s.io/yaml v1.1.0 // indirect
23 | )
24 | 


--------------------------------------------------------------------------------
/hack/tools.go:
--------------------------------------------------------------------------------
 1 | //+build tools
 2 | 
 3 | // Package tools tracks dependencies for tools that used in the build process.
 4 | // See https://github.com/golang/go/wiki/Modules
 5 | package hack
 6 | 
 7 | import (
 8 | 	_ "github.com/brancz/gojsontoyaml"
 9 | 	_ "github.com/google/go-jsonnet/cmd/jsonnet"
10 | 	_ "github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb"
11 | )
12 | 


--------------------------------------------------------------------------------
/netlify.toml:
--------------------------------------------------------------------------------
 1 | [build]
 2 | base = "site/"
 3 | publish = "public"
 4 | command = "hugo --gc --minify"
 5 | 
 6 | [context.production.environment]
 7 | HUGO_VERSION = "0.70.0"
 8 | HUGO_ENV = "production"
 9 | HUGO_ENABLEGITINFO = "true"
10 | 
11 | [context.split1]
12 | command = "hugo --gc --minify --enableGitInfo"
13 | 
14 | [context.split1.environment]
15 | HUGO_VERSION = "0.70.0"
16 | HUGO_ENV = "production"
17 | 
18 | [context.deploy-preview]
19 | command = "hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL"
20 | 
21 | [context.deploy-preview.environment]
22 | HUGO_VERSION = "0.70.0"
23 | 
24 | [context.branch-deploy]
25 | command = "hugo --gc --minify -b $DEPLOY_PRIME_URL"
26 | 
27 | [context.branch-deploy.environment]
28 | HUGO_VERSION = "0.70.0"
29 | 
30 | [context.next.environment]
31 | HUGO_ENABLEGITINFO = "true"
32 | 


--------------------------------------------------------------------------------
/site/config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | baseURL: "https://monitoring.mixins.dev/"
 3 | # baseURL: ""
 4 | languageCode: "en-us"
 5 | title: "Monitoring Mixins"
 6 | 
 7 | theme: 'ace-documentation'
 8 | 
 9 | # Google analytics
10 | # googleAnalytics: UA-123456789-1
11 | 
12 | permalinks:
13 |   post: /:filename/
14 | 
15 | params:
16 |   project_name: Monitoring Mixins
17 | 
18 |   project_tagline: Combination of alerts, recording rules, and dashboards for prometheus exporters
19 | 
20 |   disableSearch: true
21 |   disableReadmoreNav: true
22 | 
23 | markup:
24 |   highlight:
25 |     style: monokailight
26 | 
27 | menu:
28 |   shortcuts:
29 |     - name: Homepage
30 |       url: /
31 |       weight: 1
32 |     - name: About mixins
33 |       url: "https://github.com/monitoring-mixins/docs"
34 |       weight: 2
35 |     - name: "GitHub"
36 |       url: "https://github.com/monitoring-mixins/website"
37 |       weight: 3
38 | 
39 | 


--------------------------------------------------------------------------------
/site/content/MSSQL/_index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: MSSQL
  3 | ---
  4 | 
  5 | ## Overview
  6 | 
  7 | 
  8 | 
  9 | {{< panel style="danger" >}}
 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/mssql-mixin)
 11 | {{< /panel >}}
 12 | 
 13 | ## Alerts
 14 | 
 15 | {{< panel style="warning" >}}
 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/MSSQL/alerts.yaml).
 17 | {{< /panel >}}
 18 | 
 19 | ### MSSQLAlerts
 20 | 
 21 | ##### MSSQLHighNumberOfDeadlocks
 22 | 
 23 | {{< code lang="yaml" >}}
 24 | alert: MSSQLHighNumberOfDeadlocks
 25 | annotations:
 26 |   description: '{{ printf "%.2f" $value }} deadlocks have occurred over the last 5
 27 |     minutes on {{$labels.instance}}, which is above threshold of 10 deadlocks.'
 28 |   summary: There are deadlocks ocurring in the database.
 29 | expr: |
 30 |   increase(mssql_deadlocks_total{}[5m]) > 10
 31 | for: 5m
 32 | labels:
 33 |   severity: warning
 34 | {{< /code >}}
 35 |  
 36 | ##### MSSQLModerateReadStallTime
 37 | 
 38 | {{< code lang="yaml" >}}
 39 | alert: MSSQLModerateReadStallTime
 40 | annotations:
 41 |   description: '{{ printf "%.2f" $value }}ms of IO read stall has occurred on {{$labels.instance}},
 42 |     which is above threshold of 200ms.'
 43 |   summary: There is a moderate amount of IO stall for database reads.
 44 | expr: |
 45 |   1000 * increase(mssql_io_stall_seconds_total{operation="read"}[5m]) > 200
 46 | for: 5m
 47 | labels:
 48 |   severity: warning
 49 | {{< /code >}}
 50 |  
 51 | ##### MSSQLHighReadStallTime
 52 | 
 53 | {{< code lang="yaml" >}}
 54 | alert: MSSQLHighReadStallTime
 55 | annotations:
 56 |   description: '{{ printf "%.2f" $value }}ms of IO read stall has occurred on {{$labels.instance}},
 57 |     which is above threshold of 400ms.'
 58 |   summary: There is a high amount of IO stall for database reads.
 59 | expr: |
 60 |   1000 * increase(mssql_io_stall_seconds_total{operation="read"}[5m]) > 400
 61 | for: 5m
 62 | labels:
 63 |   severity: critical
 64 | {{< /code >}}
 65 |  
 66 | ##### MSSQLModerateWriteStallTime
 67 | 
 68 | {{< code lang="yaml" >}}
 69 | alert: MSSQLModerateWriteStallTime
 70 | annotations:
 71 |   description: '{{ printf "%.2f" $value }}ms of IO write stall has occurred on {{$labels.instance}},
 72 |     which is above threshold of 200ms.'
 73 |   summary: There is a moderate amount of IO stall for database writes.
 74 | expr: |
 75 |   1000 * increase(mssql_io_stall_seconds_total{operation="write"}[5m]) > 200
 76 | for: 5m
 77 | labels:
 78 |   severity: warning
 79 | {{< /code >}}
 80 |  
 81 | ##### MSSQLHighWriteStallTime
 82 | 
 83 | {{< code lang="yaml" >}}
 84 | alert: MSSQLHighWriteStallTime
 85 | annotations:
 86 |   description: '{{ printf "%.2f" $value }}ms of IO write stall has occurred on {{$labels.instance}},
 87 |     which is above threshold of 400ms.'
 88 |   summary: There is a high amount of IO stall for database writes.
 89 | expr: |
 90 |   1000 * increase(mssql_io_stall_seconds_total{operation="write"}[5m]) > 400
 91 | for: 5m
 92 | labels:
 93 |   severity: critical
 94 | {{< /code >}}
 95 |  
 96 | ## Dashboards
 97 | Following dashboards are generated from mixins and hosted on github:
 98 | 
 99 | 
100 | - [mssql-overview](https://github.com/monitoring-mixins/website/blob/master/assets/MSSQL/dashboards/mssql-overview.json)
101 | - [mssql-pages](https://github.com/monitoring-mixins/website/blob/master/assets/MSSQL/dashboards/mssql-pages.json)
102 | 


--------------------------------------------------------------------------------
/site/content/apache-airflow/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: apache-airflow
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-airflow-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/apache-airflow/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### apache-airflow
20 | 
21 | ##### ApacheAirflowStarvingPoolTasks
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: ApacheAirflowStarvingPoolTasks
25 | annotations:
26 |   description: |
27 |     The number of starved tasks is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of 0.
28 |   summary: There are starved tasks detected in the Apache Airflow pool.
29 | expr: |
30 |   airflow_pool_starving_tasks > 0
31 | for: 5m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ##### ApacheAirflowDAGScheduleDelayWarningLevel
37 | 
38 | {{< code lang="yaml" >}}
39 | alert: ApacheAirflowDAGScheduleDelayWarningLevel
40 | annotations:
41 |   description: |
42 |     The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 10.
43 |   summary: The delay in DAG schedule time to DAG run time has reached the warning
44 |     threshold.
45 | expr: |
46 |   increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 10
47 | for: 1m
48 | labels:
49 |   severity: warning
50 | {{< /code >}}
51 |  
52 | ##### ApacheAirflowDAGScheduleDelayCriticalLevel
53 | 
54 | {{< code lang="yaml" >}}
55 | alert: ApacheAirflowDAGScheduleDelayCriticalLevel
56 | annotations:
57 |   description: |
58 |     The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 60.
59 |   summary: The delay in DAG schedule time to DAG run time has reached the critical
60 |     threshold.
61 | expr: |
62 |   increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 60
63 | for: 1m
64 | labels:
65 |   severity: critical
66 | {{< /code >}}
67 |  
68 | ##### ApacheAirflowDAGFailures
69 | 
70 | {{< code lang="yaml" >}}
71 | alert: ApacheAirflowDAGFailures
72 | annotations:
73 |   description: |
74 |     The number of DAG failures seen is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 0.
75 |   summary: There have been DAG failures detected.
76 | expr: |
77 |   increase(airflow_dagrun_duration_failed_count[5m]) > 0
78 | for: 1m
79 | labels:
80 |   severity: critical
81 | {{< /code >}}
82 |  
83 | ## Dashboards
84 | Following dashboards are generated from mixins and hosted on github:
85 | 
86 | 
87 | - [apache-airflow-overview](https://github.com/monitoring-mixins/website/blob/master/assets/apache-airflow/dashboards/apache-airflow-overview.json)
88 | 


--------------------------------------------------------------------------------
/site/content/apache-camel/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: apache-camel
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-camel-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [apache-camel-micrometer](https://github.com/monitoring-mixins/website/blob/master/assets/apache-camel/dashboards/apache-camel-micrometer.json)
18 | 


--------------------------------------------------------------------------------
/site/content/apache-http/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: apache-http
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-http-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/apache-http/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### apache-http
20 | 
21 | ##### ApacheDown
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: ApacheDown
25 | annotations:
26 |   description: Apache is down on {{ $labels.instance }}.
27 |   summary: Apache is down.
28 | expr: apache_up == 0
29 | for: 5m
30 | labels:
31 |   severity: warning
32 | {{< /code >}}
33 |  
34 | ##### ApacheRestart
35 | 
36 | {{< code lang="yaml" >}}
37 | alert: ApacheRestart
38 | annotations:
39 |   description: Apache has just been restarted on {{ $labels.instance }}.
40 |   summary: Apache restart.
41 | expr: apache_uptime_seconds_total / 60 < 1
42 | for: "0"
43 | labels:
44 |   severity: info
45 | {{< /code >}}
46 |  
47 | ##### ApacheWorkersLoad
48 | 
49 | {{< code lang="yaml" >}}
50 | alert: ApacheWorkersLoad
51 | annotations:
52 |   description: |
53 |     Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}.
54 |     The current value is {{ $value }}%.
55 |   summary: Apache workers load is too high.
56 | expr: |
57 |   (sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80
58 | for: 15m
59 | labels:
60 |   severity: warning
61 | {{< /code >}}
62 |  
63 | ##### ApacheResponseTimeTooHigh
64 | 
65 | {{< code lang="yaml" >}}
66 | alert: ApacheResponseTimeTooHigh
67 | annotations:
68 |   description: |
69 |     Apache average response time is above the threshold of 5000 ms on {{ $labels.instance }}.
70 |     The current value is {{ $value }} ms.
71 |   summary: Apache response time is too high.
72 | expr: |
73 |   increase(apache_duration_ms_total[5m])/increase(apache_accesses_total[5m]) > 5000
74 | for: 15m
75 | labels:
76 |   severity: warning
77 | {{< /code >}}
78 |  
79 | ## Dashboards
80 | Following dashboards are generated from mixins and hosted on github:
81 | 
82 | 
83 | - [apache-http](https://github.com/monitoring-mixins/website/blob/master/assets/apache-http/dashboards/apache-http.json)
84 | 


--------------------------------------------------------------------------------
/site/content/apache-mesos/_index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: apache-mesos
  3 | ---
  4 | 
  5 | ## Overview
  6 | 
  7 | 
  8 | 
  9 | {{< panel style="danger" >}}
 10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-mesos-mixin)
 11 | {{< /panel >}}
 12 | 
 13 | ## Alerts
 14 | 
 15 | {{< panel style="warning" >}}
 16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/apache-mesos/alerts.yaml).
 17 | {{< /panel >}}
 18 | 
 19 | ### apache-mesos
 20 | 
 21 | ##### ApacheMesosHighMemoryUsage
 22 | 
 23 | {{< code lang="yaml" >}}
 24 | alert: ApacheMesosHighMemoryUsage
 25 | annotations:
 26 |   description: '{{ printf "%.0f" $value }} percent memory usage on {{$labels.mesos_cluster}},
 27 |     which is above the threshold of 90.'
 28 |   summary: There is a high memory usage for the cluster.
 29 | expr: |
 30 |   min without(instance, job, type) (mesos_master_mem{type="percent"}) > 90
 31 | for: 5m
 32 | labels:
 33 |   severity: warning
 34 | {{< /code >}}
 35 |  
 36 | ##### ApacheMesosHighDiskUsage
 37 | 
 38 | {{< code lang="yaml" >}}
 39 | alert: ApacheMesosHighDiskUsage
 40 | annotations:
 41 |   description: '{{ printf "%.0f" $value }} percent disk usage on {{$labels.mesos_cluster}},
 42 |     which is above the threshold of 90.'
 43 |   summary: There is a high disk usage for the cluster.
 44 | expr: |
 45 |   min without(instance, job, type) (mesos_master_disk{type="percent"}) > 90
 46 | for: 5m
 47 | labels:
 48 |   severity: critical
 49 | {{< /code >}}
 50 |  
 51 | ##### ApacheMesosUnreachableTasks
 52 | 
 53 | {{< code lang="yaml" >}}
 54 | alert: ApacheMesosUnreachableTasks
 55 | annotations:
 56 |   description: '{{ printf "%.0f" $value }} unreachable tasks on {{$labels.mesos_cluster}},
 57 |     which is above the threshold of 3.'
 58 |   summary: There are an unusually high number of unreachable tasks.
 59 | expr: |
 60 |   max without(instance, job, state) (mesos_master_task_states_current{state="unreachable"}) > 3
 61 | for: 5m
 62 | labels:
 63 |   severity: warning
 64 | {{< /code >}}
 65 |  
 66 | ##### ApacheMesosNoLeaderElected
 67 | 
 68 | {{< code lang="yaml" >}}
 69 | alert: ApacheMesosNoLeaderElected
 70 | annotations:
 71 |   description: There is no cluster coordinator on {{$labels.mesos_cluster}}.
 72 |   summary: There is currently no cluster coordinator.
 73 | expr: |
 74 |   max without(instance, job) (mesos_master_elected) == 0
 75 | for: 1m
 76 | labels:
 77 |   severity: critical
 78 | {{< /code >}}
 79 |  
 80 | ##### ApacheMesosInactiveAgents
 81 | 
 82 | {{< code lang="yaml" >}}
 83 | alert: ApacheMesosInactiveAgents
 84 | annotations:
 85 |   description: '{{ printf "%.0f" $value }} inactive agent clients over the last 5m
 86 |     which is above the threshold of 1.'
 87 |   summary: There are currently inactive agent clients.
 88 | expr: |
 89 |   max without(instance, job, state) (mesos_master_slaves_state{state=~"connected_inactive|disconnected_inactive"}) > 1
 90 | for: 5m
 91 | labels:
 92 |   severity: warning
 93 | {{< /code >}}
 94 |  
 95 | ## Dashboards
 96 | Following dashboards are generated from mixins and hosted on github:
 97 | 
 98 | 
 99 | - [apache-mesos-overview](https://github.com/monitoring-mixins/website/blob/master/assets/apache-mesos/dashboards/apache-mesos-overview.json)
100 | 


--------------------------------------------------------------------------------
/site/content/apache-tomcat/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: apache-tomcat
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/apache-tomcat-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/apache-tomcat/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### ApacheTomcatAlerts
20 | 
21 | ##### ApacheTomcatAlertsHighCpuUsage
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: ApacheTomcatAlertsHighCpuUsage
25 | annotations:
26 |   description: The CPU usage has been at {{ printf "%.0f" $value }} percent over the
27 |     last 5 minutes on {{$labels.instance}}, which is above the threshold of 80 percent.
28 |   summary: The instance has a CPU usage higher than the configured threshold.
29 | expr: |
30 |   sum by (job,instance) (jvm_process_cpu_load{job="integrations/tomcat"}) > 80
31 | for: 5m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ##### ApacheTomcatAlertsHighMemoryUsage
37 | 
38 | {{< code lang="yaml" >}}
39 | alert: ApacheTomcatAlertsHighMemoryUsage
40 | annotations:
41 |   description: The memory usage has been at {{ printf "%.0f" $value }} percent over
42 |     the last 5 minutes on {{$labels.instance}}, which is above the threshold of 80
43 |     percent.
44 |   summary: The instance has a higher memory usage than the configured threshold.
45 | expr: |
46 |   sum(jvm_memory_usage_used_bytes{job="integrations/tomcat"}) by (job,instance) / sum(jvm_physical_memory_bytes{job="integrations/tomcat"}) by (job,instance) * 100 > 80
47 | for: 5m
48 | labels:
49 |   severity: critical
50 | {{< /code >}}
51 |  
52 | ##### ApacheTomcatAlertsHighRequestErrorPercent
53 | 
54 | {{< code lang="yaml" >}}
55 | alert: ApacheTomcatAlertsHighRequestErrorPercent
56 | annotations:
57 |   description: The percentage of request errors has been at {{ printf "%.0f" $value
58 |     }} percent over the last 5 minutes on {{$labels.instance}}, which is above the
59 |     threshold of 5 percent.
60 |   summary: There are a high number of request errors.
61 | expr: |
62 |   sum by (job,instance) (increase(tomcat_errorcount_total{job="integrations/tomcat"}[5m]) / increase(tomcat_requestcount_total{job="integrations/tomcat"}[5m]) * 100) > 5
63 | for: 5m
64 | labels:
65 |   severity: critical
66 | {{< /code >}}
67 |  
68 | ##### ApacheTomcatAlertsModeratelyHighProcessingTime
69 | 
70 | {{< code lang="yaml" >}}
71 | alert: ApacheTomcatAlertsModeratelyHighProcessingTime
72 | annotations:
73 |   description: The processing time has been at {{ printf "%.0f" $value }}ms over the
74 |     last 5 minutes on {{$labels.instance}}, which is above the threshold of 300ms.
75 |   summary: The processing time has been moderately high.
76 | expr: |
77 |   sum by (job,instance) (increase(tomcat_processingtime_total{job="integrations/tomcat"}[5m]) / increase(tomcat_requestcount_total{job="integrations/tomcat"}[5m])) > 300
78 | for: 5m
79 | labels:
80 |   severity: warning
81 | {{< /code >}}
82 |  
83 | ## Dashboards
84 | Following dashboards are generated from mixins and hosted on github:
85 | 
86 | 
87 | - [apache-tomcat-hosts](https://github.com/monitoring-mixins/website/blob/master/assets/apache-tomcat/dashboards/apache-tomcat-hosts.json)
88 | - [apache-tomcat-overview](https://github.com/monitoring-mixins/website/blob/master/assets/apache-tomcat/dashboards/apache-tomcat-overview.json)
89 | 


--------------------------------------------------------------------------------
/site/content/argocd/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: argocd
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/argocd-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/argocd/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### ArgoCD
20 | 
21 | ##### ArgoAppOutOfSync
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: ArgoAppOutOfSync
25 | annotations:
26 |   description: Application {{ $labels.name }} has sync status as {{ $labels.sync_status
27 |     }}.
28 |   summary: Application is OutOfSync.
29 | expr: argocd_app_info{sync_status="OutOfSync"} == 1
30 | for: 1m
31 | labels:
32 |   severity: warning
33 | {{< /code >}}
34 |  
35 | ##### ArgoAppSyncFailed
36 | 
37 | {{< code lang="yaml" >}}
38 | alert: ArgoAppSyncFailed
39 | annotations:
40 |   description: Application {{ $labels.name }} has sync phase as {{ $labels.phase }}.
41 |   summary: Application Sync Failed.
42 | expr: argocd_app_sync_total{phase!="Succeeded"} == 1
43 | for: 1m
44 | labels:
45 |   severity: warning
46 | {{< /code >}}
47 |  
48 | ##### ArgoAppMissing
49 | 
50 | {{< code lang="yaml" >}}
51 | alert: ArgoAppMissing
52 | annotations:
53 |   description: "ArgoCD has not reported any applications data for the past 15 minutes
54 |     which means that it must be down or not functioning properly.  
55 | "
56 |   summary: No reported applications in ArgoCD.
57 | expr: absent(argocd_app_info)
58 | for: 15m
59 | labels:
60 |   severity: critical
61 | {{< /code >}}
62 |  
63 | ## Dashboards
64 | Following dashboards are generated from mixins and hosted on github:
65 | 
66 | 
67 | - [argocd-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argocd/dashboards/argocd-overview.json)
68 | 


--------------------------------------------------------------------------------
/site/content/asterisk/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: asterisk
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/asterisk-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/asterisk/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### AsteriskAlerts
20 | 
21 | ##### AsteriskRestarted
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: AsteriskRestarted
25 | annotations:
26 |   description: |-
27 |     Asterisk instance restarted in the last minute
28 |       VALUE = {{ $value }}
29 |       LABELS = {{ $labels }}
30 |   summary: Asterisk instance restarted in the last minute.
31 | expr: asterisk_core_uptime_seconds < 60
32 | for: 5s
33 | labels:
34 |   severity: critical
35 | {{< /code >}}
36 |  
37 | ##### AsteriskReloaded
38 | 
39 | {{< code lang="yaml" >}}
40 | alert: AsteriskReloaded
41 | annotations:
42 |   description: |-
43 |     Asterisk instance reloaded in the last minute
44 |       VALUE = {{ $value }}
45 |       LABELS = {{ $labels }}
46 |   summary: Asterisk instance reloaded in the last minute.
47 | expr: asterisk_core_last_reload_seconds < 60
48 | for: 5s
49 | labels:
50 |   severity: warning
51 | {{< /code >}}
52 |  
53 | ##### AsteriskHighScrapeTime
54 | 
55 | {{< code lang="yaml" >}}
56 | alert: AsteriskHighScrapeTime
57 | annotations:
58 |   description: |-
59 |     Asterisk instance core high scrape time (Possible system performance degradation)
60 |       VALUE = {{ $value }}
61 |       LABELS = {{ $labels }}
62 |   summary: Asterisk instance core high scrape time.
63 | expr: asterisk_core_scrape_time_ms > 100
64 | for: 10s
65 | labels:
66 |   severity: critical
67 | {{< /code >}}
68 |  
69 | ##### AsteriskHighActiveCallsCount
70 | 
71 | {{< code lang="yaml" >}}
72 | alert: AsteriskHighActiveCallsCount
73 | annotations:
74 |   description: |-
75 |     Asterisk high active call count
76 |       VALUE = {{ $value }}
77 |       LABELS = {{ $labels }}
78 |   summary: Asterisk high active call count.
79 | expr: asterisk_calls_count > 100
80 | for: 10s
81 | labels:
82 |   severity: warning
83 | {{< /code >}}
84 |  
85 | ## Dashboards
86 | Following dashboards are generated from mixins and hosted on github:
87 | 
88 | 
89 | - [asterisk-logs](https://github.com/monitoring-mixins/website/blob/master/assets/asterisk/dashboards/asterisk-logs.json)
90 | - [asterisk-overview](https://github.com/monitoring-mixins/website/blob/master/assets/asterisk/dashboards/asterisk-overview.json)
91 | 


--------------------------------------------------------------------------------
/site/content/awx/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: awx
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/awx-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [awx](https://github.com/monitoring-mixins/website/blob/master/assets/awx/dashboards/awx.json)
18 | 


--------------------------------------------------------------------------------
/site/content/blackbox_exporter/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: blackbox_exporter
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/adinhodovic/blackbox-exporter-mixin/](https://github.com/adinhodovic/blackbox-exporter-mixin/)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/blackbox_exporter/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### blackbox-exporter.rules
20 | 
21 | ##### BlackboxProbeFailed
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: BlackboxProbeFailed
25 | annotations:
26 |   dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{
27 |     $labels.instance }}
28 |   description: The probe failed for the instance {{ $labels.instance }}.
29 |   summary: Probe has failed for the past 1m interval.
30 | expr: |
31 |   probe_success{job="blackbox-exporter"} == 0
32 | for: 1m
33 | labels:
34 |   severity: critical
35 | {{< /code >}}
36 |  
37 | ##### BlackboxLowUptime30d
38 | 
39 | {{< code lang="yaml" >}}
40 | alert: BlackboxLowUptime30d
41 | annotations:
42 |   dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{
43 |     $labels.instance }}
44 |   description: The probe has a lower uptime than 99.9% the last 30 days for the instance
45 |     {{ $labels.instance }}.
46 |   summary: Probe uptime is lower than 99.9% for the last 30 days.
47 | expr: |
48 |   avg_over_time(probe_success{job="blackbox-exporter"}[30d]) * 100 < 99.900000000000006
49 | labels:
50 |   severity: info
51 | {{< /code >}}
52 |  
53 | ##### BlackboxSslCertificateWillExpireSoon
54 | 
55 | {{< code lang="yaml" >}}
56 | alert: BlackboxSslCertificateWillExpireSoon
57 | annotations:
58 |   dashboard_url: https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?var-instance={{
59 |     $labels.instance }}
60 |   description: |
61 |     The SSL certificate of the instance {{ $labels.instance }} is expiring within 21 days.
62 |     Actual time left: {{ $value | humanizeDuration }}.
63 |   summary: SSL certificate will expire soon.
64 | expr: |
65 |   probe_ssl_earliest_cert_expiry{job="blackbox-exporter"} - time() < 21 * 24 * 3600
66 | labels:
67 |   severity: warning
68 | {{< /code >}}
69 |  
70 | ## Dashboards
71 | Following dashboards are generated from mixins and hosted on github:
72 | 
73 | 
74 | - [blackbox-exporter](https://github.com/monitoring-mixins/website/blob/master/assets/blackbox_exporter/dashboards/blackbox-exporter.json)
75 | 


--------------------------------------------------------------------------------
/site/content/caddy/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: caddy
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/caddy-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [caddy-overview](https://github.com/monitoring-mixins/website/blob/master/assets/caddy/dashboards/caddy-overview.json)
18 | 


--------------------------------------------------------------------------------
/site/content/clickhouse/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: clickhouse
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/clickhouse-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### ClickHouseAlerts
20 | 
21 | ##### ClickHouseReplicationQueueBackingUp
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: ClickHouseReplicationQueueBackingUp
25 | annotations:
26 |   description: |
27 |     ClickHouse replication tasks are processing slower than expected on {{ $labels.instance }} causing replication queue size to back up at {{ $value }} exceeding the threshold value of 99.
28 |   summary: ClickHouse replica max queue size backing up.
29 | expr: |
30 |   ClickHouseAsyncMetrics_ReplicasMaxQueueSize > 99
31 | for: 5m
32 | keep_firing_for: 5m
33 | labels:
34 |   severity: warning
35 | {{< /code >}}
36 |  
37 | ##### ClickHouseRejectedInserts
38 | 
39 | {{< code lang="yaml" >}}
40 | alert: ClickHouseRejectedInserts
41 | annotations:
42 |   description: ClickHouse inserts are being rejected on {{ $labels.instance }} as
43 |     items are being inserted faster than ClickHouse is able to merge them.
44 |   summary: ClickHouse has too many rejected inserts.
45 | expr: ClickHouseProfileEvents_RejectedInserts > 1
46 | for: 5m
47 | keep_firing_for: 5m
48 | labels:
49 |   severity: critical
50 | {{< /code >}}
51 |  
52 | ##### ClickHouseZookeeperSessions
53 | 
54 | {{< code lang="yaml" >}}
55 | alert: ClickHouseZookeeperSessions
56 | annotations:
57 |   description: |
58 |     ClickHouse has more than one connection to a Zookeeper on {{ $labels.instance }} which can lead to bugs due to stale reads in Zookeepers consistency model.
59 |   summary: ClickHouse has too many Zookeeper sessions.
60 | expr: ClickHouseMetrics_ZooKeeperSession > 1
61 | for: 5m
62 | keep_firing_for: 5m
63 | labels:
64 |   severity: critical
65 | {{< /code >}}
66 |  
67 | ##### ClickHouseReplicasInReadOnly
68 | 
69 | {{< code lang="yaml" >}}
70 | alert: ClickHouseReplicasInReadOnly
71 | annotations:
72 |   description: |
73 |     ClickHouse has replicas in a read only state on {{ $labels.instance }} after losing connection to Zookeeper or at startup.
74 |   summary: ClickHouse has too many replicas in read only state.
75 | expr: ClickHouseMetrics_ReadonlyReplica > 0
76 | for: 5m
77 | keep_firing_for: 5m
78 | labels:
79 |   severity: critical
80 | {{< /code >}}
81 |  
82 | ## Dashboards
83 | Following dashboards are generated from mixins and hosted on github:
84 | 
85 | 
86 | - [clickhouse-latency](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/dashboards/clickhouse-latency.json)
87 | - [clickhouse-logs](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/dashboards/clickhouse-logs.json)
88 | - [clickhouse-overview](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/dashboards/clickhouse-overview.json)
89 | - [clickhouse-replica](https://github.com/monitoring-mixins/website/blob/master/assets/clickhouse/dashboards/clickhouse-replica.json)
90 | 


--------------------------------------------------------------------------------
/site/content/confluent-kafka/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: confluent-kafka
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/confluent-kafka-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [confluent-kafka-overview](https://github.com/monitoring-mixins/website/blob/master/assets/confluent-kafka/dashboards/confluent-kafka-overview.json)
18 | 


--------------------------------------------------------------------------------
/site/content/consul/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: consul
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | Grafana dashboards and Prometheus alerts for operating Consul, in the form of a monitoring mixin.
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/consul-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/consul/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### consul
20 | 
21 | ##### ConsulUp
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: ConsulUp
25 | annotations:
26 |   description: Consul '{{ $labels.job }}' is not up.
27 |   summary: Consul is not up.
28 | expr: |
29 |   consul_up != 1
30 | for: 1m
31 | labels:
32 |   severity: critical
33 | {{< /code >}}
34 |  
35 | ##### ConsulMaster
36 | 
37 | {{< code lang="yaml" >}}
38 | alert: ConsulMaster
39 | annotations:
40 |   description: Consul '{{ $labels.job }}' has no master.
41 |   summary: Consul has no master.
42 | expr: |
43 |   consul_raft_leader != 1
44 | for: 1m
45 | labels:
46 |   severity: critical
47 | {{< /code >}}
48 |  
49 | ##### ConsulPeers
50 | 
51 | {{< code lang="yaml" >}}
52 | alert: ConsulPeers
53 | annotations:
54 |   description: Consul '{{ $labels.job }}' does not have 3 peers.
55 |   summary: Consul does not have peers.
56 | expr: |
57 |   consul_raft_peers != 3
58 | for: 10m
59 | labels:
60 |   severity: critical
61 | {{< /code >}}
62 |  
63 | ## Dashboards
64 | Following dashboards are generated from mixins and hosted on github:
65 | 
66 | 
67 | - [consul-overview](https://github.com/monitoring-mixins/website/blob/master/assets/consul/dashboards/consul-overview.json)
68 | 


--------------------------------------------------------------------------------
/site/content/couchbase/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: couchbase
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/couchbase-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/couchbase/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### couchbase
20 | 
21 | ##### CouchbaseHighCPUUsage
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: CouchbaseHighCPUUsage
25 | annotations:
26 |   description: '{{ printf "%.0f" $value }} percent CPU usage on node {{$labels.instance}}
27 |     and on cluster {{$labels.couchbase_cluster}}, which is above the threshold of
28 |     85.'
29 |   summary: The node CPU usage has exceeded the critical threshold.
30 | expr: |
31 |   (sys_cpu_utilization_rate) > 85
32 | for: 5m
33 | labels:
34 |   severity: critical
35 | {{< /code >}}
36 |  
37 | ##### CouchbaseHighMemoryUsage
38 | 
39 | {{< code lang="yaml" >}}
40 | alert: CouchbaseHighMemoryUsage
41 | annotations:
42 |   description: '{{ printf "%.0f" $value }} percent memory usage on node {{$labels.instance}}
43 |     and on cluster {{$labels.couchbase_cluster}}, which is above the threshold of
44 |     85.'
45 |   summary: There is a limited amount of memory available for a node.
46 | expr: |
47 |   100 * (sys_mem_actual_used / clamp_min(sys_mem_actual_used + sys_mem_actual_free, 1)) > 85
48 | for: 5m
49 | labels:
50 |   severity: critical
51 | {{< /code >}}
52 |  
53 | ##### CouchbaseMemoryEvictionRate
54 | 
55 | {{< code lang="yaml" >}}
56 | alert: CouchbaseMemoryEvictionRate
57 | annotations:
58 |   description: '{{ printf "%.0f" $value }} evictions in bucket {{$labels.bucket}},
59 |     on node {{$labels.instance}}, and on cluster {{$labels.couchbase_cluster}}, which
60 |     is above the threshold of 10.'
61 |   summary: There is a spike in evictions in a bucket, which indicates high memory
62 |     pressure.
63 | expr: |
64 |   (kv_ep_num_value_ejects) > 10
65 | for: 5m
66 | labels:
67 |   severity: warning
68 | {{< /code >}}
69 |  
70 | ##### CouchbaseInvalidRequestVolume
71 | 
72 | {{< code lang="yaml" >}}
73 | alert: CouchbaseInvalidRequestVolume
74 | annotations:
75 |   description: '{{ printf "%.0f" $value }} invalid requests to {{$labels.couchbase_cluster}},
76 |     which is above the threshold of 1000.'
77 |   summary: There is a high volume of incoming invalid requests, which may indicate
78 |     a DOS or injection attack.
79 | expr: |
80 |   sum without(instance, job) (rate(n1ql_invalid_requests[2m])) > 1000
81 | for: 2m
82 | labels:
83 |   severity: warning
84 | {{< /code >}}
85 |  
86 | ## Dashboards
87 | Following dashboards are generated from mixins and hosted on github:
88 | 
89 | 
90 | - [couchbase-bucket-overview](https://github.com/monitoring-mixins/website/blob/master/assets/couchbase/dashboards/couchbase-bucket-overview.json)
91 | - [couchbase-cluster-overview](https://github.com/monitoring-mixins/website/blob/master/assets/couchbase/dashboards/couchbase-cluster-overview.json)
92 | - [couchbase-node-overview](https://github.com/monitoring-mixins/website/blob/master/assets/couchbase/dashboards/couchbase-node-overview.json)
93 | 


--------------------------------------------------------------------------------
/site/content/discourse/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: discourse
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/discourse-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/discourse/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### DiscourseAlerts
20 | 
21 | ##### DiscourseRequestsHigh5xxErrors
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: DiscourseRequestsHigh5xxErrors
25 | annotations:
26 |   description: '{{ printf "%.2f" $value }}% of all requests are resulting in 500 status
27 |     codes, which is above the threshold 10%, indicating a potentially larger issue
28 |     for {{$labels.instance}}'
29 |   summary: More than 10% of all requests result in a 5XX.
30 | expr: |
31 |   100 * rate(discourse_http_requests{status="500"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 10
32 | for: 5m
33 | labels:
34 |   severity: critical
35 | {{< /code >}}
36 |  
37 | ##### DiscourseRequestsHigh4xxErrors
38 | 
39 | {{< code lang="yaml" >}}
40 | alert: DiscourseRequestsHigh4xxErrors
41 | annotations:
42 |   description: '{{ printf "%.2f" $value }}% of all requests are resulting in 400 status
43 |     code, which is above the threshold 30%, indicating a potentially larger issue
44 |     for {{$labels.instance}}'
45 |   summary: More than 30% of all requests result in a 4XX.
46 | expr: |
47 |   100 * rate(discourse_http_requests{status=~"^4.*"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 30
48 | for: 5m
49 | labels:
50 |   severity: warning
51 | {{< /code >}}
52 |  
53 | ## Dashboards
54 | Following dashboards are generated from mixins and hosted on github:
55 | 
56 | 
57 | - [discourse-jobs](https://github.com/monitoring-mixins/website/blob/master/assets/discourse/dashboards/discourse-jobs.json)
58 | - [discourse-overview](https://github.com/monitoring-mixins/website/blob/master/assets/discourse/dashboards/discourse-overview.json)
59 | 


--------------------------------------------------------------------------------
/site/content/docker/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: docker
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | A set of Grafana dashboards for Docker (based on cadvisor).
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/docker-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [docker](https://github.com/monitoring-mixins/website/blob/master/assets/docker/dashboards/docker.json)
18 | 


--------------------------------------------------------------------------------
/site/content/elasticsearch/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: elasticsearch
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/dockelasticsearcher-mixin)
11 | {{< /panel >}}
12 | 
13 | 


--------------------------------------------------------------------------------
/site/content/envoy/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: envoy
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/envoy-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [envoy-overview](https://github.com/monitoring-mixins/website/blob/master/assets/envoy/dashboards/envoy-overview.json)
18 | 


--------------------------------------------------------------------------------
/site/content/gitea/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: gitea
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/go-gitea/gitea](https://github.com/go-gitea/gitea/tree/master/contrib/gitea-monitoring-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [gitea-overview](https://github.com/monitoring-mixins/website/blob/master/assets/gitea/dashboards/gitea-overview.json)
18 | 


--------------------------------------------------------------------------------
/site/content/gitlab/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: gitlab
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/gitlab-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/gitlab/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### GitLabAlerts
20 | 
21 | ##### GitLabHighJobRegistrationFailures
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: GitLabHighJobRegistrationFailures
25 | annotations:
26 |   description: '{{ printf "%.2f" $value }}% of job registrations have failed on {{$labels.instance}},
27 |     which is above threshold of 10%.'
28 |   summary: Large percentage of failed attempts to register a job.
29 | expr: "100 * rate(job_register_attempts_failed_total{}[5m]) / rate(job_register_attempts_total{}[5m])
30 |   
31 | > 10
32 | "
33 | for: 5m
34 | labels:
35 |   severity: warning
36 | {{< /code >}}
37 |  
38 | ##### GitLabHighRunnerAuthFailure
39 | 
40 | {{< code lang="yaml" >}}
41 | alert: GitLabHighRunnerAuthFailure
42 | annotations:
43 |   description: '{{ printf "%.2f" $value }}% of GitLab runner authentication attempts
44 |     are failing on {{$labels.instance}}, which is above the threshold of 10%.'
45 |   summary: Large percentage of runner authentication failures.
46 | expr: "100 * sum by (instance) (rate(gitlab_ci_runner_authentication_failure_total{}[5m]))
47 |   \ / 
48 | (sum by (instance) (rate(gitlab_ci_runner_authentication_success_total{}[5m]))
49 |   \ + sum by (instance) (rate(gitlab_ci_runner_authentication_failure_total{}[5m])))
50 | >
51 |   10
52 | "
53 | for: 5m
54 | labels:
55 |   severity: warning
56 | {{< /code >}}
57 |  
58 | ##### GitLabHigh5xxResponses
59 | 
60 | {{< code lang="yaml" >}}
61 | alert: GitLabHigh5xxResponses
62 | annotations:
63 |   description: '{{ printf "%.2f" $value }}% of all requests returned 5XX HTTP responses,
64 |     which is above the threshold 10%, indicating a system issue on {{$labels.instance}}.'
65 |   summary: Large rate of HTTP 5XX errors.
66 | expr: "100 * sum by (instance) (rate(http_requests_total{status=~\"^5.*\"}[5m])) /
67 |   sum by (instance) (rate(http_requests_total{}[5m])) 
68 | > 10
69 | "
70 | for: 5m
71 | labels:
72 |   severity: critical
73 | {{< /code >}}
74 |  
75 | ##### GitLabHigh4xxResponses
76 | 
77 | {{< code lang="yaml" >}}
78 | alert: GitLabHigh4xxResponses
79 | annotations:
80 |   description: '{{ printf "%.2f" $value }}% of all requests returned 4XX HTTP responses,
81 |     which is above the threshold 10%, indicating many failed requests on {{$labels.instance}}.'
82 |   summary: Large rate of HTTP 4XX errors.
83 | expr: |
84 |   100 * sum by (instance) (rate(http_requests_total{status=~"^4.*"}[5m])) / sum by (instance) (rate(http_requests_total{}[5m]))
85 |   > 10
86 | for: 5m
87 | labels:
88 |   severity: warning
89 | {{< /code >}}
90 |  
91 | ## Dashboards
92 | Following dashboards are generated from mixins and hosted on github:
93 | 
94 | 
95 | - [gitlab-overview](https://github.com/monitoring-mixins/website/blob/master/assets/gitlab/dashboards/gitlab-overview.json)
96 | 


--------------------------------------------------------------------------------
/site/content/go-runtime/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: go-runtime
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/go-runtime-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [go-runtime](https://github.com/monitoring-mixins/website/blob/master/assets/go-runtime/dashboards/go-runtime.json)
18 | 


--------------------------------------------------------------------------------
/site/content/grafana/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: grafana
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/grafana](https://github.com/grafana/grafana/tree/master/grafana-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### GrafanaAlerts
20 | 
21 | ##### GrafanaRequestsFailing
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: GrafanaRequestsFailing
25 | annotations:
26 |   message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing
27 |     {{ $value | humanize }}% errors'
28 | expr: |
29 |   100 * sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."})
30 |   /
31 |   sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
32 |   > 50
33 | for: 5m
34 | labels:
35 |   severity: warning
36 | {{< /code >}}
37 |  
38 | ## Recording rules
39 | 
40 | {{< panel style="warning" >}}
41 | Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/rules.yaml).
42 | {{< /panel >}}
43 | 
44 | ### grafana_rules
45 | 
46 | ##### namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
47 | 
48 | {{< code lang="yaml" >}}
49 | expr: |
50 |   sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
51 | record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
52 | {{< /code >}}
53 |  
54 | ## Dashboards
55 | Following dashboards are generated from mixins and hosted on github:
56 | 
57 | 
58 | - [grafana-overview](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/dashboards/grafana-overview.json)
59 | 


--------------------------------------------------------------------------------
/site/content/haproxy/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: haproxy
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/haproxy-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### HAProxyAlerts
20 | 
21 | ##### HAProxyDroppingLogs
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: HAProxyDroppingLogs
25 | annotations:
26 |   description: HAProxy {{$labels.job}} on {{$labels.instance}} is dropping logs.
27 |   summary: HAProxy is dropping logs.
28 | expr: rate(haproxy_process_dropped_logs_total[5m]) != 0
29 | for: 10m
30 | labels:
31 |   severity: critical
32 | {{< /code >}}
33 |  
34 | ##### HAProxyBackendCheckFlapping
35 | 
36 | {{< code lang="yaml" >}}
37 | alert: HAProxyBackendCheckFlapping
38 | annotations:
39 |   description: HAProxy {{$labels.job}} backend {{$labels.proxy}} on {{$labels.instance}}
40 |     has flapping checks.
41 |   summary: HAProxy backend checks are flapping.
42 | expr: rate(haproxy_backend_check_up_down_total[5m]) != 0
43 | for: 10m
44 | labels:
45 |   severity: critical
46 | {{< /code >}}
47 |  
48 | ##### HAProxyServerCheckFlapping
49 | 
50 | {{< code lang="yaml" >}}
51 | alert: HAProxyServerCheckFlapping
52 | annotations:
53 |   description: HAProxy {{$labels.job}} server {{$labels.server}} on {{$labels.instance}}
54 |     has flapping checks.
55 |   summary: HAProxy server checks are flapping.
56 | expr: rate(haproxy_server_check_up_down_total[5m]) != 0
57 | for: 10m
58 | labels:
59 |   severity: critical
60 | {{< /code >}}
61 |  
62 | ## Dashboards
63 | Following dashboards are generated from mixins and hosted on github:
64 | 
65 | 
66 | - [haproxy-backend](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/dashboards/haproxy-backend.json)
67 | - [haproxy-frontend](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/dashboards/haproxy-frontend.json)
68 | - [haproxy-overview](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/dashboards/haproxy-overview.json)
69 | - [haproxy-server](https://github.com/monitoring-mixins/website/blob/master/assets/haproxy/dashboards/haproxy-server.json)
70 | 


--------------------------------------------------------------------------------
/site/content/harbor/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: harbor
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/harbor-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/harbor/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### Harbor
20 | 
21 | ##### HarborComponentStatus
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: HarborComponentStatus
25 | annotations:
26 |   description: Harbor {{ $labels.component }} has been down for more than 5 minutes
27 |   summary: Harbor Component is Down.
28 | expr: |
29 |   harbor_up == 0
30 | for: 5m
31 | labels:
32 |   severity: critical
33 | {{< /code >}}
34 |  
35 | ##### HarborProjectQuataExceeded
36 | 
37 | {{< code lang="yaml" >}}
38 | alert: HarborProjectQuataExceeded
39 | annotations:
40 |   description: Harbor project {{ $labels.project_name }} has exceeded the configured
41 |     disk usage quota for the past 15 minutes
42 |   summary: Harbor project exceeds disk usage quota.
43 | expr: |
44 |   harbor_project_quota_usage_byte > harbor_project_quota_byte and on(harbor_project_quota_usage_byte) harbor_project_quota_byte != -1
45 | for: 15m
46 | labels:
47 |   severity: warning
48 | {{< /code >}}
49 |  
50 | ##### HarborHighErrorRate
51 | 
52 | {{< code lang="yaml" >}}
53 | alert: HarborHighErrorRate
54 | annotations:
55 |   description: HTTP Requests of {{ $labels.instance }} are having a high Error rate
56 |   summary: Harbor high error rate.
57 | expr: sum(rate(harbor_core_http_request_total{code=~"4..|5.."}[5m]))/sum(rate(harbor_core_http_request_total[5m]))
58 |   > 0.15
59 | for: 5m
60 | labels:
61 |   severity: warning
62 | {{< /code >}}
63 |  
64 | ## Dashboards
65 | Following dashboards are generated from mixins and hosted on github:
66 | 
67 | 
68 | - [harbor-overview](https://github.com/monitoring-mixins/website/blob/master/assets/harbor/dashboards/harbor-overview.json)
69 | 


--------------------------------------------------------------------------------
/site/content/hass/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: hass
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/hass-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [hass](https://github.com/monitoring-mixins/website/blob/master/assets/hass/dashboards/hass.json)
18 | 


--------------------------------------------------------------------------------
/site/content/ibm-mq/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: ibm-mq
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/ibm-mq-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### ibm-mq-alerts
20 | 
21 | ##### IBMMQExpiredMessages
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: IBMMQExpiredMessages
25 | annotations:
26 |   description: The number of expired messages in the {{$labels.qmgr}} is {{$labels.value}}
27 |     which is above the threshold of 2.
28 |   summary: There are expired messages, which imply that application resilience is
29 |     failing.
30 | expr: |
31 |   sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_expired_message_count) > 2
32 | for: 5m
33 | labels:
34 |   severity: critical
35 | {{< /code >}}
36 |  
37 | ##### IBMMQStaleMessages
38 | 
39 | {{< code lang="yaml" >}}
40 | alert: IBMMQStaleMessages
41 | annotations:
42 |   description: A stale message with an age of {{$labels.value}} has been sitting in
43 |     the {{$labels.queue}} which is above the threshold of 300s.
44 |   summary: Stale messages have been detected.
45 | expr: |
46 |   sum without (description,instance,job,platform) (ibmmq_queue_oldest_message_age) >= 300
47 | for: 5m
48 | labels:
49 |   severity: warning
50 | {{< /code >}}
51 |  
52 | ##### IBMMQLowDiskSpace
53 | 
54 | {{< code lang="yaml" >}}
55 | alert: IBMMQLowDiskSpace
56 | annotations:
57 |   description: The amount of disk space available for {{$labels.qmgr}} is at {{$labels.value}}%
58 |     which is below the threshold of 5%.
59 |   summary: There is limited disk available for a queue manager.
60 | expr: |
61 |   sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_queue_manager_file_system_free_space_percentage) <= 5
62 | for: 5m
63 | labels:
64 |   severity: critical
65 | {{< /code >}}
66 |  
67 | ##### IBMMQHighQueueManagerCpuUsage
68 | 
69 | {{< code lang="yaml" >}}
70 | alert: IBMMQHighQueueManagerCpuUsage
71 | annotations:
72 |   description: The amount of CPU usage for the queue manager {{$labels.qmgr}} is at
73 |     {{$labels.value}}% which is above the threshold of 85%.
74 |   summary: There is a high CPU usage estimate for a queue manager.
75 | expr: |
76 |   sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_user_cpu_time_estimate_for_queue_manager_percentage) >= 85
77 | for: 5m
78 | labels:
79 |   severity: critical
80 | {{< /code >}}
81 |  
82 | ## Dashboards
83 | Following dashboards are generated from mixins and hosted on github:
84 | 
85 | 
86 | - [ibm-mq-cluster-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/dashboards/ibm-mq-cluster-overview.json)
87 | - [ibm-mq-queue-manager-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/dashboards/ibm-mq-queue-manager-overview.json)
88 | - [ibm-mq-queue-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/dashboards/ibm-mq-queue-overview.json)
89 | - [ibm-mq-topics-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ibm-mq/dashboards/ibm-mq-topics-overview.json)
90 | 


--------------------------------------------------------------------------------
/site/content/jenkins/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: jenkins
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/jenkins-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [jenkins](https://github.com/monitoring-mixins/website/blob/master/assets/jenkins/dashboards/jenkins.json)
18 | 


--------------------------------------------------------------------------------
/site/content/jira/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: jira
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/jira-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/jira/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### alert.rules
20 | 
21 | ##### LicenseExpired
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: LicenseExpired
25 | annotations:
26 |   description: The JIRA license has expired.
27 |   summary: JIRA license expired.
28 | expr: jira_license_expiry_days_gauge <= 0
29 | for: 1m
30 | labels:
31 |   severity: critical
32 | {{< /code >}}
33 |  
34 | ##### LicenseWarning
35 | 
36 | {{< code lang="yaml" >}}
37 | alert: LicenseWarning
38 | annotations:
39 |   description: The JIRA license will expire in less than one week.
40 |   summary: License expiring soon.
41 | expr: jira_license_expiry_days_gauge <= 7 and jira_license_expiry_days_gauge > 0
42 | for: 1m
43 | labels:
44 |   severity: warning
45 | {{< /code >}}
46 |  
47 | ##### NoUserCapacity
48 | 
49 | {{< code lang="yaml" >}}
50 | alert: NoUserCapacity
51 | annotations:
52 |   description: There is no more capacity for additional users to be added to the system.
53 |   summary: All available accounts are taken.
54 | expr: jira_all_users_gauge/jira_allowed_users_gauge == 1
55 | for: 1m
56 | labels:
57 |   severity: critical
58 | {{< /code >}}
59 |  
60 | ##### EmailErrorsHigh
61 | 
62 | {{< code lang="yaml" >}}
63 | alert: EmailErrorsHigh
64 | annotations:
65 |   description: More than 1% of emails have resulted in an error in the past minute.
66 |   summary: Email errors are high.
67 | expr: jira_mail_queue_error_gauge /jira_mail_queue_gauge > 0.01
68 | for: 1m
69 | labels:
70 |   severity: critical
71 | {{< /code >}}
72 |  
73 | ## Dashboards
74 | Following dashboards are generated from mixins and hosted on github:
75 | 
76 | 
77 | - [jira-overview](https://github.com/monitoring-mixins/website/blob/master/assets/jira/dashboards/jira-overview.json)
78 | 


--------------------------------------------------------------------------------
/site/content/jvm/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: jvm
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/jvm-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/jvm/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### jvm-jvm-alerts
20 | 
21 | ##### JvmMemoryFillingUp
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: JvmMemoryFillingUp
25 | annotations:
26 |   description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last
27 |     5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
28 |   summary: JVM heap memory filling up.
29 | expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", }))/(sum without (id)
30 |   (jvm_memory_max_bytes{area="heap", } != -1))) * 100 > 80
31 | for: 5m
32 | keep_firing_for: 5m
33 | labels:
34 |   severity: warning
35 | {{< /code >}}
36 |  
37 | ##### JvmThreadsDeadlocked
38 | 
39 | {{< code lang="yaml" >}}
40 | alert: JvmThreadsDeadlocked
41 | annotations:
42 |   description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}}
43 |     are in a cyclic dependency with each other. The restart is required to resolve
44 |     the deadlock.'
45 |   summary: JVM deadlock detected.
46 | expr: (jvm_threads_deadlocked{}) > 0
47 | for: 2m
48 | keep_firing_for: 5m
49 | labels:
50 |   severity: critical
51 | {{< /code >}}
52 |  
53 | ## Dashboards
54 | Following dashboards are generated from mixins and hosted on github:
55 | 
56 | 
57 | - [jvm-dashboard](https://github.com/monitoring-mixins/website/blob/master/assets/jvm/dashboards/jvm-dashboard.json)
58 | 


--------------------------------------------------------------------------------
/site/content/kube-state-metrics/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: kube-state-metrics
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/kubernetes/kube-state-metrics](https://github.com/kubernetes/kube-state-metrics/tree/master/jsonnet/kube-state-metrics-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/kube-state-metrics/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### kube-state-metrics
20 | 
21 | ##### KubeStateMetricsListErrors
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: KubeStateMetricsListErrors
25 | annotations:
26 |   description: kube-state-metrics is experiencing errors at an elevated rate in list
27 |     operations. This is likely causing it to not be able to expose metrics about Kubernetes
28 |     objects correctly or at all.
29 |   summary: kube-state-metrics is experiencing errors in list operations.
30 | expr: |
31 |   (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
32 |     /
33 |   sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
34 |   > 0.01
35 | for: 15m
36 | labels:
37 |   severity: critical
38 | {{< /code >}}
39 |  
40 | ##### KubeStateMetricsWatchErrors
41 | 
42 | {{< code lang="yaml" >}}
43 | alert: KubeStateMetricsWatchErrors
44 | annotations:
45 |   description: kube-state-metrics is experiencing errors at an elevated rate in watch
46 |     operations. This is likely causing it to not be able to expose metrics about Kubernetes
47 |     objects correctly or at all.
48 |   summary: kube-state-metrics is experiencing errors in watch operations.
49 | expr: |
50 |   (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
51 |     /
52 |   sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
53 |   > 0.01
54 | for: 15m
55 | labels:
56 |   severity: critical
57 | {{< /code >}}
58 |  
59 | ##### KubeStateMetricsShardingMismatch
60 | 
61 | {{< code lang="yaml" >}}
62 | alert: KubeStateMetricsShardingMismatch
63 | annotations:
64 |   description: kube-state-metrics pods are running with different --total-shards configuration,
65 |     some Kubernetes objects may be exposed multiple times or not exposed at all.
66 |   summary: kube-state-metrics sharding is misconfigured.
67 | expr: |
68 |   stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
69 | for: 15m
70 | labels:
71 |   severity: critical
72 | {{< /code >}}
73 |  
74 | ##### KubeStateMetricsShardsMissing
75 | 
76 | {{< code lang="yaml" >}}
77 | alert: KubeStateMetricsShardsMissing
78 | annotations:
79 |   description: kube-state-metrics shards are missing, some Kubernetes objects are
80 |     not being exposed.
81 |   summary: kube-state-metrics shards are missing.
82 | expr: |
83 |   2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
84 |     -
85 |   sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
86 |   != 0
87 | for: 15m
88 | labels:
89 |   severity: critical
90 | {{< /code >}}
91 |  
92 | 


--------------------------------------------------------------------------------
/site/content/memcached/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: memcached
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | Grafana dashboard for operating Memcached, in the form of a monitoring mixin.
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/memcached-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/memcached/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### memcached
20 | 
21 | ##### MemcachedDown
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: MemcachedDown
25 | annotations:
26 |   description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} is down
27 |     for more than 15 minutes.
28 |   summary: Memcached instance is down.
29 | expr: |
30 |   memcached_up == 0
31 | for: 15m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ##### MemcachedConnectionLimitApproaching
37 | 
38 | {{< code lang="yaml" >}}
39 | alert: MemcachedConnectionLimitApproaching
40 | annotations:
41 |   description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection
42 |     usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes.
43 |   summary: Memcached max connection limit is approaching.
44 | expr: |
45 |   (memcached_current_connections / memcached_max_connections * 100) > 80
46 | for: 15m
47 | labels:
48 |   severity: warning
49 | {{< /code >}}
50 |  
51 | ##### MemcachedConnectionLimitApproaching
52 | 
53 | {{< code lang="yaml" >}}
54 | alert: MemcachedConnectionLimitApproaching
55 | annotations:
56 |   description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection
57 |     usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes.
58 |   summary: Memcached connections at critical level.
59 | expr: |
60 |   (memcached_current_connections / memcached_max_connections * 100) > 95
61 | for: 15m
62 | labels:
63 |   severity: critical
64 | {{< /code >}}
65 |  
66 | ##### MemcachedOutOfMemoryErrors
67 | 
68 | {{< code lang="yaml" >}}
69 | alert: MemcachedOutOfMemoryErrors
70 | annotations:
71 |   description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} has OutOfMemory
72 |     errors for at least 15 minutes, current rate is {{ printf "%0.0f" $value }}
73 |   summary: Memcached has OutOfMemory errors.
74 | expr: |
75 |   sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0
76 | for: 15m
77 | labels:
78 |   severity: warning
79 | {{< /code >}}
80 |  
81 | ## Dashboards
82 | Following dashboards are generated from mixins and hosted on github:
83 | 
84 | 
85 | - [memcached-overview](https://github.com/monitoring-mixins/website/blob/master/assets/memcached/dashboards/memcached-overview.json)
86 | 


--------------------------------------------------------------------------------
/site/content/mysql/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: mysql
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/prometheus/mysqld_exporter](https://github.com/prometheus/mysqld_exporter/tree/master/mysqld-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [mysql-overview](https://github.com/monitoring-mixins/website/blob/master/assets/mysql/dashboards/mysql-overview.json)
18 | 


--------------------------------------------------------------------------------
/site/content/nginx/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: nginx
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/nginx-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [nginx-logs](https://github.com/monitoring-mixins/website/blob/master/assets/nginx/dashboards/nginx-logs.json)
18 | - [nginx-metrics](https://github.com/monitoring-mixins/website/blob/master/assets/nginx/dashboards/nginx-metrics.json)
19 | 


--------------------------------------------------------------------------------
/site/content/nodejs/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: nodejs
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/nodejs-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/nodejs/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### NodejsAlerts
20 | 
21 | ##### NodejsDown
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: NodejsDown
25 | annotations:
26 |   description: Node.js {{$labels.job}} on {{$labels.instance}} is not up.
27 |   summary: Node.js not up.
28 | expr: absent(nodejs_version_info) or (sum by (version) (nodejs_version_info) < 1)
29 | for: 0m
30 | labels:
31 |   severity: critical
32 | {{< /code >}}
33 |  
34 | ## Dashboards
35 | Following dashboards are generated from mixins and hosted on github:
36 | 
37 | 
38 | - [nodejs-overview](https://github.com/monitoring-mixins/website/blob/master/assets/nodejs/dashboards/nodejs-overview.json)
39 | 


--------------------------------------------------------------------------------
/site/content/nomad/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: nomad
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/nomad-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [nomad-cluster](https://github.com/monitoring-mixins/website/blob/master/assets/nomad/dashboards/nomad-cluster.json)
18 | - [nomad-jobs](https://github.com/monitoring-mixins/website/blob/master/assets/nomad/dashboards/nomad-jobs.json)
19 | 


--------------------------------------------------------------------------------
/site/content/nsq/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: nsq
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/nsq-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/nsq/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### nsq
20 | 
21 | ##### NsqTopicDepthIncreasing
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: NsqTopicDepthIncreasing
25 | annotations:
26 |   description: |
27 |     Topic {{ $labels.topic }} depth is higher than 100. The current queue is {{ $value }}.
28 |   summary: Topic depth is increasing.
29 | expr: |
30 |   sum by (topic) (nsq_topic_depth) > 100
31 | for: 5m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ##### NsqChannelDepthIncreasing
37 | 
38 | {{< code lang="yaml" >}}
39 | alert: NsqChannelDepthIncreasing
40 | annotations:
41 |   description: |
42 |     Channel {{ $labels.channel }} depth in topic {{ $labels.topic }} is higher than 100. The current queue is {{ $value }}.
43 |   summary: Topic channel depth is increasing.
44 | expr: |
45 |   sum by (topic) (nsq_topic_channel_backend_depth) > 100
46 | for: 5m
47 | labels:
48 |   severity: critical
49 | {{< /code >}}
50 |  
51 | ## Dashboards
52 | Following dashboards are generated from mixins and hosted on github:
53 | 
54 | 
55 | - [nsq-instances](https://github.com/monitoring-mixins/website/blob/master/assets/nsq/dashboards/nsq-instances.json)
56 | - [nsq-topics](https://github.com/monitoring-mixins/website/blob/master/assets/nsq/dashboards/nsq-topics.json)
57 | 


--------------------------------------------------------------------------------
/site/content/openldap/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: openldap
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/openldap-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/openldap/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### openldap-alerts
20 | 
21 | ##### OpenLDAPConnectionSpike
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: OpenLDAPConnectionSpike
25 | annotations:
26 |   description: There are {{ printf "%.0f" $value }} OpenLDAP connections on instance
27 |     {{$labels.instance}}, which is above the threshold of 100.
28 |   summary: A sudden spike in OpenLDAP connections indicates potential high usage or
29 |     security issues.
30 | expr: |
31 |   increase(openldap_monitor_counter_object{dn="cn=Current,cn=Connections,cn=Monitor"}[5m]) > 100
32 | for: 5m
33 | labels:
34 |   severity: warning
35 | {{< /code >}}
36 |  
37 | ##### OpenLDAPHighSearchOperationRateSpike
38 | 
39 | {{< code lang="yaml" >}}
40 | alert: OpenLDAPHighSearchOperationRateSpike
41 | annotations:
42 |   description: The rate of search operations in OpenLDAP on instance {{$labels.instance}}
43 |     has increased by {{ printf "%.0f" $value }} percent in the last 5 minutes, compared
44 |     to the average over the last 15 minutes, which is above the threshold of 200 percent.
45 |   summary: A significant spike in OpenLDAP search operations indicates inefficient
46 |     queries, potential abuse, or unintended heavy load.
47 | expr: "100 * (
48 |   rate(openldap_monitor_operation{dn=\"cn=Search,cn=Operations,cn=Monitor\"}[5m])
49 |   
50 |   / 
51 |   clamp_min(rate(openldap_monitor_operation{dn=\"cn=Search,cn=Operations,cn=Monitor\"}[15m]
52 |   offset 5m), 0.0001)
53 | ) > 200
54 | "
55 | for: 5m
56 | labels:
57 |   severity: warning
58 | {{< /code >}}
59 |  
60 | ##### OpenLDAPDialFailures
61 | 
62 | {{< code lang="yaml" >}}
63 | alert: OpenLDAPDialFailures
64 | annotations:
65 |   description: LDAP dial failures on instance {{$labels.instance}} have increased
66 |     by {{ printf "%.0f" $value }} in the last 10 minutes, which is above the threshold
67 |     of 10.
68 |   summary: Significant increase in LDAP dial failures indicates network issues, problems
69 |     with the LDAP service, or configuration errors that may lead to service unavailability.
70 | expr: |
71 |   increase(openldap_dial{result!="ok"}[10m]) > 10
72 | for: 10m
73 | labels:
74 |   severity: warning
75 | {{< /code >}}
76 |  
77 | ##### OpenLDAPBindFailureRateIncrease
78 | 
79 | {{< code lang="yaml" >}}
80 | alert: OpenLDAPBindFailureRateIncrease
81 | annotations:
82 |   description: LDAP bind failures on instance {{$labels.instance}} have increased
83 |     by {{ printf "%.0f" $value }} in the last 10 minutes, which is above the threshold
84 |     of 10.
85 |   summary: Significant increase in LDAP bind failures indicates authentication issues,
86 |     potential security threats or problems with user directories.
87 | expr: |
88 |   increase(openldap_bind{result!="ok"}[10m]) > 10
89 | for: 10m
90 | labels:
91 |   severity: warning
92 | {{< /code >}}
93 |  
94 | ## Dashboards
95 | Following dashboards are generated from mixins and hosted on github:
96 | 
97 | 
98 | - [*](https://github.com/monitoring-mixins/website/blob/master/assets/openldap/dashboards/*.json)
99 | 


--------------------------------------------------------------------------------
/site/content/oracledb/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: oracledb
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/oracledb-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/oracledb/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### OracleDBAlerts
20 | 
21 | ##### OracledbReachingSessionLimit
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: OracledbReachingSessionLimit
25 | annotations:
26 |   description: '{{ printf "%.2f" $value }}% of sessions are being utilized which is
27 |     above the threshold 85%. This could mean that {{$labels.instance}} is being overutilized.'
28 |   summary: The number of sessions being utilized exceeded 85%.
29 | expr: |
30 |   oracledb_resource_current_utilization{resource_name="sessions"} / oracledb_resource_limit_value{resource_name="sessions"} * 100 > 85
31 | for: 5m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ##### OracledbReachingProcessLimit
37 | 
38 | {{< code lang="yaml" >}}
39 | alert: OracledbReachingProcessLimit
40 | annotations:
41 |   description: '{{ printf "%.2f" $value }} of processes are being utilized which is
42 |     above thethreshold 85%. This could potentially mean that {{$labels.instance}}
43 |     runs out of processes it can spin up.'
44 |   summary: The number of processess being utilized exceeded the threshold of 85%.
45 | expr: |
46 |   oracledb_resource_current_utilization{resource_name="processes"} / oracledb_resource_limit_value{resource_name="processes"} * 100 > 85
47 | for: 5m
48 | labels:
49 |   severity: critical
50 | {{< /code >}}
51 |  
52 | ##### OracledbTablespaceReachingCapacity
53 | 
54 | {{< code lang="yaml" >}}
55 | alert: OracledbTablespaceReachingCapacity
56 | annotations:
57 |   description: '{{ printf "%.2f" $value }}% of bytes are being utilized by the tablespace
58 |     {{$labels.tablespace}} on the instance {{$labels.instance}}, which is above the
59 |     threshold 85%.'
60 |   summary: A tablespace is exceeding more than 85% of its maximum allotted space.
61 | expr: |
62 |   oracledb_tablespace_bytes / oracledb_tablespace_max_bytes * 100 > 85
63 | for: 5m
64 | labels:
65 |   severity: critical
66 | {{< /code >}}
67 |  
68 | ## Dashboards
69 | Following dashboards are generated from mixins and hosted on github:
70 | 
71 | 
72 | - [oracledb-overview](https://github.com/monitoring-mixins/website/blob/master/assets/oracledb/dashboards/oracledb-overview.json)
73 | 


--------------------------------------------------------------------------------
/site/content/python-runtime/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: python-runtime
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/python-runtime-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [python-runtime](https://github.com/monitoring-mixins/website/blob/master/assets/python-runtime/dashboards/python-runtime.json)
18 | 


--------------------------------------------------------------------------------
/site/content/rabbitmq/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: rabbitmq
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/rabbitmq-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/rabbitmq/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### RabbitMQClusterAlerts
20 | 
21 | ##### RabbitMQMemoryHigh
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: RabbitMQMemoryHigh
25 | annotations:
26 |   description: A node {{ $labels.instance }} is using more than 90% of allocated RAM.
27 |   summary: RabbitMQ memory usage is high.
28 | expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes
29 |   * 100 > 90
30 | for: 5m
31 | labels:
32 |   severity: warning
33 | {{< /code >}}
34 |  
35 | ##### RabbitMQFileDescriptorsUsage
36 | 
37 | {{< code lang="yaml" >}}
38 | alert: RabbitMQFileDescriptorsUsage
39 | annotations:
40 |   description: A node {{ $labels.instance }} is using more than 90% of file descriptors.
41 |   summary: RabbitMQ file descriptors usage is high.
42 | expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90
43 | for: 5m
44 | labels:
45 |   severity: warning
46 | {{< /code >}}
47 |  
48 | ##### RabbitMQUnroutableMessages
49 | 
50 | {{< code lang="yaml" >}}
51 | alert: RabbitMQUnroutableMessages
52 | annotations:
53 |   description: A queue has unroutable messages on {{ $labels.instance }}.
54 |   summary: A RabbitMQ queue has unroutable messages.
55 | expr: increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m])
56 |   > 0
57 | for: 5m
58 | labels:
59 |   severity: warning
60 | {{< /code >}}
61 |  
62 | ##### RabbitMQNodeNotDistributed
63 | 
64 | {{< code lang="yaml" >}}
65 | alert: RabbitMQNodeNotDistributed
66 | annotations:
67 |   description: "Distribution link state is not 'up' on {{ $labels.instance }}, current
68 |     value is {{ $value }}. 
69 | Note: The state is represented as a numerical value where
70 |     pending=1, up_pending=2 and up=3."
71 |   summary: RabbitMQ node not distributed, link state is down.
72 | expr: erlang_vm_dist_node_state{rabbitmq_cluster!=""} < 3
73 | for: 5m
74 | labels:
75 |   severity: critical
76 | {{< /code >}}
77 |  
78 | ## Dashboards
79 | Following dashboards are generated from mixins and hosted on github:
80 | 
81 | 
82 | - [erlang-memory-allocators](https://github.com/monitoring-mixins/website/blob/master/assets/rabbitmq/dashboards/erlang-memory-allocators.json)
83 | - [rabbitmq-overview](https://github.com/monitoring-mixins/website/blob/master/assets/rabbitmq/dashboards/rabbitmq-overview.json)
84 | 


--------------------------------------------------------------------------------
/site/content/rclone/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: rclone
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/rclone-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [rclone](https://github.com/monitoring-mixins/website/blob/master/assets/rclone/dashboards/rclone.json)
18 | 


--------------------------------------------------------------------------------
/site/content/redis/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: redis
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/redis-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [*](https://github.com/monitoring-mixins/website/blob/master/assets/redis/dashboards/*.json)
18 | 


--------------------------------------------------------------------------------
/site/content/ruby/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: ruby
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/ruby-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [ruby-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ruby/dashboards/ruby-overview.json)
18 | 


--------------------------------------------------------------------------------
/site/content/sealed-secrets/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: sealed-secrets
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/bitnami-labs/sealed-secrets](https://github.com/bitnami-labs/sealed-secrets/tree/master/contrib/prometheus-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/sealed-secrets/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### sealed-secrets
20 | 
21 | ##### SealedSecretsUnsealErrorHigh
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: SealedSecretsUnsealErrorHigh
25 | annotations:
26 |   description: High number of errors during unsealing Sealed Secrets in {{ $labels.namespace
27 |     }} namespace.
28 |   runbook_url: https://github.com/bitnami-labs/sealed-secrets
29 |   summary: Sealed Secrets Unseal Error High
30 | expr: |
31 |   sum by (reason, namespace) (rate(sealed_secrets_controller_unseal_errors_total{}[5m])) > 0
32 | labels:
33 |   severity: warning
34 | {{< /code >}}
35 |  
36 | ## Dashboards
37 | Following dashboards are generated from mixins and hosted on github:
38 | 
39 | 
40 | - [sealed-secrets-controller](https://github.com/monitoring-mixins/website/blob/master/assets/sealed-secrets/dashboards/sealed-secrets-controller.json)
41 | 


--------------------------------------------------------------------------------
/site/content/spark/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: spark
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/spark-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [spark-metrics](https://github.com/monitoring-mixins/website/blob/master/assets/spark/dashboards/spark-metrics.json)
18 | 


--------------------------------------------------------------------------------
/site/content/spinnaker/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: spinnaker
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/uneeq-oss/spinnaker-mixin.git](https://github.com/uneeq-oss/spinnaker-mixin.git)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### igor
20 | 
21 | ##### PollingMonitorItemsOverThreshold
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: PollingMonitorItemsOverThreshold
25 | annotations:
26 |   description: '{{ $labels.monitor }} polling monitor for {{ $labels.partition }}
27 |     threshold exceeded, preventing pipeline triggers.'
28 |   runbook_url: https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds
29 |   summary: Polling monitor item threshold exceeded.
30 | expr: sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0
31 | for: 5m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ## Dashboards
37 | Following dashboards are generated from mixins and hosted on github:
38 | 
39 | 
40 | - [clouddriver](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/clouddriver.json)
41 | - [deck](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/deck.json)
42 | - [echo](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/echo.json)
43 | - [fiat](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/fiat.json)
44 | - [front50](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/front50.json)
45 | - [gate](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/gate.json)
46 | - [igor](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/igor.json)
47 | - [orca](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/orca.json)
48 | - [rosco](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/rosco.json)
49 | - [spinnaker-application-details](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-application-details.json)
50 | - [spinnaker-aws-platform](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-aws-platform.json)
51 | - [spinnaker-google-platform](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-google-platform.json)
52 | - [spinnaker-key-metrics](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-key-metrics.json)
53 | - [spinnaker-kubernetes-platform](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-kubernetes-platform.json)
54 | - [spinnaker-minimalist](https://github.com/monitoring-mixins/website/blob/master/assets/spinnaker/dashboards/spinnaker-minimalist.json)
55 | 


--------------------------------------------------------------------------------
/site/content/spring-boot/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: spring-boot
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/spring-boot-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/spring-boot/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### jvm-micrometer-jvm-alerts
20 | 
21 | ##### JvmMemoryFillingUp
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: JvmMemoryFillingUp
25 | annotations:
26 |   description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last
27 |     5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
28 |   summary: JVM heap memory filling up.
29 | expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
30 |   (id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
31 | for: 5m
32 | keep_firing_for: 5m
33 | labels:
34 |   severity: warning
35 | {{< /code >}}
36 |  
37 | ## Dashboards
38 | Following dashboards are generated from mixins and hosted on github:
39 | 
40 | 
41 | - [jvm-dashboard](https://github.com/monitoring-mixins/website/blob/master/assets/spring-boot/dashboards/jvm-dashboard.json)
42 | 


--------------------------------------------------------------------------------
/site/content/supabase/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: supabase
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/supabase-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [supabase](https://github.com/monitoring-mixins/website/blob/master/assets/supabase/dashboards/supabase.json)
18 | 


--------------------------------------------------------------------------------
/site/content/tensorflow/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: tensorflow
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/tensorflow-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/tensorflow/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### TensorFlowServingAlerts
20 | 
21 | ##### TensorFlowModelRequestHighErrorRate
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: TensorFlowModelRequestHighErrorRate
25 | annotations:
26 |   description: '{{ printf "%.2f" $value }}% of all model requests are not successful,
27 |     which is above the threshold 30%, indicating a potentially larger issue for {{$labels.instance}}'
28 |   summary: More than 30% of all model requests are not successful.
29 | expr: |
30 |   100 * sum(rate(:tensorflow:serving:request_count{status!="OK"}[5m])) by (instance) / sum(rate(:tensorflow:serving:request_count[5m])) by (instance) > 30
31 | for: 5m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ##### TensorFlowServingHighBatchQueuingLatency
37 | 
38 | {{< code lang="yaml" >}}
39 | alert: TensorFlowServingHighBatchQueuingLatency
40 | annotations:
41 |   description: Batch queuing latency greater than {{ printf "%.2f" $value }}µs, which
42 |     is above the threshold 5000000µs, indicating a potentially larger issue for {{$labels.instance}}
43 |   summary: Batch queuing latency more than 5000000µs.
44 | expr: |
45 |   increase(:tensorflow:serving:batching_session:queuing_latency_sum[2m]) / increase(:tensorflow:serving:batching_session:queuing_latency_count[2m]) > 5000000
46 | for: 5m
47 | labels:
48 |   severity: warning
49 | {{< /code >}}
50 |  
51 | ## Dashboards
52 | Following dashboards are generated from mixins and hosted on github:
53 | 
54 | 
55 | - [tensorflow-overview](https://github.com/monitoring-mixins/website/blob/master/assets/tensorflow/dashboards/tensorflow-overview.json)
56 | 


--------------------------------------------------------------------------------
/site/content/traefik/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: traefik
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/traefik-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [traefikdash](https://github.com/monitoring-mixins/website/blob/master/assets/traefik/dashboards/traefikdash.json)
18 | 


--------------------------------------------------------------------------------
/site/content/ubnt-edgerouter/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: ubnt-edgerouter
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/ubnt-edgerouter-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Recording rules
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ubnt-edgerouter/rules.yaml).
17 | {{< /panel >}}
18 | 
19 | ### ubnt.rules
20 | 
21 | ##### ifNiceName
22 | 
23 | {{< code lang="yaml" >}}
24 | expr: label_join(ifAdminStatus,"nicename", ":", "ifName", "ifAlias")
25 | record: ifNiceName
26 | {{< /code >}}
27 |  
28 | ## Dashboards
29 | Following dashboards are generated from mixins and hosted on github:
30 | 
31 | 
32 | - [ubnt-edgrouterx-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ubnt-edgerouter/dashboards/ubnt-edgrouterx-overview.json)
33 | 


--------------------------------------------------------------------------------
/site/content/vault/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: vault
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/vault-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [vault](https://github.com/monitoring-mixins/website/blob/master/assets/vault/dashboards/vault.json)
18 | 


--------------------------------------------------------------------------------
/site/content/velero/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: velero
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/velero-2-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/velero/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### velero
20 | 
21 | ##### VeleroBackupFailure
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: VeleroBackupFailure
25 | annotations:
26 |   description: |
27 |     Backup failures detected on {{ $labels.instance }}. This could lead to data loss or inability to recover in case of a disaster.
28 |   summary: Velero backup failures detected.
29 | expr: |
30 |   increase(velero_backup_failure_total{job="integrations/velero"}[5m]) > 0
31 | for: 5m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ##### VeleroHighBackupDuration
37 | 
38 | {{< code lang="yaml" >}}
39 | alert: VeleroHighBackupDuration
40 | annotations:
41 |   description: |
42 |     Backup duration on {{ $labels.instance }} is higher than the average duration over the past 48 hours. This could indicate performance issues or network congestion. The current value is {{ $value | printf "%.2f" }} seconds.
43 |   summary: Velero backups taking longer than usual.
44 | expr: |
45 |   histogram_quantile(0.5, sum(rate(velero_backup_duration_seconds_bucket{job="integrations/velero"}[5m])) by (le, schedule)) > 1.2 * 1.2 * avg_over_time(histogram_quantile(0.5, sum(rate(velero_backup_duration_seconds_bucket{job="integrations/velero"}[48h])) by (le, schedule))[5m:])
46 | for: 5m
47 | labels:
48 |   severity: warning
49 | {{< /code >}}
50 |  
51 | ##### VeleroHighRestoreFailureRate
52 | 
53 | {{< code lang="yaml" >}}
54 | alert: VeleroHighRestoreFailureRate
55 | annotations:
56 |   description: |
57 |     Restore failures detected on {{ $labels.instance }}. This could prevent timely data recovery and business continuity.
58 |   summary: Velero restore failures detected.
59 | expr: |
60 |   increase(velero_restore_failed_total{job="integrations/velero"}[5m]) > 0
61 | for: 5m
62 | labels:
63 |   severity: critical
64 | {{< /code >}}
65 |  
66 | ##### VeleroUpStatus
67 | 
68 | {{< code lang="yaml" >}}
69 | alert: VeleroUpStatus
70 | annotations:
71 |   description: "Cannot find any metrics related to Velero on {{ $labels.instance }}.
72 |     This may indicate further issues with Velero or the scraping agent. 
73 | "
74 |   summary: Velero is down.
75 | expr: |
76 |   up{job="integrations/velero"} != 0
77 | for: 5m
78 | labels:
79 |   severity: critical
80 | {{< /code >}}
81 |  
82 | ## Dashboards
83 | Following dashboards are generated from mixins and hosted on github:
84 | 
85 | 
86 | - [*](https://github.com/monitoring-mixins/website/blob/master/assets/velero/dashboards/*.json)
87 | 


--------------------------------------------------------------------------------
/site/content/wildfly/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: wildfly
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/wildfly-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Alerts
14 | 
15 | {{< panel style="warning" >}}
16 | Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/wildfly/alerts.yaml).
17 | {{< /panel >}}
18 | 
19 | ### wildfly
20 | 
21 | ##### HighPercentageOfErrorResponses
22 | 
23 | {{< code lang="yaml" >}}
24 | alert: HighPercentageOfErrorResponses
25 | annotations:
26 |   description: |
27 |     The percentage of error responses is {{ printf "%.2f" $value }} on {{ $labels.instance }} - {{ $labels.server }} which is higher than {{30 }}.
28 |   summary: Large percentage of requests are resulting in 5XX responses.
29 | expr: |
30 |   sum by (job, instance, server) (increase(wildfly_undertow_error_count_total{}[5m]) / increase(wildfly_undertow_request_count_total{}[5m])) * 100 > 30
31 | for: 5m
32 | labels:
33 |   severity: critical
34 | {{< /code >}}
35 |  
36 | ##### HighNumberOfRejectedSessionsForDeployment
37 | 
38 | {{< code lang="yaml" >}}
39 | alert: HighNumberOfRejectedSessionsForDeployment
40 | annotations:
41 |   description: |
42 |     Deployemnt {{ $labels.deployment }} on {{ $labels.instance }} is exceeding the threshold for rejected sessions {{ printf "%.0f" $value }} is higher than 20.
43 |   summary: Large number of sessions are being rejected for a deployment.
44 | expr: |
45 |   sum by (deployment, instance, job) (increase(wildfly_undertow_rejected_sessions_total{}[5m])) > 20
46 | for: 5m
47 | labels:
48 |   severity: critical
49 | {{< /code >}}
50 |  
51 | ## Dashboards
52 | Following dashboards are generated from mixins and hosted on github:
53 | 
54 | 
55 | - [wildfly-datasource](https://github.com/monitoring-mixins/website/blob/master/assets/wildfly/dashboards/wildfly-datasource.json)
56 | - [wildfly-overview](https://github.com/monitoring-mixins/website/blob/master/assets/wildfly/dashboards/wildfly-overview.json)
57 | 


--------------------------------------------------------------------------------
/site/content/wso2-enterprise-integrator/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: wso2-enterprise-integrator
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/wso2-enterprise-integrator-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [API_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/API_Metrics.json)
18 | - [Cluster_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/Cluster_Metrics.json)
19 | - [Inbound_Endpoint_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/Inbound_Endpoint_Metrics.json)
20 | - [Node_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/Node_Metrics.json)
21 | - [Proxy_Service_Metrics](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-enterprise-integrator/dashboards/Proxy_Service_Metrics.json)
22 | 


--------------------------------------------------------------------------------
/site/content/wso2-streaming-integrator/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: wso2-streaming-integrator
 3 | ---
 4 | 
 5 | ## Overview
 6 | 
 7 | 
 8 | 
 9 | {{< panel style="danger" >}}
10 | Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/wso2-streaming-integrator-mixin)
11 | {{< /panel >}}
12 | 
13 | ## Dashboards
14 | Following dashboards are generated from mixins and hosted on github:
15 | 
16 | 
17 | - [Siddhi_aggregation](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_aggregation.json)
18 | - [Siddhi_ondemandquery](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_ondemandquery.json)
19 | - [Siddhi_overall](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_overall.json)
20 | - [Siddhi_query](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_query.json)
21 | - [Siddhi_server](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_server.json)
22 | - [Siddhi_sink](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_sink.json)
23 | - [Siddhi_source](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_source.json)
24 | - [Siddhi_stream](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_stream.json)
25 | - [Siddhi_table](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/Siddhi_table.json)
26 | - [StreamingIntegrator_apps](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/StreamingIntegrator_apps.json)
27 | - [StreamingIntegrator_overall](https://github.com/monitoring-mixins/website/blob/master/assets/wso2-streaming-integrator/dashboards/StreamingIntegrator_overall.json)
28 | 


--------------------------------------------------------------------------------
/site/layouts/_default/baseof.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     {{- partial "head.html" . -}}
 4 |     <body>
 5 | 
 6 |         {{- partial "header.html" . -}}
 7 | 
 8 |         <div class="container-fluid">
 9 |             <div class="row">
10 | 
11 |                 <div class="docs-sidenav order-0 col-12 col-md-3 col-lg-2 col-xl-2 position-sticky border-right">
12 |                     {{- partial "menu.html" . -}}
13 |                 </div>
14 | 
15 |                 {{- if and (ne .Site.Params.toc false) (ne .Params.toc false) }}
16 |                 <div class="docs-toc large order-lg-2 order-md-0 order-xs-1 col-12 col-lg-3 col-xl-3 position-sticky border-left">
17 |                     {{- partial "tableofcontents.html" . -}}
18 |                 </div>
19 |                 <div class="main col-12 order-1 col-md-9 col-lg-9 col-xl-7 py-3">
20 |                 {{else}}
21 |                 <div class="main col-12 order-1 col-md-9 col-lg-10 col-xl-10 py-3">
22 |                 {{end}}
23 | 
24 |                     {{- block "main" . }}{{- end }}
25 |                     
26 |                     <div class="row">
27 |                         {{- if and (ne .Site.Params.disableReadmoreNav true) (ne .Params.disableReadmoreNav true) -}}
28 |                         <div class="position-relative mx-auto col-lg-9">
29 |                           {{ partial "next-prev-page.html" . }}
30 |                         </div>
31 |                         {{- end -}}
32 |                     </div> <!-- /end of row -->
33 | 
34 |                 </div>
35 | 
36 |             </div> <!-- /end of row -->
37 | 
38 |         </div> <!-- /end of container -->
39 | 
40 |         {{- partial "footer.html" . -}}
41 | 
42 |     </body>
43 | </html>
44 | 


--------------------------------------------------------------------------------