├── requirements.txt ├── .gitignore ├── dittybopper ├── dittybopper_screenshot.png ├── templates │ └── dittybopper_ns.yaml.template ├── README.md ├── syncer │ └── entrypoint.py ├── k8s-deploy.sh └── deploy.sh ├── templates ├── jsonnetfile.json ├── jsonnetfile.lock.json ├── General │ ├── uperf-perf.jsonnet │ ├── pgbench-dashboard.jsonnet │ ├── vegeta-wrapper.jsonnet │ ├── ycsb.jsonnet │ ├── api-performance-overview.jsonnet │ ├── k8s-perf.jsonnet │ ├── cilium-k8s-perf.jsonnet │ ├── ovn-dashboard.jsonnet │ └── etcd-on-cluster-dashboard.jsonnet └── CPT │ ├── ingress-perf.jsonnet │ ├── k8s-netperf.jsonnet │ ├── acs-perf.jsonnet │ └── kube-burner-report-mode.jsonnet ├── Dockerfile ├── .github └── workflows │ ├── release.yml │ └── ci.yml ├── assets ├── pgbench-dashboard │ ├── annotation.libsonnet │ ├── variables.libsonnet │ ├── queries.libsonnet │ └── panels.libsonnet ├── etcd-on-cluster-dashboard │ ├── variables.libsonnet │ └── panels.libsonnet ├── vegeta-wrapper │ ├── variables.libsonnet │ ├── panels.libsonnet │ └── queries.libsonnet ├── hypershift-perf-dashboard │ └── variables.libsonnet ├── ovn-monitoring │ ├── variables.libsonnet │ ├── panels.libsonnet │ └── queries.libsonnet ├── ycsb │ ├── variables.libsonnet │ ├── panels.libsonnet │ └── queries.libsonnet ├── k8s-perf │ ├── variables.libsonnet │ ├── panels.libsonnet │ └── queries.libsonnet ├── cilium-k8s-perf │ ├── variables.libsonnet │ └── panels.libsonnet ├── api-performance-overview │ ├── panels.libsonnet │ └── variables.libsonnet ├── ocp-performance │ ├── panels.libsonnet │ └── variables.libsonnet ├── uperf │ ├── variables.libsonnet │ ├── panels.libsonnet │ └── queries.libsonnet ├── kube-burner-report-ocp-wrapper │ └── variables.libsonnet ├── ingress-performance-ocp │ └── variables.libsonnet ├── k8s-netperf │ └── variables.libsonnet ├── kube-burner-report-mode │ └── variables.libsonnet └── acs-perf │ └── variables.libsonnet ├── Makefile └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.26.0 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | templates/grafonnet-lib 2 | templates/vendor 3 | rendered 4 | tmp 5 | bin 6 | -------------------------------------------------------------------------------- /dittybopper/dittybopper_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloud-bulldozer/performance-dashboards/HEAD/dittybopper/dittybopper_screenshot.png -------------------------------------------------------------------------------- /templates/jsonnetfile.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/grafana/grafonnet.git", 8 | "subdir": "gen/grafonnet-latest" 9 | } 10 | }, 11 | "version": "main" 12 | } 13 | ], 14 | "legacyImports": true 15 | } 16 | -------------------------------------------------------------------------------- /dittybopper/templates/dittybopper_ns.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | labels: 5 | kubernetes.io/metadata.name: dittybopper 6 | pod-security.kubernetes.io/audit: privileged 7 | pod-security.kubernetes.io/enforce: privileged 8 | pod-security.kubernetes.io/enforce-version: v1.24 9 | pod-security.kubernetes.io/warn: privileged 10 | security.openshift.io/scc.podSecurityLabelSync: "false" 11 | name: dittybopper -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM registry.access.redhat.com/ubi8/ubi-minimal 2 | 3 | # Set the working directory 4 | WORKDIR /performance-dashboards 5 | 6 | # Install necessary libraries for subsequent commands 7 | RUN microdnf install -y podman python3 python3-pip && \ 8 | microdnf clean all && \ 9 | rm -rf /var/cache/yum 10 | 11 | COPY . . 12 | 13 | # Set permissions 14 | RUN chmod -R 775 /performance-dashboards 15 | 16 | # Install dependencies 17 | RUN pip3 install --no-cache-dir --upgrade pip && \ 18 | pip3 install --no-cache-dir -r requirements.txt 19 | 20 | # Start the command 21 | CMD ["python3", "dittybopper/syncer/entrypoint.py"] 22 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release syncer image 2 | defaults: 3 | run: 4 | shell: bash 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 16 | - uses: actions/checkout@v4 17 | 18 | - name: Install dependencies required for multi-arch builds 19 | run: sudo apt-get update && sudo apt-get install qemu-user-static podman fuse-overlayfs 20 | 21 | - name: Login in quay 22 | run: podman login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN} 23 | env: 24 | QUAY_USER: ${{ secrets.QUAY_USER }} 25 | QUAY_TOKEN: ${{ secrets.QUAY_TOKEN }} 26 | 27 | - name: Build & push syncer image 28 | run: make build-syncer-image push-syncer-image 29 | -------------------------------------------------------------------------------- /assets/pgbench-dashboard/annotation.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local annotation = g.dashboard.annotation; 3 | 4 | { 5 | run_start_timestamp: 6 | annotation.withName('Run Start Time') 7 | + annotation.withDatasource('$Datasource2') 8 | + annotation.withEnable(true) 9 | + annotation.withIconColor('#5794F2') 10 | + annotation.withHide(false) 11 | + annotation.target.withTags([]) 12 | + annotation.withType('tags'), 13 | 14 | sample_start_timestamp: 15 | annotation.withName('Sample Start Time') 16 | + annotation.withDatasource('$Datasource2') 17 | + annotation.withEnable(false) 18 | + annotation.withIconColor('#B877D9') 19 | + annotation.withHide(false) 20 | + annotation.target.withTags([]) 21 | + annotation.withType('tags'), 22 | } 23 | -------------------------------------------------------------------------------- /assets/etcd-on-cluster-dashboard/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'prometheus') 7 | + var.datasource.withRegex('') 8 | + var.query.generalOptions.withLabel('Datasource') 9 | + var.query.withRefresh(1) 10 | + var.query.selectionOptions.withMulti(false) 11 | + var.query.selectionOptions.withIncludeAll(false), 12 | 13 | etcd_pod: 14 | var.query.new('etcd_pod') 15 | + var.query.withDatasourceFromVariable(self.Datasource) 16 | + var.query.queryTypes.withLabelValues( 17 | 'pod', 18 | 'etcd_cluster_version', 19 | ) 20 | + var.query.withRefresh(2) 21 | + var.query.selectionOptions.withMulti() 22 | + var.query.selectionOptions.withIncludeAll(true), 23 | } 24 | -------------------------------------------------------------------------------- /assets/pgbench-dashboard/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource1: 6 | var.datasource.new('Datasource1', 'elasticsearch') 7 | + var.datasource.withRegex('') 8 | + var.query.generalOptions.withLabel('pgbench-results datasource') 9 | + var.query.withRefresh(1), 10 | 11 | Datasource2: 12 | var.datasource.new('Datasource2', 'elasticsearch') 13 | + var.datasource.withRegex('') 14 | + var.query.generalOptions.withLabel('pgbench-summary datasource') 15 | + var.query.withRefresh(1), 16 | 17 | uuid: 18 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword"}') 19 | + var.query.withDatasourceFromVariable(self.Datasource1) 20 | + var.query.selectionOptions.withMulti(false) 21 | + var.query.selectionOptions.withIncludeAll(true) 22 | + var.query.withRefresh(2), 23 | 24 | user: 25 | var.query.new('user', '{"find": "terms", "field": "user.keyword"}') 26 | + var.query.withDatasourceFromVariable(self.Datasource1) 27 | + var.query.selectionOptions.withMulti(false) 28 | + var.query.selectionOptions.withIncludeAll(true) 29 | + var.query.withRefresh(2), 30 | } 31 | -------------------------------------------------------------------------------- /templates/jsonnetfile.lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/grafana/grafonnet.git", 8 | "subdir": "gen/grafonnet-latest" 9 | } 10 | }, 11 | "version": "9e217263ac4b922ca2e00bc5cc36ada2311bb5a6", 12 | "sum": "k5S6Nf6VA8Lg9B/qyB5XEoxDmIf5PsW8HTuC/BaNnu4=" 13 | }, 14 | { 15 | "source": { 16 | "git": { 17 | "remote": "https://github.com/grafana/grafonnet.git", 18 | "subdir": "gen/grafonnet-v10.1.0" 19 | } 20 | }, 21 | "version": "9e217263ac4b922ca2e00bc5cc36ada2311bb5a6", 22 | "sum": "GpXlwBysu8dnoH9oYmsJj31CCVsL+wXnbWMxq7sl5Gg=" 23 | }, 24 | { 25 | "source": { 26 | "git": { 27 | "remote": "https://github.com/jsonnet-libs/docsonnet.git", 28 | "subdir": "doc-util" 29 | } 30 | }, 31 | "version": "503e5c8fe96d6b55775037713ac10b184709ad93", 32 | "sum": "BY4u0kLF3Qf/4IB4HnX9S5kEQIpHb4MUrppp6WLDtlU=" 33 | }, 34 | { 35 | "source": { 36 | "git": { 37 | "remote": "https://github.com/jsonnet-libs/xtd.git", 38 | "subdir": "" 39 | } 40 | }, 41 | "version": "c1a315a7dbead0335a5e0486acc5583395b22a24", 42 | "sum": "UVdL+uuFI8BSQgLfMJEJk2WDKsQXNT3dRHcr2Ti9rLI=" 43 | } 44 | ], 45 | "legacyImports": false 46 | } -------------------------------------------------------------------------------- /templates/General/uperf-perf.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/uperf/panels.libsonnet'; 2 | local queries = import '../../assets/uperf/queries.libsonnet'; 3 | local variables = import '../../assets/uperf/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('Public - UPerf Results dashboard') 7 | + g.dashboard.withTags(['network', 'performance']) 8 | + g.dashboard.time.withFrom('now-1h') 9 | + g.dashboard.time.withTo('now') 10 | + g.dashboard.withTimezone('utc') 11 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 12 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 13 | + g.dashboard.withRefresh('') 14 | + g.dashboard.withEditable(false) 15 | + g.dashboard.graphTooltip.withSharedCrosshair() 16 | + g.dashboard.withVariables([ 17 | variables.Datasource, 18 | variables.uuid, 19 | variables.cluster_name, 20 | variables.user, 21 | variables.iteration, 22 | variables.server, 23 | variables.test_type, 24 | variables.protocol, 25 | variables.message_size, 26 | variables.threads, 27 | ]) 28 | + g.dashboard.withPanels([ 29 | panels.timeSeries.uperfPerformance('UPerf Performance : Throughput per-second', 'bps', queries.throughput.query(), { x: 0, y: 0, w: 12, h: 9 }), 30 | panels.timeSeries.uperfPerformance('UPerf Performance : Operations per-second', 'pps', queries.operations.query(), { x: 12, y: 0, w: 12, h: 9 }), 31 | panels.table.base('UPerf Result Summary', queries.results.query(), { x: 0, y: 20, w: 24, h: 18 }), 32 | ]) 33 | -------------------------------------------------------------------------------- /assets/vegeta-wrapper/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'elasticsearch') 7 | + var.datasource.withRegex('/(.*vegeta.*)/') 8 | + var.query.generalOptions.withLabel('vegeta-results datasource'), 9 | 10 | uuid: 11 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword"}') 12 | + var.query.withDatasourceFromVariable(self.Datasource) 13 | + var.query.withRefresh(2) 14 | + var.query.selectionOptions.withMulti(false) 15 | + var.query.selectionOptions.withIncludeAll(true) 16 | + var.query.generalOptions.withLabel('UUID'), 17 | 18 | hostname: 19 | var.query.new('hostname', '{"find": "terms", "field": "hostname.keyword"}') 20 | + var.query.withDatasourceFromVariable(self.Datasource) 21 | + var.query.withRefresh(2) 22 | + var.query.selectionOptions.withMulti(false) 23 | + var.query.selectionOptions.withIncludeAll(true), 24 | 25 | targets: 26 | var.query.new('targets', '{"find": "terms", "field": "targets.keyword"}') 27 | + var.query.withDatasourceFromVariable(self.Datasource) 28 | + var.query.withRefresh(2) 29 | + var.query.selectionOptions.withMulti(false) 30 | + var.query.selectionOptions.withIncludeAll(true), 31 | 32 | iteration: 33 | var.query.new('iteration', '{"find": "terms", "field": "iteration"}') 34 | + var.query.withDatasourceFromVariable(self.Datasource) 35 | + var.query.withRefresh(2) 36 | + var.query.selectionOptions.withMulti(false) 37 | + var.query.selectionOptions.withIncludeAll(true), 38 | } 39 | -------------------------------------------------------------------------------- /templates/General/pgbench-dashboard.jsonnet: -------------------------------------------------------------------------------- 1 | local annotation = import '../../assets/pgbench-dashboard/annotation.libsonnet'; 2 | local panels = import '../../assets/pgbench-dashboard/panels.libsonnet'; 3 | local queries = import '../../assets/pgbench-dashboard/queries.libsonnet'; 4 | local variables = import '../../assets/pgbench-dashboard/variables.libsonnet'; 5 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 6 | 7 | g.dashboard.new('Pgbench') 8 | + g.dashboard.time.withFrom('now/y') 9 | + g.dashboard.time.withTo('now') 10 | + g.dashboard.withTimezone('utc') 11 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 12 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 13 | + g.dashboard.withRefresh('') 14 | + g.dashboard.withEditable(true) 15 | + g.dashboard.graphTooltip.withSharedCrosshair() 16 | + g.dashboard.withVariables([ 17 | variables.Datasource1, 18 | variables.Datasource2, 19 | variables.uuid, 20 | variables.user, 21 | ]) 22 | + g.dashboard.withAnnotations([ 23 | annotation.run_start_timestamp, 24 | annotation.sample_start_timestamp, 25 | ]) 26 | + g.dashboard.withPanels([ 27 | panels.timeSeries.tps_report('TPS Report', 'ops', queries.tps_report.query(), { x: 0, y: 0, w: 12, h: 9 }), 28 | panels.timeSeries.avg_tps('Overall Average TPS Per Run', 'ops', queries.avg_tps.query(), { x: 12, y: 0, w: 12, h: 9 }), 29 | panels.heatmap.base('Latency Report', 'ms', queries.latency_report.query(), { x: 0, y: 9, w: 12, h: 9 }), 30 | panels.table.base('Result Summary', queries.results.query(), { x: 12, y: 9, w: 12, h: 9 }), 31 | ]) 32 | -------------------------------------------------------------------------------- /templates/General/vegeta-wrapper.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/vegeta-wrapper/panels.libsonnet'; 2 | local queries = import '../../assets/vegeta-wrapper/queries.libsonnet'; 3 | local variables = import '../../assets/vegeta-wrapper/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('Vegeta Results') 7 | + g.dashboard.withDescription(||| 8 | Dashboard for Ingress Performance 9 | |||) 10 | + g.dashboard.withTags('') 11 | + g.dashboard.time.withFrom('now-24h') 12 | + g.dashboard.time.withTo('now') 13 | + g.dashboard.withTimezone('utc') 14 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 15 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 16 | + g.dashboard.withRefresh('') 17 | + g.dashboard.withEditable(false) 18 | + g.dashboard.graphTooltip.withSharedCrosshair() 19 | + g.dashboard.withVariables([ 20 | variables.Datasource, 21 | variables.uuid, 22 | variables.hostname, 23 | variables.targets, 24 | variables.iteration, 25 | ]) 26 | + g.dashboard.withPanels([ 27 | panels.timeSeries.legendDisplayModeTable('RPS (rate of sent requests per second)', 'reqps', queries.rps.query(), { x: 0, y: 0, w: 12, h: 9 }), 28 | panels.timeSeries.legendDisplayModeTable('Throughput (rate of successful requests per second)', 'reqps', queries.throughput.query(), { x: 12, y: 0, w: 12, h: 9 }), 29 | panels.timeSeries.legendDisplayModeTable('Request Latency (observed over given interval)', 'µs', queries.latency.query(), { x: 0, y: 12, w: 12, h: 9 }), 30 | panels.table.base('Vegeta Result Summary', queries.results.query(), { x: 0, y: 24, w: 24, h: 9 }), 31 | ]) 32 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Jsonnet CI 2 | defaults: 3 | run: 4 | shell: bash 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | paths: 10 | - templates/** 11 | - .github/** 12 | - assets/** 13 | pull_request: 14 | branches: [ master ] 15 | paths: 16 | - templates/** 17 | - .github/** 18 | - assets/** 19 | 20 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 21 | jobs: 22 | lint: 23 | runs-on: ubuntu-latest 24 | 25 | steps: 26 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 27 | - uses: actions/checkout@v4 28 | 29 | - name: Get dependencies 30 | run: make deps 31 | 32 | - name: Run jsonnetfmt 33 | run: make format 34 | 35 | build: 36 | runs-on: ubuntu-latest 37 | 38 | steps: 39 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 40 | - uses: actions/checkout@v4 41 | 42 | - name: Compile dashboards 43 | run: make 44 | 45 | - name: Run grafana container 46 | run: sudo docker run -d -p 3000:3000 docker.io/grafana/grafana:9.4.3 47 | 48 | - name: Wait for grafana 49 | run: while [[ $(curl -s -o /dev/null -w '%{http_code}' http://localhost:3000/api/health) != "200" ]]; do sleep 1; done 50 | 51 | - name: Import dashboards to grafana 52 | run: > 53 | for t in rendered/**/*.json; do 54 | echo "Importing ${t}"; 55 | dashboard=$(cat ${t}); 56 | echo "{\"dashboard\": ${dashboard}, \"overwrite\": true}" | 57 | curl -k -Ss -XPOST -H "Content-Type: application/json" -H "Accept: application/json" -d@- 58 | "http://admin:admin@localhost:3000/api/dashboards/db" -o /dev/null; 59 | done 60 | -------------------------------------------------------------------------------- /assets/hypershift-perf-dashboard/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Namespace: 6 | var.query.new('namespace', 'label_values(kube_pod_info, namespace)') 7 | + var.query.withDatasource('prometheus', 'PF55DCC5EC58ABF5A') 8 | + var.datasource.withRegex('/^ocm/') 9 | + var.query.selectionOptions.withMulti(true) 10 | + var.query.selectionOptions.withIncludeAll(true) 11 | + var.query.generalOptions.withLabel('Namespace') 12 | + var.query.withRefresh(2), 13 | 14 | Resource: 15 | var.query.new('resource', 'label_values(apiserver_request_duration_seconds_bucket, resource)') 16 | + var.query.withDatasource('prometheus', 'PF55DCC5EC58ABF5A') 17 | + var.datasource.withRegex('') 18 | + var.query.selectionOptions.withMulti(true) 19 | + var.query.selectionOptions.withIncludeAll(true) 20 | + var.query.generalOptions.withLabel('resource') 21 | + var.query.withRefresh(2), 22 | 23 | Code: 24 | var.query.new('code', 'label_values(code)') 25 | + var.query.withDatasource('prometheus', 'PF55DCC5EC58ABF5A') 26 | + var.datasource.withRegex('') 27 | + var.query.selectionOptions.withMulti(true) 28 | + var.query.selectionOptions.withIncludeAll(true) 29 | + var.query.generalOptions.withLabel('code') 30 | + var.query.withRefresh(2), 31 | 32 | Verb: 33 | var.query.new('verb', 'label_values(verb)') 34 | + var.query.withDatasource('prometheus', 'PF55DCC5EC58ABF5A') 35 | + var.datasource.withRegex('') 36 | + var.query.selectionOptions.withMulti(true) 37 | + var.query.selectionOptions.withIncludeAll(true) 38 | + var.query.generalOptions.withLabel('verb') 39 | + var.query.withRefresh(2), 40 | } 41 | -------------------------------------------------------------------------------- /templates/General/ycsb.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/ycsb/panels.libsonnet'; 2 | local queries = import '../../assets/ycsb/queries.libsonnet'; 3 | local variables = import '../../assets/ycsb/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('YCSB') 7 | + g.dashboard.time.withFrom('now/y') 8 | + g.dashboard.time.withTo('now') 9 | + g.dashboard.withTimezone('utc') 10 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 11 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 12 | + g.dashboard.withRefresh('') 13 | + g.dashboard.withEditable(false) 14 | + g.dashboard.graphTooltip.withSharedCrosshair() 15 | + g.dashboard.withVariables([ 16 | variables.Datasource1, 17 | variables.Datasource2, 18 | variables.uuid, 19 | variables.user, 20 | variables.phase, 21 | variables.operation, 22 | ]) 23 | + g.dashboard.withPanels([ 24 | panels.timeSeries.throughputOvertimePhase('Throughput overtime - Phase = $phase : Operation = $operation', '$Datasource1', 'ops', queries.throughput_overtime.query(), { x: 0, y: 0, w: 12, h: 9 }), 25 | panels.timeSeries.latency90percReportedFromYCSB('Phase = $phase :: Latency - 90%tile Reported from YCSB', '$Datasource1', 'µs', queries.phase_average_latency.query(), { x: 12, y: 0, w: 12, h: 9 }), 26 | panels.timeSeries.LatancyofEachWorkloadPerYCSBOperation('95th% Latency of each workload per YCSB Operation', '$Datasource2', 'µs', queries.latency_95.query(), { x: 0, y: 9, w: 24, h: 6 }), 27 | panels.timeSeries.overallThroughputPerYCSB('Overall Throughput per YCSB Workload', '$Datasource2', 'ops', queries.overall_workload_throughput.query(), { x: 0, y: 15, w: 16, h: 10 }), 28 | panels.table.base('Phase = $phase :: $operation - Count', '$Datasource2', queries.aggregate_operation_sum.query(), { x: 16, y: 15, w: 8, h: 10 }), 29 | ]) 30 | -------------------------------------------------------------------------------- /assets/ovn-monitoring/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'prometheus') 7 | + var.datasource.withRegex('') 8 | + var.query.generalOptions.withLabel('Datasource') 9 | + var.query.selectionOptions.withMulti(false) 10 | + var.query.selectionOptions.withIncludeAll(false) 11 | + var.query.withRefresh(1), 12 | 13 | _master_node: 14 | var.query.new('_master_node', 'label_values(kube_node_role{role="master"}, node)') 15 | + var.datasource.withRegex('') 16 | + var.query.generalOptions.withLabel('Master') 17 | + var.query.selectionOptions.withMulti(true) 18 | + var.query.selectionOptions.withIncludeAll(false) 19 | + var.query.withRefresh(2), 20 | 21 | _worker_node: 22 | var.query.new('_worker_node', 'label_values(kube_node_role{role=~"work.*"}, node)') 23 | + var.datasource.withRegex('') 24 | + var.query.generalOptions.withLabel('Worker') 25 | + var.query.selectionOptions.withMulti(true) 26 | + var.query.selectionOptions.withIncludeAll(false) 27 | + var.query.withRefresh(2), 28 | 29 | master_pod: 30 | var.query.new('master_pod', 'label_values({pod=~"ovnkube-master.*", namespace=~"openshift-ovn-kubernetes"}, pod)') 31 | + var.datasource.withRegex('') 32 | + var.query.generalOptions.withLabel('OVNKube-Master') 33 | + var.query.selectionOptions.withMulti(true) 34 | + var.query.selectionOptions.withIncludeAll(false) 35 | + var.query.withRefresh(1), 36 | 37 | kubenode_pod: 38 | var.query.new('kubenode_pod', 'label_values({pod=~"ovnkube-node.*", namespace=~"openshift-ovn-kubernetes"}, pod)') 39 | + var.datasource.withRegex('') 40 | + var.query.generalOptions.withLabel('OVNKube-Node') 41 | + var.query.selectionOptions.withMulti(true) 42 | + var.query.selectionOptions.withIncludeAll(false) 43 | + var.query.withRefresh(1), 44 | } 45 | -------------------------------------------------------------------------------- /assets/ycsb/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource1: 6 | var.datasource.new('Datasource1', 'elasticsearch') 7 | + var.datasource.withRegex('') 8 | + var.query.withRefresh(1) 9 | + var.query.generalOptions.withLabel('ycsb-results datasource'), 10 | 11 | Datasource2: 12 | var.datasource.new('Datasource2', 'elasticsearch') 13 | + var.datasource.withRegex('') 14 | + var.query.withRefresh(1) 15 | + var.query.generalOptions.withLabel('ycsb-summary datasource'), 16 | 17 | uuid: 18 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword"}') 19 | + var.query.withDatasourceFromVariable(self.Datasource2) 20 | + var.query.withRefresh(2) 21 | + var.datasource.withRegex('') 22 | + var.query.selectionOptions.withMulti(false) 23 | + var.query.selectionOptions.withIncludeAll(true), 24 | 25 | user: 26 | var.query.new('user', '{"find": "terms", "field": "user.keyword"}') 27 | + var.query.withDatasourceFromVariable(self.Datasource2) 28 | + var.query.withRefresh(2) 29 | + var.datasource.withRegex('') 30 | + var.query.selectionOptions.withMulti(false) 31 | + var.query.selectionOptions.withIncludeAll(true), 32 | 33 | phase: 34 | var.query.new('phase', '{"find": "terms", "field": "phase.keyword"}') 35 | + var.query.withDatasourceFromVariable(self.Datasource2) 36 | + var.query.withRefresh(2) 37 | + var.datasource.withRegex('') 38 | + var.query.generalOptions.withCurrent('run', 'run') 39 | + var.query.selectionOptions.withMulti(false) 40 | + var.query.selectionOptions.withIncludeAll(true), 41 | 42 | operation: 43 | var.query.new('operation', '{"find": "fields", "field": "data.*.Operations"}') 44 | + var.query.withDatasourceFromVariable(self.Datasource2) 45 | + var.query.withRefresh(2) 46 | + var.query.generalOptions.withCurrent('READ', 'READ') 47 | + var.datasource.withRegex('/data.(.*).Operations/') 48 | + var.query.selectionOptions.withMulti(false) 49 | + var.query.selectionOptions.withIncludeAll(true), 50 | } 51 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ARCH := $(shell arch) 2 | OS_TYPE := $(shell uname) 3 | JB_OS_TYPE := $(shell uname | tr '[:upper:]' '[:lower:]') 4 | JSONNET := https://github.com/google/go-jsonnet/releases/download/v0.20.0/go-jsonnet_0.20.0_$(OS_TYPE)_$(ARCH).tar.gz 5 | JB := https://github.com/jsonnet-bundler/jsonnet-bundler/releases/latest/download/jb-$(JB_OS_TYPE)-$(subst x86_64,amd64,$(ARCH)) 6 | BINDIR = bin 7 | TEMPLATESDIR = templates 8 | ASSETS := $(wildcard assets/**/*.libsonnet) 9 | OUTPUTDIR = rendered 10 | ALLDIRS = $(BINDIR) $(OUTPUTDIR) 11 | SYNCER_IMG_TAG ?= quay.io/cloud-bulldozer/dittybopper-syncer:latest 12 | PLATFORM = linux/amd64,linux/arm64,linux/ppc64le,linux/s390x 13 | 14 | # Get all templates at $(TEMPLATESDIR) 15 | TEMPLATES := $(wildcard $(TEMPLATESDIR)/**/*.jsonnet) 16 | LIBRARY_PATH := $(TEMPLATESDIR)/vendor 17 | 18 | # Replace $(TEMPLATESDIR)/*.jsonnet by $(OUTPUTDIR)/*.json 19 | outputs := $(patsubst $(TEMPLATESDIR)/%.jsonnet, $(OUTPUTDIR)/%.json, $(TEMPLATES)) 20 | 21 | all: deps format build 22 | 23 | deps: $(ALLDIRS) $(BINDIR)/jsonnet $(LIBRARY_PATH) 24 | 25 | $(ALLDIRS): 26 | mkdir -p $(ALLDIRS) 27 | 28 | format: deps 29 | $(BINDIR)/jsonnetfmt -i $(TEMPLATES) $(ASSETS) 30 | 31 | build: deps $(LIBRARY_PATH) $(outputs) 32 | 33 | clean-all: 34 | @echo "Cleaning up" 35 | rm -rf $(ALLDIRS) $(TEMPLATESDIR)/vendor 36 | 37 | clean: 38 | @echo "Cleaning up" 39 | rm -rf $(OUTPUTDIR) 40 | 41 | $(BINDIR)/jsonnet: 42 | @echo "Downloading jsonnet binary" 43 | curl -s -L $(JSONNET) | tar xz -C $(BINDIR) 44 | @echo "Downloading jb binary" 45 | curl -s -L $(JB) -o $(BINDIR)/jb 46 | chmod +x $(BINDIR)/jb 47 | 48 | $(TEMPLATESDIR)/vendor: 49 | @echo "Downloading vendor files" 50 | cd $(TEMPLATESDIR) && ../$(BINDIR)/jb install && cd ../ 51 | 52 | # Build each template and output to $(OUTPUTDIR) 53 | $(OUTPUTDIR)/%.json: $(TEMPLATESDIR)/%.jsonnet $(ASSETS) 54 | @echo "Building template $<" 55 | mkdir -p $(dir $@) 56 | $(BINDIR)/jsonnet -J ./$(LIBRARY_PATH) $< > $@ 57 | 58 | build-syncer-image: build 59 | podman build --platform=${PLATFORM} -f Dockerfile --manifest=${SYNCER_IMG_TAG} . 60 | 61 | push-syncer-image: 62 | podman manifest push ${SYNCER_IMG_TAG} ${SYNCER_IMG_TAG} 63 | -------------------------------------------------------------------------------- /assets/k8s-perf/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'prometheus') 7 | + var.datasource.withRegex('') 8 | + var.query.withRefresh(1) 9 | + var.query.selectionOptions.withIncludeAll(false) 10 | + var.query.selectionOptions.withMulti(false), 11 | 12 | _worker_node: 13 | var.query.new('_worker_node', 'label_values(kube_node_labels{}, exported_node)') 14 | + var.query.generalOptions.withLabel('Worker') 15 | + var.query.withSort(0) 16 | + var.query.withRefresh(2) 17 | + var.query.selectionOptions.withIncludeAll(false) 18 | + var.query.selectionOptions.withMulti(true), 19 | 20 | namespace: 21 | var.query.new('namespace', 'label_values(kube_pod_info, exported_namespace)') 22 | + var.query.generalOptions.withLabel('Namespace') 23 | + var.query.withSort(0) 24 | + var.query.withRefresh(2) 25 | + var.query.selectionOptions.withIncludeAll(true) 26 | + var.query.selectionOptions.withMulti(false), 27 | 28 | block_device: 29 | var.query.new('block_device', 'label_values(node_disk_written_bytes_total,device)') 30 | + var.query.generalOptions.withLabel('Block device') 31 | + var.query.withSort(0) 32 | + var.datasource.withRegex('/^(?:(?!dm|rb).)*$/') 33 | + var.query.withRefresh(2) 34 | + var.query.selectionOptions.withIncludeAll(true) 35 | + var.query.selectionOptions.withMulti(true), 36 | 37 | net_device: 38 | var.query.new('net_device', 'label_values(node_network_receive_bytes_total,device)') 39 | + var.query.generalOptions.withLabel('Network device') 40 | + var.query.withSort(0) 41 | + var.datasource.withRegex('/^((br|en|et).*)$/') 42 | + var.query.withRefresh(2) 43 | + var.query.selectionOptions.withIncludeAll(true) 44 | + var.query.selectionOptions.withMulti(true), 45 | 46 | interval: 47 | var.interval.new('interval', ['2m', '3m', '4m', '5m']) 48 | + var.query.withDatasourceFromVariable(self.Datasource) 49 | + var.interval.generalOptions.withLabel('interval') 50 | + var.interval.withAutoOption(count=30, minInterval='10s') 51 | + var.query.withRefresh(2) 52 | + var.query.selectionOptions.withMulti(false) 53 | + var.query.selectionOptions.withIncludeAll(false), 54 | } 55 | -------------------------------------------------------------------------------- /assets/cilium-k8s-perf/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'prometheus') 7 | + var.datasource.withRegex('') 8 | + var.query.generalOptions.withLabel('Datasource') 9 | + var.query.withRefresh(1), 10 | 11 | _worker_node: 12 | var.query.new('_worker_node', 'label_values(kube_node_labels{}, exported_node)') 13 | + var.query.withDatasourceFromVariable(self.Datasource) 14 | + var.query.selectionOptions.withMulti(true) 15 | + var.query.selectionOptions.withIncludeAll(false) 16 | + var.query.generalOptions.withLabel('Worker') 17 | + var.query.withRefresh(2), 18 | 19 | namespace: 20 | var.query.new('namespace', 'label_values(kube_pod_info, exported_namespace)') 21 | + var.query.withDatasourceFromVariable(self.Datasource) 22 | + var.query.selectionOptions.withMulti(false) 23 | + var.query.selectionOptions.withIncludeAll(true) 24 | + var.query.generalOptions.withLabel('Namespace') 25 | + var.query.withRefresh(2), 26 | 27 | block_device: 28 | var.query.new('block_device', 'label_values(node_disk_written_bytes_total,device)') 29 | + var.query.withDatasourceFromVariable(self.Datasource) 30 | + var.datasource.withRegex('/^(?:(?!dm|rb).)*$/') 31 | + var.query.selectionOptions.withMulti(true) 32 | + var.query.selectionOptions.withIncludeAll(true) 33 | + var.query.generalOptions.withLabel('Block device') 34 | + var.query.withRefresh(2), 35 | 36 | net_device: 37 | var.query.new('net_device', 'label_values(node_network_receive_bytes_total,device)') 38 | + var.query.withDatasourceFromVariable(self.Datasource) 39 | + var.datasource.withRegex('/^((br|en|et).*)$/') 40 | + var.query.selectionOptions.withMulti(true) 41 | + var.query.selectionOptions.withIncludeAll(true) 42 | + var.query.generalOptions.withLabel('Network device') 43 | + var.query.withRefresh(2), 44 | 45 | interval: 46 | var.interval.new('interval', ['2m', '3m', '4m', '5m']) 47 | + var.query.withDatasourceFromVariable(self.Datasource) 48 | + var.interval.generalOptions.withLabel('interval') 49 | + var.query.withRefresh(2) 50 | + var.query.selectionOptions.withMulti(false) 51 | + var.query.selectionOptions.withIncludeAll(true), 52 | 53 | } 54 | -------------------------------------------------------------------------------- /assets/api-performance-overview/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | timeSeries: { 5 | local timeSeries = g.panel.timeSeries, 6 | local custom = timeSeries.fieldConfig.defaults.custom, 7 | local options = timeSeries.options, 8 | 9 | base(title, unit, targets, gridPos): 10 | timeSeries.new(title) 11 | + timeSeries.queryOptions.withTargets(targets) 12 | + timeSeries.datasource.withType('prometheus') 13 | + timeSeries.datasource.withUid('$Datasource') 14 | + timeSeries.standardOptions.withUnit(unit) 15 | + timeSeries.gridPos.withX(gridPos.x) 16 | + timeSeries.gridPos.withY(gridPos.y) 17 | + timeSeries.gridPos.withH(gridPos.h) 18 | + timeSeries.gridPos.withW(gridPos.w) 19 | + custom.withDrawStyle('line') 20 | + custom.withLineInterpolation('linear') 21 | + custom.withBarAlignment(0) 22 | + custom.withLineWidth(1) 23 | + custom.withFillOpacity(10) 24 | + custom.withGradientMode('none') 25 | + custom.withSpanNulls(false) 26 | + custom.withPointSize(5) 27 | + custom.withSpanNulls(false) 28 | + custom.stacking.withGroup('A') 29 | + custom.stacking.withMode('none') 30 | + custom.withShowPoints('never') 31 | + options.tooltip.withSort('desc') 32 | + timeSeries.queryOptions.withTimeFrom(null) 33 | + timeSeries.queryOptions.withTimeShift(null) 34 | + options.legend.withSortBy('Max') 35 | + options.legend.withSortDesc(true), 36 | 37 | legendRightPlacement(title, unit, targets, gridPos): 38 | self.base(title, unit, targets, gridPos) 39 | + options.legend.withCalcs([ 40 | 'max', 41 | ]) 42 | + options.legend.withShowLegend(true) 43 | + options.legend.withDisplayMode('table') 44 | + options.legend.withPlacement('right') 45 | + options.legend.withAsTable(true) 46 | + options.tooltip.withMode('multi'), 47 | 48 | legendBottomPlacement(title, unit, targets, gridPos): 49 | self.base(title, unit, targets, gridPos) 50 | + options.tooltip.withMode('multi') 51 | + options.legend.withShowLegend(true) 52 | + options.legend.withDisplayMode('list') 53 | + options.legend.withPlacement('bottom') 54 | + options.tooltip.withMode('multi'), 55 | 56 | withRequestWaitDurationAggregations(title, unit, targets, gridPos): 57 | self.legendRightPlacement(title, unit, targets, gridPos) 58 | + options.legend.withCalcs([ 59 | 'mean', 60 | 'max', 61 | 'lastNotNull', 62 | ]), 63 | }, 64 | } 65 | -------------------------------------------------------------------------------- /dittybopper/README.md: -------------------------------------------------------------------------------- 1 | # Dittybopper 2 | 3 | ![Dittybopper screenshot](dittybopper_screenshot.png) 4 | 5 | ## whatis 6 | 7 | Dittybopper is a quick-and-dirty way to deploy system-level submetric monitoring with Grafana 8 | in an OpenShift 4 environment. It deploys a stand-alone mutable Grafana pod with default 9 | dashboards to monitor things like CPU, memory, network, and disk activity. 10 | The Grafana charts expect to receive metrics from an existing Prometheus 11 | deployment and node exporters. 12 | 13 | ## Getting Started / Prerequisistes 14 | 15 | Right now Dittybopper has a number of FIXMEs that need to be addressed before it will be more portable across 16 | k8s/OpenShift environments. It should generally deploy out-of-the-box with OpenShift 4. Other environments 17 | will likely have a prerequisite to first stand up a Prometheus pod, and the Dittybopper scripts and 18 | templates will need adjustment accordingly. 19 | 20 | ## Syncer Image and Deploying Forked Changes 21 | 22 | For disconnected support, the syncer image stores all dashboards on it. For deploying dittybopper with changed 23 | dashboards, you need to build it yourself from the root of the repository and update the SYNCER_IMAGE environment 24 | variable to match your own image repository. 25 | 26 | If using disconnected, you need to sync the cloud-bulldozer grafana image (shown in the 27 | dittybopper/templates/dittybopper.yaml.template file) and your chosen syncer image 28 | (defaults to quay.io/cloud-bulldozer/dittybopper-syncer:latest). 29 | 30 | The syncer image is built with the context at the root of the repository, and the image in the root directory. 31 | You can build it with `make build-syncer-image SYNCER_IMG_TAG=container.registry.org/organization/syncer:latest` 32 | Alternatively, you can run the following command from the root folder of this repository: `podman build -f Dockerfile -t=container.registry.org/organization/syncer:latest .` 33 | 34 | ## Contribute 35 | 36 | Pull requests are encouraged. If you find this tool useful, please help extend it for more use cases. 37 | 38 | ## Deploy 39 | 40 | ### Deploy Grafana on OpenShift Cluster with Dashboards 41 | 42 | ``` 43 | $ git clone https://github.com/cloud-bulldozer/performance-dashboards.git 44 | $ cd performance-dashboards/dittybopper 45 | $ ./deploy.sh [-c ] [-n ] [-p ] 46 | ``` 47 | 48 | See `./deploy.sh -h` for help. 49 | 50 | Simply running `./deploy.sh` with no flags will assume OpenShift, the _dittybopper_ namespace, and _admin_ for the password. 51 | 52 | ### Import Dashboard 53 | 54 | This will import a dashboard (json) into an existing Dittybopper Grafana deployment. The dashboard path 55 | can be either a local file or a URL beginning with http. 56 | 57 | ``` 58 | $ ./deploy.sh -i 59 | ``` 60 | 61 | ### Delete Grafana Deployment 62 | 63 | ``` 64 | $ ./deploy.sh -d 65 | ``` 66 | -------------------------------------------------------------------------------- /assets/k8s-perf/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | stat: { 5 | local stat = g.panel.stat, 6 | local options = stat.options, 7 | 8 | base(title, unit, targets, gridPos): 9 | stat.new(title) 10 | + stat.datasource.withType('prometheus') 11 | + stat.datasource.withUid('$Datasource') 12 | + stat.standardOptions.withUnit(unit) 13 | + stat.queryOptions.withTargets(targets) 14 | + stat.gridPos.withX(gridPos.x) 15 | + stat.gridPos.withY(gridPos.y) 16 | + stat.gridPos.withH(gridPos.h) 17 | + stat.gridPos.withW(gridPos.w) 18 | + options.withJustifyMode('auto') 19 | + options.withGraphMode('none') 20 | + options.text.withTitleSize(12) 21 | + stat.standardOptions.color.withMode('thresholds') 22 | + options.withColorMode('none'), 23 | 24 | genericStatLegendPanel(title, unit, targets, gridPos): 25 | self.base(title, unit, targets, gridPos) 26 | + stat.options.reduceOptions.withCalcs([ 27 | 'last', 28 | ]), 29 | }, 30 | 31 | timeSeries: { 32 | local timeSeries = g.panel.timeSeries, 33 | local custom = timeSeries.fieldConfig.defaults.custom, 34 | local options = timeSeries.options, 35 | 36 | base(title, unit, targets, gridPos): 37 | timeSeries.new(title) 38 | + timeSeries.queryOptions.withTargets(targets) 39 | + timeSeries.datasource.withType('prometheus') 40 | + timeSeries.datasource.withUid('$Datasource') 41 | + timeSeries.standardOptions.withUnit(unit) 42 | + timeSeries.gridPos.withX(gridPos.x) 43 | + timeSeries.gridPos.withY(gridPos.y) 44 | + timeSeries.gridPos.withH(gridPos.h) 45 | + timeSeries.gridPos.withW(gridPos.w) 46 | + custom.withDrawStyle('line') 47 | + custom.withLineInterpolation('linear') 48 | + custom.withBarAlignment(0) 49 | + custom.withLineWidth(1) 50 | + custom.withFillOpacity(10) 51 | + custom.withGradientMode('none') 52 | + custom.withSpanNulls(false) 53 | + custom.withPointSize(5) 54 | + custom.withSpanNulls(false) 55 | + custom.stacking.withMode('none') 56 | + custom.withShowPoints('never') 57 | + options.tooltip.withMode('multi') 58 | + options.tooltip.withSort('desc') 59 | + options.legend.withShowLegend(true) 60 | + options.legend.withPlacement('bottom'), 61 | 62 | genericTimeSeriesPanel(title, unit, targets, gridPos): 63 | self.base(title, unit, targets, gridPos) 64 | + options.legend.withCalcs([]) 65 | + options.legend.withDisplayMode('table'), 66 | 67 | genericTimeSeriesLegendPanel(title, unit, targets, gridPos): 68 | self.base(title, unit, targets, gridPos) 69 | + options.legend.withCalcs([ 70 | 'mean', 71 | 'max', 72 | ]) 73 | + options.legend.withDisplayMode('table'), 74 | 75 | }, 76 | } 77 | -------------------------------------------------------------------------------- /assets/ocp-performance/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | timeSeries: { 5 | local timeSeries = g.panel.timeSeries, 6 | local fieldOverride = g.panel.timeSeries.fieldOverride, 7 | local custom = timeSeries.fieldConfig.defaults.custom, 8 | local options = timeSeries.options, 9 | local standardOptions = timeSeries.standardOptions, 10 | local byRegexp = timeSeries.standardOptions.override.byRegexp, 11 | 12 | generic(title, unit, targets, gridPos): 13 | timeSeries.new(title) 14 | + timeSeries.queryOptions.withTargets(targets) 15 | + timeSeries.datasource.withUid('$Datasource') 16 | + timeSeries.standardOptions.withUnit(unit) 17 | + timeSeries.gridPos.withX(gridPos.x) 18 | + timeSeries.gridPos.withY(gridPos.y) 19 | + timeSeries.gridPos.withH(gridPos.h) 20 | + timeSeries.gridPos.withW(gridPos.w) 21 | + custom.withSpanNulls(false) 22 | + options.tooltip.withMode('multi') 23 | + options.tooltip.withSort('desc') 24 | + options.legend.withDisplayMode('table'), 25 | 26 | genericLegend(title, unit, targets, gridPos): 27 | self.generic(title, unit, targets, gridPos) 28 | + options.legend.withShowLegend(true) 29 | + options.legend.withCalcs([ 30 | 'mean', 31 | 'min', 32 | 'max', 33 | ]) 34 | + options.legend.withSortBy('Max') 35 | + options.legend.withSortDesc(true) 36 | + options.legend.withPlacement('bottom'), 37 | 38 | genericLegendCounter(title, unit, targets, gridPos): 39 | self.generic(title, unit, targets, gridPos) 40 | + options.legend.withShowLegend(true) 41 | + options.legend.withCalcs([ 42 | 'first', 43 | 'min', 44 | 'max', 45 | 'last', 46 | ]) 47 | + options.legend.withSortBy('Max') 48 | + options.legend.withSortDesc(true) 49 | + options.legend.withPlacement('bottom'), 50 | 51 | genericLegendCounterSumRightHand(title, unit, targets, gridPos): 52 | self.genericLegendCounter(title, unit, targets, gridPos) 53 | + options.legend.withDisplayMode('table') 54 | + options.legend.withSortBy('Max') 55 | + standardOptions.withOverrides([ 56 | byRegexp.new('sum') 57 | + byRegexp.withProperty('custom.axisPlacement', 'right') 58 | + byRegexp.withProperty('custom.axisLabel', 'sum'), 59 | ]), 60 | }, 61 | stat: { 62 | local stat = g.panel.stat, 63 | local options = stat.options, 64 | 65 | base(title, targets, gridPos): 66 | stat.new(title) 67 | + stat.datasource.withUid('$Datasource') 68 | + stat.queryOptions.withTargets(targets) 69 | + stat.gridPos.withX(gridPos.x) 70 | + stat.gridPos.withY(gridPos.y) 71 | + stat.gridPos.withH(gridPos.h) 72 | + stat.gridPos.withW(gridPos.w) 73 | + options.reduceOptions.withCalcs([ 74 | 'last', 75 | ]), 76 | }, 77 | } 78 | -------------------------------------------------------------------------------- /assets/cilium-k8s-perf/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | timeSeries: { 5 | local timeSeries = g.panel.timeSeries, 6 | local custom = timeSeries.fieldConfig.defaults.custom, 7 | local options = timeSeries.options, 8 | 9 | base(title, unit, targets, gridPos): 10 | timeSeries.new(title) 11 | + timeSeries.queryOptions.withTargets(targets) 12 | + timeSeries.datasource.withType('prometheus') 13 | + timeSeries.datasource.withUid('$Datasource') 14 | + timeSeries.standardOptions.withUnit(unit) 15 | + timeSeries.gridPos.withX(gridPos.x) 16 | + timeSeries.gridPos.withY(gridPos.y) 17 | + timeSeries.gridPos.withH(gridPos.h) 18 | + timeSeries.gridPos.withW(gridPos.w) 19 | + custom.withDrawStyle('line') 20 | + custom.withLineInterpolation('linear') 21 | + custom.withBarAlignment(0) 22 | + custom.withLineWidth(1) 23 | + custom.withFillOpacity(10) 24 | + custom.withGradientMode('none') 25 | + custom.withSpanNulls(false) 26 | + custom.withPointSize(5) 27 | + custom.withSpanNulls(false) 28 | + custom.stacking.withMode('none') 29 | + custom.withShowPoints('never'), 30 | 31 | withCiliumAgg(title, unit, targets, gridPos): 32 | self.base(title, unit, targets, gridPos) 33 | + options.tooltip.withMode('multi') 34 | + options.tooltip.withSort('desc') 35 | + options.legend.withShowLegend(true) 36 | + options.legend.withPlacement('bottom') 37 | + options.legend.withDisplayMode('table') 38 | + options.legend.withCalcs([ 39 | 'mean', 40 | 'max', 41 | ]), 42 | 43 | withClusterAgg(title, unit, targets, gridPos): 44 | self.base(title, unit, targets, gridPos) 45 | + options.tooltip.withMode('multi') 46 | + options.tooltip.withSort('desc') 47 | + options.legend.withShowLegend(true) 48 | + options.legend.withPlacement('bottom') 49 | + options.legend.withDisplayMode('table') 50 | + options.legend.withCalcs([]), 51 | }, 52 | 53 | stat: { 54 | local stat = g.panel.stat, 55 | local options = stat.options, 56 | 57 | base(title, unit, targets, gridPos): 58 | stat.new(title) 59 | + stat.datasource.withType('prometheus') 60 | + stat.datasource.withUid('$Datasource') 61 | + stat.standardOptions.withUnit(unit) 62 | + stat.queryOptions.withTargets(targets) 63 | + stat.gridPos.withX(gridPos.x) 64 | + stat.gridPos.withY(gridPos.y) 65 | + stat.gridPos.withH(gridPos.h) 66 | + stat.gridPos.withW(gridPos.w) 67 | + options.withJustifyMode('auto') 68 | + options.withGraphMode('area') 69 | + options.text.withTitleSize(12), 70 | 71 | withclusterAgg(title, unit, targets, gridPos): 72 | self.base(title, unit, targets, gridPos) 73 | + options.reduceOptions.withCalcs([ 74 | 'last', 75 | ]) 76 | + stat.standardOptions.thresholds.withSteps([]), 77 | }, 78 | } 79 | -------------------------------------------------------------------------------- /assets/uperf/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'elasticsearch') 7 | + var.datasource.withRegex('/(.*uperf.*)/') 8 | + var.query.generalOptions.withLabel('uperf-results datasource') 9 | + var.query.withRefresh(1), 10 | 11 | uuid: 12 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword"}') 13 | + var.query.withDatasourceFromVariable(self.Datasource) 14 | + var.query.selectionOptions.withMulti(false) 15 | + var.query.selectionOptions.withIncludeAll(true) 16 | + var.query.withRefresh(2), 17 | 18 | cluster_name: 19 | var.query.new('cluster_name', '{"find": "terms", "field": "cluster_name.keyword"}') 20 | + var.query.withDatasourceFromVariable(self.Datasource) 21 | + var.query.selectionOptions.withMulti(false) 22 | + var.query.selectionOptions.withIncludeAll(true) 23 | + var.query.withRefresh(2), 24 | 25 | user: 26 | var.query.new('user', '{"find": "terms", "field": "user.keyword"}') 27 | + var.query.withDatasourceFromVariable(self.Datasource) 28 | + var.query.selectionOptions.withMulti(false) 29 | + var.query.selectionOptions.withIncludeAll(true) 30 | + var.query.withRefresh(2), 31 | 32 | iteration: 33 | var.query.new('iteration', '{"find": "terms", "field": "iteration"}') 34 | + var.query.withDatasourceFromVariable(self.Datasource) 35 | + var.query.selectionOptions.withMulti(false) 36 | + var.query.selectionOptions.withIncludeAll(true) 37 | + var.query.withRefresh(2), 38 | 39 | server: 40 | var.query.new('server', '{"find": "terms", "field": "remote_ip.keyword"}') 41 | + var.query.withDatasourceFromVariable(self.Datasource) 42 | + var.query.selectionOptions.withMulti(false) 43 | + var.query.selectionOptions.withIncludeAll(true) 44 | + var.query.withRefresh(2), 45 | 46 | test_type: 47 | var.query.new('test_type', '{"find": "terms", "field": "test_type.keyword"}') 48 | + var.query.withDatasourceFromVariable(self.Datasource) 49 | + var.query.selectionOptions.withMulti(false) 50 | + var.query.selectionOptions.withIncludeAll(true) 51 | + var.query.withRefresh(2), 52 | 53 | protocol: 54 | var.query.new('protocol', '{"find": "terms", "field": "protocol.keyword"}') 55 | + var.query.withDatasourceFromVariable(self.Datasource) 56 | + var.query.selectionOptions.withMulti(false) 57 | + var.query.selectionOptions.withIncludeAll(true) 58 | + var.query.withRefresh(2), 59 | 60 | message_size: 61 | var.query.new('message_size', '{"find": "terms", "field": "message_size"}') 62 | + var.query.withDatasourceFromVariable(self.Datasource) 63 | + var.query.selectionOptions.withMulti(false) 64 | + var.query.selectionOptions.withIncludeAll(true) 65 | + var.query.withRefresh(2), 66 | 67 | threads: 68 | var.query.new('threads', '{"find": "terms", "field": "num_threads"}') 69 | + var.query.withDatasourceFromVariable(self.Datasource) 70 | + var.query.selectionOptions.withMulti(false) 71 | + var.query.selectionOptions.withIncludeAll(true) 72 | + var.query.withRefresh(2), 73 | } 74 | -------------------------------------------------------------------------------- /assets/ocp-performance/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'prometheus') 7 | + var.query.generalOptions.withLabel('Datasource'), 8 | 9 | master_node: 10 | var.query.new('_master_node') 11 | + var.query.withDatasourceFromVariable(self.Datasource) 12 | + var.query.queryTypes.withLabelValues( 13 | 'node', 14 | 'kube_node_role{role="master"}', 15 | ) 16 | + var.query.withRefresh(2) 17 | + var.query.selectionOptions.withMulti() 18 | + var.query.selectionOptions.withIncludeAll(false) 19 | + var.query.generalOptions.withLabel('Master'), 20 | 21 | worker_node: 22 | var.query.new('_worker_node') 23 | + var.query.withDatasourceFromVariable(self.Datasource) 24 | + var.query.queryTypes.withLabelValues( 25 | 'node', 26 | 'kube_node_role{role=~"worker"}', 27 | ) 28 | + var.query.withRefresh(2) 29 | + var.query.selectionOptions.withMulti() 30 | + var.query.selectionOptions.withIncludeAll(false) 31 | + var.query.generalOptions.withLabel('Worker'), 32 | 33 | infra_node: 34 | var.query.new('_infra_node') 35 | + var.query.withDatasourceFromVariable(self.Datasource) 36 | + var.query.queryTypes.withLabelValues( 37 | 'node', 38 | 'kube_node_role{role="infra"}', 39 | ) 40 | + var.query.withRefresh(2) 41 | + var.query.selectionOptions.withMulti() 42 | + var.query.selectionOptions.withIncludeAll(false) 43 | + var.query.generalOptions.withLabel('Infra'), 44 | 45 | namespace: 46 | var.query.new('namespace') 47 | + var.query.withDatasourceFromVariable(self.Datasource) 48 | + var.query.queryTypes.withLabelValues( 49 | 'namespace', 50 | 'kube_pod_info{namespace!="(cluster-density.*|node-density-.*)"}', 51 | ) 52 | + var.query.withRefresh(2) 53 | + var.query.withRegex('') 54 | + var.query.selectionOptions.withMulti(false) 55 | + var.query.selectionOptions.withIncludeAll(true) 56 | + var.query.generalOptions.withLabel('Namespace'), 57 | 58 | block_device: 59 | var.query.new('block_device') 60 | + var.query.withDatasourceFromVariable(self.Datasource) 61 | + var.query.queryTypes.withLabelValues( 62 | 'device', 63 | 'node_disk_written_bytes_total', 64 | ) 65 | + var.query.withRefresh(2) 66 | + var.query.withRegex('/^(?:(?!dm|rb).)*$/') 67 | + var.query.selectionOptions.withMulti(true) 68 | + var.query.selectionOptions.withIncludeAll(true) 69 | + var.query.generalOptions.withLabel('Block device'), 70 | 71 | net_device: 72 | var.query.new('net_device') 73 | + var.query.withDatasourceFromVariable(self.Datasource) 74 | + var.query.queryTypes.withLabelValues( 75 | 'device', 76 | 'node_network_receive_bytes_total', 77 | ) 78 | + var.query.withRefresh(2) 79 | + var.query.withRegex('/^((br|en|et).*)$/') 80 | + var.query.selectionOptions.withMulti(true) 81 | + var.query.selectionOptions.withIncludeAll(true) 82 | + var.query.generalOptions.withLabel('Network device'), 83 | 84 | interval: 85 | var.interval.new('interval', ['2m', '3m', '4m', '5m'],) 86 | + var.interval.generalOptions.withLabel('interval'), 87 | } 88 | -------------------------------------------------------------------------------- /assets/api-performance-overview/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'prometheus') 7 | + var.datasource.withRegex('') 8 | + var.query.generalOptions.withLabel('Datasource') 9 | + var.query.withRefresh(1), 10 | apiserver: 11 | var.query.new('apiserver', 'label_values(apiserver_request_duration_seconds_bucket, apiserver)') 12 | + var.query.withDatasourceFromVariable(self.Datasource) 13 | + var.query.selectionOptions.withMulti(false) 14 | + var.query.selectionOptions.withIncludeAll(true) 15 | + var.query.generalOptions.withLabel('apiserver') 16 | + var.query.withRefresh(2), 17 | 18 | instance: 19 | var.query.new('instance', 'label_values(apiserver_request_total, instance)') 20 | + var.query.withDatasourceFromVariable(self.Datasource) 21 | + var.query.selectionOptions.withMulti(false) 22 | + var.query.selectionOptions.withIncludeAll(true) 23 | + var.query.generalOptions.withLabel('instance') 24 | + var.query.withRefresh(2), 25 | 26 | resource: 27 | var.query.new('resource', 'label_values(apiserver_request_duration_seconds_bucket, resource)') 28 | + var.query.withDatasourceFromVariable(self.Datasource) 29 | + var.query.selectionOptions.withMulti(false) 30 | + var.query.selectionOptions.withIncludeAll(true) 31 | + var.query.generalOptions.withLabel('resource') 32 | + var.query.withRefresh(2), 33 | 34 | code: 35 | var.query.new('code', 'label_values(code)') 36 | + var.query.withDatasourceFromVariable(self.Datasource) 37 | + var.query.selectionOptions.withMulti(false) 38 | + var.query.selectionOptions.withIncludeAll(true) 39 | + var.query.generalOptions.withLabel('code') 40 | + var.query.withRefresh(2), 41 | 42 | verb: 43 | var.query.new('verb', 'label_values(verb)') 44 | + var.query.withDatasourceFromVariable(self.Datasource) 45 | + var.query.selectionOptions.withMulti(false) 46 | + var.query.selectionOptions.withIncludeAll(true) 47 | + var.query.generalOptions.withLabel('verb') 48 | + var.query.withRefresh(2), 49 | 50 | flow_schema: 51 | var.query.new('flow_schema', 'label_values(flow_schema)') 52 | + var.query.withDatasourceFromVariable(self.Datasource) 53 | + var.query.selectionOptions.withMulti(false) 54 | + var.query.selectionOptions.withIncludeAll(true) 55 | + var.query.generalOptions.withLabel('flow-schema') 56 | + var.query.withRefresh(2), 57 | 58 | priority_level: 59 | var.query.new('priority_level', 'label_values(priority_level)') 60 | + var.query.withDatasourceFromVariable(self.Datasource) 61 | + var.query.selectionOptions.withMulti(false) 62 | + var.query.selectionOptions.withIncludeAll(true) 63 | + var.query.generalOptions.withLabel('priority-level') 64 | + var.query.withRefresh(2), 65 | 66 | interval: 67 | var.interval.new('interval', ['1m', '5m']) 68 | + var.query.withDatasourceFromVariable(self.Datasource) 69 | + var.interval.generalOptions.withLabel('interval') 70 | + var.interval.withAutoOption(count=30, minInterval='10s') 71 | + var.query.withRefresh(2) 72 | + var.query.selectionOptions.withMulti(false) 73 | + var.query.selectionOptions.withIncludeAll(true), 74 | } 75 | -------------------------------------------------------------------------------- /assets/ycsb/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | timeSeries: { 5 | local timeSeries = g.panel.timeSeries, 6 | local custom = timeSeries.fieldConfig.defaults.custom, 7 | local options = timeSeries.options, 8 | 9 | base(title, datasource, unit, targets, gridPos): 10 | timeSeries.new(title) 11 | + timeSeries.queryOptions.withTargets(targets) 12 | + timeSeries.datasource.withType('elasticsearch') 13 | + timeSeries.datasource.withUid(datasource) 14 | + timeSeries.standardOptions.withUnit(unit) 15 | + timeSeries.gridPos.withX(gridPos.x) 16 | + timeSeries.gridPos.withY(gridPos.y) 17 | + timeSeries.gridPos.withH(gridPos.h) 18 | + timeSeries.gridPos.withW(gridPos.w) 19 | + custom.withSpanNulls(false) 20 | + custom.withFillOpacity(25) 21 | + options.tooltip.withMode('multi') 22 | + options.tooltip.withSort('none') 23 | + options.legend.withShowLegend(true) 24 | + custom.withLineWidth(2), 25 | 26 | overallThroughputPerYCSB(title, datasource, unit, targets, gridPos): 27 | self.base(title, datasource, unit, targets, gridPos) 28 | + options.legend.withDisplayMode('table') 29 | + options.legend.withPlacement('right') 30 | + custom.withDrawStyle('bars') 31 | + custom.withFillOpacity(100) 32 | + custom.withPointSize(4) 33 | + custom.withShowPoints('never') 34 | + options.legend.withCalcs([ 35 | 'sum', 36 | ]), 37 | 38 | LatancyofEachWorkloadPerYCSBOperation(title, datasource, unit, targets, gridPos): 39 | self.base(title, datasource, unit, targets, gridPos) 40 | + options.legend.withDisplayMode('list') 41 | + options.legend.withPlacement('bottom') 42 | + custom.withFillOpacity(100) 43 | + custom.withPointSize(4) 44 | + custom.withDrawStyle('bars') 45 | + custom.withShowPoints('never'), 46 | 47 | latency90percReportedFromYCSB(title, datasource, unit, targets, gridPos): 48 | self.base(title, datasource, unit, targets, gridPos) 49 | + custom.withDrawStyle('points') 50 | + custom.withFillOpacity(20) 51 | + options.legend.withDisplayMode('list') 52 | + options.legend.withPlacement('bottom') 53 | + options.legend.withCalcs([]) 54 | + custom.withPointSize(4) 55 | + custom.withShowPoints('always'), 56 | 57 | throughputOvertimePhase(title, datasource, unit, targets, gridPos): 58 | self.base(title, datasource, unit, targets, gridPos) 59 | + custom.withDrawStyle('line') 60 | + custom.withFillOpacity(20) 61 | + custom.withPointSize(5) 62 | + options.legend.withDisplayMode('list') 63 | + options.legend.withPlacement('bottom') 64 | + options.legend.withCalcs([]) 65 | + custom.withShowPoints('never'), 66 | }, 67 | 68 | table: { 69 | local table = g.panel.table, 70 | local custom = table.fieldConfig.defaults.custom, 71 | local options = table.options, 72 | 73 | base(title, datasource, targets, gridPos): 74 | table.new(title) 75 | + table.queryOptions.withTargets(targets) 76 | + table.datasource.withType('elasticsearch') 77 | + table.datasource.withUid(datasource) 78 | + table.gridPos.withX(gridPos.x) 79 | + table.gridPos.withY(gridPos.y) 80 | + table.gridPos.withH(gridPos.h) 81 | + table.gridPos.withW(gridPos.w) 82 | + custom.cellOptions.TableSparklineCellOptions.withTransform('timeseries_to_columns') 83 | + options.withShowHeader(true) 84 | + options.sortBy.withDesc(true), 85 | }, 86 | } 87 | -------------------------------------------------------------------------------- /templates/CPT/ingress-perf.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/ingress-performance-ocp/panels.libsonnet'; 2 | local queries = import '../../assets/ingress-performance-ocp/queries.libsonnet'; 3 | local variables = import '../../assets/ingress-performance-ocp/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('Ingress-perf') 7 | + g.dashboard.withDescription(||| 8 | Dashboard for Ingress Performance 9 | |||) 10 | + g.dashboard.withTags('ingress-perf') 11 | + g.dashboard.time.withFrom('now-12h') 12 | + g.dashboard.time.withTo('now') 13 | + g.dashboard.withTimezone('utc') 14 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 15 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 16 | + g.dashboard.withRefresh('') 17 | + g.dashboard.withEditable(false) 18 | + g.dashboard.graphTooltip.withSharedCrosshair() 19 | + g.dashboard.withVariables([ 20 | variables.Datasource, 21 | variables.platform, 22 | variables.clusterType, 23 | variables.workerNodesCount, 24 | variables.infraNodesType, 25 | variables.ocpMajorVersion, 26 | variables.uuid, 27 | variables.termination, 28 | variables.latency_metric, 29 | variables.compare_by, 30 | variables.all_uuids, 31 | ]) 32 | + g.dashboard.withPanels([ 33 | g.panel.row.new('SLIs - by Version') 34 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 35 | + g.panel.row.withCollapsed(true) 36 | + g.panel.row.withPanels([ 37 | panels.stat.withAvgThresholds('Average RPS - $termination', 'reqps', queries.avgRPSAll.query(), { x: 0, y: 1, w: 6, h: 3 }), 38 | panels.stat.withAvgTimeThresholds('$latency_metric - $termination', 'µs', queries.avgTime.query(), { x: 0, y: 1, w: 6, h: 3 }), 39 | ]), 40 | g.panel.row.new('Workloads summary') 41 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 42 | + g.panel.row.withCollapsed(true) 43 | + g.panel.row.withPanels([ 44 | panels.table.withWorkloadSummary('', '', queries.workloadSummary.query(), { x: 0, y: 2, w: 24, h: 6 }), 45 | ]), 46 | g.panel.row.new('$termination') 47 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 48 | + g.panel.row.withCollapsed(true) 49 | + g.panel.row.withRepeat('termination') 50 | + g.panel.row.withPanels([ 51 | panels.timeSeries.withMeanReq('RPS $termination trend', 'reqps', queries.trendRPS.query(), { x: 0, y: 15, w: 12, h: 8 }), 52 | panels.timeSeries.withMeanReq('$latency_metric trend', 'µs', queries.latencyTrend.query(), { x: 12, y: 15, w: 12, h: 8 }), 53 | panels.bargauge.withAvgTimeThresholds('RPS $termination', 'reqps', queries.terminationRPS.query(), { x: 0, y: 23, w: 12, h: 7 }), 54 | panels.bargauge.withAvgTimeThresholds('$latency_metric $termination', 'µs', queries.latencyTermination.query(), { x: 12, y: 23, w: 12, h: 7 }), 55 | panels.bargauge.withAvgTimeThresholds('HAProxy avg CPU usage $termination', 'percent', queries.HAProxyAvgCPUUsage.query(), { x: 0, y: 30, w: 12, h: 7 }), 56 | panels.bargauge.withAvgTimeThresholds('Infra nodes CPU usage $termination', 'percent', queries.InfraNodesCPUUsageEdge.query(), { x: 12, y: 30, w: 12, h: 7 }), 57 | panels.gauge.withAvgTimeThresholds('RPS data quality', 'none', queries.qualityRPS.query(), { x: 0, y: 30, w: 12, h: 4 }), 58 | panels.gauge.withAvgTimeThresholds('Data quality: $latency_metric', 'none', queries.dataQuality.query(), { x: 12, y: 30, w: 12, h: 4 }), 59 | panels.table.withTerminationRawData('$termination raw data', 'short', queries.rawData.query(), { x: 8, y: 118, w: 24, h: 8 }), 60 | ]), 61 | ]) 62 | -------------------------------------------------------------------------------- /templates/CPT/k8s-netperf.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/k8s-netperf/panels.libsonnet'; 2 | local queries = import '../../assets/k8s-netperf/queries.libsonnet'; 3 | local variables = import '../../assets/k8s-netperf/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('k8s-netperf') 7 | + g.dashboard.time.withFrom('now-6h') 8 | + g.dashboard.time.withTo('now') 9 | + g.dashboard.withTimezone('utc') 10 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 11 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 12 | + g.dashboard.withRefresh('') 13 | + g.dashboard.withEditable(true) 14 | + g.dashboard.graphTooltip.withSharedCrosshair() 15 | + g.dashboard.withVariables([ 16 | variables.Datasource, 17 | variables.platform, 18 | variables.workers, 19 | variables.uuid, 20 | variables.hostNetwork, 21 | variables.service, 22 | variables.streams, 23 | variables.throughput_profile, 24 | variables.latency_profile, 25 | variables.messageSize, 26 | variables.driver, 27 | variables.compare_by, 28 | ]) 29 | + g.dashboard.withPanels([ 30 | panels.row.base('Workload Summary', '', { x: 0, y: 0, w: 24, h: 0 }), 31 | panels.table.workloadSummary('', queries.summary.query('$throughput_profile', 'throughput'), { x: 0, y: 0, w: 24, h: 11 }), 32 | panels.row.base('$latency_profile', 'latency_profile', { x: 0, y: 0, w: 24, h: 1 }), 33 | panels.timeSeries.base('$latency_profile - $driver - $messageSize', queries.all.query('$latency_profile', 'latency'), { x: 0, y: 0, w: 24, h: 8 }), 34 | panels.row.base('$throughput_profile', 'throughput_profile', { x: 0, y: 9, w: 24, h: 1 }), 35 | panels.timeSeries.withThroughputOverrides('$throughput_profile - $driver - $messageSize', queries.all.query('$throughput_profile', 'throughput'), { x: 0, y: 10, w: 24, h: 8 }), 36 | panels.row.base('Parallelism $parallelism', 'parallelism', { x: 0, y: 18, w: 24, h: 1 }), 37 | panels.table.base('Throughput - Parallelism: $parallelism', queries.parallelismAll.query('$throughput_profile', 'throughput'), { x: 0, y: 19, w: 24, h: 11 }), 38 | panels.table.withLatencyOverrides('Latency - Parallelism: $parallelism', queries.parallelismAll.query('$latency_profile', 'latency'), { x: 0, y: 19, w: 24, h: 11 }), 39 | 40 | panels.row.base('Node to Node', '', { x: 0, y: 20, w: 24, h: 1 }), 41 | panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', true, false), { x: 0, y: 21, w: 11, h: 11 }), 42 | panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', true, false), { x: 0, y: 21, w: 11, h: 11 }), 43 | 44 | panels.row.base('Pod to Pod', '', { x: 0, y: 22, w: 24, h: 1 }), 45 | panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', false, false), { x: 0, y: 23, w: 11, h: 11 }), 46 | panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', false, false), { x: 0, y: 23, w: 11, h: 11 }), 47 | 48 | panels.row.base('Pod to Pod via Service', '', { x: 0, y: 24, w: 24, h: 1 }), 49 | panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', false, true), { x: 0, y: 25, w: 11, h: 11 }), 50 | panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', false, true), { x: 0, y: 25, w: 11, h: 11 }), 51 | ]) 52 | -------------------------------------------------------------------------------- /assets/ovn-monitoring/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | { 3 | stat: { 4 | local stat = g.panel.stat, 5 | local options = stat.options, 6 | 7 | base(title, unit, targets, gridPos): 8 | stat.new(title) 9 | + stat.datasource.withType('prometheus') 10 | + stat.datasource.withUid('$Datasource') 11 | + stat.standardOptions.withUnit(unit) 12 | + stat.queryOptions.withTargets(targets) 13 | + stat.gridPos.withX(gridPos.x) 14 | + stat.gridPos.withY(gridPos.y) 15 | + stat.gridPos.withH(gridPos.h) 16 | + stat.gridPos.withW(gridPos.w) 17 | + options.withJustifyMode('auto') 18 | + options.withGraphMode('area') 19 | + options.text.withTitleSize(12) 20 | + stat.standardOptions.color.withMode('thresholds') 21 | + options.withColorMode('none') 22 | + options.withColorMode('value'), 23 | 24 | genericstatLegendPanel(title, unit, targets, gridPos): 25 | self.base(title, unit, targets, gridPos) 26 | + stat.options.reduceOptions.withCalcs([ 27 | 'last', 28 | ]), 29 | 30 | genericstatThresoldPanel(title, unit, targets, gridPos): 31 | self.genericstatLegendPanel(title, unit, targets, gridPos) 32 | + stat.standardOptions.thresholds.withSteps([ 33 | { 34 | color: 'green', 35 | value: null, 36 | }, 37 | { 38 | color: 'orange', 39 | value: 0, 40 | }, 41 | { 42 | color: 'green', 43 | value: 1, 44 | }, 45 | ]) 46 | + options.withTextMode('name'), 47 | 48 | genericstatThresoldOVNControllerPanel(title, unit, targets, gridPos): 49 | self.genericstatLegendPanel(title, unit, targets, gridPos) 50 | + stat.standardOptions.thresholds.withSteps([ 51 | { 52 | color: 'green', 53 | value: null, 54 | }, 55 | ]) 56 | + options.withTextMode('auto'), 57 | }, 58 | 59 | timeSeries: { 60 | local timeSeries = g.panel.timeSeries, 61 | local custom = timeSeries.fieldConfig.defaults.custom, 62 | local options = timeSeries.options, 63 | 64 | base(title, unit, targets, gridPos): 65 | timeSeries.new(title) 66 | + timeSeries.queryOptions.withTargets(targets) 67 | + timeSeries.datasource.withType('prometheus') 68 | + timeSeries.datasource.withUid('$Datasource') 69 | + timeSeries.standardOptions.withUnit(unit) 70 | + timeSeries.gridPos.withX(gridPos.x) 71 | + timeSeries.gridPos.withY(gridPos.y) 72 | + timeSeries.gridPos.withH(gridPos.h) 73 | + timeSeries.gridPos.withW(gridPos.w) 74 | + custom.withDrawStyle('line') 75 | + custom.withLineInterpolation('linear') 76 | + custom.withBarAlignment(0) 77 | + custom.withLineWidth(1) 78 | + custom.withFillOpacity(10) 79 | + custom.withGradientMode('none') 80 | + custom.withSpanNulls(false) 81 | + custom.withPointSize(5) 82 | + custom.withSpanNulls(false) 83 | + custom.stacking.withMode('none') 84 | + custom.withShowPoints('never') 85 | + options.tooltip.withMode('multi') 86 | + options.tooltip.withSort('desc') 87 | + options.legend.withShowLegend(true) 88 | + options.legend.withPlacement('bottom'), 89 | 90 | genericTimeSeriesLegendPanel(title, unit, targets, gridPos): 91 | self.base(title, unit, targets, gridPos) 92 | + options.legend.withCalcs([ 93 | 'mean', 94 | 'max', 95 | ]) 96 | + options.legend.withDisplayMode('table') 97 | + { 98 | options+: { 99 | legend+: { 100 | sortBy: 'Max', 101 | sortDesc: true, 102 | }, 103 | }, 104 | }, 105 | 106 | 107 | }, 108 | } 109 | -------------------------------------------------------------------------------- /assets/kube-burner-report-ocp-wrapper/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'elasticsearch') 7 | + var.datasource.withRegex('/.*kube-burner.*/') 8 | + var.query.generalOptions.withLabel('Datasource'), 9 | 10 | platform: 11 | var.query.new('platform', '{"find": "terms", "field": "platform.keyword"}') 12 | + var.query.withDatasourceFromVariable(self.Datasource) 13 | + var.query.withRefresh(2) 14 | + var.query.selectionOptions.withMulti() 15 | + var.query.selectionOptions.withIncludeAll(false) 16 | + var.query.generalOptions.withLabel('Platform'), 17 | 18 | sdn: 19 | var.query.new('sdn', '{"find": "terms", "field": "sdnType.keyword", "query": "platform.keyword: $platform"}') 20 | + var.query.withDatasourceFromVariable(self.Datasource) 21 | + var.query.withRefresh(1) 22 | + var.query.selectionOptions.withMulti() 23 | + var.query.selectionOptions.withIncludeAll(false) 24 | + var.query.generalOptions.withLabel('SDN type'), 25 | 26 | job: 27 | var.query.new('job', '{"find": "terms", "field": "jobConfig.name.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn" AND NOT jobConfig.name.keyword: garbage-collection}') 28 | + var.query.withDatasourceFromVariable(self.Datasource) 29 | + var.query.withRefresh(1) 30 | + var.query.selectionOptions.withMulti(false) 31 | + var.query.selectionOptions.withIncludeAll(false) 32 | + var.query.generalOptions.withLabel('Job'), 33 | 34 | nodes: 35 | var.query.new('nodes', '{"find": "terms", "field": "totalNodes", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job"}') 36 | + var.query.withDatasourceFromVariable(self.Datasource) 37 | + var.query.withRefresh(1) 38 | + var.query.selectionOptions.withMulti(false) 39 | + var.query.selectionOptions.withIncludeAll(false) 40 | + var.query.generalOptions.withLabel('nodes'), 41 | 42 | uuid: 43 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND totalNodes: $nodes"}') 44 | + var.query.withDatasourceFromVariable(self.Datasource) 45 | + var.query.withRefresh(2) 46 | + var.query.selectionOptions.withMulti(false) 47 | + var.query.selectionOptions.withIncludeAll(false) 48 | + var.query.generalOptions.withLabel('UUID'), 49 | 50 | master: 51 | var.query.new('master', '{ "find" : "terms", "field": "labels.node.keyword", "query": "metricName.keyword: nodeRoles AND labels.role.keyword: master AND uuid.keyword: $uuid"}') 52 | + var.query.withDatasourceFromVariable(self.Datasource) 53 | + var.query.withRefresh(2) 54 | + var.query.selectionOptions.withMulti(true) 55 | + var.query.selectionOptions.withIncludeAll(false) 56 | + var.query.generalOptions.withLabel('Master nodes'), 57 | 58 | worker: 59 | var.query.new('worker', '{ "find" : "terms", "field": "labels.node.keyword", "query": "metricName.keyword: nodeRoles AND labels.role.keyword: worker AND uuid.keyword: $uuid"}') 60 | + var.query.withDatasourceFromVariable(self.Datasource) 61 | + var.query.withRefresh(2) 62 | + var.query.selectionOptions.withMulti(true) 63 | + var.query.selectionOptions.withIncludeAll(false) 64 | + var.query.generalOptions.withLabel('Worker nodes'), 65 | 66 | infra: 67 | var.query.new('infra', '{ "find" : "terms", "field": "labels.node.keyword", "query": "metricName.keyword: nodeRoles AND labels.role.keyword: infra AND uuid.keyword: $uuid"}') 68 | + var.query.withDatasourceFromVariable(self.Datasource) 69 | + var.query.withRefresh(2) 70 | + var.query.selectionOptions.withMulti(true) 71 | + var.query.selectionOptions.withIncludeAll(false) 72 | + var.query.generalOptions.withLabel('Infra nodes'), 73 | 74 | latencyPercentile: 75 | var.custom.new('latencyPercentile', ['P99', 'P95', 'P50'],) 76 | + var.custom.generalOptions.withLabel('Latency percentile'), 77 | } 78 | -------------------------------------------------------------------------------- /templates/General/api-performance-overview.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/api-performance-overview/panels.libsonnet'; 2 | local queries = import '../../assets/api-performance-overview/queries.libsonnet'; 3 | local variables = import '../../assets/api-performance-overview/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('API Performance Dashboard') 7 | + g.dashboard.withDescription(||| 8 | Dashboard for Api-performance-overview 9 | |||) 10 | + g.dashboard.withTags('Api-performance') 11 | + g.dashboard.time.withFrom('now-1h') 12 | + g.dashboard.time.withTo('now') 13 | + g.dashboard.withTimezone('utc') 14 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 15 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 16 | + g.dashboard.withRefresh('30s') 17 | + g.dashboard.withEditable(false) 18 | + g.dashboard.graphTooltip.withSharedCrosshair() 19 | + g.dashboard.withVariables([ 20 | variables.Datasource, 21 | variables.apiserver, 22 | variables.instance, 23 | variables.resource, 24 | variables.code, 25 | variables.verb, 26 | variables.flow_schema, 27 | variables.priority_level, 28 | variables.interval, 29 | ]) 30 | + g.dashboard.withPanels([ 31 | panels.timeSeries.legendRightPlacement('request duration - 99th quantile', 's', queries.request_duration_99th_quantile.query(), { x: 0, y: 0, w: 12, h: 8 }), 32 | panels.timeSeries.legendRightPlacement('request rate - by instance', 'short', queries.requestRateByInstance.query(), { x: 12, y: 0, w: 12, h: 8 }), 33 | panels.timeSeries.legendRightPlacement('request duration - 99th quantile - by resource', 's', queries.requestDuarationByResource.query(), { x: 0, y: 8, w: 12, h: 8 }), 34 | panels.timeSeries.legendRightPlacement('request rate - by resource', 'short', queries.requestRateByResource.query(), { x: 12, y: 8, w: 12, h: 8 }), 35 | panels.timeSeries.legendBottomPlacement('request duration - read vs write', 's', queries.requestDurationReadWrite.query(), { x: 0, y: 16, w: 12, h: 8 }), 36 | panels.timeSeries.legendBottomPlacement('request rate - read vs write', 'short', queries.requestRateReadWrite.query(), { x: 12, y: 16, w: 12, h: 8 }), 37 | panels.timeSeries.legendBottomPlacement('requests dropped rate', 'short', queries.requestRateDropped.query(), { x: 0, y: 24, w: 12, h: 8 }), 38 | panels.timeSeries.legendBottomPlacement('requests terminated rate', 'short', queries.requestRateTerminated.query(), { x: 12, y: 24, w: 12, h: 8 }), 39 | panels.timeSeries.legendRightPlacement('requests status rate', 'short', queries.requestRateStatus.query(), { x: 0, y: 32, w: 12, h: 8 }), 40 | panels.timeSeries.legendRightPlacement('long running requests', 'short', queries.requestsLongRunning.query(), { x: 12, y: 32, w: 12, h: 8 }), 41 | panels.timeSeries.legendRightPlacement('request in flight', 'short', queries.requestInFlight.query(), { x: 0, y: 40, w: 12, h: 8 }), 42 | panels.timeSeries.legendRightPlacement('response size - 99th quantile', 'bytes', queries.responseSize99Quatile.query(), { x: 12, y: 40, w: 12, h: 8 }), 43 | panels.timeSeries.legendRightPlacement('p&f - request queue length', 'short', queries.requestQueueLengthPandF.query(), { x: 0, y: 48, w: 12, h: 8 }), 44 | panels.timeSeries.withRequestWaitDurationAggregations('p&f - request wait duration - 99th quantile', 's', queries.requestWaitDuration99QuatilePandF.query(), { x: 12, y: 48, w: 12, h: 8 }), 45 | panels.timeSeries.legendRightPlacement('p&f - request dispatch rate', 'short', queries.requestDispatchRatePandF.query(), { x: 0, y: 64, w: 12, h: 8 }), 46 | panels.timeSeries.legendRightPlacement('p&f - request execution duration', 's', queries.requestExecutionDurationPandF.query(), { x: 12, y: 64, w: 12, h: 8 }), 47 | panels.timeSeries.legendRightPlacement('p&f - pending in queue', 'short', queries.pendingInQueuePandF.query(), { x: 0, y: 72, w: 12, h: 8 }), 48 | panels.timeSeries.legendRightPlacement('p&f - concurrency limit by kube-apiserver', 'short', queries.concurrencyLimitByKubeapiserverPandF.query(), { x: 12, y: 72, w: 12, h: 8 }), 49 | ]) 50 | -------------------------------------------------------------------------------- /assets/ingress-performance-ocp/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'elasticsearch') 7 | + var.datasource.withRegex('/.*Ingress.*/') 8 | + var.query.generalOptions.withLabel('Datasource'), 9 | 10 | platform: 11 | var.query.new('platform', '{"find": "terms", "field": "platform.keyword"}') 12 | + var.query.withDatasourceFromVariable(self.Datasource) 13 | + var.query.withRefresh(2) 14 | + var.query.selectionOptions.withMulti() 15 | + var.query.selectionOptions.withIncludeAll(false) 16 | + var.query.generalOptions.withLabel('Platform'), 17 | clusterType: 18 | var.query.new('clusterType', '{"find": "terms", "field": "clusterType.keyword", "query": "platform.keyword: $platform"}') 19 | + var.query.withDatasourceFromVariable(self.Datasource) 20 | + var.query.withRefresh(2) 21 | + var.query.selectionOptions.withMulti(true) 22 | + var.query.selectionOptions.withIncludeAll(false) 23 | + var.query.generalOptions.withLabel('Cluster Type'), 24 | 25 | 26 | workerNodesCount: 27 | var.query.new('workerNodesCount', '{"find": "terms", "field": "workerNodesCount", "query": "platform.keyword: $platform AND clusterType.keyword: $clusterType"}') 28 | + var.query.withDatasourceFromVariable(self.Datasource) 29 | + var.query.withRefresh(2) 30 | + var.query.selectionOptions.withMulti(true) 31 | + var.query.selectionOptions.withIncludeAll(false) 32 | + var.query.generalOptions.withLabel('Worker Nodes Count'), 33 | 34 | infraNodesType: 35 | var.query.new('infraNodesType', '{"find": "terms", "field": "infraNodesType.keyword", "query": "platform.keyword: $platform AND workerNodesCount: $workerNodesCount AND clusterType.keyword: $clusterType"}') 36 | + var.query.withDatasourceFromVariable(self.Datasource) 37 | + var.query.withRefresh(2) 38 | + var.query.selectionOptions.withMulti(true) 39 | + var.query.selectionOptions.withIncludeAll(false) 40 | + var.query.generalOptions.withLabel('Infra Nodes Type'), 41 | 42 | ocpMajorVersion: 43 | var.query.new('ocpMajorVersion', '{"find": "terms", "field": "ocpMajorVersion.keyword", "query": "platform.keyword: $platform AND infraNodesType.keyword: $infraNodesType AND workerNodesCount: $workerNodesCount AND clusterType.keyword: $clusterType"}') 44 | + var.query.withDatasourceFromVariable(self.Datasource) 45 | + var.query.withRefresh(2) 46 | + var.query.selectionOptions.withMulti(true) 47 | + var.query.selectionOptions.withIncludeAll(false) 48 | + var.query.generalOptions.withLabel('Major Version'), 49 | 50 | uuid: 51 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword", "query": "platform.keyword: $platform AND infraNodesType.keyword: $infraNodesType AND workerNodesCount: $workerNodesCount AND clusterType.keyword: $clusterType AND ocpMajorVersion.keyword: $ocpMajorVersion"}') 52 | + var.query.withDatasourceFromVariable(self.Datasource) 53 | + var.query.withRefresh(2) 54 | + var.query.selectionOptions.withMulti(true) 55 | + var.query.selectionOptions.withIncludeAll(false) 56 | + var.query.generalOptions.withLabel('UUID'), 57 | 58 | termination: 59 | var.query.new('termination', '{"find": "terms", "field": "config.termination.keyword"}') 60 | + var.query.withDatasourceFromVariable(self.Datasource) 61 | + var.query.withRefresh(2) 62 | + var.query.selectionOptions.withMulti(true) 63 | + var.query.selectionOptions.withIncludeAll(true) 64 | + var.query.generalOptions.withLabel('Termination'), 65 | 66 | latency_metric: 67 | var.custom.new('latency_metric', ['avg_lat_us', 'max_lat_us', 'p99_lat_us', 'p95_lat_us', 'p90_lat_us'],) 68 | + var.custom.generalOptions.withLabel('Latency Metric'), 69 | 70 | compare_by: 71 | var.custom.new('compare_by', ['uuid.keyword', 'ocpVersion.keyword', 'ocpMajorVersion.keyword', 'clusterName.keyword', 'haproxyVersion.keyword'],) 72 | + var.custom.generalOptions.withLabel('Compare By'), 73 | 74 | all_uuids: 75 | var.query.new('all_uuids', '{"find": "terms", "field": "uuid.keyword", "query": "platform.keyword: $platform AND infraNodesType.keyword: $infraNodesType AND workerNodesCount: $workerNodesCount AND clusterType.keyword: $clusterType AND ocpMajorVersion.keyword: $ocpMajorVersion"}') 76 | + var.query.withDatasourceFromVariable(self.Datasource) 77 | + var.query.withRefresh(2) 78 | + var.query.selectionOptions.withMulti(false) 79 | + var.query.selectionOptions.withIncludeAll(true), 80 | } 81 | -------------------------------------------------------------------------------- /assets/k8s-netperf/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'elasticsearch') 7 | + var.datasource.withRegex('/(.*netperf.*)/') 8 | + var.query.generalOptions.withLabel('Datasource') 9 | + var.query.selectionOptions.withMulti(false) 10 | + var.query.withRefresh(1) 11 | + var.query.selectionOptions.withIncludeAll(false), 12 | 13 | platform: 14 | var.query.new('platform', '{"find": "terms", "field": "metadata.platform.keyword"}') 15 | + var.query.withDatasourceFromVariable(self.Datasource) 16 | + var.query.withRefresh(2) 17 | + var.query.selectionOptions.withMulti(true) 18 | + var.query.selectionOptions.withIncludeAll(true) 19 | + var.query.generalOptions.withLabel('Platform'), 20 | 21 | workers: 22 | var.query.new('workerNodesType', '{"find": "terms", "field": "metadata.workerNodesType.keyword", "query": "metadata.platform.keyword: $platform"}') 23 | + var.query.withDatasourceFromVariable(self.Datasource) 24 | + var.query.withRefresh(2) 25 | + var.query.selectionOptions.withMulti(false) 26 | + var.query.selectionOptions.withIncludeAll(true) 27 | + var.query.generalOptions.withLabel('workers'), 28 | 29 | uuid: 30 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword", "query":"metadata.platform.keyword: $platform AND metadata.workerNodesType.keyword: $workerNodesType"}') 31 | + var.query.withDatasourceFromVariable(self.Datasource) 32 | + var.query.withRefresh(2) 33 | + var.query.selectionOptions.withMulti(true) 34 | + var.query.selectionOptions.withIncludeAll(true) 35 | + var.query.generalOptions.withLabel('uuid'), 36 | 37 | hostNetwork: 38 | var.custom.new('hostNetwork', ['true', 'false'],) 39 | + var.custom.selectionOptions.withMulti(true) 40 | + var.custom.selectionOptions.withIncludeAll(false) 41 | + var.custom.generalOptions.withLabel('hostNetwork'), 42 | 43 | service: 44 | var.custom.new('service', ['true', 'false'],) 45 | + var.custom.selectionOptions.withMulti(true) 46 | + var.custom.selectionOptions.withIncludeAll(true) 47 | + var.custom.generalOptions.withLabel('service'), 48 | 49 | streams: 50 | var.query.new('parallelism', '{"find": "terms", "field": "parallelism", "query":"uuid: $uuid"}') 51 | + var.query.withDatasourceFromVariable(self.Datasource) 52 | + var.query.withRefresh(2) 53 | + var.query.selectionOptions.withMulti(true) 54 | + var.query.selectionOptions.withIncludeAll(true) 55 | + var.query.generalOptions.withLabel('streams'), 56 | 57 | throughput_profile: 58 | var.query.new('throughput_profile', '{"find": "terms", "field": "profile.keyword", "query":"uuid:$uuid"}') 59 | + var.query.withDatasourceFromVariable(self.Datasource) 60 | + var.query.withRegex('.*STREAM.*') 61 | + var.query.withRefresh(2) 62 | + var.query.selectionOptions.withMulti(true) 63 | + var.query.selectionOptions.withIncludeAll(true) 64 | + var.query.generalOptions.withLabel('Throughput profile'), 65 | 66 | latency_profile: 67 | var.query.new('latency_profile', '{"find": "terms", "field": "profile.keyword", "query":"uuid:$uuid"}') 68 | + var.query.withDatasourceFromVariable(self.Datasource) 69 | + var.query.withRegex('.*RR.*') 70 | + var.query.withRefresh(2) 71 | + var.query.selectionOptions.withMulti(true) 72 | + var.query.selectionOptions.withIncludeAll(true) 73 | + var.query.generalOptions.withLabel('Latency profile'), 74 | 75 | messageSize: 76 | var.query.new('messageSize', '{"find": "terms", "field": "messageSize","query":"uuid:$uuid"}') 77 | + var.query.withDatasourceFromVariable(self.Datasource) 78 | + var.query.withRefresh(2) 79 | + var.query.selectionOptions.withMulti(true) 80 | + var.query.selectionOptions.withIncludeAll(true) 81 | + var.query.generalOptions.withLabel('messageSize'), 82 | 83 | driver: 84 | var.query.new('driver', '{"find": "terms", "field": "driver.keyword","query":"uuid:$uuid"}') 85 | + var.query.withDatasourceFromVariable(self.Datasource) 86 | + var.query.withRefresh(1) 87 | + var.query.selectionOptions.withMulti(false) 88 | + var.query.selectionOptions.withIncludeAll(false) 89 | + var.query.generalOptions.withLabel('Driver'), 90 | 91 | compare_by: 92 | var.custom.new('compare_by', ['uuid.keyword', 'metadata.ocpVersion.keyword', 'metadata.clusterName.keyword', 'metadata.ocpShortVersion.keyword', 'metadata.platform.keyword'],) 93 | + var.custom.selectionOptions.withMulti(false) 94 | + var.custom.selectionOptions.withIncludeAll(false) 95 | + var.custom.generalOptions.withLabel('Compare By'), 96 | } 97 | -------------------------------------------------------------------------------- /assets/kube-burner-report-mode/variables.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local var = g.dashboard.variable; 3 | 4 | { 5 | Datasource: 6 | var.datasource.new('Datasource', 'elasticsearch') 7 | + var.datasource.withRegex('/.*kube-burner.*/') 8 | + var.query.withRefresh(1) 9 | + var.query.selectionOptions.withIncludeAll(false) 10 | + var.query.selectionOptions.withMulti(false), 11 | 12 | platform: 13 | var.query.new('platform', '{"find": "terms", "field": "platform.keyword"}') 14 | + var.query.withDatasourceFromVariable(self.Datasource) 15 | + var.query.withRefresh(2) 16 | + var.query.selectionOptions.withMulti(true) 17 | + var.query.selectionOptions.withIncludeAll(false) 18 | + var.query.generalOptions.withLabel('Platform'), 19 | 20 | sdn: 21 | var.query.new('sdn', '{"find": "terms", "field": "sdnType.keyword", "query": "platform.keyword: $platform"}') 22 | + var.query.withDatasourceFromVariable(self.Datasource) 23 | + var.query.withRefresh(1) 24 | + var.query.selectionOptions.withMulti(true) 25 | + var.query.selectionOptions.withIncludeAll(false) 26 | + var.query.generalOptions.withLabel('SDN type'), 27 | 28 | clusterType: 29 | var.query.new('clusterType', '{"find": "terms", "field": "clusterType.keyword", "query": "platform.keyword: $platform"}') 30 | + var.query.withDatasourceFromVariable(self.Datasource) 31 | + var.query.withRefresh(1) 32 | + var.query.selectionOptions.withMulti(true) 33 | + var.query.selectionOptions.withIncludeAll(true) 34 | + var.query.generalOptions.withLabel('Cluster Type'), 35 | 36 | job: 37 | var.query.new('job', '{"find": "terms", "field": "jobConfig.name.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND clusterType.keyword: $clusterType AND NOT jobConfig.name.keyword: garbage-collection"}') 38 | + var.query.withDatasourceFromVariable(self.Datasource) 39 | + var.query.withRefresh(1) 40 | + var.query.selectionOptions.withMulti(false) 41 | + var.query.selectionOptions.withIncludeAll(false) 42 | + var.query.generalOptions.withLabel('Job'), 43 | 44 | workerNodesCount: 45 | var.query.new('workerNodesCount', '{"find": "terms", "field": "workerNodesCount", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND clusterType.keyword: $clusterType"}') 46 | + var.query.withDatasourceFromVariable(self.Datasource) 47 | + var.query.withRefresh(1) 48 | + var.query.selectionOptions.withMulti(false) 49 | + var.query.selectionOptions.withIncludeAll(false) 50 | + var.query.generalOptions.withLabel('Workers'), 51 | 52 | ocpMajorVersion: 53 | var.query.new('ocpMajorVersion', '{"find": "terms", "field": "ocpMajorVersion.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND workerNodesCount: $workerNodesCount AND clusterType.keyword: $clusterType"}') 54 | + var.query.withDatasourceFromVariable(self.Datasource) 55 | + var.query.withRefresh(1) 56 | + var.query.selectionOptions.withMulti(true) 57 | + var.query.selectionOptions.withIncludeAll(false) 58 | + var.query.generalOptions.withLabel('OCP Major'), 59 | 60 | uuid: 61 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND workerNodesCount: $workerNodesCount AND ocpMajorVersion.keyword: $ocpMajorVersion AND clusterType.keyword: $clusterType"}') 62 | + var.query.withDatasourceFromVariable(self.Datasource) 63 | + var.query.withRefresh(1) 64 | + var.query.selectionOptions.withMulti(true) 65 | + var.query.selectionOptions.withIncludeAll(false) 66 | + var.query.generalOptions.withLabel('UUID'), 67 | 68 | compare_by: 69 | var.custom.new('compare_by', ['uuid', '.ocpVersion', 'ocpMajorVersion']) 70 | + var.custom.generalOptions.withLabel('Compare by') 71 | + var.custom.selectionOptions.withIncludeAll(false) 72 | + var.custom.selectionOptions.withMulti(false), 73 | 74 | component: 75 | var.custom.new('component', ['crio', 'kube-apiserver', 'kube-controller-manager', 'kubelet', 'multus', 'openshift-apiserver', 'openshift-controller-manager', 'ovn-control-plane', 'ovnkube-node', 'prometheus', 'router']) 76 | + var.custom.generalOptions.withLabel('Component') 77 | + var.custom.selectionOptions.withIncludeAll(true) 78 | + var.custom.selectionOptions.withMulti(true), 79 | 80 | node_roles: 81 | var.custom.new('node_roles', ['masters', 'workers', 'infra']) 82 | + var.custom.generalOptions.withLabel('Node roles') 83 | + var.custom.selectionOptions.withIncludeAll(false) 84 | + var.custom.selectionOptions.withMulti(true), 85 | } 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Jsonnet CI](https://github.com/cloud-bulldozer/performance-dashboards/workflows/Jsonnet%20CI/badge.svg?branch=master) 2 | # Performance dashboards 3 | 4 | ## Jsonnet grafana dashboards 5 | 6 | Managing grafana dashboards in a CVS is not an easy task, since the exported dashboards by Grafana do not have always the same json layout due to the nature of the own json format. 7 | When exporting a Grafana dashboard json keys may be exported in different order. In addition, dealing with such complex json files is not an easy task since it's usually required import & export the full dashboard to perform a minimal modification or update. 8 | 9 | Jsonnet based dashboards is an effort to improve the manageability of grafana json dashboards by leveraging the libraries included at the project [grafonnet-lib](https://github.com/grafana/grafonnet-lib). But grafonnet-lib had a hard time to keep up with Grafana development, this resulted in it being under-maintained. [Grafonnet](https://github.com/grafana/grafonnet) jsonnet library which is generated from JSON Schemas generated by [Grok](https://github.com/grafana/grok), these schemas are generated directly from the Grafana repository to ensure Grafonnet can keep up with Grafana development. Using this mechanism of dashboards as code will improve versioning and make simplify collaboration. 10 | 11 | ## How to 12 | 13 | To make a change, simply update the desired .jsonnet dashboard file and push your changes. The project is configured to automatically render updated .jsonnet files into .json format. 14 | 15 | Alternatively, you can render the jsonnet files manually by doing the following: 16 | 17 | Render a jsonnet file is as simple as executing `jsonnet `. The jsonnet binary is not included in this repo, though binary builds can be found in its official [repository](https://github.com/google/jsonnet/releases). 18 | A makefile has been included to automate jsonnet formatting and rendering tasks. Executing `make` downloads the jsonnet binary and renders the templates at the *rendered* directory. 19 | 20 | ``` 21 | $ make 22 | mkdir -p bin rendered 23 | Downloading jsonnet binary 24 | curl -s -L https://github.com/google/go-jsonnet/releases/download/v0.20.0/go-jsonnet_0.20.0_Linux_x86_64.tar.gz | tar xz -C bin 25 | Downloading jb binary 26 | curl -s -L https://github.com/jsonnet-bundler/jsonnet-bundler/releases/latest/download/jb-linux-amd64 -o bin/jb 27 | chmod +x bin/jb 28 | Downloading vendor files 29 | cd templates && ../bin/jb install && cd ../ 30 | GET https://github.com/grafana/grafonnet/archive/f40876da40d787e9c288de0b547ac85597c781d9.tar.gz 200 31 | GET https://github.com/grafana/grafonnet/archive/f40876da40d787e9c288de0b547ac85597c781d9.tar.gz 200 32 | GET https://github.com/jsonnet-libs/docsonnet/archive/cc9df63eaca56f39e8e4e1ce192141333257b08d.tar.gz 200 33 | GET https://github.com/jsonnet-libs/xtd/archive/0256a910ac71f0f842696d7bca0bf01ea77eb654.tar.gz 200 34 | bin/jsonnetfmt -i templates/General/ocp-performance-v2.jsonnet 35 | Building template templates/General/ocp-performance-v2.jsonnet 36 | mkdir -p rendered/General/ 37 | bin/jsonnet -J ./templates/vendor templates/General/ocp-performance-v2.jsonnet > rendered/General/ocp-performance-v2.json 38 | ``` 39 | In order to clean up the environment execute `make clean`. 40 | 41 | In order to lint the templates using `jsonnetfmt`execute `make format` 42 | 43 | ``` 44 | $ make clean 45 | Cleaning up 46 | rm -rf bin rendered tmp templates/grafonnet-lib 47 | ``` 48 | 49 | ## Templates available 50 | 51 | Dashboards Available after Migration to Grafonnet v10.1.0(latest): 52 | - CPT 53 | - [x] Ingress Perf Dashboard. 54 | - [x] K8s Netperf Dashboard. 55 | - [x] Kube-burner Report Mode Dashboard. 56 | - [x] Kube Burner Report OCP Wrapper dashboard. 57 | - General 58 | - [x] API Performance Dashboard. 59 | - [x] Cilium K8s Performance Dashboard. 60 | - [x] Etcd Dashboard. 61 | - [x] Hypershift Performance Dashboard. 62 | - [x] K8s Performance Dashboard. 63 | - [x] OpenShift Performance Dashboard. 64 | - [x] OVN Dashboard. 65 | - [x] Pgbench Dashboard. 66 | - [x] UPerf Results Dashboard. 67 | - [x] Vegeta Dashboard. 68 | - [x] YCSB Dashboard. 69 | 70 | ## Dittybopper 71 | 72 | Dittybopper is a tool meant to deploy a grafana instance with certain dashboards on top of a running OpenShift 4.X cluster. Find more info [here](./dittybopper/README.md) 73 | 74 | ## Contributing 75 | 76 | To contribute to this repository submit a PR with your changes. If you're adding or modifying a panel, a screenshot with the changes you've make will ease 77 | reviewers work. 78 | 79 | In addition, make sure to lint your modifications to jsonnet files if you don't want our CI to complain. You can do that executing `make format`. 80 | 81 | ## Tested versions 82 | 83 | The dashboards from this repository have been tested with Grafana 9.X -------------------------------------------------------------------------------- /templates/General/k8s-perf.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/k8s-perf/panels.libsonnet'; 2 | local queries = import '../../assets/k8s-perf/queries.libsonnet'; 3 | local variables = import '../../assets/k8s-perf/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('k8s Performance dashboard') 7 | + g.dashboard.time.withFrom('now-1h') 8 | + g.dashboard.time.withTo('now') 9 | + g.dashboard.withTimezone('utc') 10 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 11 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 12 | + g.dashboard.withRefresh('30s') 13 | + g.dashboard.withEditable(false) 14 | + g.dashboard.graphTooltip.withSharedCrosshair() 15 | + g.dashboard.withVariables([ 16 | variables.Datasource, 17 | variables._worker_node, 18 | variables.namespace, 19 | variables.block_device, 20 | variables.net_device, 21 | variables.interval, 22 | ]) 23 | 24 | + g.dashboard.withPanels([ 25 | g.panel.row.new('Cluster Details') 26 | + g.panel.row.withCollapsed(true) 27 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 28 | + g.panel.row.withPanels([ 29 | panels.stat.genericStatLegendPanel('Current Node Count', 'none', queries.currentNodeCount.query(), { x: 0, y: 4, w: 8, h: 3 }), 30 | panels.stat.genericStatLegendPanel('Current namespace Count', 'none', queries.currentNamespaceCount.query(), { x: 8, y: 4, w: 8, h: 3 }), 31 | panels.stat.genericStatLegendPanel('Current Pod Count', 'none', queries.currentPodCount.query(), { x: 16, y: 4, w: 8, h: 3 }), 32 | panels.timeSeries.genericTimeSeriesPanel('Number of nodes', 'none', queries.numberOfNodes.query(), { x: 0, y: 12, w: 8, h: 8 }), 33 | panels.timeSeries.genericTimeSeriesPanel('Namespace count', 'none', queries.namespaceCount.query(), { x: 8, y: 12, w: 8, h: 8 }), 34 | panels.timeSeries.genericTimeSeriesPanel('Pod count', 'none', queries.podCount.query(), { x: 16, y: 12, w: 8, h: 8 }), 35 | panels.timeSeries.genericTimeSeriesPanel('Secret & configmap count', 'none', queries.secretAndConfigMapCount.query(), { x: 0, y: 20, w: 8, h: 8 }), 36 | panels.timeSeries.genericTimeSeriesPanel('Deployment count', 'none', queries.deployCount.query(), { x: 8, y: 20, w: 8, h: 8 }), 37 | panels.timeSeries.genericTimeSeriesPanel('Services count', 'none', queries.serviceCount.query(), { x: 16, y: 20, w: 8, h: 8 }), 38 | panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container RSS', 'bytes', queries.top10ContainerRSS.query(), { x: 0, y: 28, w: 24, h: 8 }), 39 | panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container CPU', 'percent', queries.top10ContainerCPU.query(), { x: 0, y: 36, w: 12, h: 8 }), 40 | panels.timeSeries.genericTimeSeriesPanel('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 36, w: 12, h: 8 }), 41 | panels.timeSeries.genericTimeSeriesLegendPanel('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 44, w: 24, h: 8 }), 42 | ]), 43 | 44 | g.panel.row.new('Node: $_worker_node') 45 | + g.panel.row.withCollapsed(true) 46 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 47 | + g.panel.row.withRepeat('_worker_node') 48 | + g.panel.row.withPanels([ 49 | panels.timeSeries.genericTimeSeriesLegendPanel('CPU Basic: $_worker_node ', 'percent', queries.basicCPU.query('$_worker_node'), { x: 0, y: 0, w: 12, h: 8 }), 50 | panels.timeSeries.genericTimeSeriesLegendPanel('System Memory: $_worker_node ', 'bytes', queries.systemMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }), 51 | panels.timeSeries.genericTimeSeriesLegendPanel('Disk throughput: $_worker_node ', 'Bps', queries.diskThroughput.query('$_worker_node'), { x: 0, y: 8, w: 12, h: 8 }), 52 | panels.timeSeries.genericTimeSeriesLegendPanel('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query('$_worker_node'), { x: 12, y: 8, w: 12, h: 8 }), 53 | panels.timeSeries.genericTimeSeriesLegendPanel('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query('$_worker_node'), { x: 0, y: 16, w: 12, h: 8 }), 54 | panels.timeSeries.genericTimeSeriesLegendPanel('Network Packets: $_worker_node', 'pps', queries.networkPackets.query('$_worker_node'), { x: 12, y: 16, w: 12, h: 8 }), 55 | panels.timeSeries.genericTimeSeriesLegendPanel('Network packets drop: $_worker_node', 'pps', queries.networkDrop.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), 56 | panels.timeSeries.genericTimeSeriesLegendPanel('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), 57 | panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainersCPU.query('$_worker_node'), { x: 0, y: 32, w: 12, h: 8 }), 58 | panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainersRSS.query(' $_worker_node'), { x: 12, y: 32, w: 12, h: 8 }), 59 | 60 | ]), 61 | ]) 62 | -------------------------------------------------------------------------------- /assets/etcd-on-cluster-dashboard/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | timeSeries: { 5 | local timeSeries = g.panel.timeSeries, 6 | local custom = timeSeries.fieldConfig.defaults.custom, 7 | local options = timeSeries.options, 8 | local standardOptions = timeSeries.standardOptions, 9 | local byRegexp = timeSeries.standardOptions.override.byRegexp, 10 | 11 | base(title, unit, targets, gridPos): 12 | timeSeries.new(title) 13 | + timeSeries.queryOptions.withTargets(targets) 14 | + timeSeries.datasource.withType('prometheus') 15 | + timeSeries.standardOptions.withUnit(unit) 16 | + timeSeries.gridPos.withX(gridPos.x) 17 | + timeSeries.gridPos.withY(gridPos.y) 18 | + timeSeries.gridPos.withH(gridPos.h) 19 | + timeSeries.gridPos.withW(gridPos.w) 20 | + custom.withDrawStyle('line') 21 | + custom.withLineInterpolation('linear') 22 | + custom.withBarAlignment(0) 23 | + custom.withLineWidth(1) 24 | + custom.withFillOpacity(10) 25 | + custom.withGradientMode('none') 26 | + custom.withSpanNulls(false) 27 | + custom.withPointSize(5) 28 | + custom.withSpanNulls(false) 29 | + custom.stacking.withMode('none') 30 | + custom.withShowPoints('never') 31 | + options.tooltip.withMode('multi') 32 | + options.tooltip.withSort('desc') 33 | + options.legend.withShowLegend(true) 34 | + options.legend.withPlacement('bottom'), 35 | 36 | generalUsageAgg(title, unit, targets, gridPos): 37 | self.base(title, unit, targets, gridPos) 38 | + options.legend.withCalcs([ 39 | 'mean', 40 | 'max', 41 | ]) 42 | + options.legend.withDisplayMode('table') 43 | + options.legend.withSortBy('Max') 44 | + options.legend.withSortDesc(true), 45 | 46 | generalCounter(title, unit, targets, gridPos): 47 | self.base(title, unit, targets, gridPos) 48 | + options.legend.withCalcs([ 49 | 'first', 50 | 'min', 51 | 'max', 52 | 'last', 53 | ]), 54 | 55 | histogramStatsRightHand(title, unit, targets, gridPos, leftAxis): 56 | self.generalCounter(title, unit, targets, gridPos) 57 | + custom.withAxisLabel(leftAxis) 58 | + options.legend.withDisplayMode('table') 59 | + options.legend.withSortBy('Max') 60 | + standardOptions.withOverrides([ 61 | byRegexp.new('.*rate.*') 62 | + byRegexp.withProperty('custom.axisPlacement', 'right') 63 | + byRegexp.withProperty('custom.axisLabel', 'rate') 64 | + byRegexp.withProperty('unit', 'none'), 65 | ]), 66 | 67 | withoutCalcsAgg(title, unit, targets, gridPos): 68 | self.base(title, unit, targets, gridPos) 69 | + options.legend.withCalcs([]) 70 | + options.legend.withDisplayMode('table'), 71 | 72 | GeneralInfoAgg(title, unit, targets, gridPos): 73 | self.base(title, unit, targets, gridPos) 74 | + options.legend.withCalcs([ 75 | 'mean', 76 | 'max', 77 | ]) 78 | + options.legend.withDisplayMode('list'), 79 | 80 | GeneralInfo(title, unit, targets, gridPos): 81 | self.base(title, unit, targets, gridPos) 82 | + options.legend.withCalcs([]) 83 | + options.legend.withDisplayMode('list'), 84 | }, 85 | 86 | stat: { 87 | local stat = g.panel.stat, 88 | local options = stat.options, 89 | 90 | base(title, unit, targets, gridPos): 91 | stat.new(title) 92 | + stat.datasource.withType('prometheus') 93 | + stat.datasource.withUid('$Datasource') 94 | + stat.standardOptions.withUnit(unit) 95 | + stat.queryOptions.withTargets(targets) 96 | + stat.gridPos.withX(gridPos.x) 97 | + stat.gridPos.withY(gridPos.y) 98 | + stat.gridPos.withH(gridPos.h) 99 | + stat.gridPos.withW(gridPos.w) 100 | + options.withJustifyMode('auto') 101 | + options.withGraphMode('none') 102 | + options.text.withTitleSize(12) 103 | + stat.standardOptions.color.withMode('thresholds') 104 | + options.withColorMode('none'), 105 | 106 | 107 | etcdLeader(title, unit, target, gridPos): 108 | self.base(title, unit, target, gridPos) 109 | + stat.options.reduceOptions.withCalcs([ 110 | 'mean', 111 | ]) 112 | + stat.standardOptions.withMappings({ 113 | type: 'value', 114 | options: { 115 | '0': { 116 | text: 'NO', 117 | }, 118 | '1': { 119 | text: 'YES', 120 | }, 121 | }, 122 | }), 123 | 124 | failedProposalsSeen(title, unit, target, gridPos): 125 | self.base(title, unit, target, gridPos) 126 | + stat.options.reduceOptions.withCalcs([ 127 | 'mean', 128 | ]) 129 | + stat.standardOptions.withMappings( 130 | { 131 | type: 'special', 132 | options: { 133 | match: 'null', 134 | result: { 135 | text: 'N/A', 136 | }, 137 | }, 138 | } 139 | ), 140 | }, 141 | } 142 | -------------------------------------------------------------------------------- /dittybopper/syncer/entrypoint.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import requests 5 | import uuid 6 | import time 7 | from collections import defaultdict 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | 12 | class GrafanaOperations: 13 | """ 14 | This class is responsible for Grafana operations 15 | """ 16 | def __init__(self, grafana_url: str, input_directory: str, git_commit_hash: str): 17 | self.grafana_url = grafana_url 18 | self.input_directory = input_directory 19 | self.git_commit_hash = git_commit_hash if git_commit_hash else '' 20 | self.dashboards = defaultdict(list) 21 | self.folder_map = dict() 22 | self.logger = logging.getLogger(__name__) 23 | 24 | def fetch_all_dashboards(self): 25 | """ 26 | This method fetches all rendered dashboards 27 | :return: 28 | """ 29 | self.get_all_folders() 30 | self.folder_map['General'] = None 31 | for root, _, files in os.walk(self.input_directory): 32 | folder_name = os.path.basename(root) 33 | json_files = [os.path.join(root, filename) for filename in files if filename.endswith(".json")] 34 | folder_name = "General" if (folder_name == "") else folder_name 35 | if folder_name in self.folder_map: 36 | folder_id = self.folder_map[folder_name] 37 | else: 38 | folder_id = self.create_folder(folder_name) 39 | self.dashboards[folder_id].extend(json_files) 40 | 41 | def get_all_folders(self): 42 | """ 43 | This method gets the entire list of folders in grafana 44 | :return: 45 | """ 46 | headers = { 47 | "Content-Type": "application/json", 48 | "Accept": "application/json", 49 | } 50 | try: 51 | response = requests.get( 52 | f"{self.grafana_url}/api/folders", 53 | headers=headers, 54 | ) 55 | response_json = response.json() 56 | self.folder_map = {each_folder['title']: each_folder['id'] for each_folder in response_json} 57 | except requests.exceptions.RequestException as e: 58 | raise Exception(f"Error listing folders. Message: {e}") 59 | 60 | def create_folder(self, folder_name): 61 | """ 62 | This method creates a folder in grafana 63 | :return: 64 | """ 65 | uid = str(uuid.uuid4()) 66 | headers = { 67 | "Content-Type": "application/json", 68 | "Accept": "application/json", 69 | } 70 | try: 71 | response = requests.post( 72 | f"{self.grafana_url}/api/folders", 73 | headers=headers, 74 | json={ 75 | "title": folder_name, 76 | "uid": uid, 77 | }, 78 | ) 79 | response_json = response.json() 80 | self.folder_map[folder_name] = id 81 | return response_json['id'] 82 | 83 | except requests.exceptions.RequestException as e: 84 | raise Exception(f"Error creating folder with name:'{self.folder_name}' and uid:'{uid}'. Message: {e}") 85 | 86 | def read_dashboard_json(self, json_file): 87 | """ 88 | This method reads dashboard from json file 89 | :return: 90 | """ 91 | with open(json_file, 'r') as f: 92 | return json.load(f) 93 | 94 | def create_dashboards(self): 95 | """ 96 | This method creates/updates dashboard with new json 97 | :return: 98 | """ 99 | headers = { 100 | "Content-Type": "application/json", 101 | "Accept": "application/json", 102 | } 103 | for folder_id, files in self.dashboards.items(): 104 | for json_file in set(files): 105 | dashboard_json = self.read_dashboard_json(json_file) 106 | if "tags" in dashboard_json.keys(): 107 | dashboard_json["tags"].append(self.git_commit_hash) 108 | else: 109 | dashboard_json["tags"] = [self.git_commit_hash] 110 | try: 111 | response = requests.post( 112 | f"{self.grafana_url}/api/dashboards/db", 113 | headers=headers, 114 | json={ 115 | "dashboard": dashboard_json, 116 | "folderId": folder_id, 117 | "overwrite": True, 118 | }, 119 | ) 120 | if response.status_code == 200: 121 | self.logger.info(f"Dashboard '{dashboard_json['title']}' created successfully in folder '{folder_id}'") 122 | else: 123 | raise Exception( 124 | f"Failed to create dashboard '{dashboard_json['title']}' in folder '{folder_id}'. Status code: {response.status_code}. Message: {response.text}") 125 | 126 | except requests.exceptions.RequestException as e: 127 | raise Exception(f"Error creating dashboard '{dashboard_json['title']}' in folder '{folder_id}'. Message: {e}") 128 | 129 | if __name__ == '__main__': 130 | grafana_operations = GrafanaOperations(os.environ.get("GRAFANA_URL"), os.environ.get("INPUT_DIR"), os.environ.get("GIT_COMMIT_HASH")) 131 | grafana_operations.fetch_all_dashboards() 132 | grafana_operations.create_dashboards() 133 | while True: 134 | time.sleep(60) 135 | -------------------------------------------------------------------------------- /templates/General/cilium-k8s-perf.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/cilium-k8s-perf/panels.libsonnet'; 2 | local queries = import '../../assets/cilium-k8s-perf/queries.libsonnet'; 3 | local variables = import '../../assets/cilium-k8s-perf/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('Cilium k8s Performance dashboard') 7 | + g.dashboard.time.withFrom('now-1h') 8 | + g.dashboard.time.withTo('now') 9 | + g.dashboard.withTimezone('utc') 10 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 11 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 12 | + g.dashboard.withRefresh('30s') 13 | + g.dashboard.withEditable(false) 14 | + g.dashboard.graphTooltip.withSharedCrosshair() 15 | + g.dashboard.withVariables([ 16 | variables.Datasource, 17 | variables._worker_node, 18 | variables.namespace, 19 | variables.block_device, 20 | variables.net_device, 21 | variables.interval, 22 | ]) 23 | 24 | + g.dashboard.withPanels([ 25 | g.panel.row.new('Cilium Details') 26 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 27 | + g.panel.row.withCollapsed(true) 28 | + g.panel.row.withPanels([ 29 | panels.timeSeries.withCiliumAgg('Cilium Controller Failures', 'none', queries.ciliumControllerFailures.query(), { x: 0, y: 1, w: 12, h: 8 }), 30 | panels.timeSeries.withCiliumAgg('Cilium IP Address Allocation', 'none', queries.ciliumIPAddressAllocation.query(), { x: 12, y: 1, w: 12, h: 8 }), 31 | panels.timeSeries.withCiliumAgg('Cilium Container CPU', 'percent', queries.ciliumContainerCPU.query(), { x: 0, y: 9, w: 12, h: 8 }), 32 | panels.timeSeries.withCiliumAgg('Cilium Container Memory', 'bytes', queries.ciliumConatinerMemory.query(), { x: 12, y: 9, w: 12, h: 8 }), 33 | panels.timeSeries.withCiliumAgg('Cilium Network Polices Per Agent', 'none', queries.ciliumNetworkPolicesPerAgent.query(), { x: 0, y: 17, w: 12, h: 8 }), 34 | panels.timeSeries.withCiliumAgg('Cilium BPF Operations', 'none', queries.ciliumBPFOperations.query(), { x: 12, y: 17, w: 12, h: 8 }), 35 | ]), 36 | g.panel.row.new('Cluster Details') 37 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 38 | + g.panel.row.withCollapsed(true) 39 | + g.panel.row.withPanels([ 40 | panels.stat.withclusterAgg('Current Node Count', 'none', queries.currentNodeCount.query(), { x: 0, y: 26, w: 8, h: 3 }), 41 | panels.stat.withclusterAgg('Current namespace Count', 'none', queries.currentNamespaceCount.query(), { x: 8, y: 26, w: 8, h: 3 }), 42 | panels.stat.withclusterAgg('Current Pod Count', 'none', queries.currentPodCount.query(), { x: 16, y: 26, w: 8, h: 3 }), 43 | panels.timeSeries.withClusterAgg('Number of nodes', 'none', queries.numberOfNodes.query(), { x: 0, y: 29, w: 8, h: 8 }), 44 | panels.timeSeries.withClusterAgg('Namespace count', 'none', queries.namespaceCount.query(), { x: 8, y: 29, w: 8, h: 8 }), 45 | panels.timeSeries.withClusterAgg('Pod count', 'none', queries.podCount.query(), { x: 16, y: 29, w: 8, h: 8 }), 46 | panels.timeSeries.withClusterAgg('Secret & configmap count', 'none', queries.secretConfigmapCount.query(), { x: 0, y: 37, w: 8, h: 8 }), 47 | panels.timeSeries.withClusterAgg('Deployment count', 'none', queries.deploymentCount.query(), { x: 8, y: 37, w: 8, h: 8 }), 48 | panels.timeSeries.withClusterAgg('Services count', 'none', queries.serviceCount.query(), { x: 16, y: 37, w: 8, h: 8 }), 49 | panels.timeSeries.withCiliumAgg('Top 10 container RSS', 'bytes', queries.top10ContainerRSS.query(), { x: 0, y: 45, w: 24, h: 8 }), 50 | panels.timeSeries.withCiliumAgg('Top 10 container CPU', 'percent', queries.top10ContainerCPU.query(), { x: 0, y: 53, w: 12, h: 8 }), 51 | panels.timeSeries.withClusterAgg('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 53, w: 12, h: 8 }), 52 | panels.timeSeries.withCiliumAgg('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 61, w: 24, h: 8 }), 53 | ]), 54 | g.panel.row.new('Node: $_worker_node') 55 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 56 | + g.panel.row.withCollapsed(true) 57 | + g.panel.row.withRepeat('_worker_node') 58 | + g.panel.row.withPanels([ 59 | panels.timeSeries.withCiliumAgg('CPU Basic: $_worker_node', 'percent', queries.CPUBasic.query(), { x: 0, y: 70, w: 12, h: 8 }), 60 | panels.timeSeries.withCiliumAgg('System Memory: $_worker_node', 'bytes', queries.systemMemory.query(), { x: 12, y: 70, w: 12, h: 8 }), 61 | panels.timeSeries.withCiliumAgg('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query(), { x: 0, y: 78, w: 12, h: 8 }), 62 | panels.timeSeries.withCiliumAgg('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query(), { x: 12, y: 78, w: 12, h: 8 }), 63 | panels.timeSeries.withCiliumAgg('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query(), { x: 0, y: 86, w: 12, h: 8 }), 64 | panels.timeSeries.withCiliumAgg('Network Packets: $_worker_node', 'pps', queries.networkPackets.query(), { x: 12, y: 86, w: 12, h: 8 }), 65 | panels.timeSeries.withCiliumAgg('Network packets drop: $_worker_node', 'pps', queries.networkPacketDrop.query(), { x: 0, y: 94, w: 12, h: 8 }), 66 | panels.timeSeries.withCiliumAgg('Conntrack stats: $_worker_node', '', queries.conntrackStats.query(), { x: 12, y: 94, w: 12, h: 8 }), 67 | panels.timeSeries.withCiliumAgg('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPUNode.query(), { x: 0, y: 102, w: 12, h: 8 }), 68 | panels.timeSeries.withCiliumAgg('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSSNode.query(), { x: 12, y: 102, w: 12, h: 8 }), 69 | ]), 70 | ]) 71 | -------------------------------------------------------------------------------- /assets/uperf/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | timeSeries: { 5 | local timeSeries = g.panel.timeSeries, 6 | local custom = timeSeries.fieldConfig.defaults.custom, 7 | local options = timeSeries.options, 8 | 9 | base(title, unit, targets, gridPos): 10 | timeSeries.new(title) 11 | + timeSeries.queryOptions.withTargets(targets) 12 | + timeSeries.datasource.withType('elasticsearch') 13 | + timeSeries.datasource.withUid('$Datasource') 14 | + timeSeries.standardOptions.withUnit(unit) 15 | + timeSeries.gridPos.withX(gridPos.x) 16 | + timeSeries.gridPos.withY(gridPos.y) 17 | + timeSeries.gridPos.withH(gridPos.h) 18 | + timeSeries.gridPos.withW(gridPos.w) 19 | + custom.withSpanNulls(false) 20 | + options.tooltip.withMode('multi') 21 | + options.tooltip.withSort('none') 22 | + options.legend.withShowLegend(true) 23 | + timeSeries.queryOptions.withTimeFrom(null) 24 | + timeSeries.queryOptions.withTimeShift(null) 25 | + timeSeries.panelOptions.withTransparent(true), 26 | 27 | uperfPerformance(title, unit, targets, gridPos): 28 | self.base(title, unit, targets, gridPos) 29 | + options.legend.withCalcs([ 30 | 'mean', 31 | 'max', 32 | ]) 33 | + options.legend.withShowLegend(true) 34 | + options.legend.withDisplayMode('table') 35 | + options.legend.withPlacement('bottom') 36 | + custom.withLineWidth(1) 37 | + custom.withFillOpacity(10) 38 | + custom.withPointSize(5) 39 | + custom.withSpanNulls(true) 40 | + custom.withShowPoints('never'), 41 | }, 42 | 43 | table: { 44 | local table = g.panel.table, 45 | local custom = table.fieldConfig.defaults.custom, 46 | local options = table.options, 47 | 48 | base(title, targets, gridPos): 49 | table.new(title) 50 | + table.queryOptions.withTargets(targets) 51 | + table.datasource.withType('elasticsearch') 52 | + table.datasource.withUid('$Datasource') 53 | + table.gridPos.withX(gridPos.x) 54 | + table.gridPos.withY(gridPos.y) 55 | + table.gridPos.withH(gridPos.h) 56 | + table.gridPos.withW(gridPos.w) 57 | + options.withShowHeader(true) 58 | + options.footer.TableFooterOptions.withShow(false) 59 | + options.footer.TableFooterOptions.withReducer('sum') 60 | + options.footer.TableFooterOptions.withCountRows(false) 61 | + custom.withAlign('auto') 62 | + custom.withInspect(false) 63 | + table.panelOptions.withTransparent(true) 64 | + table.queryOptions.withTimeFrom(null) 65 | + table.queryOptions.withTimeShift(null) 66 | + table.standardOptions.color.withMode('thresholds') 67 | + table.queryOptions.withTransformations([ 68 | { 69 | id: 'seriesToColumns', 70 | options: { 71 | reducers: [], 72 | }, 73 | }, 74 | ]) 75 | + table.standardOptions.withOverrides([ 76 | { 77 | matcher: { 78 | id: 'byName', 79 | options: 'message_size', 80 | }, 81 | properties: [ 82 | { 83 | id: 'unit', 84 | value: '', 85 | }, 86 | { 87 | id: 'custom.align', 88 | value: null, 89 | }, 90 | ], 91 | }, 92 | { 93 | matcher: { 94 | id: 'byName', 95 | options: 'Average norm_byte', 96 | }, 97 | properties: [ 98 | { 99 | id: 'unit', 100 | value: 'bps', 101 | }, 102 | { 103 | id: 'decimals', 104 | value: '2', 105 | }, 106 | { 107 | id: 'custom.align', 108 | value: null, 109 | }, 110 | ], 111 | }, 112 | { 113 | matcher: { 114 | id: 'byName', 115 | options: 'Average norm_ops', 116 | }, 117 | properties: [ 118 | { 119 | id: 'unit', 120 | value: 'none', 121 | }, 122 | { 123 | id: 'decimals', 124 | value: '0', 125 | }, 126 | { 127 | id: 'custom.align', 128 | value: null, 129 | }, 130 | ], 131 | }, 132 | { 133 | matcher: { 134 | id: 'byName', 135 | options: 'Average norm_ltcy', 136 | }, 137 | properties: [ 138 | { 139 | id: 'unit', 140 | value: 'µs', 141 | }, 142 | { 143 | id: 'decimals', 144 | value: '2', 145 | }, 146 | { 147 | id: 'custom.align', 148 | value: null, 149 | }, 150 | ], 151 | }, 152 | { 153 | matcher: { 154 | id: 'byName', 155 | options: 'Count', 156 | }, 157 | properties: [ 158 | { 159 | id: 'displayName', 160 | value: 'Sample count', 161 | }, 162 | { 163 | id: 'unit', 164 | value: 'short', 165 | }, 166 | { 167 | id: 'decimals', 168 | value: '2', 169 | }, 170 | { 171 | id: 'custom.align', 172 | value: null, 173 | }, 174 | ], 175 | }, 176 | ]), 177 | }, 178 | 179 | } 180 | -------------------------------------------------------------------------------- /assets/pgbench-dashboard/queries.libsonnet: -------------------------------------------------------------------------------- 1 | local variables = import './variables.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local elasticsearch = g.query.elasticsearch; 4 | 5 | { 6 | tps_report: { 7 | query(): 8 | elasticsearch.withAlias(null) 9 | + elasticsearch.withBucketAggs([ 10 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 11 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 12 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 13 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 14 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 15 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 16 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0), 17 | ]) 18 | + elasticsearch.withMetrics([ 19 | elasticsearch.metrics.MetricAggregationWithSettings.Sum.withField('tps') 20 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withId('1') 21 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withType('sum') 22 | + elasticsearch.metrics.MetricAggregationWithSettings.CumulativeSum.withPipelineAgg('select metric') 23 | + elasticsearch.metrics.MetricAggregationWithSettings.BucketScript.pipelineVariables.withName('var1') 24 | + elasticsearch.metrics.MetricAggregationWithSettings.BucketScript.pipelineVariables.withPipelineAgg('select metric'), 25 | ]) 26 | + elasticsearch.withQuery('(user = $user) AND (uuid = $uuid)') 27 | + elasticsearch.withTimeField('timestamp'), 28 | }, 29 | 30 | avg_tps: { 31 | query(): 32 | elasticsearch.withAlias(null) 33 | + elasticsearch.withBucketAggs([ 34 | elasticsearch.bucketAggs.Terms.withField('description.keyword') 35 | + elasticsearch.bucketAggs.Terms.withId('6') 36 | + elasticsearch.bucketAggs.Terms.withType('terms') 37 | + elasticsearch.bucketAggs.Terms.settings.withOrder('asc') 38 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 39 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 40 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 41 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 42 | + elasticsearch.bucketAggs.DateHistogram.withId('4') 43 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 44 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 45 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 46 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 47 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0), 48 | ]) 49 | + elasticsearch.withMetrics([ 50 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('tps_incl_con_est') 51 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 52 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg') 53 | + elasticsearch.metrics.MetricAggregationWithSettings.CumulativeSum.withPipelineAgg('select metric'), 54 | ]) 55 | + elasticsearch.withQuery('(uuid.keyword=$uuid) AND (user.keyword=$user)') 56 | + elasticsearch.withTimeField('timestamp'), 57 | }, 58 | 59 | latency_report: { 60 | query(): 61 | elasticsearch.withAlias(null) 62 | + elasticsearch.withBucketAggs([ 63 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 64 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 65 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 66 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 67 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 68 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 69 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0), 70 | ]) 71 | + elasticsearch.withMetrics([ 72 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('latency_ms') 73 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 74 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 75 | ]) 76 | + elasticsearch.withQuery('(uuid.keyword=$uuid) AND (user.keyword=$user)') 77 | + elasticsearch.withTimeField('timestamp'), 78 | }, 79 | 80 | results: { 81 | query(): 82 | elasticsearch.withAlias(null) 83 | + elasticsearch.withBucketAggs([ 84 | elasticsearch.bucketAggs.Terms.withField('user.keyword') 85 | + elasticsearch.bucketAggs.Terms.withId('1') 86 | + elasticsearch.bucketAggs.Terms.withType('terms') 87 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 88 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 89 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 90 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 91 | ]) 92 | + elasticsearch.withMetrics([ 93 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('latency_ms') 94 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('4') 95 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 96 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('tps') 97 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('20') 98 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 99 | 100 | ]) 101 | + elasticsearch.withQuery('(uuid.keyword=$uuid) AND (user.keyword=$user)') 102 | + elasticsearch.withTimeField('timestamp'), 103 | }, 104 | } 105 | -------------------------------------------------------------------------------- /dittybopper/k8s-deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | function _usage { 6 | cat <] [-n ] [-p ] 12 | 13 | $(basename "${0}") [-i ] 14 | 15 | $(basename "${0}") [-d] [-n ] 16 | 17 | -c : The (c)ommand to use for k8s admin (defaults to 'kubectl' for now) 18 | 19 | -n : The (n)amespace in which to deploy the Grafana instance 20 | (defaults to 'dittybopper') 21 | 22 | -p : The (p)assword to configure for the Grafana admin user 23 | (defaults to 'admin') 24 | 25 | -i : (I)mport dashboard from given path. Using this flag will 26 | bypass the deployment process and only do the import to an 27 | already-running Grafana pod. Can be a local path or a remote 28 | URL beginning with http. 29 | 30 | -d : (D)elete an existing deployment 31 | 32 | -h : Help 33 | 34 | END 35 | } 36 | 37 | # Set default template variables 38 | 39 | export PROMETHEUS_USER=internal 40 | export GRAFANA_ADMIN_PASSWORD=admin 41 | export GRAFANA_URL="http://admin:${GRAFANA_ADMIN_PASSWORD}@localhost:3000" 42 | export DASHBOARDS="k8s-performance.json" 43 | export SYNCER_IMAGE=${SYNCER_IMAGE:-"quay.io/cloud-bulldozer/dittybopper-syncer:latest"} # Syncer image 44 | export GRAFANA_IMAGE=${GRAFANA_IMAGE:-"quay.io/cloud-bulldozer/grafana:9.4.3"} # Syncer image 45 | 46 | 47 | # Set defaults for command options 48 | k8s_cmd='kubectl' 49 | namespace='dittybopper' 50 | grafana_default_pass=True 51 | 52 | # Other vars 53 | deploy_template="templates/k8s-dittybopper.yaml.template" 54 | 55 | # Capture and act on command options 56 | while getopts ":c:m:n:p:i:dh" opt; do 57 | case ${opt} in 58 | c) 59 | k8s_cmd=${OPTARG} 60 | ;; 61 | n) 62 | namespace="${OPTARG}" 63 | ;; 64 | p) 65 | export GRAFANA_ADMIN_PASSWORD=${OPTARG} 66 | grafana_default_pass=False 67 | ;; 68 | i) 69 | dash_import+=(${OPTARG}) 70 | ;; 71 | d) 72 | delete=True 73 | ;; 74 | h) 75 | _usage 76 | exit 1 77 | ;; 78 | \?) 79 | echo -e "\033[32mERROR: Invalid option -${OPTARG}\033[0m" >&2 80 | _usage 81 | exit 1 82 | ;; 83 | :) 84 | echo -e "\033[32mERROR: Option -${OPTARG} requires an argument.\033[0m" >&2 85 | _usage 86 | exit 1 87 | ;; 88 | esac 89 | done 90 | 91 | 92 | echo "${dash_import[@]}" 93 | echo -e "\033[32m 94 | ____ _ __ __ __ 95 | / __ \(_) /_/ /___ __/ /_ ____ ____ ____ ___ _____ 96 | / / / / / __/ __/ / / / __ \/ __ \/ __ \/ __ \/ _ \/ ___/ 97 | / /_/ / / /_/ /_/ /_/ / /_/ / /_/ / /_/ / /_/ / __/ / 98 | /_____/_/\__/\__/\__, /_.___/\____/ .___/ .___/\___/_/ 99 | /____/ /_/ /_/ 100 | 101 | \033[0m" 102 | echo "Using k8s command: $k8s_cmd" 103 | echo "Using namespace: $namespace" 104 | if [[ ${grafana_default_pass} ]]; then 105 | echo "Using default grafana password: ${GRAFANA_ADMIN_PASSWORD}" 106 | else 107 | echo "Using custom grafana password." 108 | fi 109 | 110 | 111 | # Get environment values 112 | echo "" 113 | echo -e "\033[32mGetting environment vars...\033[0m" 114 | export PROMETHEUS_URL=http://$($k8s_cmd get endpoints -n prometheus prometheus-server -o jsonpath="{.subsets[0].addresses[0].ip}"):$($k8s_cmd get endpoints -n prometheus prometheus-server -o jsonpath="{.subsets[0].ports[0].port}") 115 | echo "Prometheus URL is: ${PROMETHEUS_URL}" 116 | 117 | function namespace() { 118 | # Create namespace 119 | $k8s_cmd "$1" namespace "$namespace" 120 | } 121 | 122 | function grafana() { 123 | envsubst < ${deploy_template} | $k8s_cmd "$1" -n "$namespace" -f - 124 | if [[ ! $delete ]]; then 125 | echo "" 126 | echo -e "\033[32mWaiting for dittybopper deployment to be available...\033[0m" 127 | $k8s_cmd wait --for=condition=available -n $namespace deployment/dittybopper --timeout=60s 128 | fi 129 | } 130 | 131 | function dash_import(){ 132 | echo -e "\033[32mImporting dashboards...\033[0m" 133 | for dash in ${dash_import[@]}; do 134 | if [[ $dash =~ ^http ]]; then 135 | echo "Fetching remote dashboard $dash" 136 | dashfile="/tmp/$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 8)" 137 | curl -sS $dash -o $dashfile 138 | else 139 | echo "Using local dashboard ${dash}" 140 | dashfile=$dash 141 | fi 142 | dashboard=$(cat ${dashfile}) 143 | echo "{\"dashboard\": ${dashboard}, \"overwrite\": true}" | \ 144 | curl -Ss -XPOST -H "Content-Type: application/json" -H "Accept: application/json" -d@- \ 145 | "http://admin:${GRAFANA_ADMIN_PASSWORD}@127.0.0.1:3000/api/dashboards/db" -o /dev/null 146 | done 147 | } 148 | 149 | if [[ $delete ]]; then 150 | echo "" 151 | echo -e "\033[32mDeleting Grafana...\033[0m" 152 | grafana "delete" 153 | echo "" 154 | echo -e "\033[32mDeleting namespace...\033[0m" 155 | namespace "delete" 156 | echo "" 157 | echo -e "\033[32mDeployment deleted!\033[0m" 158 | else 159 | echo "" 160 | echo -e "\033[32mCreating namespace...\033[0m" 161 | # delete the namespace if it already exists to make sure the latest version of the dashboards are deployed and also to support the case where user wants to redeploy dittybopper without having to delete the namespace manually 162 | if [[ $($k8s_cmd get namespaces | grep -w $namespace) ]]; then 163 | echo "Looks like the namespace $namespace already exists, deleting it" 164 | namespace "delete" 165 | fi 166 | namespace "create" 167 | echo "" 168 | echo -e "\033[32mDeploying Grafana...\033[0m" 169 | grafana "apply" 170 | echo "" 171 | $k8s_cmd -n $namespace port-forward service/dittybopper 3000 & 172 | # Ugly, but need to slow things down when opening the port-forward 173 | sleep 5 174 | dash_import 175 | echo "You can access the Grafana instance at http://127.0.0.1:3000" 176 | fi 177 | -------------------------------------------------------------------------------- /assets/vegeta-wrapper/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | timeSeries: { 5 | local timeSeries = g.panel.timeSeries, 6 | local custom = timeSeries.fieldConfig.defaults.custom, 7 | local options = timeSeries.options, 8 | 9 | base(title, unit, targets, gridPos): 10 | timeSeries.new(title) 11 | + timeSeries.queryOptions.withTargets(targets) 12 | + timeSeries.datasource.withType('elasticsearch') 13 | + timeSeries.datasource.withUid('$Datasource') 14 | + timeSeries.standardOptions.withUnit(unit) 15 | + timeSeries.gridPos.withX(gridPos.x) 16 | + timeSeries.gridPos.withY(gridPos.y) 17 | + timeSeries.gridPos.withH(gridPos.h) 18 | + timeSeries.gridPos.withW(gridPos.w) 19 | + custom.withSpanNulls(false) 20 | + options.tooltip.withMode('multi') 21 | + options.tooltip.withSort('none') 22 | + options.legend.withShowLegend(true) 23 | + timeSeries.queryOptions.withTimeFrom(null) 24 | + timeSeries.queryOptions.withTimeShift(null) 25 | + timeSeries.panelOptions.withTransparent(true), 26 | 27 | legendDisplayModeTable(title, unit, targets, gridPos): 28 | self.base(title, unit, targets, gridPos) 29 | + options.legend.withCalcs([ 30 | 'mean', 31 | 'max', 32 | ]) 33 | + options.legend.withShowLegend(true) 34 | + options.legend.withDisplayMode('table') 35 | + options.legend.withPlacement('bottom') 36 | + custom.withLineWidth(1) 37 | + custom.withFillOpacity(20) 38 | + custom.withPointSize(5) 39 | + custom.withSpanNulls(true) 40 | + custom.withShowPoints('never'), 41 | }, 42 | 43 | table: { 44 | local table = g.panel.table, 45 | local custom = table.fieldConfig.defaults.custom, 46 | local options = table.options, 47 | 48 | base(title, targets, gridPos): 49 | table.new(title) 50 | + table.queryOptions.withTargets(targets) 51 | + table.datasource.withType('elasticsearch') 52 | + table.datasource.withUid('$Datasource') 53 | + table.gridPos.withX(gridPos.x) 54 | + table.gridPos.withY(gridPos.y) 55 | + table.gridPos.withH(gridPos.h) 56 | + table.gridPos.withW(gridPos.w) 57 | + options.withShowHeader(true) 58 | + options.footer.TableFooterOptions.withShow(false) 59 | + options.footer.TableFooterOptions.withReducer('sum') 60 | + options.footer.TableFooterOptions.withCountRows(false) 61 | + custom.withAlign('auto') 62 | + custom.withInspect(false) 63 | + table.panelOptions.withTransparent(true) 64 | + table.queryOptions.withTimeFrom(null) 65 | + table.queryOptions.withTimeShift(null) 66 | + table.standardOptions.color.withMode('thresholds') 67 | + table.queryOptions.withTransformations([ 68 | { 69 | id: 'seriesToColumns', 70 | options: { 71 | reducers: [], 72 | }, 73 | }, 74 | ]) 75 | + table.standardOptions.withOverrides([ 76 | { 77 | matcher: { 78 | id: 'byName', 79 | options: 'Average rps', 80 | }, 81 | properties: [ 82 | { 83 | id: 'unit', 84 | value: 'reqps', 85 | }, 86 | { 87 | id: 'decimals', 88 | value: '2', 89 | }, 90 | { 91 | id: 'custom.align', 92 | value: null, 93 | }, 94 | ], 95 | }, 96 | { 97 | matcher: { 98 | id: 'byName', 99 | options: 'Average throughput', 100 | }, 101 | properties: [ 102 | { 103 | id: 'unit', 104 | value: 'reqps', 105 | }, 106 | { 107 | id: 'decimals', 108 | value: '2', 109 | }, 110 | { 111 | id: 'custom.align', 112 | value: null, 113 | }, 114 | ], 115 | }, 116 | { 117 | matcher: { 118 | id: 'byName', 119 | options: 'Average p99_latency', 120 | }, 121 | properties: [ 122 | { 123 | id: 'unit', 124 | value: 'µs', 125 | }, 126 | { 127 | id: 'decimals', 128 | value: '2', 129 | }, 130 | { 131 | id: 'custom.align', 132 | value: null, 133 | }, 134 | ], 135 | }, 136 | { 137 | matcher: { 138 | id: 'byName', 139 | options: 'Average req_latency', 140 | }, 141 | properties: [ 142 | { 143 | id: 'unit', 144 | value: 'µs', 145 | }, 146 | { 147 | id: 'decimals', 148 | value: '2', 149 | }, 150 | { 151 | id: 'custom.align', 152 | value: null, 153 | }, 154 | ], 155 | }, 156 | { 157 | matcher: { 158 | id: 'byName', 159 | options: 'Average bytes_in', 160 | }, 161 | properties: [ 162 | { 163 | id: 'unit', 164 | value: 'bps', 165 | }, 166 | { 167 | id: 'decimals', 168 | value: '2', 169 | }, 170 | { 171 | id: 'custom.align', 172 | value: null, 173 | }, 174 | ], 175 | }, 176 | { 177 | matcher: { 178 | id: 'byName', 179 | options: 'Average bytes_out', 180 | }, 181 | properties: [ 182 | { 183 | id: 'unit', 184 | value: 'bps', 185 | }, 186 | { 187 | id: 'decimals', 188 | value: '2', 189 | }, 190 | { 191 | id: 'custom.align', 192 | value: null, 193 | }, 194 | ], 195 | }, 196 | ]), 197 | }, 198 | } 199 | -------------------------------------------------------------------------------- /assets/k8s-perf/queries.libsonnet: -------------------------------------------------------------------------------- 1 | local variables = import './variables.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | 4 | local generateTimeSeriesQuery(query, legend) = [ 5 | local prometheusQuery = g.query.prometheus; 6 | prometheusQuery.new('$' + variables.Datasource.name, query) 7 | + prometheusQuery.withFormat('time_series') 8 | + prometheusQuery.withIntervalFactor(2) 9 | + prometheusQuery.withLegendFormat(legend), 10 | ]; 11 | 12 | { 13 | currentNodeCount: { 14 | query(): 15 | generateTimeSeriesQuery('sum(kube_node_info{})', 'Number of nodes') 16 | + generateTimeSeriesQuery('sum(kube_node_status_condition{status="true"}) by (condition) > 0', 'Node: {{ condition }}'), 17 | }, 18 | 19 | currentNamespaceCount: { 20 | query(): 21 | generateTimeSeriesQuery('sum(kube_namespace_status_phase) by (phase)', '{{ phase }}'), 22 | }, 23 | 24 | currentPodCount: { 25 | query(): 26 | generateTimeSeriesQuery('sum(kube_pod_status_phase{}) by (phase) > 0', '{{ phase}} Pods'), 27 | }, 28 | 29 | numberOfNodes: { 30 | query(): 31 | generateTimeSeriesQuery('sum(kube_node_info{})', 'Number of nodes') 32 | + generateTimeSeriesQuery('sum(kube_node_status_condition{status="true"}) by (condition) > 0', 'Node: {{ condition }}'), 33 | }, 34 | 35 | namespaceCount: { 36 | query(): 37 | generateTimeSeriesQuery('sum(kube_namespace_status_phase) by (phase) > 0', '{{ phase }} namespaces'), 38 | }, 39 | 40 | podCount: { 41 | query(): 42 | generateTimeSeriesQuery('sum(kube_pod_status_phase{}) by (phase)', '{{phase}} pods'), 43 | }, 44 | 45 | secretAndConfigMapCount: { 46 | query(): 47 | generateTimeSeriesQuery('count(kube_secret_info{})', 'secrets') 48 | + generateTimeSeriesQuery('count(kube_configmap_info{})', 'Configmaps'), 49 | }, 50 | deployCount: { 51 | query(): 52 | generateTimeSeriesQuery('count(kube_deployment_labels{})', 'Deployments'), 53 | }, 54 | 55 | serviceCount: { 56 | query(): 57 | generateTimeSeriesQuery('count(kube_service_info{})', 'Services'), 58 | }, 59 | 60 | top10ContainerRSS: { 61 | query(): 62 | generateTimeSeriesQuery('topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})', '{{ namespace }} - {{ name }}'), 63 | }, 64 | 65 | top10ContainerCPU: { 66 | query(): 67 | generateTimeSeriesQuery('topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[$interval])*100)', '{{ namespace }} - {{ name }}'), 68 | }, 69 | 70 | goroutinesCount: { 71 | query(): 72 | generateTimeSeriesQuery('topk(10, sum(go_goroutines{}) by (job,instance))', '{{ job }} - {{ instance }}'), 73 | }, 74 | 75 | podDistribution: { 76 | query(): 77 | generateTimeSeriesQuery('count(kube_pod_info{}) by (exported_node)', '{{ node }}'), 78 | }, 79 | 80 | basicCPU: { 81 | query(nodeName): 82 | generateTimeSeriesQuery('sum by (instance, mode)(rate(node_cpu_seconds_total{node=~"' + nodeName + '",job=~".*"}[$interval])) * 100', 'Busy {{mode}}'), 83 | }, 84 | 85 | systemMemory: { 86 | query(nodeName): 87 | generateTimeSeriesQuery('node_memory_Active_bytes{node=~"' + nodeName + '"}', 'Active') 88 | + generateTimeSeriesQuery('node_memory_MemTotal_bytes{node=~"' + nodeName + '"}', 'Total') 89 | + generateTimeSeriesQuery('node_memory_Cached_bytes{node=~"' + nodeName + '"} + node_memory_Buffers_bytes{node=~"' + nodeName + '"}', 'Cached + Buffers') 90 | + generateTimeSeriesQuery('node_memory_MemAvailable_bytes{node=~"' + nodeName + '"}', 'Available'), 91 | }, 92 | 93 | diskThroughput: { 94 | query(nodeName): 95 | generateTimeSeriesQuery('rate(node_disk_read_bytes_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', '{{ device }} - read') 96 | + generateTimeSeriesQuery('rate(node_disk_written_bytes_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', '{{ device }} - write'), 97 | }, 98 | 99 | diskIOPS: { 100 | query(nodeName): 101 | generateTimeSeriesQuery('rate(node_disk_reads_completed_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', '{{ device }} - read') 102 | + generateTimeSeriesQuery('rate(node_disk_writes_completed_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', '{{ device }} - write'), 103 | }, 104 | 105 | networkUtilization: { 106 | query(nodeName): 107 | generateTimeSeriesQuery('rate(node_network_receive_bytes_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', '{{instance}} - {{device}} - RX') 108 | + generateTimeSeriesQuery('rate(node_network_transmit_bytes_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', '{{instance}} - {{device}} - TX'), 109 | }, 110 | 111 | networkPackets: { 112 | query(nodeName): 113 | generateTimeSeriesQuery('rate(node_network_receive_packets_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval])', '{{instance}} - {{device}} - RX') 114 | + generateTimeSeriesQuery('rate(node_network_transmit_packets_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval])', '{{instance}} - {{device}} - TX'), 115 | }, 116 | 117 | networkDrop: { 118 | query(nodeName): 119 | generateTimeSeriesQuery('topk(10, rate(node_network_receive_drop_total{node=~"' + nodeName + '"}[$interval]))', 'rx-drop-{{ device }}') 120 | + generateTimeSeriesQuery('topk(10,rate(node_network_transmit_drop_total{node=~"' + nodeName + '"}[$interval]))', 'tx-drop-{{ device }}'), 121 | }, 122 | 123 | conntrackStats: { 124 | query(nodeName): 125 | generateTimeSeriesQuery('node_nf_conntrack_entries{node=~"' + nodeName + '"}', 'conntrack_entries') 126 | + generateTimeSeriesQuery('node_nf_conntrack_entries_limit{node=~"' + nodeName + '"}', 'conntrack_limit'), 127 | }, 128 | 129 | top10ContainersCPU: { 130 | query(nodeName): 131 | generateTimeSeriesQuery('topk(10, sum(irate(container_cpu_usage_seconds_total{container!="POD",name!="",instance=~"' + nodeName + '",namespace!="",namespace=~"$namespace"}[$interval])) by (pod,container,namespace,name,service) * 100)', '{{ pod }}: {{ container }}'), 132 | }, 133 | 134 | top10ContainersRSS: { 135 | query(nodeName): 136 | generateTimeSeriesQuery('topk(10, container_memory_rss{container!="POD",name!="",instance=~"' + nodeName + '",namespace!="",namespace=~"$namespace"})', '{{ pod }}: {{ container }}'), 137 | }, 138 | } 139 | -------------------------------------------------------------------------------- /assets/uperf/queries.libsonnet: -------------------------------------------------------------------------------- 1 | local variables = import './variables.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local elasticsearch = g.query.elasticsearch; 4 | 5 | { 6 | throughput: { 7 | query(): 8 | elasticsearch.withAlias(null) 9 | + elasticsearch.withBucketAggs([ 10 | elasticsearch.bucketAggs.DateHistogram.withField('uperf_ts') 11 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 12 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 13 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 14 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 15 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 16 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(null), 17 | ]) 18 | + elasticsearch.withMetrics([ 19 | elasticsearch.metrics.MetricAggregationWithSettings.Sum.withField('norm_byte') 20 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withId('1') 21 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withType('sum') 22 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.settings.script.withInline('_value * 8'), 23 | ]) 24 | + elasticsearch.withQuery('uuid: $uuid AND cluster_name: $cluster_name AND user: $user AND iteration: $iteration AND remote_ip: $server AND message_size: $message_size AND test_type: $test_type AND protocol: $protocol AND num_threads: $threads') 25 | + elasticsearch.withTimeField('uperf_ts'), 26 | }, 27 | 28 | operations: { 29 | query(): 30 | elasticsearch.withAlias(null) 31 | + elasticsearch.withBucketAggs([ 32 | elasticsearch.bucketAggs.DateHistogram.withField('uperf_ts') 33 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 34 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 35 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 36 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 37 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 38 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(null), 39 | ]) 40 | + elasticsearch.withMetrics([ 41 | elasticsearch.metrics.MetricAggregationWithSettings.Sum.withField('norm_ops') 42 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withId('1') 43 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withType('sum'), 44 | ]) 45 | + elasticsearch.withQuery('uuid: $uuid AND user: $user AND iteration: $iteration AND remote_ip: $server AND message_size: $message_size AND test_type: $test_type AND protocol: $protocol AND num_threads: $threads') 46 | + elasticsearch.withTimeField('uperf_ts'), 47 | }, 48 | 49 | results: { 50 | query(): 51 | elasticsearch.withAlias(null) 52 | + elasticsearch.withBucketAggs([ 53 | elasticsearch.bucketAggs.Terms.withField('test_type.keyword') 54 | + elasticsearch.bucketAggs.Terms.withId('3') 55 | + elasticsearch.bucketAggs.Terms.withType('terms') 56 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 57 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 58 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 59 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 60 | elasticsearch.bucketAggs.Terms.withField('protocol.keyword') 61 | + elasticsearch.bucketAggs.Terms.withId('4') 62 | + elasticsearch.bucketAggs.Terms.withType('terms') 63 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 64 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 65 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 66 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 67 | elasticsearch.bucketAggs.Terms.withField('num_threads') 68 | + elasticsearch.bucketAggs.Terms.withId('5') 69 | + elasticsearch.bucketAggs.Terms.withType('terms') 70 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 71 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 72 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 73 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 74 | elasticsearch.bucketAggs.Terms.withField('message_size') 75 | + elasticsearch.bucketAggs.Terms.withId('2') 76 | + elasticsearch.bucketAggs.Terms.withType('terms') 77 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 78 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 79 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 80 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 81 | 82 | ]) 83 | + elasticsearch.withMetrics([ 84 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('norm_byte') 85 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 86 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg') 87 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.settings.script.withInline('_value * 8'), 88 | 89 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('norm_ops') 90 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('6') 91 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 92 | 93 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('norm_ltcy') 94 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('7') 95 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 96 | 97 | elasticsearch.metrics.MetricAggregationWithSettings.UniqueCount.withType('count') 98 | + elasticsearch.metrics.MetricAggregationWithSettings.UniqueCount.withId('8') 99 | + elasticsearch.metrics.MetricAggregationWithSettings.UniqueCount.withField('select field'), 100 | ]) 101 | + elasticsearch.withQuery('uuid: $uuid AND user: $user AND iteration: $iteration AND remote_ip: $server AND message_size: $message_size AND test_type: $test_type AND protocol: $protocol AND NOT norm_ops:0') 102 | + elasticsearch.withTimeField('uperf_ts'), 103 | }, 104 | } 105 | -------------------------------------------------------------------------------- /templates/CPT/acs-perf.jsonnet: -------------------------------------------------------------------------------- 1 | /* 2 | * Generated by Claude Code 3 | */ 4 | local panels = import '../../assets/acs-perf/panels.libsonnet'; 5 | local queries = import '../../assets/acs-perf/queries.libsonnet'; 6 | local variables = import '../../assets/acs-perf/variables.libsonnet'; 7 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 8 | 9 | g.dashboard.new('Kube-burner report for ACS') 10 | + g.dashboard.withDescription(||| 11 | Dashboard for ACS (Advanced Cluster Security) performance testing with kube-burner 12 | |||) 13 | + g.dashboard.withTags(['kube-burner', 'acs', 'performance']) 14 | + g.dashboard.time.withFrom('now-30d') 15 | + g.dashboard.time.withTo('now') 16 | + g.dashboard.withTimezone('utc') 17 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 18 | + g.dashboard.withRefresh('') 19 | + g.dashboard.withEditable(true) 20 | + g.dashboard.graphTooltip.withSharedCrosshair() 21 | + g.dashboard.withVariables([ 22 | variables.Datasource, 23 | variables.platform, 24 | variables.sdn, 25 | variables.clusterType, 26 | variables.job, 27 | variables.workerNodesCount, 28 | variables.ocpMajorVersion, 29 | variables.run_date, 30 | variables.acs_version, 31 | variables.uuid_from_date, 32 | variables.uuid, 33 | variables.uuid_bkp, 34 | variables.compare_by, 35 | variables.acs_component, 36 | variables.node_roles, 37 | ]) 38 | + g.dashboard.withPanels([ 39 | // Job Summary Tables 40 | panels.table.withJobSummary('', '', [queries.jobSummary.query()], { x: 0, y: 0, w: 24, h: 6 }), 41 | panels.table.withJobDetails('', '', [queries.jobSummary.query()], { x: 0, y: 6, w: 24, h: 5 }), 42 | 43 | // ACS Version Information Table 44 | panels.table.withACSVersionInfo('ACS Version Information', '', [queries.acsVersionInfo.query()], { x: 0, y: 11, w: 24, h: 5 }), 45 | 46 | // ACS Stats Row 47 | g.panel.row.new('ACS Stats') 48 | + g.panel.row.withGridPos({ x: 0, y: 16, w: 24, h: 1 }), 49 | panels.barChart.withACSQueueDepth('Central Deployment Queue Depth', '', [ 50 | queries.centralDeploymentQueueDepth.addOperations(), 51 | queries.centralDeploymentQueueDepth.removeOperations(), 52 | ], { x: 0, y: 17, w: 10, h: 6 }), 53 | panels.barChart.withSensorDrops('Sensor Process Enrichment Drops', '', [queries.sensorProcessEnrichmentDrops.query()], { x: 10, y: 17, w: 10, h: 6 }), 54 | 55 | // ACS Component Performance (Repeating Row) 56 | g.panel.row.new('$acs_component') 57 | + g.panel.row.withGridPos({ x: 0, y: 23, w: 24, h: 1 }) 58 | + g.panel.row.withRepeat('acs_component'), 59 | panels.barChart.withACSMemoryUsage('Max WSS Usage $acs_component', 'bytes', [queries.acsComponentMemory.maxWSS()], { x: 0, y: 24, w: 10, h: 5 }), 60 | panels.barChart.withACSCPUUsage('Max CPU Usage $acs_component', 'cores', [queries.acsComponentCPU.maxCPU()], { x: 10, y: 24, w: 10, h: 5 }), 61 | panels.barChart.withP90Memory('p90 WSS Usage $acs_component', 'bytes', [queries.acsComponentMemory.p90WSS()], { x: 0, y: 29, w: 10, h: 5 }), 62 | panels.barChart.withP90CPU('p90 CPU Usage $acs_component', 'cores', [queries.acsComponentCPU.p90CPU()], { x: 10, y: 29, w: 10, h: 5 }), 63 | 64 | // Node Usage (Collapsed Row) 65 | g.panel.row.new('Node Usage') 66 | + g.panel.row.withGridPos({ x: 0, y: 56, w: 24, h: 1 }) 67 | + g.panel.row.withCollapsed(true) 68 | + g.panel.row.withPanels([ 69 | panels.barGauge.withNodeCPUUsage('$workerNodesCount nodes - CPU usage $node_roles', 'cores', [queries.nodeCPUUsage.query()], { x: 0, y: 29, w: 8, h: 4 }), 70 | panels.barGauge.withNodeCPUUsage('Maximum CPU usage $node_roles', 'cores', [queries.nodeCPUUsage.maxCPU()], { x: 0, y: 33, w: 8, h: 4 }), 71 | panels.barGauge.withNodeMemoryUsage('$workerNodesCount nodes - Memory usage $node_roles', 'bytes', [queries.nodeMemoryUsage.query()], { x: 0, y: 37, w: 8, h: 4 }), 72 | panels.barGauge.withNodeMemoryUsage('$workerNodesCount nodes - Maximum aggregated memory usage $node_roles', 'bytes', [queries.nodeMemoryUsage.maxAggregated()], { x: 0, y: 41, w: 8, h: 4 }), 73 | panels.barChart.withClusterUsageRatio('Max Cluster CPU usage ratio', '', [queries.clusterUsageRatio.maxCPU()], { x: 0, y: 45, w: 12, h: 6 }), 74 | panels.barChart.withClusterUsageRatio('Max Cluster memory usage ratio', '', [queries.clusterUsageRatio.maxMemory()], { x: 12, y: 45, w: 12, h: 6 }), 75 | ]), 76 | 77 | // Pod & Service Latency (Collapsed Row) 78 | g.panel.row.new('Pod & Service ready latency') 79 | + g.panel.row.withGridPos({ x: 0, y: 57, w: 24, h: 1 }) 80 | + g.panel.row.withCollapsed(true) 81 | + g.panel.row.withPanels([ 82 | panels.barGauge.withPodLatency('$workerNodesCount nodes - P99 Pod ready latency', 'ms', [queries.podLatency.p99Ready()], { x: 0, y: 30, w: 12, h: 6 }), 83 | panels.barGauge.withServiceLatency('$workerNodesCount nodes - P99 Service ready latency', 'ns', [queries.serviceLatency.p99Ready()], { x: 12, y: 30, w: 12, h: 6 }), 84 | ]), 85 | 86 | // API Latency (Collapsed Row) 87 | g.panel.row.new('API latency') 88 | + g.panel.row.withGridPos({ x: 0, y: 58, w: 24, h: 1 }) 89 | + g.panel.row.withCollapsed(true) 90 | + g.panel.row.withPanels([ 91 | panels.barChart.withAPILatency('Read Only API request P99 latency - resource scoped', 's', [queries.apiLatency.readOnlyResource()], { x: 0, y: 31, w: 12, h: 6 }), 92 | panels.barChart.withAPILatency('Maximum Read Only API request P99 latency - resource scoped', 's', [queries.apiLatency.maxReadOnlyResource()], { x: 12, y: 31, w: 12, h: 6 }), 93 | panels.barChart.withAPILatency('Read Only API request P99 latency - namespace scoped', 's', [queries.apiLatency.readOnlyNamespace()], { x: 0, y: 37, w: 12, h: 6 }), 94 | panels.barChart.withAPILatency('Maximum Read Only API request P99 latency - namespace scoped', 's', [queries.apiLatency.maxReadOnlyNamespace()], { x: 12, y: 37, w: 12, h: 6 }), 95 | panels.barChart.withAPILatency('Read Only API request P99 latency - cluster scoped', 's', [queries.apiLatency.readOnlyCluster()], { x: 0, y: 43, w: 12, h: 6 }), 96 | panels.barChart.withAPILatency('Maximum Read Only API request P99 latency - cluster scoped', 's', [queries.apiLatency.maxReadOnlyCluster()], { x: 12, y: 43, w: 12, h: 6 }), 97 | panels.barChart.withAPILatency('Mutating API request P99 latency', 's', [queries.apiLatency.mutating()], { x: 0, y: 49, w: 12, h: 6 }), 98 | panels.barChart.withAPILatency('Maximum Mutating API request P99 latency', 's', [queries.apiLatency.maxMutating()], { x: 12, y: 49, w: 12, h: 6 }), 99 | ]), 100 | ]) 101 | -------------------------------------------------------------------------------- /assets/pgbench-dashboard/panels.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | { 4 | timeSeries: { 5 | local timeSeries = g.panel.timeSeries, 6 | local custom = timeSeries.fieldConfig.defaults.custom, 7 | local options = timeSeries.options, 8 | 9 | base(title, unit, targets, gridPos): 10 | timeSeries.new(title) 11 | + timeSeries.queryOptions.withTargets(targets) 12 | + timeSeries.datasource.withType('elasticsearch') 13 | + timeSeries.datasource.withUid('$Datasource1') 14 | + timeSeries.standardOptions.withUnit(unit) 15 | + timeSeries.gridPos.withX(gridPos.x) 16 | + timeSeries.gridPos.withY(gridPos.y) 17 | + timeSeries.gridPos.withH(gridPos.h) 18 | + timeSeries.gridPos.withW(gridPos.w) 19 | + custom.withDrawStyle('line') 20 | + custom.withLineInterpolation('linear') 21 | + custom.withBarAlignment(0) 22 | + custom.withFillOpacity(10) 23 | + custom.withGradientMode('none') 24 | + custom.withSpanNulls(false) 25 | + custom.withPointSize(5) 26 | + custom.withSpanNulls(false) 27 | + custom.stacking.withGroup('A') 28 | + custom.stacking.withMode('none') 29 | + custom.withShowPoints('never') 30 | + timeSeries.queryOptions.withTimeFrom(null) 31 | + timeSeries.queryOptions.withTimeShift(null) 32 | + timeSeries.panelOptions.withTransparent(true), 33 | 34 | tps_report(title, unit, targets, gridPos): 35 | self.base(title, unit, targets, gridPos) 36 | + custom.withLineWidth(2) 37 | + options.tooltip.withMode('multi') 38 | + options.legend.withShowLegend(false) 39 | + options.legend.withDisplayMode('list') 40 | + options.legend.withPlacement('bottom'), 41 | 42 | 43 | avg_tps(title, unit, targets, gridPos): 44 | self.base(title, unit, targets, gridPos) 45 | + options.legend.withShowLegend(true) 46 | + options.legend.withDisplayMode('table') 47 | + options.legend.withCalcs([ 48 | 'mean', 49 | 'max', 50 | 'min', 51 | ]) 52 | + options.legend.withPlacement('bottom') 53 | + custom.withDrawStyle('bars') 54 | + custom.withLineInterpolation('linear'), 55 | }, 56 | 57 | heatmap: { 58 | local heatmap = g.panel.heatmap, 59 | local custom = heatmap.fieldConfig.defaults.custom, 60 | local options = heatmap.options, 61 | 62 | base(title, unit, targets, gridPos): 63 | heatmap.new(title) 64 | + heatmap.queryOptions.withTargets(targets) 65 | + heatmap.datasource.withType('elasticsearch') 66 | + heatmap.datasource.withUid('$Datasource1') 67 | + heatmap.standardOptions.withUnit(unit) 68 | + heatmap.gridPos.withX(gridPos.x) 69 | + heatmap.gridPos.withY(gridPos.y) 70 | + heatmap.gridPos.withH(gridPos.h) 71 | + heatmap.gridPos.withW(gridPos.w) 72 | + custom.scaleDistribution.withType('linear') 73 | + custom.hideFrom.withLegend(false) 74 | + custom.hideFrom.withTooltip(false) 75 | + custom.hideFrom.withViz(false) 76 | + options.withCalculate(true) 77 | + options.yAxis.withAxisPlacement('left') 78 | + options.yAxis.withReverse(false) 79 | + options.yAxis.withUnit('ms') 80 | + options.rowsFrame.withLayout('auto') 81 | + options.color.HeatmapColorOptions.withMode('scheme') 82 | + options.color.HeatmapColorOptions.withFill('dark-orange') 83 | + options.color.HeatmapColorOptions.withScale('exponential') 84 | + options.color.HeatmapColorOptions.withExponent(0.5) 85 | + options.color.HeatmapColorOptions.withScheme('Oranges') 86 | + options.color.HeatmapColorOptions.withSteps(128) 87 | + options.color.HeatmapColorOptions.withReverse(false) 88 | + options.withCellGap(2) 89 | + options.filterValues.FilterValueRange.withLe(1e-9) 90 | + options.tooltip.withShow(false) 91 | + options.tooltip.withYHistogram(false) 92 | + options.legend.withShow(true) 93 | + options.exemplars.withColor('rgba(255,0,255,0.7)') 94 | + options.withShowValue('never') 95 | + heatmap.panelOptions.withTransparent(true), 96 | }, 97 | 98 | table: { 99 | local table = g.panel.table, 100 | local custom = table.fieldConfig.defaults.custom, 101 | local options = table.options, 102 | 103 | base(title, targets, gridPos): 104 | table.new(title) 105 | + table.queryOptions.withTargets(targets) 106 | + table.datasource.withType('elasticsearch') 107 | + table.datasource.withUid('$Datasource1') 108 | + table.gridPos.withX(gridPos.x) 109 | + table.gridPos.withY(gridPos.y) 110 | + table.gridPos.withH(gridPos.h) 111 | + table.gridPos.withW(gridPos.w) 112 | + options.withShowHeader(true) 113 | + options.footer.TableFooterOptions.withShow(false) 114 | + options.footer.TableFooterOptions.withReducer('sum') 115 | + options.footer.TableFooterOptions.withCountRows(false) 116 | + custom.withAlign('auto') 117 | + custom.withInspect(false) 118 | + table.panelOptions.withTransparent(true) 119 | + table.queryOptions.withTimeFrom(null) 120 | + table.queryOptions.withTimeShift(null) 121 | + table.standardOptions.color.withMode('thresholds') 122 | + table.queryOptions.withTransformations([ 123 | { 124 | id: 'seriesToColumns', 125 | options: { 126 | reducers: [], 127 | }, 128 | }, 129 | ]) 130 | + table.standardOptions.withOverrides([ 131 | { 132 | matcher: { 133 | id: 'byName', 134 | options: 'Average latency_ms', 135 | }, 136 | properties: [ 137 | { 138 | id: 'displayName', 139 | value: 'Avg latency', 140 | }, 141 | { 142 | id: 'decimals', 143 | value: '2', 144 | }, 145 | { 146 | id: 'custom.align', 147 | value: null, 148 | }, 149 | ], 150 | }, 151 | { 152 | matcher: { 153 | id: 'byName', 154 | options: 'Average tps', 155 | }, 156 | properties: [ 157 | { 158 | id: 'displayName', 159 | value: 'Avg TPS', 160 | }, 161 | { 162 | id: 'decimals', 163 | value: '2', 164 | }, 165 | { 166 | id: 'custom.align', 167 | value: null, 168 | }, 169 | ], 170 | }, 171 | ]), 172 | }, 173 | } 174 | -------------------------------------------------------------------------------- /templates/General/ovn-dashboard.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/ovn-monitoring/panels.libsonnet'; 2 | local queries = import '../../assets/ovn-monitoring/queries.libsonnet'; 3 | local variables = import '../../assets/ovn-monitoring/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('Openshift Networking') 7 | + g.dashboard.time.withFrom('now-1h') 8 | + g.dashboard.time.withTo('now') 9 | + g.dashboard.withTimezone('utc') 10 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 11 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 12 | + g.dashboard.withRefresh('') 13 | + g.dashboard.withEditable(false) 14 | + g.dashboard.graphTooltip.withSharedCrosshair() 15 | + g.dashboard.withVariables([ 16 | variables.Datasource, 17 | variables._master_node, 18 | variables._worker_node, 19 | variables.master_pod, 20 | variables.kubenode_pod, 21 | ]) 22 | 23 | 24 | + g.dashboard.withPanels([ 25 | g.panel.row.new('OVN Resource Monitoring') 26 | + g.panel.row.withCollapsed(true) 27 | + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) 28 | + g.panel.row.withPanels([ 29 | panels.stat.genericstatThresoldPanel('OVNKube Cluster Manager Leader', 'none', queries.ovnClusterManagerLeader.query(), { x: 0, y: 0, w: 8, h: 4 }), 30 | panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 8, y: 0, w: 8, h: 4 }), 31 | panels.stat.genericstatThresoldOVNControllerPanel('OVN Controller Count', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 8, h: 4 }), 32 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Control Plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 4, w: 12, h: 10 }), 33 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Control Plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 4, w: 12, h: 10 }), 34 | ]), 35 | g.panel.row.new('Pod Startup Latency Breakdown') 36 | + g.panel.row.withCollapsed(true) 37 | + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) 38 | + g.panel.row.withPanels([ 39 | panels.timeSeries.genericTimeSeriesLegendPanel('Scheduler Pod Scheduling Duration (P99)', 's', queries.podSchedulingLatency.query(), { x: 0, y: 0, w: 12, h: 10 }), 40 | panels.timeSeries.genericTimeSeriesLegendPanel('Pod First Seen to LSP Created Latency (P99)', 's', queries.firstSeenToLSPCreated.query(), { x: 12, y: 0, w: 12, h: 10 }), 41 | panels.timeSeries.genericTimeSeriesLegendPanel('Pod Annotation Latency (P99)', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 10, w: 12, h: 10 }), 42 | panels.timeSeries.genericTimeSeriesLegendPanel('Port Binding After LSP Creation Latency (P99)', 's', queries.lspCreated.query(), { x: 12, y: 10, w: 12, h: 10 }), 43 | panels.timeSeries.genericTimeSeriesLegendPanel('Port Binding to Chassis Assignment Latency (P99)', 's', queries.lspToChassis.query(), { x: 0, y: 20, w: 12, h: 10 }), 44 | panels.timeSeries.genericTimeSeriesLegendPanel('Port Marked As Up (P99)', 's', queries.portMarkedAsUp.query(), { x: 12, y: 20, w: 12, h: 10 }), 45 | panels.timeSeries.genericTimeSeriesLegendPanel('CNI Request ADD Latency (P99)', 's', queries.ovnCNIAdd.query(), { x: 0, y: 30, w: 12, h: 10 }), 46 | panels.timeSeries.genericTimeSeriesLegendPanel('Network Programming Complete (P99)', 's', queries.networkProgrammingComplete.query(), { x: 12, y: 30, w: 12, h: 10 }), 47 | panels.timeSeries.genericTimeSeriesLegendPanel('Sync Service Latency', 's', queries.synclatency.query(), { x: 0, y: 40, w: 12, h: 10 }), 48 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Ready Latency', 's', queries.ovnkubeNodeReadyLatency.query(), { x: 12, y: 40, w: 12, h: 10 }), 49 | ]), 50 | g.panel.row.new('OVN Component Resource Usage') 51 | + g.panel.row.withCollapsed(true) 52 | + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) 53 | + g.panel.row.withPanels([ 54 | // Worker node pod resource usage 55 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Pods CPU Usage (Top 10)', 'percent', queries.topOvnkubenodePodCPU.query(), { x: 0, y: 0, w: 12, h: 10 }), 56 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Pods Memory Usage (Top 10)', 'bytes', queries.topOvnkubenodePodMem.query(), { x: 12, y: 0, w: 12, h: 10 }), 57 | 58 | // Component resource usage 59 | panels.timeSeries.genericTimeSeriesLegendPanel('Northd CPU Usage (Top 10)', 'percent', queries.topNorthdCPU.query(), { x: 0, y: 8, w: 12, h: 10 }), 60 | panels.timeSeries.genericTimeSeriesLegendPanel('Northd Memory Usage (Top 10)', 'bytes', queries.topNorthdMem.query(), { x: 12, y: 8, w: 12, h: 10 }), 61 | panels.timeSeries.genericTimeSeriesLegendPanel('Sbdb CPU Usage (Top 10)', 'percent', queries.topSbdbCPU.query(), { x: 0, y: 16, w: 12, h: 10 }), 62 | panels.timeSeries.genericTimeSeriesLegendPanel('Sbdb Memory Usage (Top 10)', 'bytes', queries.topSbdbMem.query(), { x: 12, y: 16, w: 12, h: 10 }), 63 | panels.timeSeries.genericTimeSeriesLegendPanel('Nbdb CPU Usage (Top 10)', 'percent', queries.topNbdbCPU.query(), { x: 0, y: 24, w: 12, h: 10 }), 64 | panels.timeSeries.genericTimeSeriesLegendPanel('Nbdb Memory Usage (Top 10)', 'bytes', queries.topNbdbMem.query(), { x: 12, y: 24, w: 12, h: 10 }), 65 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Controller CPU Usage (Top 10)', 'percent', queries.topOvnkubeControllerCPU.query(), { x: 0, y: 32, w: 12, h: 10 }), 66 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Controller Memory Usage (Top 10)', 'bytes', queries.topOvnkubeControllerMem.query(), { x: 12, y: 32, w: 12, h: 10 }), 67 | panels.timeSeries.genericTimeSeriesLegendPanel('OVN Controller CPU Usage (Top 10)', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 40, w: 12, h: 10 }), 68 | panels.timeSeries.genericTimeSeriesLegendPanel('OVN Controller Memory Usage (Top 10)', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 40, w: 12, h: 10 }), 69 | ]), 70 | g.panel.row.new('WorkQueue Monitoring') 71 | + g.panel.row.withCollapsed(true) 72 | + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) 73 | + g.panel.row.withPanels([ 74 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue', 'short', queries.workQueue.query(), { x: 0, y: 0, w: 12, h: 10 }), 75 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue Depth', 'short', queries.workQueueDepth.query(), { x: 12, y: 0, w: 12, h: 10 }), 76 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue duration', 's', queries.workQueueLatency.query(), { x: 0, y: 8, w: 12, h: 10 }), 77 | panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue - Unfinished', 's', queries.workQueueUnfinishedLatency.query(), { x: 12, y: 8, w: 12, h: 10 }), 78 | ]), 79 | ]) 80 | -------------------------------------------------------------------------------- /templates/General/etcd-on-cluster-dashboard.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/etcd-on-cluster-dashboard/panels.libsonnet'; 2 | local queries = import '../../assets/etcd-on-cluster-dashboard/queries.libsonnet'; 3 | local variables = import '../../assets/etcd-on-cluster-dashboard/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('etcd-cluster-info dashboard') 7 | + g.dashboard.time.withFrom('now-1h') 8 | + g.dashboard.time.withTo('now') 9 | + g.dashboard.withTimezone('utc') 10 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 11 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 12 | + g.dashboard.withRefresh('') 13 | + g.dashboard.withEditable(false) 14 | + g.dashboard.graphTooltip.withSharedCrosshair() 15 | + g.dashboard.withVariables([ 16 | variables.Datasource, 17 | variables.etcd_pod, 18 | ]) 19 | 20 | + g.dashboard.withPanels([ 21 | g.panel.row.new('General Resource Usage') 22 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 23 | + g.panel.row.withCollapsed(true) 24 | + g.panel.row.withPanels([ 25 | panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }), 26 | panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }), 27 | panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }), 28 | panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendCommitDuration.query(), { x: 12, y: 8, w: 12, h: 8 }), 29 | panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }), 30 | panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }), 31 | ]), 32 | 33 | g.panel.row.new('Compact/Defrag Detailed') 34 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 35 | + g.panel.row.withCollapsed(true) 36 | + g.panel.row.withPanels([ 37 | panels.timeSeries.histogramStatsRightHand('Compaction Duration sum', 'none', queries.compactionDurationSum.query(), { x: 0, y: 0, w: 8, h: 8 }, 'sum'), 38 | panels.timeSeries.histogramStatsRightHand('Defrag Duration sum', 'none', queries.defragDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'count'), 39 | panels.timeSeries.histogramStatsRightHand('vmstat major page faults', 'none', queries.nodeVmstatPgmajfault.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), 40 | ]), 41 | 42 | g.panel.row.new('WAL fsync Duration Detailed') 43 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 44 | + g.panel.row.withCollapsed(true) 45 | + g.panel.row.withPanels([ 46 | panels.timeSeries.generalUsageAgg('WAL fsync Duration p99', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 0, w: 8, h: 8 }), 47 | panels.timeSeries.histogramStatsRightHand('WAL fsync Duration sum', 'none', queries.diskWalSyncDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'), 48 | panels.timeSeries.histogramStatsRightHand('WAL fsync Duration count', 'none', queries.diskWalSyncDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), 49 | ]), 50 | 51 | g.panel.row.new('Backend Commit Duration Detailed') 52 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 53 | + g.panel.row.withCollapsed(true) 54 | + g.panel.row.withPanels([ 55 | panels.timeSeries.generalUsageAgg('Backend Commit Duration', 's', queries.diskBackendCommitDuration.query(), { x: 0, y: 0, w: 8, h: 8 }), 56 | panels.timeSeries.histogramStatsRightHand('Backend Commit Duration sum', 'none', queries.diskBackendCommitDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'), 57 | panels.timeSeries.histogramStatsRightHand('Backend Commit Duration count', 'none', queries.diskBackendCommitDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), 58 | ]), 59 | 60 | g.panel.row.new('Network Usage') 61 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 62 | + g.panel.row.withCollapsed(true) 63 | + g.panel.row.withPanels([ 64 | panels.timeSeries.generalUsageAgg('Container network traffic', 'Bps', queries.containerNetworkTraffic.query(), { x: 0, y: 1, w: 12, h: 8 }), 65 | panels.timeSeries.generalUsageAgg('p99 peer to peer latency', 's', queries.p99PeerToPeerLatency.query(), { x: 12, y: 1, w: 12, h: 8 }), 66 | panels.timeSeries.generalUsageAgg('Peer network traffic', 'Bps', queries.peerNetworkTraffic.query(), { x: 0, y: 8, w: 12, h: 8 }), 67 | panels.timeSeries.generalUsageAgg('gRPC network traffic', 'Bps', queries.gRPCNetworkTraffic.query(), { x: 12, y: 8, w: 12, h: 8 }), 68 | panels.timeSeries.withoutCalcsAgg('Active Streams', '', queries.activeStreams.query(), { x: 0, y: 16, w: 12, h: 8 }), 69 | panels.timeSeries.withoutCalcsAgg('Snapshot duration', 's', queries.snapshotDuration.query(), { x: 12, y: 16, w: 12, h: 8 }), 70 | ]), 71 | 72 | g.panel.row.new('DB Info per Member') 73 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 74 | + g.panel.row.withCollapsed(true) 75 | + g.panel.row.withPanels([ 76 | panels.timeSeries.withoutCalcsAgg('% DB Space Used', 'percent', queries.dbSpaceUsed.query(), { x: 0, y: 8, w: 8, h: 8 }), 77 | panels.timeSeries.withoutCalcsAgg('DB Left capacity (with fragmented space)', 'bytes', queries.dbLeftCapacity.query(), { x: 8, y: 8, w: 8, h: 8 }), 78 | panels.timeSeries.withoutCalcsAgg('DB Size Limit (Backend-bytes)', 'bytes', queries.dbSizeLimit.query(), { x: 16, y: 8, w: 8, h: 8 }), 79 | ]), 80 | 81 | g.panel.row.new('General Info') 82 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 83 | + g.panel.row.withCollapsed(true) 84 | + g.panel.row.withPanels([ 85 | panels.timeSeries.GeneralInfo('Raft Proposals', '', queries.raftProposals.query(), { x: 0, y: 1, w: 12, h: 8 }), 86 | panels.timeSeries.GeneralInfo('Number of leader changes seen', '', queries.numberOfLeaderChangesSeen.query(), { x: 12, y: 1, w: 12, h: 8 }), 87 | panels.stat.etcdLeader('Etcd has a leader?', 'none', queries.etcdHasALeader.query(), { x: 0, y: 8, w: 6, h: 2 }), 88 | panels.stat.failedProposalsSeen('Total number of failed proposals seen', 'none', queries.totalNumberOfProposalsSeen.query(), { x: 6, y: 8, w: 6, h: 2 }), 89 | panels.timeSeries.GeneralInfo('Keys', 'short', queries.keys.query(), { x: 12, y: 12, w: 12, h: 8 }), 90 | panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }), 91 | panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }), 92 | panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }), 93 | panels.timeSeries.generalCounter('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }), 94 | panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }), 95 | ]), 96 | 97 | ]) 98 | -------------------------------------------------------------------------------- /assets/acs-perf/variables.libsonnet: -------------------------------------------------------------------------------- 1 | /* 2 | * Generated by Claude Code 3 | */ 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | local var = g.dashboard.variable; 6 | 7 | { 8 | Datasource: 9 | var.datasource.new('Datasource', 'elasticsearch') 10 | + var.datasource.withRegex('/.*kube-burner.*/') 11 | + var.query.withRefresh(1) 12 | + var.query.selectionOptions.withIncludeAll(false) 13 | + var.query.selectionOptions.withMulti(false), 14 | 15 | platform: 16 | var.query.new('platform', '{"find": "terms", "field": "platform.keyword"}') 17 | + var.query.withDatasourceFromVariable(self.Datasource) 18 | + var.query.withRefresh(2) 19 | + var.query.selectionOptions.withMulti(true) 20 | + var.query.selectionOptions.withIncludeAll(false) 21 | + var.query.generalOptions.withLabel('Platform'), 22 | 23 | sdn: 24 | var.query.new('sdn', '{"find": "terms", "field": "sdnType.keyword", "query": "platform.keyword: $platform"}') 25 | + var.query.withDatasourceFromVariable(self.Datasource) 26 | + var.query.withRefresh(1) 27 | + var.query.selectionOptions.withMulti(true) 28 | + var.query.selectionOptions.withIncludeAll(false) 29 | + var.query.generalOptions.withLabel('SDN type'), 30 | 31 | clusterType: 32 | var.query.new('clusterType', '{"find": "terms", "field": "clusterType.keyword", "query": "platform.keyword: $platform"}') 33 | + var.query.withDatasourceFromVariable(self.Datasource) 34 | + var.query.withRefresh(1) 35 | + var.query.selectionOptions.withMulti(true) 36 | + var.query.selectionOptions.withIncludeAll(true) 37 | + var.query.generalOptions.withLabel('clusterType'), 38 | 39 | job: 40 | var.query.new('job', '{"find": "terms", "field": "jobConfig.name.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND clusterType.keyword: $clusterType AND NOT jobConfig.name: garbage-collection"}') 41 | + var.query.withDatasourceFromVariable(self.Datasource) 42 | + var.query.withRefresh(1) 43 | + var.query.selectionOptions.withMulti(false) 44 | + var.query.selectionOptions.withIncludeAll(false) 45 | + var.query.generalOptions.withLabel('Job'), 46 | 47 | workerNodesCount: 48 | var.query.new('workerNodesCount', '{"find": "terms", "field": "workerNodesCount", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND clusterType.keyword: $clusterType"}') 49 | + var.query.withDatasourceFromVariable(self.Datasource) 50 | + var.query.withRefresh(1) 51 | + var.query.selectionOptions.withMulti(false) 52 | + var.query.selectionOptions.withIncludeAll(false) 53 | + var.query.generalOptions.withLabel('Workers'), 54 | 55 | ocpMajorVersion: 56 | var.query.new('ocpMajorVersion', '{"find": "terms", "field": "ocpMajorVersion.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND workerNodesCount: $workerNodesCount AND clusterType.keyword: $clusterType"}') 57 | + var.query.withDatasourceFromVariable(self.Datasource) 58 | + var.query.withRefresh(1) 59 | + var.query.selectionOptions.withMulti(true) 60 | + var.query.selectionOptions.withIncludeAll(true) 61 | + var.query.generalOptions.withLabel('OCP Major'), 62 | 63 | run_date: 64 | var.query.new('run_date', '{"find": "terms", "field": "endTimestamp", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND workerNodesCount: $workerNodesCount AND ocpMajorVersion.keyword: $ocpMajorVersion AND clusterType.keyword: $clusterType"}') 65 | + var.query.withDatasourceFromVariable(self.Datasource) 66 | + var.query.withRefresh(2) 67 | + var.query.selectionOptions.withMulti(true) 68 | + var.query.selectionOptions.withIncludeAll(false) 69 | + var.query.generalOptions.withLabel('Run Date') 70 | + var.query.generalOptions.showOnDashboard.withNothing(), 71 | 72 | acs_version: 73 | var.query.new('acs_version', '{"find": "terms", "field": "labels.central_version.keyword", "query": "metricName.keyword: central_rox_central_info AND jobName.keyword: $job AND metadata.ocpMajorVersion.keyword: $ocpMajorVersion"}') 74 | + var.query.withDatasourceFromVariable(self.Datasource) 75 | + var.query.withRefresh(2) 76 | + var.query.selectionOptions.withMulti(true) 77 | + var.query.selectionOptions.withIncludeAll(false) 78 | + var.query.generalOptions.withLabel('ACS Version'), 79 | 80 | uuid_from_date: 81 | var.query.new('uuid_from_date', '{"find": "terms", "field": "uuid.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND workerNodesCount: $workerNodesCount AND ocpMajorVersion.keyword: $ocpMajorVersion AND clusterType.keyword: $clusterType AND endTimestamp:$run_date"}') 82 | + var.query.withDatasourceFromVariable(self.Datasource) 83 | + var.query.withRefresh(2) 84 | + var.query.selectionOptions.withMulti(true) 85 | + var.query.selectionOptions.withIncludeAll(true) 86 | + var.query.generalOptions.withLabel('UUID From Date') 87 | + var.query.generalOptions.showOnDashboard.withNothing(), 88 | 89 | uuid: 90 | var.query.new('uuid', '{"find": "terms", "field": "uuid.keyword", "query": "metricName.keyword: central_rox_central_info AND jobName.keyword: $job AND metadata.ocpMajorVersion.keyword: $ocpMajorVersion AND labels.central_version.keyword: $acs_version"}') 91 | + var.query.withDatasourceFromVariable(self.Datasource) 92 | + var.query.withRefresh(2) 93 | + var.query.selectionOptions.withMulti(true) 94 | + var.query.selectionOptions.withIncludeAll(true) 95 | + var.query.generalOptions.withLabel('UUID from Version') 96 | + var.query.generalOptions.showOnDashboard.withNothing(), 97 | 98 | uuid_bkp: 99 | var.query.new('uuid_bkp', '{"find": "terms", "field": "uuid.keyword", "query": "platform.keyword: $platform AND sdnType.keyword: $sdn AND jobConfig.name.keyword: $job AND workerNodesCount: $workerNodesCount AND ocpMajorVersion.keyword: $ocpMajorVersion AND clusterType.keyword: $clusterType"}') 100 | + var.query.withDatasourceFromVariable(self.Datasource) 101 | + var.query.withRefresh(2) 102 | + var.query.selectionOptions.withMulti(true) 103 | + var.query.selectionOptions.withIncludeAll(false) 104 | + var.query.generalOptions.withLabel('UUID') 105 | + var.query.generalOptions.showOnDashboard.withNothing(), 106 | 107 | compare_by: 108 | var.custom.new('compare_by', ['uuid']) 109 | + var.custom.generalOptions.withLabel('Compare by') 110 | + var.query.generalOptions.showOnDashboard.withNothing() 111 | + var.custom.selectionOptions.withIncludeAll(false) 112 | + var.custom.selectionOptions.withMulti(false), 113 | 114 | acs_component: 115 | var.custom.new('acs_component', ['central', 'central-db', 'sensor', 'collector', 'indexer', 'matcher', 'db', 'admission-control']) 116 | + var.custom.generalOptions.withLabel('ACS Component') 117 | + var.custom.selectionOptions.withIncludeAll(true) 118 | + var.custom.selectionOptions.withMulti(true), 119 | 120 | node_roles: 121 | var.custom.new('node_roles', ['masters', 'workers', 'infra']) 122 | + var.custom.generalOptions.withLabel('Node roles') 123 | + var.query.generalOptions.showOnDashboard.withNothing() 124 | + var.custom.selectionOptions.withIncludeAll(true) 125 | + var.custom.selectionOptions.withMulti(true), 126 | 127 | } 128 | -------------------------------------------------------------------------------- /templates/CPT/kube-burner-report-mode.jsonnet: -------------------------------------------------------------------------------- 1 | local panels = import '../../assets/kube-burner-report-mode/panels.libsonnet'; 2 | local queries = import '../../assets/kube-burner-report-mode/queries.libsonnet'; 3 | local variables = import '../../assets/kube-burner-report-mode/variables.libsonnet'; 4 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 5 | 6 | g.dashboard.new('Kube-burner Report Mode') 7 | + g.dashboard.withDescription(||| 8 | Dashboard for kube-burner Mode 9 | |||) 10 | + g.dashboard.withTags('kube-burner') 11 | + g.dashboard.time.withFrom('2024-01-28 00:00:00') 12 | + g.dashboard.time.withTo('2024-01-29 23:59:59') 13 | + g.dashboard.withTimezone('utc') 14 | + g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) 15 | + g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) 16 | + g.dashboard.withRefresh('') 17 | + g.dashboard.withEditable(false) 18 | + g.dashboard.graphTooltip.withSharedCrosshair() 19 | + g.dashboard.withVariables([ 20 | variables.Datasource, 21 | variables.platform, 22 | variables.sdn, 23 | variables.clusterType, 24 | variables.job, 25 | variables.workerNodesCount, 26 | variables.ocpMajorVersion, 27 | variables.uuid, 28 | variables.compare_by, 29 | variables.component, 30 | variables.node_roles, 31 | ]) 32 | + g.dashboard.withPanels([ 33 | panels.table.withPlatformOverview('', '', queries.platformOverview.query(), { x: 6, y: 0, w: 24, h: 6 }), 34 | panels.table.withJobSummary('', '', queries.jobSummary.query(), { x: 0, y: 6, w: 24, h: 5 }), 35 | g.panel.row.new('Node Usage') 36 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 37 | + g.panel.row.withCollapsed(true) 38 | + g.panel.row.withPanels([ 39 | panels.barGauge.withnodeCPUUsage('$workerNodesCount nodes - CPU usage $node_roles', 'cores', queries.nodeCPUusage.query(), { x: 0, y: 12, w: 24, h: 4 }), 40 | panels.barGauge.withnodeCPUUsage('Maximum CPU usage $node_roles', 'cores', queries.maximumCPUusage.query(), { x: 4, y: 16, w: 8, h: 4 }), 41 | panels.barGauge.withnodeMemoryUsage('$workerNodesCount nodes - Memory usage $node_roles', 'bytes', queries.masterMemoryUsage.query(), { x: 0, y: 20, w: 8, h: 4 }), 42 | panels.barGauge.withnodeMemoryUsage('$workerNodesCount nodes - Maximum aggregated memory usage $node_roles', 'bytes', queries.maximumAggregatedMemory.query(), { x: 0, y: 24, w: 8, h: 4 }), 43 | panels.barChart.maxClusterCPUusageRatio('Max Cluster CPU usage ratio', '', queries.maxClusterCPUusageRatio.query(), { x: 0, y: 28, w: 7, h: 6 }), 44 | panels.barChart.maxClusterCPUusageRatio('Max Cluster memory usage ratio', '', queries.maxClusterMemoryUsageratio.query(), { x: 7, y: 28, w: 7, h: 6 }), 45 | ]), 46 | g.panel.row.new('Pod & Service ready latency') 47 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 48 | + g.panel.row.withCollapsed(true) 49 | + g.panel.row.withPanels([ 50 | panels.barGauge.withP99PodReadyLatency('P99 Pod ready latency', 'ms', queries.P99PodReadyLatency.query(), { x: 0, y: 13, w: 10, h: 6 }), 51 | panels.barGauge.withP99PodReadyLatency('P99 Service ready latency', 'ns', queries.P99ServiceReadyLatency.query(), { x: 10, y: 35, w: 10, h: 6 }), 52 | ]), 53 | g.panel.row.new('API latency') 54 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 55 | + g.panel.row.withCollapsed(true) 56 | + g.panel.row.withPanels([ 57 | panels.barChart.ReadOnlyAPIrequestP99latency('Read Only API request P99 latency - resource scoped', 's', queries.ReadOnlyAPIRequestP99LatencyResourceScoped.query(), { x: 0, y: 14, w: 12, h: 6 }), 58 | panels.barChart.ReadOnlyAPIrequestP99latency('Maximum Read Only API request P99 latency - resource scoped', 's', queries.MaxReadOnlyAPIrequestP99ResourceScoped.query(), { x: 12, y: 14, w: 12, h: 6 }), 59 | panels.barChart.ReadOnlyAPIrequestP99latency('Read Only API request P99 latency - namespace scoped', 's', queries.ReadonlyAPIrequestP99LatencyNamespaceScoped.query(), { x: 0, y: 20, w: 12, h: 6 }), 60 | panels.barChart.ReadOnlyAPIrequestP99latency('Maximum Read Only API request P99 latency - namespace scoped', 's', queries.MaxReadOnlyAPIrequestP99LatencyNamespaceScoped.query(), { x: 12, y: 20, w: 12, h: 6 }), 61 | panels.barChart.ReadOnlyAPIrequestP99latency('Read Only API request P99 latency - cluster scoped', 's', queries.ReadOnlyAPIrequestP99LatencyClusterScoped.query(), { x: 0, y: 26, w: 12, h: 6 }), 62 | panels.barChart.ReadOnlyAPIrequestP99latency('Maximum Read Only API request P99 latency - cluster scoped', 's', queries.MaxReadonlyAPIrequestP99LatencyClusterScoped.query(), { x: 12, y: 26, w: 12, h: 6 }), 63 | panels.barChart.ReadOnlyAPIrequestP99latency('Mutating API request P99 latency', 's', queries.MutatingAPIrequestP99Latency.query(), { x: 0, y: 32, w: 12, h: 6 }), 64 | panels.barChart.ReadOnlyAPIrequestP99latency('Maximum Mutating API request P99 latency', 's', queries.MaxMutatingAPIrequestP99Latency.query(), { x: 12, y: 32, w: 12, h: 6 }), 65 | ]), 66 | g.panel.row.new('ETCD') 67 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 68 | + g.panel.row.withCollapsed(true) 69 | + g.panel.row.withPanels([ 70 | panels.barChart.etcdScaleDistribution('99th WAL fsync', 's', queries.etcd99thWALfsync.query(), { x: 0, y: 15, w: 7, h: 5 }), 71 | panels.barChart.etcdScaleDistribution('Maximum 99th WAL fsync', 's', queries.Max99thWALfsync.query(), { x: 7, y: 15, w: 11, h: 5 }), 72 | panels.barChart.etcdroundtrip('99th Roundtrip', 's', queries.etcd99Roundtrip.query(), { x: 0, y: 20, w: 7, h: 5 }), 73 | panels.barChart.etcdroundtrip('Maximum 99th Roundtrip', 's', queries.Max99Roundtrip.query(), { x: 7, y: 20, w: 11, h: 5 }), 74 | panels.barChart.etcdScaleDistribution('99th Backend I/O', 's', queries.etcd99BackendIandO.query(), { x: 0, y: 25, w: 7, h: 5 }), 75 | panels.barChart.etcdScaleDistribution('Maximum 99th Backend I/O', 's', queries.Max99thBackendIandO.query(), { x: 7, y: 25, w: 11, h: 5 }), 76 | panels.barGauge.etcdCPUusage('Etcd CPU usage', 'cores', queries.etcdCPUusage.query(), { x: 0, y: 30, w: 7, h: 6 }), 77 | panels.barGauge.etcdCPUusage('Maximum Etcd CPU usage', 'cores', queries.MaxetcdCPUusage.query(), { x: 7, y: 30, w: 7, h: 6 }), 78 | panels.barGauge.etcdCPUusage('Etcd RSS usage', 'bytes', queries.etcdRSSusage.query(), { x: 0, y: 36, w: 7, h: 6 }), 79 | panels.barGauge.etcdCPUusage('Etcd max RSS usage', 'bytes', queries.etcdMaxRSSusage.query(), { x: 7, y: 36, w: 7, h: 6 }), 80 | ]), 81 | g.panel.row.new('$component') 82 | + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) 83 | + g.panel.row.withCollapsed(true) 84 | + g.panel.row.withRepeat('component') 85 | + g.panel.row.withPanels([ 86 | panels.barChart.ComponentRepeatPanelsBlue('Average RSS Usage $component', 'bytes', queries.AvgRSSUsageComponet.query(), { x: 0, y: 43, w: 9, h: 10 }), 87 | panels.barChart.ComponentRepeatPanelsRed('Max Aggregated RSS Usage $component', 'bytes', queries.MaxAggregatedRSSUsageComponent.query(), { x: 9, y: 43, w: 8, h: 10 }), 88 | panels.barChart.ComponentRepeatPanelsRed('Max RSS Usage $component', 'bytes', queries.MaxRSSUsageComponent.query(), { x: 17, y: 43, w: 7, h: 10 }), 89 | panels.barChart.ComponentRepeatPanelsYellow('Average CPU Usage $component', 'cores', queries.AvgCPUUsageComponent.query(), { x: 0, y: 53, w: 11, h: 5 }), 90 | panels.barChart.ComponentRepeatPanelsYellow('Maximum CPU Usage $component', 'cores', queries.MaxCPUUsageComponent.query(), { x: 11, y: 53, w: 13, h: 5 }), 91 | ]), 92 | ]) 93 | -------------------------------------------------------------------------------- /assets/ovn-monitoring/queries.libsonnet: -------------------------------------------------------------------------------- 1 | local variables = import './variables.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | 4 | local generateTimeSeriesQuery(query, legend) = [ 5 | local prometheusQuery = g.query.prometheus; 6 | prometheusQuery.new('$' + variables.Datasource.name, query) 7 | + prometheusQuery.withFormat('time_series') 8 | + prometheusQuery.withIntervalFactor(2) 9 | + prometheusQuery.withLegendFormat(legend), 10 | ]; 11 | 12 | { 13 | ovnClusterManagerLeader: { 14 | query(): 15 | generateTimeSeriesQuery('ovnkube_clustermanager_leader > 0', '{{pod}}'), 16 | }, 17 | 18 | ovnNorthd: { 19 | query(): 20 | generateTimeSeriesQuery('ovn_northd_status', '{{pod}}'), 21 | }, 22 | 23 | numOnvController: { 24 | query(): 25 | generateTimeSeriesQuery('count(ovn_controller_monitor_all) by (namespace)', ''), 26 | }, 27 | 28 | ovnKubeControlPlaneCPU: { 29 | query(): 30 | generateTimeSeriesQuery('sum( irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100 ) by (pod, node)', '{{pod}} - {{node}}'), 31 | }, 32 | 33 | ovnKubeControlPlaneMem: { 34 | query(): 35 | generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{pod}} - {{node}}'), 36 | }, 37 | 38 | topOvnControllerCPU: { 39 | query(): 40 | generateTimeSeriesQuery('topk(10, sum( irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100) by (pod,node) )', '{{pod}} - {{node}}'), 41 | }, 42 | topOvnControllerMem: { 43 | query(): 44 | generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (pod,node))', '{{pod}} - {{node}}'), 45 | }, 46 | 47 | topOvnkubenodePodCPU: { 48 | query(): 49 | generateTimeSeriesQuery( 50 | 'topk(10, (sum(irate(container_cpu_usage_seconds_total{name!="",container!~"POD|",namespace=~"openshift-ovn-kubernetes", node=~"$_worker_node"}[2m]) * 100) by (pod, namespace, node)) > 0)', 51 | '{{pod}} - {{node}}' 52 | ), 53 | }, 54 | 55 | topOvnkubenodePodMem: { 56 | query(): 57 | generateTimeSeriesQuery( 58 | 'topk(10, sum(container_memory_rss{name!="",container!~"POD|",namespace=~"openshift-ovn-kubernetes", node=~"$_worker_node"}) by (pod, namespace, node))', 59 | '{{pod}} - {{node}}' 60 | ), 61 | }, 62 | 63 | topNorthdCPU: { 64 | query(): 65 | generateTimeSeriesQuery( 66 | 'topk(10, sum(irate(container_cpu_usage_seconds_total{container="northd", namespace="openshift-ovn-kubernetes"}[2m])*100) by (pod, node))', 67 | '{{pod}} - {{node}}' 68 | ), 69 | }, 70 | 71 | topNorthdMem: { 72 | query(): 73 | generateTimeSeriesQuery( 74 | 'topk(10, sum(container_memory_rss{container="northd", namespace="openshift-ovn-kubernetes"}) by (pod, node))', 75 | '{{pod}} - {{node}}' 76 | ), 77 | }, 78 | 79 | topSbdbCPU: { 80 | query(): 81 | generateTimeSeriesQuery( 82 | 'topk(10, sum(irate(container_cpu_usage_seconds_total{container="sbdb", namespace="openshift-ovn-kubernetes"}[2m])*100) by (pod, node))', 83 | '{{pod}} - {{node}}' 84 | ), 85 | }, 86 | 87 | topSbdbMem: { 88 | query(): 89 | generateTimeSeriesQuery( 90 | 'topk(10, sum(container_memory_rss{container="sbdb", namespace="openshift-ovn-kubernetes"}) by (pod, node))', 91 | '{{pod}} - {{node}}' 92 | ), 93 | }, 94 | 95 | topNbdbCPU: { 96 | query(): 97 | generateTimeSeriesQuery( 98 | 'topk(10, sum(irate(container_cpu_usage_seconds_total{container="nbdb", namespace="openshift-ovn-kubernetes"}[2m])*100) by (pod, node))', 99 | '{{pod}} - {{node}}' 100 | ), 101 | }, 102 | 103 | topNbdbMem: { 104 | query(): 105 | generateTimeSeriesQuery( 106 | 'topk(10, sum(container_memory_rss{container="nbdb", namespace="openshift-ovn-kubernetes"}) by (pod, node))', 107 | '{{pod}} - {{node}}' 108 | ), 109 | }, 110 | 111 | topOvnkubeControllerCPU: { 112 | query(): 113 | generateTimeSeriesQuery( 114 | 'topk(10, sum(irate(container_cpu_usage_seconds_total{container="ovnkube-controller", namespace="openshift-ovn-kubernetes"}[2m])*100) by (pod, node))', 115 | '{{pod}} - {{node}}' 116 | ), 117 | }, 118 | 119 | topOvnkubeControllerMem: { 120 | query(): 121 | generateTimeSeriesQuery( 122 | 'topk(10, sum(container_memory_rss{container="ovnkube-controller", namespace="openshift-ovn-kubernetes"}) by (pod, node))', 123 | '{{pod}} - {{node}}' 124 | ), 125 | }, 126 | 127 | podSchedulingLatency: { 128 | query(): 129 | generateTimeSeriesQuery('histogram_quantile(0.99, rate(scheduler_pod_scheduling_sli_duration_seconds_bucket[5m])) > 0', '{{pod}}'), 130 | }, 131 | 132 | firstSeenToLSPCreated: { 133 | query(): 134 | generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_pod_first_seen_lsp_created_duration_seconds_bucket[2m])) by (pod, le)) > 0', '{{pod}}'), 135 | }, 136 | 137 | ovnAnnotationLatency: { 138 | query(): 139 | generateTimeSeriesQuery('histogram_quantile(0.99, sum by (pod, le) (rate(ovnkube_controller_pod_creation_latency_seconds_bucket[2m]))) > 0', '{{pod}}'), 140 | }, 141 | 142 | lspCreated: { 143 | query(): 144 | generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_pod_lsp_created_port_binding_duration_seconds_bucket[2m])) by (pod,le)) > 0', '{{pod}}'), 145 | }, 146 | 147 | lspToChassis: { 148 | query(): 149 | generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_pod_port_binding_port_binding_chassis_duration_seconds_bucket[2m])) by (pod, le)) > 0', '{{pod}}'), 150 | }, 151 | 152 | portMarkedAsUp: { 153 | query(): 154 | generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_pod_port_binding_chassis_port_binding_up_duration_seconds_bucket[2m])) by (pod, le)) > 0', '{{pod}}'), 155 | }, 156 | 157 | ovnCNIAdd: { 158 | query(): 159 | generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="ADD"}[2m])) by (pod,le)) > 0', '{{pod}}'), 160 | }, 161 | 162 | networkProgrammingComplete: { 163 | query(): 164 | generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_network_programming_duration_seconds_bucket[2m])) by (pod, le)) > 0', '{{pod}}'), 165 | }, 166 | 167 | synclatency: { 168 | query(): 169 | generateTimeSeriesQuery('rate(ovnkube_master_sync_service_latency_seconds_sum[2m])', '{{pod}} - Sync service latency'), 170 | }, 171 | 172 | ovnkubeNodeReadyLatency: { 173 | query(): 174 | generateTimeSeriesQuery('ovnkube_node_ready_duration_seconds{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{pod}}'), 175 | }, 176 | 177 | workQueue: { 178 | query(): 179 | generateTimeSeriesQuery('rate(ovnkube_master_workqueue_adds_total[2m])', '{{pod}} - Rate of handled adds'), 180 | }, 181 | 182 | workQueueDepth: { 183 | query(): 184 | generateTimeSeriesQuery('ovnkube_master_workqueue_depth', '{{pod}} - Depth of workqueue'), 185 | }, 186 | 187 | workQueueLatency: { 188 | query(): 189 | generateTimeSeriesQuery('ovnkube_master_workqueue_longest_running_processor_seconds', '{{pod}} - Longest processor duration'), 190 | }, 191 | 192 | workQueueUnfinishedLatency: { 193 | query(): 194 | generateTimeSeriesQuery('ovnkube_master_workqueue_unfinished_work_seconds', '{{pod}} - Unfinished work duration'), 195 | }, 196 | } 197 | -------------------------------------------------------------------------------- /assets/vegeta-wrapper/queries.libsonnet: -------------------------------------------------------------------------------- 1 | local variables = import './variables.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local elasticsearch = g.query.elasticsearch; 4 | 5 | { 6 | rps: { 7 | query(): 8 | elasticsearch.withAlias(null) 9 | + elasticsearch.withBucketAggs([ 10 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 11 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 12 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 13 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 14 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 15 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 16 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(null), 17 | ]) 18 | + elasticsearch.withMetrics([ 19 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('rps') 20 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 21 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 22 | ]) 23 | + elasticsearch.withQuery('uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"') 24 | + elasticsearch.withTimeField('timestamp'), 25 | }, 26 | 27 | throughput: { 28 | query(): 29 | elasticsearch.withAlias(null) 30 | + elasticsearch.withBucketAggs([ 31 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 32 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 33 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 34 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 35 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 36 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 37 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(null), 38 | ]) 39 | + elasticsearch.withMetrics([ 40 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('throughput') 41 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 42 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 43 | ]) 44 | + elasticsearch.withQuery('uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"') 45 | + elasticsearch.withTimeField('timestamp'), 46 | }, 47 | 48 | latency: { 49 | query(): 50 | [ 51 | elasticsearch.withAlias(null) 52 | + elasticsearch.withBucketAggs([ 53 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 54 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 55 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 56 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 57 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 58 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 59 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(null), 60 | ]) 61 | + elasticsearch.withMetrics([ 62 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('req_latency') 63 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 64 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 65 | ]) 66 | + elasticsearch.withQuery('uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"') 67 | + elasticsearch.withTimeField('timestamp'), 68 | 69 | elasticsearch.withAlias(null) 70 | + elasticsearch.withBucketAggs([ 71 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 72 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 73 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 74 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 75 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 76 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 77 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(null), 78 | ]) 79 | + elasticsearch.withMetrics([ 80 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('p99_latency') 81 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 82 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 83 | ]) 84 | + elasticsearch.withQuery('uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"') 85 | + elasticsearch.withTimeField('timestamp'), 86 | 87 | ], 88 | }, 89 | 90 | results: { 91 | query(): 92 | elasticsearch.withAlias(null) 93 | + elasticsearch.withBucketAggs([ 94 | elasticsearch.bucketAggs.Terms.withField('uuid.keyword') 95 | + elasticsearch.bucketAggs.Terms.withId('2') 96 | + elasticsearch.bucketAggs.Terms.withType('terms') 97 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 98 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 99 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 100 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 101 | elasticsearch.bucketAggs.Terms.withField('targets.keyword') 102 | + elasticsearch.bucketAggs.Terms.withId('1') 103 | + elasticsearch.bucketAggs.Terms.withType('terms') 104 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 105 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 106 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 107 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 108 | ]) 109 | + elasticsearch.withMetrics([ 110 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('rps') 111 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('3') 112 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 113 | 114 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('throughput') 115 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('4') 116 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 117 | 118 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('p99_latency') 119 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('5') 120 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 121 | 122 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('req_latency') 123 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('6') 124 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 125 | 126 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('bytes_in') 127 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('7') 128 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 129 | 130 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('bytes_out') 131 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('8') 132 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 133 | ]) 134 | + elasticsearch.withQuery('uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"') 135 | + elasticsearch.withTimeField('timestamp'), 136 | }, 137 | } 138 | -------------------------------------------------------------------------------- /dittybopper/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | function _usage { 6 | cat <] [-n ] [-p ] 12 | 13 | $(basename "${0}") [-i ] 14 | 15 | $(basename "${0}") [-d] [-n ] 16 | 17 | -c : The (c)ommand to use for k8s admin (defaults to 'oc' for now) 18 | 19 | -n : The (n)amespace in which to deploy the Grafana instance 20 | (defaults to 'dittybopper') 21 | 22 | -p : The (p)assword to configure for the Grafana admin user 23 | (defaults to 'admin') 24 | 25 | -i : (I)mport dashboard from given path. Using this flag will 26 | bypass the deployment process and only do the import to an 27 | already-running Grafana pod. Can be a local path or a remote 28 | URL beginning with http. 29 | 30 | -t : Use custom dittybopper template from local path, default will be templates/dittybopper.yaml.template 31 | 32 | -d : (D)elete an existing deployment 33 | 34 | -h : Help 35 | 36 | END 37 | } 38 | 39 | # Set default template variables 40 | export PROMETHEUS_USER=internal 41 | export GRAFANA_ADMIN_PASSWORD=admin 42 | export GRAFANA_URL="http://admin:${GRAFANA_ADMIN_PASSWORD}@localhost:3000" 43 | export SYNCER_IMAGE=${SYNCER_IMAGE:-"quay.io/cloud-bulldozer/dittybopper-syncer:latest"} # Syncer image 44 | export GRAFANA_IMAGE=${GRAFANA_IMAGE:-"quay.io/cloud-bulldozer/grafana:9.4.3"} # Syncer image 45 | export GRAFANA_RENDERER_IMAGE=${GRAFANA_RENDERER_IMAGE:-"grafana/grafana-image-renderer:latest"} # Grafana renderer image 46 | 47 | # Set defaults for command options 48 | k8s_cmd='oc' 49 | namespace='dittybopper' 50 | namespace_file="$(dirname $(realpath ${BASH_SOURCE[0]}))/templates/dittybopper_ns.yaml.template" 51 | grafana_default_pass=True 52 | 53 | # Capture and act on command options 54 | while getopts ":c:m:n:p:i:t:dh" opt; do 55 | case ${opt} in 56 | c) 57 | k8s_cmd=${OPTARG} 58 | ;; 59 | n) 60 | namespace="${OPTARG}" 61 | ;; 62 | p) 63 | export GRAFANA_ADMIN_PASSWORD=${OPTARG} 64 | grafana_default_pass=False 65 | ;; 66 | i) 67 | dash_import+=(${OPTARG}) 68 | ;; 69 | t) 70 | template=${OPTARG} 71 | ;; 72 | d) 73 | delete=True 74 | ;; 75 | h) 76 | _usage 77 | exit 1 78 | ;; 79 | \?) 80 | echo -e "\033[32mERROR: Invalid option -${OPTARG}\033[0m" >&2 81 | _usage 82 | exit 1 83 | ;; 84 | :) 85 | echo -e "\033[32mERROR: Option -${OPTARG} requires an argument.\033[0m" >&2 86 | _usage 87 | exit 1 88 | ;; 89 | esac 90 | done 91 | 92 | 93 | if [[ ! -z ${template} ]]; then 94 | deploy_template=${template} 95 | else 96 | deploy_template="$(dirname $(realpath ${BASH_SOURCE[0]}))/templates/dittybopper.yaml.template" 97 | fi 98 | 99 | 100 | echo "${dash_import[@]}" 101 | echo -e "\033[32m 102 | ____ _ __ __ __ 103 | / __ \(_) /_/ /___ __/ /_ ____ ____ ____ ___ _____ 104 | / / / / / __/ __/ / / / __ \/ __ \/ __ \/ __ \/ _ \/ ___/ 105 | / /_/ / / /_/ /_/ /_/ / /_/ / /_/ / /_/ / /_/ / __/ / 106 | /_____/_/\__/\__/\__, /_.___/\____/ .___/ .___/\___/_/ 107 | /____/ /_/ /_/ 108 | 109 | \033[0m" 110 | echo "Using k8s command: $k8s_cmd" 111 | echo "Using namespace: $namespace" 112 | if [[ ${grafana_default_pass} ]]; then 113 | echo "Using default grafana password: ${GRAFANA_ADMIN_PASSWORD}" 114 | else 115 | echo "Using custom grafana password." 116 | fi 117 | 118 | 119 | # Get environment values 120 | #FIXME: This is OCP-Specific; needs updating to support k8s 121 | echo "" 122 | echo -e "\033[32mGetting environment vars...\033[0m" 123 | export PROMETHEUS_URL="https://prometheus-k8s.openshift-monitoring.svc.cluster.local:9091" 124 | export PROMETHEUS_BEARER=$($k8s_cmd create token -n openshift-monitoring prometheus-k8s --duration 240h || $k8s_cmd sa get-token -n openshift-monitoring prometheus-k8s || $k8s_cmd sa new-token -n openshift-monitoring prometheus-k8s) 125 | echo "Prometheus URL is: ${PROMETHEUS_URL}" 126 | if [[ -n ${PROMETHEUS_BEARER} ]]; then 127 | echo "Prometheus bearer token collected." 128 | else 129 | echo "ERROR: Prometheus bearer token is not collected." 130 | exit 1 131 | fi 132 | 133 | # Identify Hypershift Management Cluster 134 | if [ $($k8s_cmd get crd hostedclusters.hypershift.openshift.io 2>/dev/null | wc -l) -ne 0 ] ; then 135 | echo "Detected Hypershift Management Cluster" 136 | export HYPERSHIFT_MANAGEMENT_CLUSTER="yes" 137 | export OBO_URL="http://hypershift-monitoring-stack-prometheus.openshift-observability-operator.svc.cluster.local:9090" 138 | fi 139 | 140 | function namespace() { 141 | # Create namespace 142 | $k8s_cmd "$1" -f "$namespace_file" 143 | } 144 | 145 | function grafana() { 146 | envsubst < ${deploy_template} | $k8s_cmd "$1" -n "$namespace" -f - 147 | if [[ ! $delete ]]; then 148 | echo "" 149 | echo -e "\033[32mWaiting for dittybopper deployment to be available...\033[0m" 150 | if $k8s_cmd wait --for=condition=available -n $namespace deployment/dittybopper --timeout=60s; then 151 | return 0 152 | else 153 | $k8s_cmd get pods -n $namespace 154 | $k8s_cmd get deploy -n $namespace 155 | $k8s_cmd logs -l app=dittybopper --max-log-requests=100 -n $namespace --all-containers=true 156 | exit 1 157 | fi 158 | fi 159 | } 160 | 161 | function dash_import(){ 162 | sleep 5 163 | echo -e "\033[32mImporting dashboards...\033[0m" 164 | for dash in ${dash_import[@]}; do 165 | if [[ $dash =~ ^http ]]; then 166 | echo "Fetching remote dashboard $dash" 167 | dashfile="/tmp/$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 8)" 168 | curl -sS $dash -o $dashfile 169 | else 170 | echo "Using local dashboard ${dash}" 171 | dashfile=$dash 172 | fi 173 | dashboard=$(cat ${dashfile}) 174 | dashboard_request="{\"dashboard\": ${dashboard}, \"overwrite\": true}" 175 | response_code=$(curl -Ss -w "%{http_code}" -X POST -H "Content-Type: application/json" -H "Accept: application/json" -d "${dashboard_request}" \ 176 | "http://admin:${GRAFANA_ADMIN_PASSWORD}@${dittybopper_route}/api/dashboards/db" -o /tmp/resp.txt) 177 | if [[ $response_code != "200" ]]; then 178 | echo "" 179 | echo -e "\033[31mFailed to import dashboard ${dash}\033[0m" 180 | cat /tmp/resp.txt 181 | echo "" 182 | echo -e "\033[31mYou can find the above output in /tmp/resp.txt\033[0m" 183 | exit 1 184 | else 185 | echo -e "\033[32mImported dashboard ${dash}\033[0m" 186 | fi 187 | done 188 | } 189 | 190 | if [[ $delete ]]; then 191 | echo "" 192 | echo -e "\033[32mDeleting Grafana...\033[0m" 193 | grafana "delete" 194 | echo "" 195 | echo -e "\033[32mDeleting namespace...\033[0m" 196 | namespace "delete" 197 | echo "" 198 | echo -e "\033[32mDeployment deleted!\033[0m" 199 | else 200 | echo "" 201 | echo -e "\033[32mCreating namespace...\033[0m" 202 | # delete the namespace if it already exists to make sure the latest version of the dashboards are deployed and also to support the case where user wants to redeploy dittybopper without having to delete the namespace manually 203 | if [[ $($k8s_cmd get namespaces | grep -w $namespace) ]]; then 204 | echo "Looks like the namespace $namespace already exists, deleting it" 205 | namespace "delete" 206 | fi 207 | namespace "create" 208 | echo "" 209 | echo -e "\033[32mDeploying Grafana...\033[0m" 210 | grafana "apply" 211 | echo "" 212 | dittybopper_route=$($k8s_cmd -n $namespace get route dittybopper -o jsonpath="{.spec.host}") 213 | [[ ! -z ${dash_import} ]] && dash_import 214 | echo "You can access the Grafana instance at http://${dittybopper_route}" 215 | fi 216 | -------------------------------------------------------------------------------- /assets/ycsb/queries.libsonnet: -------------------------------------------------------------------------------- 1 | local variables = import './variables.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local elasticsearch = g.query.elasticsearch; 4 | 5 | { 6 | throughput_overtime: { 7 | query(): 8 | elasticsearch.withAlias(null) 9 | + elasticsearch.withBucketAggs([ 10 | elasticsearch.bucketAggs.Terms.withField('action.keyword') 11 | + elasticsearch.bucketAggs.Terms.withId('4') 12 | + elasticsearch.bucketAggs.Terms.withType('terms') 13 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 14 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 15 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 16 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 17 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 18 | + elasticsearch.bucketAggs.DateHistogram.withId('3') 19 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 20 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 21 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 22 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 23 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0), 24 | ]) 25 | + elasticsearch.withMetrics([ 26 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('overall_rate') 27 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 28 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 29 | ]) 30 | + elasticsearch.withQuery('(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user) AND (action.keyword=$operation)') 31 | + elasticsearch.withTimeField('timestamp'), 32 | }, 33 | 34 | phase_average_latency: { 35 | query(): 36 | elasticsearch.withAlias('{{ocpMajorVersion.keyword}}') 37 | + elasticsearch.withBucketAggs([ 38 | elasticsearch.bucketAggs.Terms.withField('action.keyword') 39 | + elasticsearch.bucketAggs.Terms.withId('3') 40 | + elasticsearch.bucketAggs.Terms.withType('terms') 41 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 42 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 43 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 44 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 45 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 46 | + elasticsearch.bucketAggs.DateHistogram.withId('2') 47 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 48 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 49 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 50 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 51 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0), 52 | ]) 53 | + elasticsearch.withMetrics([ 54 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('latency_90') 55 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 56 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 57 | ]) 58 | + elasticsearch.withQuery('(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user) AND (action.keyword=$operation)') 59 | + elasticsearch.withTimeField('timestamp'), 60 | }, 61 | 62 | latency_95: { 63 | query(): 64 | elasticsearch.withAlias('{{ocpMajorVersion.keyword}}') 65 | + elasticsearch.withBucketAggs([ 66 | elasticsearch.bucketAggs.Terms.withField('workload_type.keyword') 67 | + elasticsearch.bucketAggs.Terms.withId('5') 68 | + elasticsearch.bucketAggs.Terms.withType('terms') 69 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 70 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 71 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 72 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 73 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 74 | + elasticsearch.bucketAggs.DateHistogram.withId('3') 75 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 76 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 77 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 78 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 79 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0), 80 | ]) 81 | + elasticsearch.withMetrics([ 82 | elasticsearch.metrics.MetricAggregationWithSettings.Average.withField('data.$operation.95thPercentileLatency(us)') 83 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withId('1') 84 | + elasticsearch.metrics.MetricAggregationWithSettings.Average.withType('avg'), 85 | ]) 86 | + elasticsearch.withQuery('(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user)') 87 | + elasticsearch.withTimeField('timestamp'), 88 | }, 89 | 90 | overall_workload_throughput: { 91 | query(): 92 | elasticsearch.withAlias('{{ocpMajorVersion.keyword}}') 93 | + elasticsearch.withBucketAggs([ 94 | elasticsearch.bucketAggs.Terms.withField('workload_type.keyword') 95 | + elasticsearch.bucketAggs.Terms.withId('5') 96 | + elasticsearch.bucketAggs.Terms.withType('terms') 97 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 98 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 99 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 100 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 101 | elasticsearch.bucketAggs.DateHistogram.withField('timestamp') 102 | + elasticsearch.bucketAggs.DateHistogram.withId('3') 103 | + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') 104 | + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') 105 | + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount(0) 106 | + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone('utc') 107 | + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0), 108 | ]) 109 | + elasticsearch.withMetrics([ 110 | elasticsearch.metrics.MetricAggregationWithSettings.Sum.withField('data.OVERALL.Throughput(ops/sec)') 111 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withId('1') 112 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withType('sum'), 113 | ]) 114 | + elasticsearch.withQuery('(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user)') 115 | + elasticsearch.withTimeField('timestamp'), 116 | }, 117 | 118 | aggregate_operation_sum: { 119 | query(): 120 | elasticsearch.withAlias('$operation - Operations') 121 | + elasticsearch.withBucketAggs([ 122 | elasticsearch.bucketAggs.Terms.withField('workload_type.keyword') 123 | + elasticsearch.bucketAggs.Terms.withId('3') 124 | + elasticsearch.bucketAggs.Terms.withType('terms') 125 | + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') 126 | + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') 127 | + elasticsearch.bucketAggs.Terms.settings.withMinDocCount(1) 128 | + elasticsearch.bucketAggs.Terms.settings.withSize('10'), 129 | ]) 130 | + elasticsearch.withMetrics([ 131 | elasticsearch.metrics.MetricAggregationWithSettings.Sum.withField('data.$operation.Operations') 132 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withId('1') 133 | + elasticsearch.metrics.MetricAggregationWithSettings.Sum.withType('sum'), 134 | ]) 135 | + elasticsearch.withQuery('(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user)') 136 | + elasticsearch.withTimeField('timestamp'), 137 | }, 138 | } 139 | --------------------------------------------------------------------------------