├── .github └── workflows │ ├── benchmarks.yaml │ └── teardown.yaml ├── LICENSE ├── README.md ├── benchmarks └── ramp_up.js ├── dev ├── docker-compose.yml ├── prometheus-config.yml └── run-benchmark ├── environments └── aws │ ├── .gitignore │ ├── Pulumi.main.yaml │ ├── Pulumi.yaml │ ├── README.md │ ├── index.ts │ ├── package-lock.json │ └── package.json ├── k8s ├── benchmark │ ├── kustomization.yaml │ ├── monitoring-service.yaml │ ├── service-monitor.yaml │ ├── soak-test-deployment.yaml │ └── workers-deployment.yaml ├── monitoring │ ├── dashboards │ │ ├── cloudwatch.yaml │ │ ├── fetch-dashboards.sh │ │ ├── folder.yaml │ │ ├── patch.yaml │ │ ├── soak-test-frontend.json │ │ ├── soak-test-history.json │ │ ├── soak-test-matching.json │ │ ├── soak-test-persistence.json │ │ ├── soak-test-pods.json │ │ ├── soak-test-polling.json │ │ ├── soak-test-services.json │ │ ├── soak-test-slo.json │ │ ├── soak-test-summary.json │ │ └── soak-test-worker.json │ ├── grafana.ini │ ├── kustomization.yaml │ ├── prometheus-rbacTemporal.yaml │ └── temporal-rules.yaml └── temporal │ ├── frontend-deployment.yaml │ ├── frontend-service.yaml │ ├── history-deployment.yaml │ ├── kustomization.yaml │ ├── matching-deployment.yaml │ ├── monitoring-service.yaml │ ├── service-monitor.yaml │ └── worker-deployment.yaml └── stacks ├── .gitignore ├── Pulumi.eks-cassandra-medium.yaml ├── Pulumi.eks-rds-aurora-mysql-medium.yaml ├── Pulumi.eks-rds-aurora-postgres-medium.yaml ├── Pulumi.eks-rds-mysql-scaling-series.yaml ├── Pulumi.eks-rds-postgres-medium.yaml ├── Pulumi.eks-rds-postgres-micro.yaml ├── Pulumi.yaml ├── README.md ├── fetch-kubeconfig ├── grafana-tunnel ├── index.ts ├── package-lock.json ├── package.json ├── port-forward-grafana └── tsconfig.json /.github/workflows/benchmarks.yaml: -------------------------------------------------------------------------------- 1 | name: Run Benchmarks 2 | on: 3 | - workflow_dispatch 4 | jobs: 5 | benchmarks: 6 | name: Run benchmarks 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | stack: [eks-rds-postgres-m6i-large,eks-rds-postgres-m6i-2xlarge,eks-rds-aurora-postgres-r5-2xlarge] 11 | runs-on: ubuntu-latest 12 | permissions: 13 | id-token: write 14 | contents: read 15 | steps: 16 | - uses: actions/checkout@v3 17 | - uses: actions/setup-node@v3 18 | with: 19 | node-version: 16 20 | - run: npm install 21 | working-directory: stacks 22 | - name: Configure AWS Credentials 23 | uses: aws-actions/configure-aws-credentials@v1 24 | with: 25 | aws-region: ${{ secrets.AWS_REGION }} 26 | role-to-assume: ${{ secrets.AWS_ROLE }} 27 | - uses: pulumi/actions@v3 28 | id: pulumi-up 29 | with: 30 | work-dir: stacks 31 | command: up 32 | stack-name: ${{ matrix.stack }} 33 | env: 34 | PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_TOKEN }} 35 | - run: | 36 | echo "### Benchmark: ${{ matrix.stack }}" >> $GITHUB_STEP_SUMMARY 37 | echo "${{ steps.pulumi-up.outputs.clusterSummary }}" >> $GITHUB_STEP_SUMMARY 38 | - uses: ianbelcher/eks-kubectl-action@master 39 | id: run-benchmark 40 | with: 41 | cluster_name: ${{ steps.pulumi-up.outputs.clusterName }} 42 | stdin: benchmarks/ramp_up.js 43 | args: > 44 | run k6-${{ github.run_attempt }} -i --restart Never 45 | --image ghcr.io/temporalio/xk6-temporal:v0.1.0 46 | --env TEMPORAL_GRPC_ENDPOINT=temporal-frontend:7233 47 | --env K6_OUT=output-prometheus-remote 48 | --env K6_PROMETHEUS_REMOTE_URL=http://prometheus-k8s.monitoring.svc.cluster.local:9090/api/v1/write 49 | --env PROMETHEUS_ENDPOINT=http://prometheus-k8s.monitoring.svc.cluster.local:9090/ 50 | -- 51 | k6 run --quiet --no-color - 52 | - run: | 53 | echo "${{ steps.run-benchmark.outputs.kubectl-out }}" >> $GITHUB_STEP_SUMMARY 54 | - uses: pulumi/actions@v3 55 | with: 56 | work-dir: stacks 57 | command: destroy 58 | stack-name: ${{ matrix.stack }} 59 | env: 60 | PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_TOKEN }} 61 | -------------------------------------------------------------------------------- /.github/workflows/teardown.yaml: -------------------------------------------------------------------------------- 1 | name: Teardown clusters 2 | on: 3 | - workflow_dispatch 4 | jobs: 5 | benchmarks: 6 | name: Teardown clusters 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | stack: [eks-rds-postgres-m6i-large,eks-rds-postgres-m6i-2xlarge] 11 | runs-on: ubuntu-latest 12 | permissions: 13 | id-token: write 14 | contents: read 15 | steps: 16 | - uses: actions/checkout@v3 17 | - uses: actions/setup-node@v3 18 | with: 19 | node-version: 16 20 | - run: npm install 21 | working-directory: stacks 22 | - name: Configure AWS Credentials 23 | uses: aws-actions/configure-aws-credentials@v1 24 | with: 25 | aws-region: ${{ secrets.AWS_REGION }} 26 | role-to-assume: ${{ secrets.AWS_ROLE }} 27 | - uses: pulumi/actions@v3 28 | id: pulumi-down 29 | with: 30 | work-dir: stacks 31 | command: destroy 32 | stack-name: ${{ matrix.stack }} 33 | env: 34 | PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_TOKEN }} 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 temporal.io 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Temporal Benchmark Matrix 2 | 3 | The Benchmark Matrix is designed to give users a guide to what kind of performance they can expect out of various cluster and persistence configurations. 4 | 5 | Please note that this project is in the extremely early stages, clusters are not being tuned, resource limits are not set or enforced. Any performance numbers recorded should be ignored for now. 6 | 7 | Once the matrix is able to run benchmarks across multiple providers and persistence backends we will apply constraints and tuning so that we can get consistent and meaningful benchmark results. 8 | 9 | # Benchmark matrix 10 | 11 | These are the platforms and persistence types that the benchmark matrix currently tests: 12 | 13 | | Provider | Platform | Persistence Type | Persistence Size | 14 | |---|---|---|---| 15 | |AWS|EKS|Postgres|m6i-large| 16 | |AWS|EKS|Postgres|m6i-2xlarge| 17 | 18 | # Running the benchmarks 19 | 20 | TODO :) 21 | 22 | # Contributing 23 | 24 | Currently we only run on AWS EKS with Postgres RDS instances. We would love to support Azure, GCP and more persistence systems. All contributions welcome! -------------------------------------------------------------------------------- /benchmarks/ramp_up.js: -------------------------------------------------------------------------------- 1 | import temporal from 'k6/x/temporal'; 2 | import promclient from 'k6/x/prometheus-client'; 3 | import { scenario } from 'k6/execution'; 4 | import { textSummary } from 'https://jslib.k6.io/k6-summary/0.0.2/index.js'; 5 | 6 | export const options = { 7 | scenarios: { 8 | ramp_up: { 9 | executor: 'ramping-vus', 10 | startVUs: 500, 11 | stages: [ 12 | { duration: '2m', target: 500 }, 13 | { duration: '10s', target: 600 }, 14 | { duration: '2m', target: 600 }, 15 | { duration: '10s', target: 700 }, 16 | { duration: '2m', target: 700 }, 17 | { duration: '10s', target: 800 }, 18 | { duration: '2m', target: 800 }, 19 | { duration: '10s', target: 900 }, 20 | { duration: '2m', target: 900 }, 21 | { duration: '10s', target: 1000 }, 22 | { duration: '2m', target: 1000 }, 23 | ], 24 | }, 25 | }, 26 | }; 27 | 28 | const startWorkflow = (client) => { 29 | while(true) { 30 | try { 31 | const workflow = client.startWorkflow( 32 | { 33 | task_queue: 'benchmark', 34 | id: 'echo-' + scenario.iterationInTest, 35 | }, 36 | 'ExecuteActivity', 37 | { "Count": 1, "Activity": "Echo", "Input": { "Message": "test" } }, 38 | ) 39 | 40 | return workflow; 41 | } catch (err) { console.log("Retrying...", err); } 42 | } 43 | } 44 | 45 | const waitForWorkflowCompletion = (workflow) => { 46 | while(true) { 47 | try { 48 | workflow.result() 49 | return 50 | } catch (err) { console.log("Retrying...", err); } 51 | } 52 | } 53 | 54 | export default () => { 55 | const client = temporal.newClient() 56 | 57 | const workflow = startWorkflow(client); 58 | waitForWorkflowCompletion(workflow) 59 | 60 | client.close() 61 | }; 62 | 63 | const queryProm = (query) => { 64 | const prom = promclient.newClient(__ENV.PROMETHEUS_ENDPOINT) 65 | 66 | const [result, warnings] = prom.query(query, new Date()); 67 | 68 | if (warnings.length) { 69 | console.warn("Prometheus warnings:", warnings) 70 | } 71 | 72 | return result 73 | } 74 | 75 | export function handleSummary(data) { 76 | delete(data.metrics.data_sent); 77 | delete(data.metrics.data_received); 78 | 79 | data.metrics.actions = { 80 | "type": "counter", 81 | "values": { 82 | "count": queryProm('sum(action{namespace="default"})')[0].value + 0, 83 | "rate": queryProm('max_over_time(sum(rate(action{namespace="default"}[1m]))[15m:30s])')[0].value + 0, 84 | } 85 | } 86 | 87 | return { 88 | 'stdout': textSummary(data, { enableColors: false }) 89 | }; 90 | }; -------------------------------------------------------------------------------- /dev/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | services: 3 | postgresql: 4 | container_name: temporal-postgresql 5 | environment: 6 | POSTGRES_PASSWORD: temporal 7 | POSTGRES_USER: temporal 8 | image: postgres:13 9 | networks: 10 | - temporal-network 11 | temporal: 12 | container_name: temporal 13 | depends_on: 14 | - postgresql 15 | environment: 16 | - DB=postgresql 17 | - DB_PORT=5432 18 | - POSTGRES_USER=temporal 19 | - POSTGRES_PWD=temporal 20 | - POSTGRES_SEEDS=postgresql 21 | - PROMETHEUS_ENDPOINT=0.0.0.0:8000 22 | image: temporalio/auto-setup:1.20.0 23 | networks: 24 | - temporal-network 25 | ports: 26 | - 7233:7233 27 | benchmark-workers: 28 | container_name: benchmark-workers 29 | depends_on: 30 | - temporal 31 | environment: 32 | - TEMPORAL_GRPC_ENDPOINT=temporal:7233 33 | - TEMPORAL_TASK_QUEUE=benchmark 34 | - PROMETHEUS_ENDPOINT=0.0.0.0:8000 35 | image: ghcr.io/temporalio/benchmark-workers:main 36 | networks: 37 | - temporal-network 38 | temporal-admin-tools: 39 | container_name: temporal-admin-tools 40 | depends_on: 41 | - temporal 42 | environment: 43 | - TEMPORAL_CLI_ADDRESS=temporal:7233 44 | image: temporalio/admin-tools:1.20.0 45 | networks: 46 | - temporal-network 47 | stdin_open: true 48 | tty: true 49 | prometheus: 50 | container_name: prometheus 51 | image: prom/prometheus:v2.39.1 52 | command: 53 | - --config.file=/etc/prometheus/prometheus.yml 54 | - --storage.tsdb.path=/prometheus 55 | - --web.console.libraries=/usr/share/prometheus/console_libraries 56 | - --web.console.templates=/usr/share/prometheus/consoles 57 | - --web.enable-remote-write-receiver 58 | ports: 59 | - 9090:9090 60 | volumes: 61 | - type: bind 62 | source: ./prometheus-config.yml 63 | target: /etc/prometheus/prometheus.yml 64 | networks: 65 | - temporal-network 66 | networks: 67 | temporal-network: 68 | driver: bridge 69 | name: temporal-network 70 | -------------------------------------------------------------------------------- /dev/prometheus-config.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | scrape_configs: 4 | - job_name: 'temporal' 5 | metrics_path: /metrics 6 | scheme: http 7 | static_configs: 8 | - targets: 9 | - 'temporal:8000' 10 | - 'benchmark-workers:8000' 11 | -------------------------------------------------------------------------------- /dev/run-benchmark: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | docker run --network temporal-network -i \ 4 | --env TEMPORAL_GRPC_ENDPOINT=temporal:7233 \ 5 | --env K6_OUT=output-prometheus-remote \ 6 | --env K6_PROMETHEUS_REMOTE_URL=http://prometheus:9090/api/v1/write \ 7 | --env PROMETHEUS_ENDPOINT=http://prometheus:9090/ \ 8 | xk6-temporal:main \ 9 | k6 run --tag testid=$(uuidgen) - < $1 10 | -------------------------------------------------------------------------------- /environments/aws/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | -------------------------------------------------------------------------------- /environments/aws/Pulumi.main.yaml: -------------------------------------------------------------------------------- 1 | encryptionsalt: v1:JS9jJGyKuKQ=:v1:BKBLkuaWAjVYIS1R:MvrTtSCLoQ3fMHNu/YpzSYFGYoZbeA== 2 | -------------------------------------------------------------------------------- /environments/aws/Pulumi.yaml: -------------------------------------------------------------------------------- 1 | name: temporal-benchmarks-aws-environment 2 | runtime: nodejs 3 | description: AWS Environment for Temporal benchmarks 4 | -------------------------------------------------------------------------------- /environments/aws/README.md: -------------------------------------------------------------------------------- 1 | [![Deploy](https://get.pulumi.com/new/button.svg)](https://app.pulumi.com/new?template=https://github.com/temporalio/benchmark-matrix/tree/master/environments/aws) 2 | 3 | # AWS Environment for Temporal Benchmark Clusters 4 | 5 | This [Pulumi](https://pulumi.com) app creates a VPC, subnets and RDS subnet group to hold clusters created for the Temporal Benchmark Matrix. 6 | 7 | ## Deploying 8 | 9 | You can create this application in Pulumi using the button above, or if you'd prefer to use local state storage to experiment with Temporal Benchmark Matrix, you can use: 10 | 11 | 1. Configure pulumi to use local state: 12 | 13 | ```shell 14 | $ pulumi login --local 15 | ``` 16 | 17 | 2. Bring up a stack: 18 | 19 | ```shell 20 | $ pulumi -s dev up 21 | ``` 22 | 23 | For more information on Pulumi state storage, please see [their docs](https://www.pulumi.com/docs/intro/concepts/state/) -------------------------------------------------------------------------------- /environments/aws/index.ts: -------------------------------------------------------------------------------- 1 | import * as aws from "@pulumi/aws"; 2 | import * as awsx from "@pulumi/awsx"; 3 | 4 | const azCount = 3; 5 | 6 | export const AvailabilityZones = aws.getAvailabilityZones({state: "available"}).then(zones => zones.names.slice(0, azCount)) 7 | 8 | const vpc = new awsx.ec2.Vpc("temporal-benchmark", { 9 | requestedAvailabilityZones: AvailabilityZones 10 | }) 11 | 12 | const rdsSubnetGroup = new aws.rds.SubnetGroup("temporal-benchmark-rds", { 13 | subnetIds: vpc.publicSubnetIds 14 | }); 15 | 16 | new aws.iam.ServiceLinkedRole("opensearch", { awsServiceName: "opensearchservice.amazonaws.com" }) 17 | 18 | export const VpcId = vpc.id 19 | export const PrivateSubnetIds = vpc.privateSubnetIds 20 | export const PublicSubnetIds = vpc.publicSubnetIds 21 | export const RdsSubnetGroupName = rdsSubnetGroup.name 22 | export const Role = "BenchmarkClusterAdmin" -------------------------------------------------------------------------------- /environments/aws/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "foo", 3 | "main": "index.ts", 4 | "devDependencies": { 5 | "@types/node": "^14" 6 | }, 7 | "dependencies": { 8 | "@pulumi/aws": "^5.0.0", 9 | "@pulumi/awsx": "^0.40.0", 10 | "@pulumi/eks": "^0.42.5", 11 | "@pulumi/pulumi": "^3.0.0" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /k8s/benchmark/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - workers-deployment.yaml 3 | - soak-test-deployment.yaml 4 | - monitoring-service.yaml 5 | - service-monitor.yaml 6 | -------------------------------------------------------------------------------- /k8s/benchmark/monitoring-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: benchmark 6 | app.kubernetes.io/component: monitoring 7 | name: benchmark-monitoring 8 | spec: 9 | type: ClusterIP 10 | clusterIP: None 11 | internalTrafficPolicy: Cluster 12 | publishNotReadyAddresses: true 13 | ports: 14 | - name: metrics 15 | port: 9090 16 | targetPort: metrics 17 | protocol: TCP 18 | selector: 19 | app.kubernetes.io/name: benchmark 20 | -------------------------------------------------------------------------------- /k8s/benchmark/service-monitor.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: benchmark-monitor 5 | labels: 6 | app.kubernetes.io/name: benchmark 7 | app.kubernetes.io/component: monitoring 8 | spec: 9 | endpoints: 10 | - port: metrics 11 | interval: 30s 12 | namespaceSelector: 13 | matchNames: 14 | - default 15 | selector: 16 | matchLabels: 17 | app.kubernetes.io/name: benchmark 18 | app.kubernetes.io/component: monitoring 19 | -------------------------------------------------------------------------------- /k8s/benchmark/soak-test-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: benchmark 6 | app.kubernetes.io/component: soak-test 7 | name: benchmark-soak-test 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app.kubernetes.io/name: benchmark 13 | app.kubernetes.io/component: soak-test 14 | template: 15 | metadata: 16 | labels: 17 | app.kubernetes.io/name: benchmark 18 | app.kubernetes.io/component: soak-test 19 | spec: 20 | initContainers: 21 | - name: create-namespace 22 | image: temporalio/admin-tools:1.20.0 23 | env: 24 | - name: TEMPORAL_CLI_ADDRESS 25 | value: "temporal-frontend.temporal.svc.cluster.local:7233" 26 | command: ["bash", "-c"] 27 | args: ["tctl --namespace default namespace register || tctl --namespace default namespace describe"] 28 | containers: 29 | - image: ghcr.io/temporalio/benchmark-workers:main 30 | imagePullPolicy: Always 31 | name: benchmark-soak-test 32 | command: ["runner", "-w", "-c", "$(CONCURRENT_WORKFLOWS)", "-t", "ExecuteActivity", '{ "Count": 3, "Activity": "Echo", "Input": { "Message": "test" } }'] 33 | env: 34 | - name: TEMPORAL_GRPC_ENDPOINT 35 | value: "dns:///temporal-frontend.temporal.svc.cluster.local:7233" 36 | - name: PROMETHEUS_ENDPOINT 37 | value: 0.0.0.0:8000 38 | - name: TEMPORAL_NAMESPACE 39 | value: "default" 40 | - name: TEMPORAL_TASK_QUEUE 41 | value: "benchmark" 42 | envFrom: 43 | - configMapRef: 44 | name: benchmark-soaktest-env 45 | ports: 46 | - name: metrics 47 | containerPort: 8000 48 | protocol: TCP 49 | resources: 50 | requests: 51 | cpu: 100m 52 | memory: 32Mi 53 | limits: 54 | cpu: 200m 55 | memory: 64Mi 56 | restartPolicy: Always -------------------------------------------------------------------------------- /k8s/benchmark/workers-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: benchmark 6 | app.kubernetes.io/component: workers 7 | name: benchmark-workers 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app.kubernetes.io/name: benchmark 13 | app.kubernetes.io/component: workers 14 | template: 15 | metadata: 16 | labels: 17 | app.kubernetes.io/name: benchmark 18 | app.kubernetes.io/component: workers 19 | spec: 20 | initContainers: 21 | - name: create-namespace 22 | image: temporalio/admin-tools:1.20.0 23 | env: 24 | - name: TEMPORAL_CLI_ADDRESS 25 | value: "temporal-frontend.temporal.svc.cluster.local:7233" 26 | command: ["bash", "-c"] 27 | args: ["tctl --namespace default namespace register || tctl --namespace default namespace describe"] 28 | containers: 29 | - image: ghcr.io/temporalio/benchmark-workers:main 30 | imagePullPolicy: Always 31 | name: benchmark-workers 32 | env: 33 | - name: TEMPORAL_GRPC_ENDPOINT 34 | value: "dns:///temporal-frontend.temporal.svc.cluster.local:7233" 35 | - name: PROMETHEUS_ENDPOINT 36 | value: 0.0.0.0:8000 37 | - name: TEMPORAL_NAMESPACE 38 | value: "default" 39 | - name: TEMPORAL_TASK_QUEUE 40 | value: "benchmark" 41 | envFrom: 42 | - configMapRef: 43 | name: benchmark-worker-env 44 | ports: 45 | - name: metrics 46 | containerPort: 8000 47 | protocol: TCP 48 | resources: 49 | requests: 50 | cpu: 300m 51 | memory: 32Mi 52 | limits: 53 | cpu: 500m 54 | memory: 64Mi 55 | restartPolicy: Always -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/cloudwatch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: grafana-datasources 5 | namespace: monitoring 6 | stringData: 7 | cloudwatch.yaml: |- 8 | { 9 | "apiVersion": 1, 10 | "datasources": [ 11 | { 12 | "access": "proxy", 13 | "editable": false, 14 | "name": "cloudwatch", 15 | "orgId": 1, 16 | "type": "cloudwatch", 17 | "jsonData": { 18 | "authType": "default", 19 | "defaultRegion": "us-west-2", 20 | }, 21 | "version": 1 22 | } 23 | ] 24 | } 25 | type: Opaque 26 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/fetch-dashboards.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | host="https://temporal-benchmark.ngrok.io" 4 | 5 | for uid in $(curl -s "${host}/api/search?tag=temporal" | jq -r '.[] | .uid'); do 6 | dashboard="$(curl -s "${host}/api/dashboards/uid/${uid}")" 7 | name=$(echo "${dashboard}" | jq -r '.meta.slug') 8 | echo "${dashboard}" | jq '.dashboard' > ${name}.json 9 | done -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/folder.yaml: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": 1, 3 | "providers": [ 4 | { 5 | "folder": "Temporal", 6 | "folderUid": "", 7 | "name": "temporal", 8 | "options": { 9 | "path": "/grafana-dashboard-definitions/temporal" 10 | }, 11 | "orgId": 1, 12 | "type": "file", 13 | "allowUiUpdates": true 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: grafana 5 | namespace: monitoring 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: grafana 11 | volumeMounts: 12 | - mountPath: /grafana-dashboard-definitions/temporal 13 | name: grafana-temporal-dashboard-definitions 14 | volumes: 15 | - name: grafana-temporal-dashboard-definitions 16 | configMap: 17 | defaultMode: 420 18 | name: grafana-temporal-dashboard-definitions 19 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-frontend.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "datasource", 8 | "uid": "grafana" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 33, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "collapsed": false, 33 | "datasource": { 34 | "type": "datasource", 35 | "uid": "grafana" 36 | }, 37 | "gridPos": { 38 | "h": 1, 39 | "w": 24, 40 | "x": 0, 41 | "y": 0 42 | }, 43 | "id": 10, 44 | "panels": [], 45 | "title": "Service", 46 | "type": "row" 47 | }, 48 | { 49 | "datasource": { 50 | "type": "prometheus", 51 | "uid": "P1809F7CD0C75ACF3" 52 | }, 53 | "fieldConfig": { 54 | "defaults": { 55 | "color": { 56 | "mode": "palette-classic" 57 | }, 58 | "custom": { 59 | "axisCenteredZero": false, 60 | "axisColorMode": "text", 61 | "axisLabel": "", 62 | "axisPlacement": "auto", 63 | "barAlignment": 0, 64 | "drawStyle": "line", 65 | "fillOpacity": 0, 66 | "gradientMode": "none", 67 | "hideFrom": { 68 | "legend": false, 69 | "tooltip": false, 70 | "viz": false 71 | }, 72 | "lineInterpolation": "linear", 73 | "lineWidth": 1, 74 | "pointSize": 5, 75 | "scaleDistribution": { 76 | "type": "linear" 77 | }, 78 | "showPoints": "never", 79 | "spanNulls": false, 80 | "stacking": { 81 | "group": "A", 82 | "mode": "none" 83 | }, 84 | "thresholdsStyle": { 85 | "mode": "off" 86 | } 87 | }, 88 | "mappings": [], 89 | "thresholds": { 90 | "mode": "absolute", 91 | "steps": [ 92 | { 93 | "color": "green", 94 | "value": null 95 | }, 96 | { 97 | "color": "red", 98 | "value": 80 99 | } 100 | ] 101 | }, 102 | "unit": "s" 103 | }, 104 | "overrides": [] 105 | }, 106 | "gridPos": { 107 | "h": 8, 108 | "w": 12, 109 | "x": 0, 110 | "y": 1 111 | }, 112 | "id": 19, 113 | "options": { 114 | "legend": { 115 | "calcs": [], 116 | "displayMode": "list", 117 | "placement": "bottom", 118 | "showLegend": true 119 | }, 120 | "tooltip": { 121 | "mode": "single", 122 | "sort": "none" 123 | } 124 | }, 125 | "targets": [ 126 | { 127 | "datasource": { 128 | "type": "prometheus", 129 | "uid": "prometheus" 130 | }, 131 | "editorMode": "code", 132 | "exemplar": true, 133 | "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(service_latency_bucket{service_name=\"frontend\",operation!~\"Poll.*\"}[$__rate_interval]))) > 0", 134 | "interval": "", 135 | "legendFormat": "__auto", 136 | "range": true, 137 | "refId": "A" 138 | } 139 | ], 140 | "title": "Frontend Request Latency p95", 141 | "type": "timeseries" 142 | }, 143 | { 144 | "datasource": { 145 | "type": "prometheus", 146 | "uid": "P1809F7CD0C75ACF3" 147 | }, 148 | "fieldConfig": { 149 | "defaults": { 150 | "color": { 151 | "mode": "palette-classic" 152 | }, 153 | "custom": { 154 | "axisCenteredZero": false, 155 | "axisColorMode": "text", 156 | "axisLabel": "", 157 | "axisPlacement": "auto", 158 | "barAlignment": 0, 159 | "drawStyle": "line", 160 | "fillOpacity": 0, 161 | "gradientMode": "none", 162 | "hideFrom": { 163 | "legend": false, 164 | "tooltip": false, 165 | "viz": false 166 | }, 167 | "lineInterpolation": "linear", 168 | "lineWidth": 1, 169 | "pointSize": 5, 170 | "scaleDistribution": { 171 | "type": "linear" 172 | }, 173 | "showPoints": "never", 174 | "spanNulls": false, 175 | "stacking": { 176 | "group": "A", 177 | "mode": "none" 178 | }, 179 | "thresholdsStyle": { 180 | "mode": "off" 181 | } 182 | }, 183 | "mappings": [], 184 | "thresholds": { 185 | "mode": "absolute", 186 | "steps": [ 187 | { 188 | "color": "green", 189 | "value": null 190 | }, 191 | { 192 | "color": "red", 193 | "value": 80 194 | } 195 | ] 196 | }, 197 | "unit": "short" 198 | }, 199 | "overrides": [] 200 | }, 201 | "gridPos": { 202 | "h": 8, 203 | "w": 12, 204 | "x": 12, 205 | "y": 1 206 | }, 207 | "id": 15, 208 | "options": { 209 | "legend": { 210 | "calcs": [], 211 | "displayMode": "list", 212 | "placement": "bottom", 213 | "showLegend": true 214 | }, 215 | "tooltip": { 216 | "mode": "single", 217 | "sort": "none" 218 | } 219 | }, 220 | "targets": [ 221 | { 222 | "datasource": { 223 | "type": "prometheus", 224 | "uid": "prometheus" 225 | }, 226 | "editorMode": "code", 227 | "exemplar": true, 228 | "expr": "sum by (error_type) (rate(service_error_with_type{service_name=\"frontend\"}[$__rate_interval])) > 0", 229 | "interval": "", 230 | "legendFormat": "{{error_type}}", 231 | "range": true, 232 | "refId": "A" 233 | } 234 | ], 235 | "title": "Frontend Errors", 236 | "type": "timeseries" 237 | }, 238 | { 239 | "collapsed": false, 240 | "gridPos": { 241 | "h": 1, 242 | "w": 24, 243 | "x": 0, 244 | "y": 9 245 | }, 246 | "id": 17, 247 | "panels": [], 248 | "title": "Pods", 249 | "type": "row" 250 | }, 251 | { 252 | "datasource": { 253 | "type": "prometheus", 254 | "uid": "P1809F7CD0C75ACF3" 255 | }, 256 | "fieldConfig": { 257 | "defaults": { 258 | "color": { 259 | "mode": "palette-classic" 260 | }, 261 | "custom": { 262 | "axisCenteredZero": false, 263 | "axisColorMode": "text", 264 | "axisLabel": "", 265 | "axisPlacement": "auto", 266 | "barAlignment": 0, 267 | "drawStyle": "line", 268 | "fillOpacity": 0, 269 | "gradientMode": "none", 270 | "hideFrom": { 271 | "legend": false, 272 | "tooltip": false, 273 | "viz": false 274 | }, 275 | "lineInterpolation": "linear", 276 | "lineWidth": 1, 277 | "pointSize": 5, 278 | "scaleDistribution": { 279 | "type": "linear" 280 | }, 281 | "showPoints": "never", 282 | "spanNulls": false, 283 | "stacking": { 284 | "group": "A", 285 | "mode": "none" 286 | }, 287 | "thresholdsStyle": { 288 | "mode": "off" 289 | } 290 | }, 291 | "mappings": [], 292 | "thresholds": { 293 | "mode": "absolute", 294 | "steps": [ 295 | { 296 | "color": "green", 297 | "value": null 298 | }, 299 | { 300 | "color": "red", 301 | "value": 80 302 | } 303 | ] 304 | }, 305 | "unit": "percentunit" 306 | }, 307 | "overrides": [] 308 | }, 309 | "gridPos": { 310 | "h": 8, 311 | "w": 12, 312 | "x": 0, 313 | "y": 10 314 | }, 315 | "id": 12, 316 | "options": { 317 | "legend": { 318 | "calcs": [], 319 | "displayMode": "list", 320 | "placement": "bottom", 321 | "showLegend": true 322 | }, 323 | "tooltip": { 324 | "mode": "single", 325 | "sort": "none" 326 | } 327 | }, 328 | "targets": [ 329 | { 330 | "datasource": { 331 | "type": "prometheus", 332 | "uid": "prometheus" 333 | }, 334 | "editorMode": "code", 335 | "exemplar": true, 336 | "expr": "sum(\n rate(container_cpu_usage_seconds_total{container=\"temporal\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-frontend\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-frontend\"}\n) by (pod)\n", 337 | "interval": "", 338 | "legendFormat": "{{pod}}", 339 | "range": true, 340 | "refId": "A" 341 | } 342 | ], 343 | "title": "Frontend CPU%", 344 | "type": "timeseries" 345 | }, 346 | { 347 | "datasource": { 348 | "type": "prometheus", 349 | "uid": "P1809F7CD0C75ACF3" 350 | }, 351 | "fieldConfig": { 352 | "defaults": { 353 | "color": { 354 | "mode": "palette-classic" 355 | }, 356 | "custom": { 357 | "axisCenteredZero": false, 358 | "axisColorMode": "text", 359 | "axisLabel": "", 360 | "axisPlacement": "auto", 361 | "barAlignment": 0, 362 | "drawStyle": "line", 363 | "fillOpacity": 0, 364 | "gradientMode": "none", 365 | "hideFrom": { 366 | "legend": false, 367 | "tooltip": false, 368 | "viz": false 369 | }, 370 | "lineInterpolation": "linear", 371 | "lineWidth": 1, 372 | "pointSize": 5, 373 | "scaleDistribution": { 374 | "type": "linear" 375 | }, 376 | "showPoints": "never", 377 | "spanNulls": false, 378 | "stacking": { 379 | "group": "A", 380 | "mode": "none" 381 | }, 382 | "thresholdsStyle": { 383 | "mode": "off" 384 | } 385 | }, 386 | "mappings": [], 387 | "thresholds": { 388 | "mode": "absolute", 389 | "steps": [ 390 | { 391 | "color": "green", 392 | "value": null 393 | }, 394 | { 395 | "color": "red", 396 | "value": 80 397 | } 398 | ] 399 | }, 400 | "unit": "percentunit" 401 | }, 402 | "overrides": [] 403 | }, 404 | "gridPos": { 405 | "h": 8, 406 | "w": 12, 407 | "x": 12, 408 | "y": 10 409 | }, 410 | "id": 13, 411 | "options": { 412 | "legend": { 413 | "calcs": [], 414 | "displayMode": "list", 415 | "placement": "bottom", 416 | "showLegend": true 417 | }, 418 | "tooltip": { 419 | "mode": "single", 420 | "sort": "none" 421 | } 422 | }, 423 | "targets": [ 424 | { 425 | "datasource": { 426 | "type": "prometheus", 427 | "uid": "prometheus" 428 | }, 429 | "editorMode": "code", 430 | "exemplar": true, 431 | "expr": "sum(\n container_memory_working_set_bytes{namespace=\"temporal\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-frontend\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-frontend\"}\n) by (pod)\n", 432 | "interval": "", 433 | "legendFormat": "{{pod}}", 434 | "range": true, 435 | "refId": "A" 436 | } 437 | ], 438 | "title": "Frontend Memory%", 439 | "type": "timeseries" 440 | }, 441 | { 442 | "collapsed": false, 443 | "gridPos": { 444 | "h": 1, 445 | "w": 24, 446 | "x": 0, 447 | "y": 18 448 | }, 449 | "id": 22, 450 | "panels": [], 451 | "title": "Balance", 452 | "type": "row" 453 | }, 454 | { 455 | "datasource": { 456 | "type": "prometheus", 457 | "uid": "P1809F7CD0C75ACF3" 458 | }, 459 | "fieldConfig": { 460 | "defaults": { 461 | "color": { 462 | "mode": "palette-classic" 463 | }, 464 | "custom": { 465 | "axisCenteredZero": false, 466 | "axisColorMode": "text", 467 | "axisLabel": "", 468 | "axisPlacement": "auto", 469 | "barAlignment": 0, 470 | "drawStyle": "line", 471 | "fillOpacity": 0, 472 | "gradientMode": "none", 473 | "hideFrom": { 474 | "legend": false, 475 | "tooltip": false, 476 | "viz": false 477 | }, 478 | "lineInterpolation": "linear", 479 | "lineWidth": 1, 480 | "pointSize": 5, 481 | "scaleDistribution": { 482 | "type": "linear" 483 | }, 484 | "showPoints": "never", 485 | "spanNulls": false, 486 | "stacking": { 487 | "group": "A", 488 | "mode": "none" 489 | }, 490 | "thresholdsStyle": { 491 | "mode": "off" 492 | } 493 | }, 494 | "mappings": [], 495 | "thresholds": { 496 | "mode": "absolute", 497 | "steps": [ 498 | { 499 | "color": "green", 500 | "value": null 501 | }, 502 | { 503 | "color": "red", 504 | "value": 80 505 | } 506 | ] 507 | }, 508 | "unit": "short" 509 | }, 510 | "overrides": [] 511 | }, 512 | "gridPos": { 513 | "h": 8, 514 | "w": 12, 515 | "x": 0, 516 | "y": 19 517 | }, 518 | "id": 20, 519 | "options": { 520 | "legend": { 521 | "calcs": [], 522 | "displayMode": "list", 523 | "placement": "bottom", 524 | "showLegend": true 525 | }, 526 | "tooltip": { 527 | "mode": "single", 528 | "sort": "none" 529 | } 530 | }, 531 | "targets": [ 532 | { 533 | "datasource": { 534 | "type": "prometheus", 535 | "uid": "prometheus" 536 | }, 537 | "editorMode": "code", 538 | "exemplar": true, 539 | "expr": "sum by (pod) (rate(service_latency_count{service_name=\"frontend\"}[$__rate_interval]))", 540 | "interval": "", 541 | "legendFormat": "__auto", 542 | "range": true, 543 | "refId": "A" 544 | } 545 | ], 546 | "title": "RPS", 547 | "type": "timeseries" 548 | } 549 | ], 550 | "refresh": "30s", 551 | "schemaVersion": 37, 552 | "style": "dark", 553 | "tags": [ 554 | "frontend", 555 | "temporal" 556 | ], 557 | "templating": { 558 | "list": [] 559 | }, 560 | "time": { 561 | "from": "now-15m", 562 | "to": "now" 563 | }, 564 | "timepicker": {}, 565 | "timezone": "browser", 566 | "title": "Soak Test - Frontend", 567 | "uid": "7e2f4673-fd2a-44a5-8b16-3b20b0427a30", 568 | "version": 2, 569 | "weekStart": "" 570 | } 571 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-history.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "datasource", 8 | "uid": "grafana" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 28, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "collapsed": false, 33 | "datasource": { 34 | "type": "datasource", 35 | "uid": "grafana" 36 | }, 37 | "gridPos": { 38 | "h": 1, 39 | "w": 24, 40 | "x": 0, 41 | "y": 0 42 | }, 43 | "id": 10, 44 | "panels": [], 45 | "title": "Service", 46 | "type": "row" 47 | }, 48 | { 49 | "datasource": { 50 | "type": "prometheus", 51 | "uid": "P1809F7CD0C75ACF3" 52 | }, 53 | "fieldConfig": { 54 | "defaults": { 55 | "color": { 56 | "mode": "palette-classic" 57 | }, 58 | "custom": { 59 | "axisCenteredZero": false, 60 | "axisColorMode": "text", 61 | "axisLabel": "", 62 | "axisPlacement": "auto", 63 | "barAlignment": 0, 64 | "drawStyle": "line", 65 | "fillOpacity": 0, 66 | "gradientMode": "none", 67 | "hideFrom": { 68 | "legend": false, 69 | "tooltip": false, 70 | "viz": false 71 | }, 72 | "lineInterpolation": "linear", 73 | "lineWidth": 1, 74 | "pointSize": 5, 75 | "scaleDistribution": { 76 | "type": "linear" 77 | }, 78 | "showPoints": "never", 79 | "spanNulls": false, 80 | "stacking": { 81 | "group": "A", 82 | "mode": "none" 83 | }, 84 | "thresholdsStyle": { 85 | "mode": "off" 86 | } 87 | }, 88 | "mappings": [], 89 | "thresholds": { 90 | "mode": "absolute", 91 | "steps": [ 92 | { 93 | "color": "green", 94 | "value": null 95 | }, 96 | { 97 | "color": "red", 98 | "value": 80 99 | } 100 | ] 101 | }, 102 | "unit": "s" 103 | }, 104 | "overrides": [] 105 | }, 106 | "gridPos": { 107 | "h": 8, 108 | "w": 12, 109 | "x": 0, 110 | "y": 1 111 | }, 112 | "id": 18, 113 | "options": { 114 | "legend": { 115 | "calcs": [], 116 | "displayMode": "list", 117 | "placement": "bottom", 118 | "showLegend": true 119 | }, 120 | "tooltip": { 121 | "mode": "single", 122 | "sort": "none" 123 | } 124 | }, 125 | "targets": [ 126 | { 127 | "datasource": { 128 | "type": "prometheus", 129 | "uid": "prometheus" 130 | }, 131 | "editorMode": "code", 132 | "exemplar": true, 133 | "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(service_latency_bucket{service_name=\"history\",operation!~\"Poll.*\"}[$__rate_interval])))", 134 | "interval": "", 135 | "legendFormat": "__auto", 136 | "range": true, 137 | "refId": "A" 138 | } 139 | ], 140 | "title": "History Request Latency p95", 141 | "type": "timeseries" 142 | }, 143 | { 144 | "datasource": { 145 | "type": "prometheus", 146 | "uid": "P1809F7CD0C75ACF3" 147 | }, 148 | "fieldConfig": { 149 | "defaults": { 150 | "color": { 151 | "mode": "palette-classic" 152 | }, 153 | "custom": { 154 | "axisCenteredZero": false, 155 | "axisColorMode": "text", 156 | "axisLabel": "", 157 | "axisPlacement": "auto", 158 | "barAlignment": 0, 159 | "drawStyle": "line", 160 | "fillOpacity": 0, 161 | "gradientMode": "none", 162 | "hideFrom": { 163 | "legend": false, 164 | "tooltip": false, 165 | "viz": false 166 | }, 167 | "lineInterpolation": "linear", 168 | "lineWidth": 1, 169 | "pointSize": 5, 170 | "scaleDistribution": { 171 | "type": "linear" 172 | }, 173 | "showPoints": "never", 174 | "spanNulls": false, 175 | "stacking": { 176 | "group": "A", 177 | "mode": "none" 178 | }, 179 | "thresholdsStyle": { 180 | "mode": "off" 181 | } 182 | }, 183 | "mappings": [], 184 | "thresholds": { 185 | "mode": "absolute", 186 | "steps": [ 187 | { 188 | "color": "green", 189 | "value": null 190 | }, 191 | { 192 | "color": "red", 193 | "value": 80 194 | } 195 | ] 196 | }, 197 | "unit": "short" 198 | }, 199 | "overrides": [] 200 | }, 201 | "gridPos": { 202 | "h": 8, 203 | "w": 12, 204 | "x": 12, 205 | "y": 1 206 | }, 207 | "id": 15, 208 | "options": { 209 | "legend": { 210 | "calcs": [], 211 | "displayMode": "list", 212 | "placement": "bottom", 213 | "showLegend": true 214 | }, 215 | "tooltip": { 216 | "mode": "single", 217 | "sort": "none" 218 | } 219 | }, 220 | "targets": [ 221 | { 222 | "datasource": { 223 | "type": "prometheus", 224 | "uid": "prometheus" 225 | }, 226 | "editorMode": "code", 227 | "exemplar": true, 228 | "expr": "sum by (error_type) (rate(service_error_with_type{service_name=\"history\"}[$__rate_interval])) > 0", 229 | "interval": "", 230 | "legendFormat": "{{error_type}}", 231 | "range": true, 232 | "refId": "A" 233 | } 234 | ], 235 | "title": "History Error Rate", 236 | "type": "timeseries" 237 | }, 238 | { 239 | "collapsed": false, 240 | "gridPos": { 241 | "h": 1, 242 | "w": 24, 243 | "x": 0, 244 | "y": 9 245 | }, 246 | "id": 17, 247 | "panels": [], 248 | "title": "Pods", 249 | "type": "row" 250 | }, 251 | { 252 | "datasource": { 253 | "type": "prometheus", 254 | "uid": "P1809F7CD0C75ACF3" 255 | }, 256 | "fieldConfig": { 257 | "defaults": { 258 | "color": { 259 | "mode": "palette-classic" 260 | }, 261 | "custom": { 262 | "axisCenteredZero": false, 263 | "axisColorMode": "text", 264 | "axisLabel": "", 265 | "axisPlacement": "auto", 266 | "barAlignment": 0, 267 | "drawStyle": "line", 268 | "fillOpacity": 0, 269 | "gradientMode": "none", 270 | "hideFrom": { 271 | "legend": false, 272 | "tooltip": false, 273 | "viz": false 274 | }, 275 | "lineInterpolation": "linear", 276 | "lineWidth": 1, 277 | "pointSize": 5, 278 | "scaleDistribution": { 279 | "type": "linear" 280 | }, 281 | "showPoints": "never", 282 | "spanNulls": false, 283 | "stacking": { 284 | "group": "A", 285 | "mode": "none" 286 | }, 287 | "thresholdsStyle": { 288 | "mode": "off" 289 | } 290 | }, 291 | "mappings": [], 292 | "thresholds": { 293 | "mode": "absolute", 294 | "steps": [ 295 | { 296 | "color": "green", 297 | "value": null 298 | }, 299 | { 300 | "color": "red", 301 | "value": 80 302 | } 303 | ] 304 | }, 305 | "unit": "percentunit" 306 | }, 307 | "overrides": [ 308 | { 309 | "matcher": { 310 | "id": "byFrameRefID", 311 | "options": "B" 312 | }, 313 | "properties": [ 314 | { 315 | "id": "color", 316 | "value": { 317 | "fixedColor": "dark-red", 318 | "mode": "fixed" 319 | } 320 | }, 321 | { 322 | "id": "custom.fillOpacity", 323 | "value": 50 324 | } 325 | ] 326 | } 327 | ] 328 | }, 329 | "gridPos": { 330 | "h": 8, 331 | "w": 12, 332 | "x": 0, 333 | "y": 10 334 | }, 335 | "id": 12, 336 | "options": { 337 | "legend": { 338 | "calcs": [], 339 | "displayMode": "list", 340 | "placement": "bottom", 341 | "showLegend": true 342 | }, 343 | "tooltip": { 344 | "mode": "single", 345 | "sort": "none" 346 | } 347 | }, 348 | "targets": [ 349 | { 350 | "datasource": { 351 | "type": "prometheus", 352 | "uid": "prometheus" 353 | }, 354 | "editorMode": "code", 355 | "exemplar": true, 356 | "expr": "sum(\n rate(container_cpu_usage_seconds_total{container=\"temporal\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-history\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-history\"}\n) by (pod)\n", 357 | "interval": "", 358 | "legendFormat": "__auto", 359 | "range": true, 360 | "refId": "A" 361 | }, 362 | { 363 | "datasource": { 364 | "type": "prometheus", 365 | "uid": "P1809F7CD0C75ACF3" 366 | }, 367 | "editorMode": "code", 368 | "expr": "sum(\n increase(container_cpu_cfs_throttled_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-history\"}\n)\n/\nsum(\n increase(container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-history\"}\n) > 0\n", 369 | "hide": false, 370 | "legendFormat": "throttle", 371 | "range": true, 372 | "refId": "B" 373 | } 374 | ], 375 | "title": "History CPU Request % Used", 376 | "type": "timeseries" 377 | }, 378 | { 379 | "datasource": { 380 | "type": "prometheus", 381 | "uid": "P1809F7CD0C75ACF3" 382 | }, 383 | "fieldConfig": { 384 | "defaults": { 385 | "color": { 386 | "mode": "palette-classic" 387 | }, 388 | "custom": { 389 | "axisCenteredZero": false, 390 | "axisColorMode": "text", 391 | "axisLabel": "", 392 | "axisPlacement": "auto", 393 | "barAlignment": 0, 394 | "drawStyle": "line", 395 | "fillOpacity": 0, 396 | "gradientMode": "none", 397 | "hideFrom": { 398 | "legend": false, 399 | "tooltip": false, 400 | "viz": false 401 | }, 402 | "lineInterpolation": "linear", 403 | "lineWidth": 1, 404 | "pointSize": 5, 405 | "scaleDistribution": { 406 | "type": "linear" 407 | }, 408 | "showPoints": "never", 409 | "spanNulls": false, 410 | "stacking": { 411 | "group": "A", 412 | "mode": "none" 413 | }, 414 | "thresholdsStyle": { 415 | "mode": "off" 416 | } 417 | }, 418 | "mappings": [], 419 | "thresholds": { 420 | "mode": "absolute", 421 | "steps": [ 422 | { 423 | "color": "green", 424 | "value": null 425 | }, 426 | { 427 | "color": "red", 428 | "value": 80 429 | } 430 | ] 431 | }, 432 | "unit": "percentunit" 433 | }, 434 | "overrides": [] 435 | }, 436 | "gridPos": { 437 | "h": 8, 438 | "w": 12, 439 | "x": 12, 440 | "y": 10 441 | }, 442 | "id": 13, 443 | "options": { 444 | "legend": { 445 | "calcs": [], 446 | "displayMode": "list", 447 | "placement": "bottom", 448 | "showLegend": true 449 | }, 450 | "tooltip": { 451 | "mode": "single", 452 | "sort": "none" 453 | } 454 | }, 455 | "targets": [ 456 | { 457 | "datasource": { 458 | "type": "prometheus", 459 | "uid": "prometheus" 460 | }, 461 | "editorMode": "code", 462 | "exemplar": true, 463 | "expr": "sum(\n container_memory_working_set_bytes{namespace=\"temporal\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-history\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-history\"}\n) by (pod)\n", 464 | "interval": "", 465 | "legendFormat": "{{pod}}", 466 | "range": true, 467 | "refId": "A" 468 | } 469 | ], 470 | "title": "History Memory Request % Used", 471 | "type": "timeseries" 472 | }, 473 | { 474 | "collapsed": false, 475 | "gridPos": { 476 | "h": 1, 477 | "w": 24, 478 | "x": 0, 479 | "y": 18 480 | }, 481 | "id": 29, 482 | "panels": [], 483 | "title": "Tuning", 484 | "type": "row" 485 | }, 486 | { 487 | "datasource": { 488 | "type": "prometheus", 489 | "uid": "P1809F7CD0C75ACF3" 490 | }, 491 | "fieldConfig": { 492 | "defaults": { 493 | "color": { 494 | "mode": "palette-classic" 495 | }, 496 | "custom": { 497 | "axisCenteredZero": false, 498 | "axisColorMode": "text", 499 | "axisLabel": "", 500 | "axisPlacement": "auto", 501 | "barAlignment": 0, 502 | "drawStyle": "line", 503 | "fillOpacity": 0, 504 | "gradientMode": "none", 505 | "hideFrom": { 506 | "legend": false, 507 | "tooltip": false, 508 | "viz": false 509 | }, 510 | "lineInterpolation": "linear", 511 | "lineWidth": 1, 512 | "pointSize": 5, 513 | "scaleDistribution": { 514 | "type": "linear" 515 | }, 516 | "showPoints": "never", 517 | "spanNulls": false, 518 | "stacking": { 519 | "group": "A", 520 | "mode": "none" 521 | }, 522 | "thresholdsStyle": { 523 | "mode": "dashed" 524 | } 525 | }, 526 | "mappings": [], 527 | "thresholds": { 528 | "mode": "absolute", 529 | "steps": [ 530 | { 531 | "color": "green", 532 | "value": null 533 | } 534 | ] 535 | }, 536 | "unit": "s" 537 | }, 538 | "overrides": [] 539 | }, 540 | "gridPos": { 541 | "h": 8, 542 | "w": 12, 543 | "x": 0, 544 | "y": 19 545 | }, 546 | "id": 25, 547 | "options": { 548 | "legend": { 549 | "calcs": [], 550 | "displayMode": "list", 551 | "placement": "bottom", 552 | "showLegend": true 553 | }, 554 | "tooltip": { 555 | "mode": "single", 556 | "sort": "none" 557 | } 558 | }, 559 | "targets": [ 560 | { 561 | "datasource": { 562 | "type": "prometheus", 563 | "uid": "prometheus" 564 | }, 565 | "editorMode": "code", 566 | "expr": "histogram_quantile(0.95, sum by (le) (rate(lock_latency_bucket[$__rate_interval])))", 567 | "legendFormat": "Shard", 568 | "range": true, 569 | "refId": "A" 570 | }, 571 | { 572 | "datasource": { 573 | "type": "prometheus", 574 | "uid": "P1809F7CD0C75ACF3" 575 | }, 576 | "editorMode": "code", 577 | "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(cache_latency_bucket{operation=\"HistoryCacheGetOrCreate\"}[$__rate_interval])))", 578 | "hide": false, 579 | "legendFormat": "Workflow", 580 | "range": true, 581 | "refId": "B" 582 | } 583 | ], 584 | "title": "Lock Latency p95", 585 | "type": "timeseries" 586 | }, 587 | { 588 | "datasource": { 589 | "type": "prometheus", 590 | "uid": "P1809F7CD0C75ACF3" 591 | }, 592 | "fieldConfig": { 593 | "defaults": { 594 | "color": { 595 | "mode": "palette-classic" 596 | }, 597 | "custom": { 598 | "axisCenteredZero": false, 599 | "axisColorMode": "text", 600 | "axisLabel": "", 601 | "axisPlacement": "auto", 602 | "barAlignment": 0, 603 | "drawStyle": "line", 604 | "fillOpacity": 0, 605 | "gradientMode": "none", 606 | "hideFrom": { 607 | "legend": false, 608 | "tooltip": false, 609 | "viz": false 610 | }, 611 | "lineInterpolation": "linear", 612 | "lineWidth": 1, 613 | "pointSize": 5, 614 | "scaleDistribution": { 615 | "type": "linear" 616 | }, 617 | "showPoints": "never", 618 | "spanNulls": false, 619 | "stacking": { 620 | "group": "A", 621 | "mode": "none" 622 | }, 623 | "thresholdsStyle": { 624 | "mode": "dashed" 625 | } 626 | }, 627 | "mappings": [], 628 | "thresholds": { 629 | "mode": "absolute", 630 | "steps": [ 631 | { 632 | "color": "green", 633 | "value": null 634 | } 635 | ] 636 | }, 637 | "unit": "s" 638 | }, 639 | "overrides": [] 640 | }, 641 | "gridPos": { 642 | "h": 8, 643 | "w": 12, 644 | "x": 12, 645 | "y": 19 646 | }, 647 | "id": 32, 648 | "options": { 649 | "legend": { 650 | "calcs": [], 651 | "displayMode": "list", 652 | "placement": "bottom", 653 | "showLegend": true 654 | }, 655 | "tooltip": { 656 | "mode": "single", 657 | "sort": "none" 658 | } 659 | }, 660 | "targets": [ 661 | { 662 | "datasource": { 663 | "type": "prometheus", 664 | "uid": "prometheus" 665 | }, 666 | "editorMode": "code", 667 | "expr": "histogram_quantile(0.50, sum by (le) (rate(task_latency_processing_bucket[$__rate_interval])))", 668 | "legendFormat": "p50", 669 | "range": true, 670 | "refId": "A" 671 | }, 672 | { 673 | "datasource": { 674 | "type": "prometheus", 675 | "uid": "P1809F7CD0C75ACF3" 676 | }, 677 | "editorMode": "code", 678 | "expr": "histogram_quantile(0.95, sum by (le) (rate(task_latency_processing_bucket[$__rate_interval])))", 679 | "hide": false, 680 | "legendFormat": "p95", 681 | "range": true, 682 | "refId": "B" 683 | } 684 | ], 685 | "title": "Task Latency", 686 | "type": "timeseries" 687 | }, 688 | { 689 | "collapsed": true, 690 | "gridPos": { 691 | "h": 1, 692 | "w": 24, 693 | "x": 0, 694 | "y": 27 695 | }, 696 | "id": 31, 697 | "panels": [ 698 | { 699 | "datasource": { 700 | "type": "prometheus", 701 | "uid": "P1809F7CD0C75ACF3" 702 | }, 703 | "fieldConfig": { 704 | "defaults": { 705 | "color": { 706 | "mode": "palette-classic" 707 | }, 708 | "custom": { 709 | "axisCenteredZero": false, 710 | "axisColorMode": "text", 711 | "axisLabel": "", 712 | "axisPlacement": "auto", 713 | "barAlignment": 0, 714 | "drawStyle": "line", 715 | "fillOpacity": 0, 716 | "gradientMode": "none", 717 | "hideFrom": { 718 | "legend": false, 719 | "tooltip": false, 720 | "viz": false 721 | }, 722 | "lineInterpolation": "linear", 723 | "lineWidth": 1, 724 | "pointSize": 5, 725 | "scaleDistribution": { 726 | "type": "linear" 727 | }, 728 | "showPoints": "never", 729 | "spanNulls": false, 730 | "stacking": { 731 | "group": "A", 732 | "mode": "none" 733 | }, 734 | "thresholdsStyle": { 735 | "mode": "off" 736 | } 737 | }, 738 | "mappings": [], 739 | "min": 0, 740 | "thresholds": { 741 | "mode": "absolute", 742 | "steps": [ 743 | { 744 | "color": "green", 745 | "value": null 746 | }, 747 | { 748 | "color": "red", 749 | "value": 80 750 | } 751 | ] 752 | }, 753 | "unit": "short" 754 | }, 755 | "overrides": [] 756 | }, 757 | "gridPos": { 758 | "h": 8, 759 | "w": 12, 760 | "x": 0, 761 | "y": 28 762 | }, 763 | "id": 19, 764 | "options": { 765 | "legend": { 766 | "calcs": [], 767 | "displayMode": "list", 768 | "placement": "bottom", 769 | "showLegend": true 770 | }, 771 | "tooltip": { 772 | "mode": "single", 773 | "sort": "none" 774 | } 775 | }, 776 | "targets": [ 777 | { 778 | "datasource": { 779 | "type": "prometheus", 780 | "uid": "prometheus" 781 | }, 782 | "editorMode": "code", 783 | "exemplar": true, 784 | "expr": "sum by (pod) (numshards_gauge)", 785 | "interval": "", 786 | "legendFormat": "{{error_type}}", 787 | "range": true, 788 | "refId": "A" 789 | }, 790 | { 791 | "datasource": { 792 | "type": "prometheus", 793 | "uid": "prometheus" 794 | }, 795 | "exemplar": true, 796 | "expr": "", 797 | "hide": false, 798 | "interval": "", 799 | "legendFormat": "", 800 | "refId": "B" 801 | } 802 | ], 803 | "title": "History Shard Balance", 804 | "type": "timeseries" 805 | } 806 | ], 807 | "title": "Misc", 808 | "type": "row" 809 | } 810 | ], 811 | "refresh": "30s", 812 | "schemaVersion": 37, 813 | "style": "dark", 814 | "tags": [ 815 | "history", 816 | "temporal" 817 | ], 818 | "templating": { 819 | "list": [] 820 | }, 821 | "time": { 822 | "from": "now-15m", 823 | "to": "now" 824 | }, 825 | "timepicker": {}, 826 | "timezone": "", 827 | "title": "Soak Test - History", 828 | "uid": "82d00f2f-2548-496b-a627-0108bf7cb990", 829 | "version": 3, 830 | "weekStart": "" 831 | } 832 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-matching.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "datasource", 8 | "uid": "grafana" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 29, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "collapsed": false, 33 | "datasource": { 34 | "type": "datasource", 35 | "uid": "grafana" 36 | }, 37 | "gridPos": { 38 | "h": 1, 39 | "w": 24, 40 | "x": 0, 41 | "y": 0 42 | }, 43 | "id": 10, 44 | "panels": [], 45 | "title": "Service", 46 | "type": "row" 47 | }, 48 | { 49 | "datasource": { 50 | "type": "prometheus", 51 | "uid": "P1809F7CD0C75ACF3" 52 | }, 53 | "fieldConfig": { 54 | "defaults": { 55 | "color": { 56 | "mode": "palette-classic" 57 | }, 58 | "custom": { 59 | "axisCenteredZero": false, 60 | "axisColorMode": "text", 61 | "axisLabel": "", 62 | "axisPlacement": "auto", 63 | "barAlignment": 0, 64 | "drawStyle": "line", 65 | "fillOpacity": 0, 66 | "gradientMode": "none", 67 | "hideFrom": { 68 | "legend": false, 69 | "tooltip": false, 70 | "viz": false 71 | }, 72 | "lineInterpolation": "linear", 73 | "lineWidth": 1, 74 | "pointSize": 5, 75 | "scaleDistribution": { 76 | "type": "linear" 77 | }, 78 | "showPoints": "never", 79 | "spanNulls": false, 80 | "stacking": { 81 | "group": "A", 82 | "mode": "none" 83 | }, 84 | "thresholdsStyle": { 85 | "mode": "off" 86 | } 87 | }, 88 | "mappings": [], 89 | "thresholds": { 90 | "mode": "absolute", 91 | "steps": [ 92 | { 93 | "color": "green", 94 | "value": null 95 | }, 96 | { 97 | "color": "red", 98 | "value": 80 99 | } 100 | ] 101 | }, 102 | "unit": "s" 103 | }, 104 | "overrides": [] 105 | }, 106 | "gridPos": { 107 | "h": 8, 108 | "w": 12, 109 | "x": 0, 110 | "y": 1 111 | }, 112 | "id": 24, 113 | "options": { 114 | "legend": { 115 | "calcs": [], 116 | "displayMode": "list", 117 | "placement": "bottom", 118 | "showLegend": true 119 | }, 120 | "tooltip": { 121 | "mode": "single", 122 | "sort": "none" 123 | } 124 | }, 125 | "targets": [ 126 | { 127 | "datasource": { 128 | "type": "prometheus", 129 | "uid": "prometheus" 130 | }, 131 | "editorMode": "code", 132 | "exemplar": true, 133 | "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(service_latency_bucket{service_name=\"matching\",operation!~\"Poll.*\"}[$__rate_interval])))", 134 | "interval": "", 135 | "legendFormat": "__auto", 136 | "range": true, 137 | "refId": "A" 138 | } 139 | ], 140 | "title": "Matching Request Latency p95", 141 | "type": "timeseries" 142 | }, 143 | { 144 | "datasource": { 145 | "type": "prometheus", 146 | "uid": "P1809F7CD0C75ACF3" 147 | }, 148 | "fieldConfig": { 149 | "defaults": { 150 | "color": { 151 | "mode": "palette-classic" 152 | }, 153 | "custom": { 154 | "axisCenteredZero": false, 155 | "axisColorMode": "text", 156 | "axisLabel": "", 157 | "axisPlacement": "auto", 158 | "barAlignment": 0, 159 | "drawStyle": "line", 160 | "fillOpacity": 0, 161 | "gradientMode": "none", 162 | "hideFrom": { 163 | "legend": false, 164 | "tooltip": false, 165 | "viz": false 166 | }, 167 | "lineInterpolation": "linear", 168 | "lineWidth": 1, 169 | "pointSize": 5, 170 | "scaleDistribution": { 171 | "type": "linear" 172 | }, 173 | "showPoints": "never", 174 | "spanNulls": false, 175 | "stacking": { 176 | "group": "A", 177 | "mode": "none" 178 | }, 179 | "thresholdsStyle": { 180 | "mode": "off" 181 | } 182 | }, 183 | "mappings": [], 184 | "thresholds": { 185 | "mode": "absolute", 186 | "steps": [ 187 | { 188 | "color": "green", 189 | "value": null 190 | }, 191 | { 192 | "color": "red", 193 | "value": 80 194 | } 195 | ] 196 | }, 197 | "unit": "short" 198 | }, 199 | "overrides": [] 200 | }, 201 | "gridPos": { 202 | "h": 8, 203 | "w": 12, 204 | "x": 12, 205 | "y": 1 206 | }, 207 | "id": 26, 208 | "options": { 209 | "legend": { 210 | "calcs": [], 211 | "displayMode": "list", 212 | "placement": "bottom", 213 | "showLegend": true 214 | }, 215 | "tooltip": { 216 | "mode": "single", 217 | "sort": "none" 218 | } 219 | }, 220 | "targets": [ 221 | { 222 | "datasource": { 223 | "type": "prometheus", 224 | "uid": "prometheus" 225 | }, 226 | "editorMode": "code", 227 | "exemplar": true, 228 | "expr": "sum by (error_type) (rate(service_error_with_type{service_name=\"matching\"}[$__rate_interval])) > 0", 229 | "interval": "", 230 | "legendFormat": "{{error_type}}", 231 | "range": true, 232 | "refId": "A" 233 | } 234 | ], 235 | "title": "Matching Error Rate", 236 | "type": "timeseries" 237 | }, 238 | { 239 | "collapsed": false, 240 | "gridPos": { 241 | "h": 1, 242 | "w": 24, 243 | "x": 0, 244 | "y": 9 245 | }, 246 | "id": 17, 247 | "panels": [], 248 | "title": "Pods", 249 | "type": "row" 250 | }, 251 | { 252 | "datasource": { 253 | "type": "prometheus", 254 | "uid": "P1809F7CD0C75ACF3" 255 | }, 256 | "fieldConfig": { 257 | "defaults": { 258 | "color": { 259 | "mode": "palette-classic" 260 | }, 261 | "custom": { 262 | "axisCenteredZero": false, 263 | "axisColorMode": "text", 264 | "axisLabel": "", 265 | "axisPlacement": "auto", 266 | "barAlignment": 0, 267 | "drawStyle": "line", 268 | "fillOpacity": 0, 269 | "gradientMode": "none", 270 | "hideFrom": { 271 | "legend": false, 272 | "tooltip": false, 273 | "viz": false 274 | }, 275 | "lineInterpolation": "linear", 276 | "lineWidth": 1, 277 | "pointSize": 5, 278 | "scaleDistribution": { 279 | "type": "linear" 280 | }, 281 | "showPoints": "never", 282 | "spanNulls": false, 283 | "stacking": { 284 | "group": "A", 285 | "mode": "none" 286 | }, 287 | "thresholdsStyle": { 288 | "mode": "off" 289 | } 290 | }, 291 | "mappings": [], 292 | "thresholds": { 293 | "mode": "absolute", 294 | "steps": [ 295 | { 296 | "color": "green", 297 | "value": null 298 | }, 299 | { 300 | "color": "red", 301 | "value": 80 302 | } 303 | ] 304 | }, 305 | "unit": "percentunit" 306 | }, 307 | "overrides": [ 308 | { 309 | "matcher": { 310 | "id": "byFrameRefID", 311 | "options": "B" 312 | }, 313 | "properties": [ 314 | { 315 | "id": "color", 316 | "value": { 317 | "fixedColor": "dark-red", 318 | "mode": "fixed" 319 | } 320 | }, 321 | { 322 | "id": "custom.fillOpacity", 323 | "value": 50 324 | } 325 | ] 326 | } 327 | ] 328 | }, 329 | "gridPos": { 330 | "h": 8, 331 | "w": 12, 332 | "x": 0, 333 | "y": 10 334 | }, 335 | "id": 28, 336 | "options": { 337 | "legend": { 338 | "calcs": [], 339 | "displayMode": "list", 340 | "placement": "bottom", 341 | "showLegend": true 342 | }, 343 | "tooltip": { 344 | "mode": "single", 345 | "sort": "none" 346 | } 347 | }, 348 | "targets": [ 349 | { 350 | "datasource": { 351 | "type": "prometheus", 352 | "uid": "prometheus" 353 | }, 354 | "editorMode": "code", 355 | "exemplar": true, 356 | "expr": "sum(\n rate(container_cpu_usage_seconds_total{container=\"temporal\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-matching\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-matching\"}\n) by (pod)\n", 357 | "interval": "", 358 | "legendFormat": "__auto", 359 | "range": true, 360 | "refId": "A" 361 | }, 362 | { 363 | "datasource": { 364 | "type": "prometheus", 365 | "uid": "P1809F7CD0C75ACF3" 366 | }, 367 | "editorMode": "code", 368 | "expr": "sum(\n increase(container_cpu_cfs_throttled_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-history\"}\n)\n/\nsum(\n increase(container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-history\"}\n) > 0\n", 369 | "hide": false, 370 | "legendFormat": "throttle", 371 | "range": true, 372 | "refId": "B" 373 | } 374 | ], 375 | "title": "Matching CPU Request % Used", 376 | "type": "timeseries" 377 | }, 378 | { 379 | "datasource": { 380 | "type": "prometheus", 381 | "uid": "P1809F7CD0C75ACF3" 382 | }, 383 | "fieldConfig": { 384 | "defaults": { 385 | "color": { 386 | "mode": "palette-classic" 387 | }, 388 | "custom": { 389 | "axisCenteredZero": false, 390 | "axisColorMode": "text", 391 | "axisLabel": "", 392 | "axisPlacement": "auto", 393 | "barAlignment": 0, 394 | "drawStyle": "line", 395 | "fillOpacity": 0, 396 | "gradientMode": "none", 397 | "hideFrom": { 398 | "legend": false, 399 | "tooltip": false, 400 | "viz": false 401 | }, 402 | "lineInterpolation": "linear", 403 | "lineWidth": 1, 404 | "pointSize": 5, 405 | "scaleDistribution": { 406 | "type": "linear" 407 | }, 408 | "showPoints": "never", 409 | "spanNulls": false, 410 | "stacking": { 411 | "group": "A", 412 | "mode": "none" 413 | }, 414 | "thresholdsStyle": { 415 | "mode": "off" 416 | } 417 | }, 418 | "mappings": [], 419 | "thresholds": { 420 | "mode": "absolute", 421 | "steps": [ 422 | { 423 | "color": "green", 424 | "value": null 425 | }, 426 | { 427 | "color": "red", 428 | "value": 80 429 | } 430 | ] 431 | }, 432 | "unit": "percentunit" 433 | }, 434 | "overrides": [] 435 | }, 436 | "gridPos": { 437 | "h": 8, 438 | "w": 12, 439 | "x": 12, 440 | "y": 10 441 | }, 442 | "id": 30, 443 | "options": { 444 | "legend": { 445 | "calcs": [], 446 | "displayMode": "list", 447 | "placement": "bottom", 448 | "showLegend": true 449 | }, 450 | "tooltip": { 451 | "mode": "single", 452 | "sort": "none" 453 | } 454 | }, 455 | "targets": [ 456 | { 457 | "datasource": { 458 | "type": "prometheus", 459 | "uid": "prometheus" 460 | }, 461 | "editorMode": "code", 462 | "exemplar": true, 463 | "expr": "sum(\n container_memory_working_set_bytes{namespace=\"temporal\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-matching\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-matching\"}\n) by (pod)\n", 464 | "interval": "", 465 | "legendFormat": "{{pod}}", 466 | "range": true, 467 | "refId": "A" 468 | } 469 | ], 470 | "title": "Matching Memory Request % Used", 471 | "type": "timeseries" 472 | }, 473 | { 474 | "collapsed": false, 475 | "gridPos": { 476 | "h": 1, 477 | "w": 24, 478 | "x": 0, 479 | "y": 18 480 | }, 481 | "id": 22, 482 | "panels": [], 483 | "title": "Misc", 484 | "type": "row" 485 | }, 486 | { 487 | "datasource": { 488 | "type": "prometheus", 489 | "uid": "P1809F7CD0C75ACF3" 490 | }, 491 | "fieldConfig": { 492 | "defaults": { 493 | "color": { 494 | "mode": "palette-classic" 495 | }, 496 | "custom": { 497 | "axisCenteredZero": false, 498 | "axisColorMode": "text", 499 | "axisLabel": "", 500 | "axisPlacement": "auto", 501 | "barAlignment": 0, 502 | "drawStyle": "line", 503 | "fillOpacity": 0, 504 | "gradientMode": "none", 505 | "hideFrom": { 506 | "legend": false, 507 | "tooltip": false, 508 | "viz": false 509 | }, 510 | "lineInterpolation": "linear", 511 | "lineWidth": 1, 512 | "pointSize": 5, 513 | "scaleDistribution": { 514 | "type": "linear" 515 | }, 516 | "showPoints": "never", 517 | "spanNulls": false, 518 | "stacking": { 519 | "group": "A", 520 | "mode": "none" 521 | }, 522 | "thresholdsStyle": { 523 | "mode": "off" 524 | } 525 | }, 526 | "mappings": [], 527 | "thresholds": { 528 | "mode": "absolute", 529 | "steps": [ 530 | { 531 | "color": "green", 532 | "value": null 533 | }, 534 | { 535 | "color": "red", 536 | "value": 80 537 | } 538 | ] 539 | }, 540 | "unit": "short" 541 | }, 542 | "overrides": [] 543 | }, 544 | "gridPos": { 545 | "h": 8, 546 | "w": 12, 547 | "x": 0, 548 | "y": 19 549 | }, 550 | "id": 20, 551 | "options": { 552 | "legend": { 553 | "calcs": [], 554 | "displayMode": "list", 555 | "placement": "bottom", 556 | "showLegend": true 557 | }, 558 | "tooltip": { 559 | "mode": "single", 560 | "sort": "none" 561 | } 562 | }, 563 | "targets": [ 564 | { 565 | "datasource": { 566 | "type": "prometheus", 567 | "uid": "prometheus" 568 | }, 569 | "exemplar": true, 570 | "expr": "sum by (pod) (loaded_task_queue_count{exported_namespace=\"default\"})", 571 | "interval": "", 572 | "legendFormat": "{{error_type}}", 573 | "refId": "A" 574 | }, 575 | { 576 | "datasource": { 577 | "type": "prometheus", 578 | "uid": "prometheus" 579 | }, 580 | "exemplar": true, 581 | "expr": "", 582 | "hide": false, 583 | "interval": "", 584 | "legendFormat": "", 585 | "refId": "B" 586 | } 587 | ], 588 | "title": "Matching Partitions", 589 | "type": "timeseries" 590 | } 591 | ], 592 | "refresh": "30s", 593 | "schemaVersion": 37, 594 | "style": "dark", 595 | "tags": [ 596 | "matching", 597 | "temporal" 598 | ], 599 | "templating": { 600 | "list": [] 601 | }, 602 | "time": { 603 | "from": "now-15m", 604 | "to": "now" 605 | }, 606 | "timepicker": {}, 607 | "timezone": "", 608 | "title": "Soak Test - Matching", 609 | "uid": "d9bfbe59-a99f-4c89-951e-4ec0167ecfb0", 610 | "version": 2, 611 | "weekStart": "" 612 | } 613 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-persistence.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "datasource", 8 | "uid": "grafana" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 34, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "collapsed": false, 33 | "datasource": { 34 | "type": "datasource", 35 | "uid": "grafana" 36 | }, 37 | "gridPos": { 38 | "h": 1, 39 | "w": 24, 40 | "x": 0, 41 | "y": 0 42 | }, 43 | "id": 6, 44 | "panels": [], 45 | "title": "Service", 46 | "type": "row" 47 | }, 48 | { 49 | "datasource": { 50 | "type": "prometheus", 51 | "uid": "P1809F7CD0C75ACF3" 52 | }, 53 | "fieldConfig": { 54 | "defaults": { 55 | "color": { 56 | "mode": "palette-classic" 57 | }, 58 | "custom": { 59 | "axisCenteredZero": false, 60 | "axisColorMode": "text", 61 | "axisLabel": "", 62 | "axisPlacement": "auto", 63 | "barAlignment": 0, 64 | "drawStyle": "line", 65 | "fillOpacity": 0, 66 | "gradientMode": "none", 67 | "hideFrom": { 68 | "legend": false, 69 | "tooltip": false, 70 | "viz": false 71 | }, 72 | "lineInterpolation": "linear", 73 | "lineWidth": 1, 74 | "pointSize": 5, 75 | "scaleDistribution": { 76 | "type": "linear" 77 | }, 78 | "showPoints": "never", 79 | "spanNulls": false, 80 | "stacking": { 81 | "group": "A", 82 | "mode": "none" 83 | }, 84 | "thresholdsStyle": { 85 | "mode": "off" 86 | } 87 | }, 88 | "mappings": [], 89 | "thresholds": { 90 | "mode": "absolute", 91 | "steps": [ 92 | { 93 | "color": "green", 94 | "value": null 95 | }, 96 | { 97 | "color": "red", 98 | "value": 80 99 | } 100 | ] 101 | }, 102 | "unit": "ops" 103 | }, 104 | "overrides": [] 105 | }, 106 | "gridPos": { 107 | "h": 8, 108 | "w": 12, 109 | "x": 0, 110 | "y": 1 111 | }, 112 | "id": 8, 113 | "options": { 114 | "legend": { 115 | "calcs": [], 116 | "displayMode": "list", 117 | "placement": "bottom", 118 | "showLegend": true 119 | }, 120 | "tooltip": { 121 | "mode": "single", 122 | "sort": "none" 123 | } 124 | }, 125 | "targets": [ 126 | { 127 | "datasource": { 128 | "type": "prometheus", 129 | "uid": "prometheus" 130 | }, 131 | "editorMode": "code", 132 | "exemplar": false, 133 | "expr": "sum by (operation) (rate(persistence_requests[$__rate_interval])) > 0", 134 | "interval": "", 135 | "legendFormat": "{{operation}}", 136 | "range": true, 137 | "refId": "A" 138 | } 139 | ], 140 | "title": "RPS", 141 | "type": "timeseries" 142 | }, 143 | { 144 | "datasource": { 145 | "type": "prometheus", 146 | "uid": "P1809F7CD0C75ACF3" 147 | }, 148 | "fieldConfig": { 149 | "defaults": { 150 | "color": { 151 | "mode": "palette-classic" 152 | }, 153 | "custom": { 154 | "axisCenteredZero": false, 155 | "axisColorMode": "text", 156 | "axisLabel": "", 157 | "axisPlacement": "auto", 158 | "barAlignment": 0, 159 | "drawStyle": "line", 160 | "fillOpacity": 0, 161 | "gradientMode": "none", 162 | "hideFrom": { 163 | "legend": false, 164 | "tooltip": false, 165 | "viz": false 166 | }, 167 | "lineInterpolation": "linear", 168 | "lineWidth": 1, 169 | "pointSize": 5, 170 | "scaleDistribution": { 171 | "type": "linear" 172 | }, 173 | "showPoints": "never", 174 | "spanNulls": false, 175 | "stacking": { 176 | "group": "A", 177 | "mode": "none" 178 | }, 179 | "thresholdsStyle": { 180 | "mode": "off" 181 | } 182 | }, 183 | "mappings": [], 184 | "thresholds": { 185 | "mode": "absolute", 186 | "steps": [ 187 | { 188 | "color": "green", 189 | "value": null 190 | }, 191 | { 192 | "color": "red", 193 | "value": 80 194 | } 195 | ] 196 | }, 197 | "unit": "s" 198 | }, 199 | "overrides": [] 200 | }, 201 | "gridPos": { 202 | "h": 8, 203 | "w": 12, 204 | "x": 12, 205 | "y": 1 206 | }, 207 | "id": 4, 208 | "options": { 209 | "legend": { 210 | "calcs": [], 211 | "displayMode": "list", 212 | "placement": "bottom", 213 | "showLegend": true 214 | }, 215 | "tooltip": { 216 | "mode": "single", 217 | "sort": "none" 218 | } 219 | }, 220 | "targets": [ 221 | { 222 | "datasource": { 223 | "type": "prometheus", 224 | "uid": "prometheus" 225 | }, 226 | "editorMode": "code", 227 | "exemplar": true, 228 | "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(persistence_latency_bucket[$__rate_interval]))) > 0", 229 | "interval": "", 230 | "legendFormat": "{{operation}}", 231 | "range": true, 232 | "refId": "A" 233 | } 234 | ], 235 | "title": "Request Latency", 236 | "type": "timeseries" 237 | }, 238 | { 239 | "datasource": { 240 | "type": "prometheus", 241 | "uid": "P1809F7CD0C75ACF3" 242 | }, 243 | "fieldConfig": { 244 | "defaults": { 245 | "color": { 246 | "mode": "palette-classic" 247 | }, 248 | "custom": { 249 | "axisCenteredZero": false, 250 | "axisColorMode": "text", 251 | "axisLabel": "", 252 | "axisPlacement": "auto", 253 | "barAlignment": 0, 254 | "drawStyle": "line", 255 | "fillOpacity": 0, 256 | "gradientMode": "none", 257 | "hideFrom": { 258 | "legend": false, 259 | "tooltip": false, 260 | "viz": false 261 | }, 262 | "lineInterpolation": "linear", 263 | "lineWidth": 1, 264 | "pointSize": 5, 265 | "scaleDistribution": { 266 | "type": "linear" 267 | }, 268 | "showPoints": "never", 269 | "spanNulls": false, 270 | "stacking": { 271 | "group": "A", 272 | "mode": "none" 273 | }, 274 | "thresholdsStyle": { 275 | "mode": "off" 276 | } 277 | }, 278 | "mappings": [], 279 | "thresholds": { 280 | "mode": "absolute", 281 | "steps": [ 282 | { 283 | "color": "green", 284 | "value": null 285 | }, 286 | { 287 | "color": "red", 288 | "value": 80 289 | } 290 | ] 291 | }, 292 | "unit": "reqps" 293 | }, 294 | "overrides": [] 295 | }, 296 | "gridPos": { 297 | "h": 8, 298 | "w": 12, 299 | "x": 0, 300 | "y": 9 301 | }, 302 | "id": 16, 303 | "options": { 304 | "legend": { 305 | "calcs": [], 306 | "displayMode": "list", 307 | "placement": "bottom", 308 | "showLegend": true 309 | }, 310 | "tooltip": { 311 | "mode": "single", 312 | "sort": "none" 313 | } 314 | }, 315 | "targets": [ 316 | { 317 | "datasource": { 318 | "type": "prometheus", 319 | "uid": "prometheus" 320 | }, 321 | "editorMode": "code", 322 | "exemplar": true, 323 | "expr": "sum by (error_type) (rate(persistence_error_with_type[$__rate_interval])) > 0", 324 | "interval": "", 325 | "legendFormat": "{{operation}}", 326 | "range": true, 327 | "refId": "A" 328 | } 329 | ], 330 | "title": "Request Errors", 331 | "type": "timeseries" 332 | }, 333 | { 334 | "datasource": { 335 | "type": "cloudwatch", 336 | "uid": "P561CB7FAE9DC47C2" 337 | }, 338 | "fieldConfig": { 339 | "defaults": { 340 | "color": { 341 | "mode": "palette-classic" 342 | }, 343 | "custom": { 344 | "axisCenteredZero": false, 345 | "axisColorMode": "text", 346 | "axisLabel": "", 347 | "axisPlacement": "auto", 348 | "barAlignment": 0, 349 | "drawStyle": "line", 350 | "fillOpacity": 0, 351 | "gradientMode": "none", 352 | "hideFrom": { 353 | "legend": false, 354 | "tooltip": false, 355 | "viz": false 356 | }, 357 | "lineInterpolation": "linear", 358 | "lineWidth": 1, 359 | "pointSize": 5, 360 | "scaleDistribution": { 361 | "type": "linear" 362 | }, 363 | "showPoints": "never", 364 | "spanNulls": false, 365 | "stacking": { 366 | "group": "A", 367 | "mode": "none" 368 | }, 369 | "thresholdsStyle": { 370 | "mode": "off" 371 | } 372 | }, 373 | "mappings": [], 374 | "thresholds": { 375 | "mode": "absolute", 376 | "steps": [ 377 | { 378 | "color": "green", 379 | "value": null 380 | }, 381 | { 382 | "color": "red", 383 | "value": 80 384 | } 385 | ] 386 | } 387 | }, 388 | "overrides": [] 389 | }, 390 | "gridPos": { 391 | "h": 8, 392 | "w": 12, 393 | "x": 0, 394 | "y": 17 395 | }, 396 | "id": 18, 397 | "options": { 398 | "legend": { 399 | "calcs": [], 400 | "displayMode": "list", 401 | "placement": "bottom", 402 | "showLegend": true 403 | }, 404 | "tooltip": { 405 | "mode": "single", 406 | "sort": "none" 407 | } 408 | }, 409 | "targets": [ 410 | { 411 | "datasource": { 412 | "type": "cloudwatch", 413 | "uid": "P561CB7FAE9DC47C2" 414 | }, 415 | "dimensions": { 416 | "DBInstanceIdentifier": "$dbidentifier" 417 | }, 418 | "expression": "", 419 | "id": "", 420 | "label": "", 421 | "matchExact": true, 422 | "metricEditorMode": 0, 423 | "metricName": "CPUUtilization", 424 | "metricQueryType": 0, 425 | "namespace": "AWS/RDS", 426 | "period": "", 427 | "queryMode": "Metrics", 428 | "refId": "A", 429 | "region": "default", 430 | "sqlExpression": "", 431 | "statistic": "Average" 432 | } 433 | ], 434 | "title": "RDS CPU", 435 | "type": "timeseries" 436 | } 437 | ], 438 | "refresh": "30s", 439 | "schemaVersion": 37, 440 | "style": "dark", 441 | "tags": [ 442 | "persistence", 443 | "temporal" 444 | ], 445 | "templating": { 446 | "list": [ 447 | { 448 | "hide": 2, 449 | "name": "dbidentifier", 450 | "query": "eks-rds-mysql-scaling-series20230504100109469600000005", 451 | "skipUrlSync": false, 452 | "type": "constant" 453 | } 454 | ] 455 | }, 456 | "time": { 457 | "from": "now-15m", 458 | "to": "now" 459 | }, 460 | "timepicker": {}, 461 | "timezone": "", 462 | "title": "Soak Test - Persistence", 463 | "uid": "d2bab180-bba7-4cdd-a946-75543f8c512e", 464 | "version": 2, 465 | "weekStart": "" 466 | } 467 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-pods.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 30, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "collapsed": false, 33 | "gridPos": { 34 | "h": 1, 35 | "w": 24, 36 | "x": 0, 37 | "y": 0 38 | }, 39 | "id": 4, 40 | "panels": [], 41 | "repeat": "services", 42 | "repeatDirection": "h", 43 | "title": "$services", 44 | "type": "row" 45 | }, 46 | { 47 | "datasource": { 48 | "type": "prometheus", 49 | "uid": "P1809F7CD0C75ACF3" 50 | }, 51 | "fieldConfig": { 52 | "defaults": { 53 | "color": { 54 | "mode": "palette-classic" 55 | }, 56 | "custom": { 57 | "axisCenteredZero": false, 58 | "axisColorMode": "text", 59 | "axisLabel": "", 60 | "axisPlacement": "auto", 61 | "barAlignment": 0, 62 | "drawStyle": "line", 63 | "fillOpacity": 0, 64 | "gradientMode": "none", 65 | "hideFrom": { 66 | "legend": false, 67 | "tooltip": false, 68 | "viz": false 69 | }, 70 | "lineInterpolation": "linear", 71 | "lineWidth": 1, 72 | "pointSize": 5, 73 | "scaleDistribution": { 74 | "type": "linear" 75 | }, 76 | "showPoints": "never", 77 | "spanNulls": false, 78 | "stacking": { 79 | "group": "A", 80 | "mode": "none" 81 | }, 82 | "thresholdsStyle": { 83 | "mode": "off" 84 | } 85 | }, 86 | "mappings": [], 87 | "thresholds": { 88 | "mode": "absolute", 89 | "steps": [ 90 | { 91 | "color": "green", 92 | "value": null 93 | }, 94 | { 95 | "color": "red", 96 | "value": 80 97 | } 98 | ] 99 | }, 100 | "unit": "percentunit" 101 | }, 102 | "overrides": [ 103 | { 104 | "matcher": { 105 | "id": "byFrameRefID", 106 | "options": "B" 107 | }, 108 | "properties": [ 109 | { 110 | "id": "color", 111 | "value": { 112 | "fixedColor": "dark-red", 113 | "mode": "fixed" 114 | } 115 | }, 116 | { 117 | "id": "custom.fillOpacity", 118 | "value": 50 119 | } 120 | ] 121 | } 122 | ] 123 | }, 124 | "gridPos": { 125 | "h": 8, 126 | "w": 12, 127 | "x": 0, 128 | "y": 1 129 | }, 130 | "id": 16, 131 | "options": { 132 | "legend": { 133 | "calcs": [], 134 | "displayMode": "list", 135 | "placement": "bottom", 136 | "showLegend": true 137 | }, 138 | "tooltip": { 139 | "mode": "single", 140 | "sort": "none" 141 | } 142 | }, 143 | "targets": [ 144 | { 145 | "datasource": { 146 | "type": "prometheus", 147 | "uid": "prometheus" 148 | }, 149 | "editorMode": "code", 150 | "exemplar": true, 151 | "expr": "sum(\n rate(container_cpu_usage_seconds_total{container=\"temporal\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"}\n) by (pod)\n", 152 | "interval": "", 153 | "legendFormat": "__auto", 154 | "range": true, 155 | "refId": "A" 156 | }, 157 | { 158 | "datasource": { 159 | "type": "prometheus", 160 | "uid": "P1809F7CD0C75ACF3" 161 | }, 162 | "editorMode": "code", 163 | "expr": "sum(\n increase(container_cpu_cfs_throttled_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"}\n)\n/\nsum(\n increase(container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"}\n) > 0\n", 164 | "hide": false, 165 | "legendFormat": "throttle", 166 | "range": true, 167 | "refId": "B" 168 | } 169 | ], 170 | "title": "$services CPU Request % Used", 171 | "type": "timeseries" 172 | }, 173 | { 174 | "datasource": { 175 | "type": "prometheus", 176 | "uid": "P1809F7CD0C75ACF3" 177 | }, 178 | "fieldConfig": { 179 | "defaults": { 180 | "color": { 181 | "mode": "palette-classic" 182 | }, 183 | "custom": { 184 | "axisCenteredZero": false, 185 | "axisColorMode": "text", 186 | "axisLabel": "", 187 | "axisPlacement": "auto", 188 | "barAlignment": 0, 189 | "drawStyle": "line", 190 | "fillOpacity": 0, 191 | "gradientMode": "none", 192 | "hideFrom": { 193 | "legend": false, 194 | "tooltip": false, 195 | "viz": false 196 | }, 197 | "lineInterpolation": "linear", 198 | "lineWidth": 1, 199 | "pointSize": 5, 200 | "scaleDistribution": { 201 | "type": "linear" 202 | }, 203 | "showPoints": "never", 204 | "spanNulls": false, 205 | "stacking": { 206 | "group": "A", 207 | "mode": "none" 208 | }, 209 | "thresholdsStyle": { 210 | "mode": "off" 211 | } 212 | }, 213 | "mappings": [], 214 | "thresholds": { 215 | "mode": "absolute", 216 | "steps": [ 217 | { 218 | "color": "green", 219 | "value": null 220 | }, 221 | { 222 | "color": "red", 223 | "value": 80 224 | } 225 | ] 226 | }, 227 | "unit": "percentunit" 228 | }, 229 | "overrides": [] 230 | }, 231 | "gridPos": { 232 | "h": 8, 233 | "w": 12, 234 | "x": 12, 235 | "y": 1 236 | }, 237 | "id": 17, 238 | "options": { 239 | "legend": { 240 | "calcs": [], 241 | "displayMode": "list", 242 | "placement": "bottom", 243 | "showLegend": true 244 | }, 245 | "tooltip": { 246 | "mode": "single", 247 | "sort": "none" 248 | } 249 | }, 250 | "targets": [ 251 | { 252 | "datasource": { 253 | "type": "prometheus", 254 | "uid": "prometheus" 255 | }, 256 | "editorMode": "code", 257 | "exemplar": true, 258 | "expr": "sum(\n container_memory_working_set_bytes{namespace=\"temporal\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"}\n) by (pod)\n", 259 | "interval": "", 260 | "legendFormat": "{{pod}}", 261 | "range": true, 262 | "refId": "A" 263 | } 264 | ], 265 | "title": "$services Memory Request % Used", 266 | "type": "timeseries" 267 | }, 268 | { 269 | "datasource": { 270 | "type": "prometheus", 271 | "uid": "P1809F7CD0C75ACF3" 272 | }, 273 | "fieldConfig": { 274 | "defaults": { 275 | "color": { 276 | "mode": "palette-classic" 277 | }, 278 | "custom": { 279 | "axisCenteredZero": false, 280 | "axisColorMode": "text", 281 | "axisLabel": "", 282 | "axisPlacement": "auto", 283 | "barAlignment": 0, 284 | "drawStyle": "line", 285 | "fillOpacity": 0, 286 | "gradientMode": "none", 287 | "hideFrom": { 288 | "legend": false, 289 | "tooltip": false, 290 | "viz": false 291 | }, 292 | "lineInterpolation": "linear", 293 | "lineWidth": 1, 294 | "pointSize": 5, 295 | "scaleDistribution": { 296 | "type": "linear" 297 | }, 298 | "showPoints": "never", 299 | "spanNulls": false, 300 | "stacking": { 301 | "group": "A", 302 | "mode": "none" 303 | }, 304 | "thresholdsStyle": { 305 | "mode": "off" 306 | } 307 | }, 308 | "mappings": [], 309 | "thresholds": { 310 | "mode": "absolute", 311 | "steps": [ 312 | { 313 | "color": "green", 314 | "value": null 315 | }, 316 | { 317 | "color": "red", 318 | "value": 80 319 | } 320 | ] 321 | }, 322 | "unit": "short" 323 | }, 324 | "overrides": [ 325 | { 326 | "matcher": { 327 | "id": "byFrameRefID", 328 | "options": "B" 329 | }, 330 | "properties": [ 331 | { 332 | "id": "color", 333 | "value": { 334 | "fixedColor": "dark-orange", 335 | "mode": "fixed" 336 | } 337 | }, 338 | { 339 | "id": "custom.lineStyle", 340 | "value": { 341 | "dash": [ 342 | 10, 343 | 10 344 | ], 345 | "fill": "dash" 346 | } 347 | } 348 | ] 349 | }, 350 | { 351 | "matcher": { 352 | "id": "byFrameRefID", 353 | "options": "C" 354 | }, 355 | "properties": [ 356 | { 357 | "id": "color", 358 | "value": { 359 | "fixedColor": "dark-red", 360 | "mode": "fixed" 361 | } 362 | }, 363 | { 364 | "id": "custom.lineStyle", 365 | "value": { 366 | "dash": [ 367 | 10, 368 | 10 369 | ], 370 | "fill": "dash" 371 | } 372 | } 373 | ] 374 | } 375 | ] 376 | }, 377 | "gridPos": { 378 | "h": 8, 379 | "w": 12, 380 | "x": 0, 381 | "y": 9 382 | }, 383 | "id": 18, 384 | "options": { 385 | "legend": { 386 | "calcs": [], 387 | "displayMode": "list", 388 | "placement": "bottom", 389 | "showLegend": true 390 | }, 391 | "tooltip": { 392 | "mode": "single", 393 | "sort": "none" 394 | } 395 | }, 396 | "targets": [ 397 | { 398 | "datasource": { 399 | "type": "prometheus", 400 | "uid": "prometheus" 401 | }, 402 | "editorMode": "code", 403 | "exemplar": true, 404 | "expr": "sum(\n rate(container_cpu_usage_seconds_total{container=\"temporal\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"}\n) by (pod)", 405 | "interval": "", 406 | "legendFormat": "__auto", 407 | "range": true, 408 | "refId": "A" 409 | }, 410 | { 411 | "datasource": { 412 | "type": "prometheus", 413 | "uid": "P1809F7CD0C75ACF3" 414 | }, 415 | "editorMode": "code", 416 | "expr": "avg(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"})", 417 | "hide": false, 418 | "legendFormat": "container request", 419 | "range": true, 420 | "refId": "B" 421 | }, 422 | { 423 | "datasource": { 424 | "type": "prometheus", 425 | "uid": "P1809F7CD0C75ACF3" 426 | }, 427 | "editorMode": "code", 428 | "expr": "avg(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"})", 429 | "hide": false, 430 | "legendFormat": "container limit", 431 | "range": true, 432 | "refId": "C" 433 | } 434 | ], 435 | "title": "$services CPU", 436 | "type": "timeseries" 437 | }, 438 | { 439 | "datasource": { 440 | "type": "prometheus", 441 | "uid": "P1809F7CD0C75ACF3" 442 | }, 443 | "fieldConfig": { 444 | "defaults": { 445 | "color": { 446 | "mode": "palette-classic" 447 | }, 448 | "custom": { 449 | "axisCenteredZero": false, 450 | "axisColorMode": "text", 451 | "axisLabel": "", 452 | "axisPlacement": "auto", 453 | "barAlignment": 0, 454 | "drawStyle": "line", 455 | "fillOpacity": 0, 456 | "gradientMode": "none", 457 | "hideFrom": { 458 | "legend": false, 459 | "tooltip": false, 460 | "viz": false 461 | }, 462 | "lineInterpolation": "linear", 463 | "lineWidth": 1, 464 | "pointSize": 5, 465 | "scaleDistribution": { 466 | "type": "linear" 467 | }, 468 | "showPoints": "never", 469 | "spanNulls": false, 470 | "stacking": { 471 | "group": "A", 472 | "mode": "none" 473 | }, 474 | "thresholdsStyle": { 475 | "mode": "off" 476 | } 477 | }, 478 | "mappings": [], 479 | "thresholds": { 480 | "mode": "absolute", 481 | "steps": [ 482 | { 483 | "color": "green", 484 | "value": null 485 | }, 486 | { 487 | "color": "red", 488 | "value": 80 489 | } 490 | ] 491 | }, 492 | "unit": "bytes" 493 | }, 494 | "overrides": [ 495 | { 496 | "matcher": { 497 | "id": "byFrameRefID", 498 | "options": "B" 499 | }, 500 | "properties": [ 501 | { 502 | "id": "color", 503 | "value": { 504 | "fixedColor": "dark-orange", 505 | "mode": "fixed" 506 | } 507 | }, 508 | { 509 | "id": "custom.lineStyle", 510 | "value": { 511 | "dash": [ 512 | 10, 513 | 10 514 | ], 515 | "fill": "dash" 516 | } 517 | } 518 | ] 519 | }, 520 | { 521 | "matcher": { 522 | "id": "byFrameRefID", 523 | "options": "C" 524 | }, 525 | "properties": [ 526 | { 527 | "id": "color", 528 | "value": { 529 | "fixedColor": "dark-red", 530 | "mode": "fixed" 531 | } 532 | }, 533 | { 534 | "id": "custom.lineStyle", 535 | "value": { 536 | "dash": [ 537 | 10, 538 | 10 539 | ], 540 | "fill": "dash" 541 | } 542 | } 543 | ] 544 | } 545 | ] 546 | }, 547 | "gridPos": { 548 | "h": 8, 549 | "w": 12, 550 | "x": 12, 551 | "y": 9 552 | }, 553 | "id": 19, 554 | "options": { 555 | "legend": { 556 | "calcs": [], 557 | "displayMode": "list", 558 | "placement": "bottom", 559 | "showLegend": true 560 | }, 561 | "tooltip": { 562 | "mode": "single", 563 | "sort": "none" 564 | } 565 | }, 566 | "targets": [ 567 | { 568 | "datasource": { 569 | "type": "prometheus", 570 | "uid": "prometheus" 571 | }, 572 | "editorMode": "code", 573 | "exemplar": true, 574 | "expr": "sum(\n container_memory_working_set_bytes{namespace=\"temporal\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"}\n) by (pod)", 575 | "interval": "", 576 | "legendFormat": "{{pod}}", 577 | "range": true, 578 | "refId": "A" 579 | }, 580 | { 581 | "datasource": { 582 | "type": "prometheus", 583 | "uid": "P1809F7CD0C75ACF3" 584 | }, 585 | "editorMode": "code", 586 | "expr": "avg(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"})", 587 | "hide": false, 588 | "legendFormat": "container request", 589 | "range": true, 590 | "refId": "B" 591 | }, 592 | { 593 | "datasource": { 594 | "type": "prometheus", 595 | "uid": "P1809F7CD0C75ACF3" 596 | }, 597 | "editorMode": "code", 598 | "expr": "avg(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-$services\"})", 599 | "hide": false, 600 | "legendFormat": "container limit", 601 | "range": true, 602 | "refId": "C" 603 | } 604 | ], 605 | "title": "$services memory", 606 | "type": "timeseries" 607 | } 608 | ], 609 | "refresh": "30s", 610 | "schemaVersion": 37, 611 | "style": "dark", 612 | "tags": [ 613 | "temporal", 614 | "pods" 615 | ], 616 | "templating": { 617 | "list": [ 618 | { 619 | "current": { 620 | "selected": true, 621 | "text": [ 622 | "history" 623 | ], 624 | "value": [ 625 | "history" 626 | ] 627 | }, 628 | "hide": 0, 629 | "includeAll": true, 630 | "multi": true, 631 | "name": "services", 632 | "options": [ 633 | { 634 | "selected": false, 635 | "text": "All", 636 | "value": "$__all" 637 | }, 638 | { 639 | "selected": false, 640 | "text": "frontend", 641 | "value": "frontend" 642 | }, 643 | { 644 | "selected": true, 645 | "text": "history", 646 | "value": "history" 647 | }, 648 | { 649 | "selected": false, 650 | "text": "matching", 651 | "value": "matching" 652 | }, 653 | { 654 | "selected": false, 655 | "text": "worker", 656 | "value": "worker" 657 | } 658 | ], 659 | "query": "frontend,history,matching,worker", 660 | "queryValue": "", 661 | "skipUrlSync": false, 662 | "type": "custom" 663 | } 664 | ] 665 | }, 666 | "time": { 667 | "from": "now-15m", 668 | "to": "now" 669 | }, 670 | "timepicker": {}, 671 | "timezone": "", 672 | "title": "Soak Test - Pods", 673 | "uid": "JjZ8XAsVk", 674 | "version": 3, 675 | "weekStart": "" 676 | } 677 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-polling.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "datasource", 8 | "uid": "grafana" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 35, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "datasource": { 33 | "type": "prometheus", 34 | "uid": "P1809F7CD0C75ACF3" 35 | }, 36 | "fieldConfig": { 37 | "defaults": { 38 | "color": { 39 | "mode": "palette-classic" 40 | }, 41 | "custom": { 42 | "axisCenteredZero": false, 43 | "axisColorMode": "text", 44 | "axisLabel": "", 45 | "axisPlacement": "auto", 46 | "barAlignment": 0, 47 | "drawStyle": "line", 48 | "fillOpacity": 0, 49 | "gradientMode": "none", 50 | "hideFrom": { 51 | "legend": false, 52 | "tooltip": false, 53 | "viz": false 54 | }, 55 | "lineInterpolation": "linear", 56 | "lineWidth": 1, 57 | "pointSize": 5, 58 | "scaleDistribution": { 59 | "type": "linear" 60 | }, 61 | "showPoints": "never", 62 | "spanNulls": false, 63 | "stacking": { 64 | "group": "A", 65 | "mode": "none" 66 | }, 67 | "thresholdsStyle": { 68 | "mode": "off" 69 | } 70 | }, 71 | "mappings": [], 72 | "thresholds": { 73 | "mode": "absolute", 74 | "steps": [ 75 | { 76 | "color": "green", 77 | "value": null 78 | }, 79 | { 80 | "color": "red", 81 | "value": 80 82 | } 83 | ] 84 | }, 85 | "unit": "percentunit" 86 | }, 87 | "overrides": [] 88 | }, 89 | "gridPos": { 90 | "h": 8, 91 | "w": 12, 92 | "x": 0, 93 | "y": 0 94 | }, 95 | "id": 28, 96 | "options": { 97 | "legend": { 98 | "calcs": [], 99 | "displayMode": "list", 100 | "placement": "bottom", 101 | "showLegend": true 102 | }, 103 | "tooltip": { 104 | "mode": "single", 105 | "sort": "none" 106 | } 107 | }, 108 | "targets": [ 109 | { 110 | "datasource": { 111 | "type": "prometheus", 112 | "uid": "P1809F7CD0C75ACF3" 113 | }, 114 | "editorMode": "code", 115 | "expr": "sum (rate(poll_success_sync{task_type=\"Workflow\"}[$__rate_interval])) / sum (rate(poll_success{task_type=\"Workflow\"}[$__rate_interval]))", 116 | "legendFormat": "Workflow", 117 | "range": true, 118 | "refId": "A" 119 | }, 120 | { 121 | "datasource": { 122 | "type": "prometheus", 123 | "uid": "P1809F7CD0C75ACF3" 124 | }, 125 | "editorMode": "code", 126 | "expr": "sum (rate(poll_success_sync{task_type=\"Activity\"}[$__rate_interval])) / sum (rate(poll_success{task_type=\"Activity\"}[$__rate_interval]))", 127 | "hide": false, 128 | "legendFormat": "Activity", 129 | "range": true, 130 | "refId": "B" 131 | } 132 | ], 133 | "title": "Poll Sync Rate", 134 | "type": "timeseries" 135 | }, 136 | { 137 | "datasource": { 138 | "type": "prometheus", 139 | "uid": "P1809F7CD0C75ACF3" 140 | }, 141 | "fieldConfig": { 142 | "defaults": { 143 | "color": { 144 | "mode": "palette-classic" 145 | }, 146 | "custom": { 147 | "axisCenteredZero": false, 148 | "axisColorMode": "text", 149 | "axisLabel": "", 150 | "axisPlacement": "auto", 151 | "barAlignment": 0, 152 | "drawStyle": "line", 153 | "fillOpacity": 0, 154 | "gradientMode": "none", 155 | "hideFrom": { 156 | "legend": false, 157 | "tooltip": false, 158 | "viz": false 159 | }, 160 | "lineInterpolation": "linear", 161 | "lineWidth": 1, 162 | "pointSize": 5, 163 | "scaleDistribution": { 164 | "type": "linear" 165 | }, 166 | "showPoints": "never", 167 | "spanNulls": false, 168 | "stacking": { 169 | "group": "A", 170 | "mode": "none" 171 | }, 172 | "thresholdsStyle": { 173 | "mode": "off" 174 | } 175 | }, 176 | "mappings": [], 177 | "thresholds": { 178 | "mode": "absolute", 179 | "steps": [ 180 | { 181 | "color": "green", 182 | "value": null 183 | }, 184 | { 185 | "color": "red", 186 | "value": 80 187 | } 188 | ] 189 | }, 190 | "unit": "percentunit" 191 | }, 192 | "overrides": [] 193 | }, 194 | "gridPos": { 195 | "h": 8, 196 | "w": 12, 197 | "x": 12, 198 | "y": 0 199 | }, 200 | "id": 31, 201 | "options": { 202 | "legend": { 203 | "calcs": [], 204 | "displayMode": "list", 205 | "placement": "bottom", 206 | "showLegend": true 207 | }, 208 | "tooltip": { 209 | "mode": "single", 210 | "sort": "none" 211 | } 212 | }, 213 | "targets": [ 214 | { 215 | "datasource": { 216 | "type": "prometheus", 217 | "uid": "P1809F7CD0C75ACF3" 218 | }, 219 | "editorMode": "code", 220 | "expr": "sum (rate(poll_timeouts{task_type=\"Workflow\"}[$__rate_interval])) / sum (rate(poll_success{task_type=\"Workflow\"}[$__rate_interval]) + rate(poll_success_sync{task_type=\"Workflow\"}[$__rate_interval]) + rate(poll_timeouts{task_type=\"Workflow\"}[$__rate_interval]))", 221 | "legendFormat": "Workflow", 222 | "range": true, 223 | "refId": "A" 224 | }, 225 | { 226 | "datasource": { 227 | "type": "prometheus", 228 | "uid": "P1809F7CD0C75ACF3" 229 | }, 230 | "editorMode": "code", 231 | "expr": "sum (rate(poll_timeouts{task_type=\"Activity\"}[$__rate_interval])) / sum (rate(poll_success{task_type=\"Activity\"}[$__rate_interval]) + rate(poll_success_sync{task_type=\"Activity\"}[$__rate_interval]) + rate(poll_timeouts{task_type=\"Activity\"}[$__rate_interval]))", 232 | "hide": false, 233 | "legendFormat": "Activity", 234 | "range": true, 235 | "refId": "B" 236 | } 237 | ], 238 | "title": "Poll Timeout Rate", 239 | "type": "timeseries" 240 | }, 241 | { 242 | "datasource": { 243 | "type": "prometheus", 244 | "uid": "P1809F7CD0C75ACF3" 245 | }, 246 | "fieldConfig": { 247 | "defaults": { 248 | "color": { 249 | "mode": "palette-classic" 250 | }, 251 | "custom": { 252 | "axisCenteredZero": false, 253 | "axisColorMode": "text", 254 | "axisLabel": "", 255 | "axisPlacement": "auto", 256 | "barAlignment": 0, 257 | "drawStyle": "line", 258 | "fillOpacity": 0, 259 | "gradientMode": "none", 260 | "hideFrom": { 261 | "legend": false, 262 | "tooltip": false, 263 | "viz": false 264 | }, 265 | "lineInterpolation": "linear", 266 | "lineWidth": 1, 267 | "pointSize": 5, 268 | "scaleDistribution": { 269 | "type": "linear" 270 | }, 271 | "showPoints": "never", 272 | "spanNulls": false, 273 | "stacking": { 274 | "group": "A", 275 | "mode": "none" 276 | }, 277 | "thresholdsStyle": { 278 | "mode": "dashed" 279 | } 280 | }, 281 | "mappings": [], 282 | "thresholds": { 283 | "mode": "absolute", 284 | "steps": [ 285 | { 286 | "color": "green", 287 | "value": null 288 | }, 289 | { 290 | "color": "red", 291 | "value": 0.15 292 | } 293 | ] 294 | }, 295 | "unit": "s" 296 | }, 297 | "overrides": [] 298 | }, 299 | "gridPos": { 300 | "h": 8, 301 | "w": 12, 302 | "x": 12, 303 | "y": 8 304 | }, 305 | "id": 30, 306 | "options": { 307 | "legend": { 308 | "calcs": [], 309 | "displayMode": "list", 310 | "placement": "bottom", 311 | "showLegend": true 312 | }, 313 | "tooltip": { 314 | "mode": "single", 315 | "sort": "none" 316 | } 317 | }, 318 | "targets": [ 319 | { 320 | "datasource": { 321 | "type": "prometheus", 322 | "uid": "prometheus" 323 | }, 324 | "editorMode": "code", 325 | "exemplar": true, 326 | "expr": "histogram_quantile(0.95, sum by(le) (rate(temporal_workflow_task_schedule_to_start_latency_bucket{namespace=\"default\"}[$__rate_interval])))", 327 | "interval": "", 328 | "legendFormat": "Workflow", 329 | "range": true, 330 | "refId": "A" 331 | }, 332 | { 333 | "datasource": { 334 | "type": "prometheus", 335 | "uid": "P1809F7CD0C75ACF3" 336 | }, 337 | "editorMode": "code", 338 | "expr": "histogram_quantile(0.95, sum by(le) (rate(temporal_activity_schedule_to_start_latency_bucket{namespace=\"default\"}[$__rate_interval])))", 339 | "hide": false, 340 | "legendFormat": "Activity", 341 | "range": true, 342 | "refId": "B" 343 | } 344 | ], 345 | "title": "Schedule to Start Latency p95", 346 | "type": "timeseries" 347 | }, 348 | { 349 | "collapsed": false, 350 | "gridPos": { 351 | "h": 1, 352 | "w": 24, 353 | "x": 0, 354 | "y": 16 355 | }, 356 | "id": 17, 357 | "panels": [], 358 | "title": "Worker Pods", 359 | "type": "row" 360 | }, 361 | { 362 | "datasource": { 363 | "type": "prometheus", 364 | "uid": "P1809F7CD0C75ACF3" 365 | }, 366 | "fieldConfig": { 367 | "defaults": { 368 | "color": { 369 | "mode": "palette-classic" 370 | }, 371 | "custom": { 372 | "axisCenteredZero": false, 373 | "axisColorMode": "text", 374 | "axisLabel": "", 375 | "axisPlacement": "auto", 376 | "barAlignment": 0, 377 | "drawStyle": "line", 378 | "fillOpacity": 0, 379 | "gradientMode": "none", 380 | "hideFrom": { 381 | "legend": false, 382 | "tooltip": false, 383 | "viz": false 384 | }, 385 | "lineInterpolation": "linear", 386 | "lineWidth": 1, 387 | "pointSize": 5, 388 | "scaleDistribution": { 389 | "type": "linear" 390 | }, 391 | "showPoints": "never", 392 | "spanNulls": false, 393 | "stacking": { 394 | "group": "A", 395 | "mode": "none" 396 | }, 397 | "thresholdsStyle": { 398 | "mode": "off" 399 | } 400 | }, 401 | "mappings": [], 402 | "thresholds": { 403 | "mode": "absolute", 404 | "steps": [ 405 | { 406 | "color": "green", 407 | "value": null 408 | }, 409 | { 410 | "color": "red", 411 | "value": 80 412 | } 413 | ] 414 | }, 415 | "unit": "percentunit" 416 | }, 417 | "overrides": [ 418 | { 419 | "matcher": { 420 | "id": "byFrameRefID", 421 | "options": "B" 422 | }, 423 | "properties": [ 424 | { 425 | "id": "color", 426 | "value": { 427 | "fixedColor": "dark-red", 428 | "mode": "fixed" 429 | } 430 | }, 431 | { 432 | "id": "custom.fillOpacity", 433 | "value": 50 434 | } 435 | ] 436 | } 437 | ] 438 | }, 439 | "gridPos": { 440 | "h": 8, 441 | "w": 12, 442 | "x": 0, 443 | "y": 17 444 | }, 445 | "id": 24, 446 | "options": { 447 | "legend": { 448 | "calcs": [], 449 | "displayMode": "list", 450 | "placement": "bottom", 451 | "showLegend": true 452 | }, 453 | "tooltip": { 454 | "mode": "single", 455 | "sort": "none" 456 | } 457 | }, 458 | "targets": [ 459 | { 460 | "datasource": { 461 | "type": "prometheus", 462 | "uid": "prometheus" 463 | }, 464 | "editorMode": "code", 465 | "exemplar": true, 466 | "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"default\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"default\", workload=\"benchmark-workers\", workload_type=\"statefulset\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"default\",resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"default\", workload=\"benchmark-workers\", workload_type=\"statefulset\"}\n) by (pod)\n", 467 | "interval": "", 468 | "legendFormat": "__auto", 469 | "range": true, 470 | "refId": "A" 471 | }, 472 | { 473 | "datasource": { 474 | "type": "prometheus", 475 | "uid": "P1809F7CD0C75ACF3" 476 | }, 477 | "editorMode": "code", 478 | "expr": "sum(\n increase(container_cpu_cfs_throttled_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"default\", workload=\"benchmark-workers\", workload_type=\"statefulset\"}\n)\n/\nsum(\n increase(container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}[$__rate_interval])\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"default\", workload=\"benchmark-workers\", workload_type=\"statefulset\"}\n) > 0\n", 479 | "hide": false, 480 | "legendFormat": "throttle", 481 | "range": true, 482 | "refId": "B" 483 | } 484 | ], 485 | "title": "Benchmark Workers CPU Request % Used", 486 | "type": "timeseries" 487 | }, 488 | { 489 | "datasource": { 490 | "type": "prometheus", 491 | "uid": "P1809F7CD0C75ACF3" 492 | }, 493 | "fieldConfig": { 494 | "defaults": { 495 | "color": { 496 | "mode": "palette-classic" 497 | }, 498 | "custom": { 499 | "axisCenteredZero": false, 500 | "axisColorMode": "text", 501 | "axisLabel": "", 502 | "axisPlacement": "auto", 503 | "barAlignment": 0, 504 | "drawStyle": "line", 505 | "fillOpacity": 0, 506 | "gradientMode": "none", 507 | "hideFrom": { 508 | "legend": false, 509 | "tooltip": false, 510 | "viz": false 511 | }, 512 | "lineInterpolation": "linear", 513 | "lineWidth": 1, 514 | "pointSize": 5, 515 | "scaleDistribution": { 516 | "type": "linear" 517 | }, 518 | "showPoints": "never", 519 | "spanNulls": false, 520 | "stacking": { 521 | "group": "A", 522 | "mode": "none" 523 | }, 524 | "thresholdsStyle": { 525 | "mode": "off" 526 | } 527 | }, 528 | "mappings": [], 529 | "thresholds": { 530 | "mode": "absolute", 531 | "steps": [ 532 | { 533 | "color": "green", 534 | "value": null 535 | }, 536 | { 537 | "color": "red", 538 | "value": 80 539 | } 540 | ] 541 | }, 542 | "unit": "percentunit" 543 | }, 544 | "overrides": [] 545 | }, 546 | "gridPos": { 547 | "h": 8, 548 | "w": 12, 549 | "x": 12, 550 | "y": 17 551 | }, 552 | "id": 26, 553 | "options": { 554 | "legend": { 555 | "calcs": [], 556 | "displayMode": "list", 557 | "placement": "bottom", 558 | "showLegend": true 559 | }, 560 | "tooltip": { 561 | "mode": "single", 562 | "sort": "none" 563 | } 564 | }, 565 | "targets": [ 566 | { 567 | "datasource": { 568 | "type": "prometheus", 569 | "uid": "prometheus" 570 | }, 571 | "editorMode": "code", 572 | "exemplar": true, 573 | "expr": "sum(\n container_memory_working_set_bytes{namespace=\"default\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"default\", workload=\"benchmark-workers\", workload_type=\"statefulset\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"default\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"default\", workload=\"benchmark-workers\", workload_type=\"statefulset\"}\n) by (pod)\n", 574 | "interval": "", 575 | "legendFormat": "{{pod}}", 576 | "range": true, 577 | "refId": "A" 578 | } 579 | ], 580 | "title": "Benchmark Workers Memory Request % Used", 581 | "type": "timeseries" 582 | } 583 | ], 584 | "refresh": "30s", 585 | "schemaVersion": 37, 586 | "style": "dark", 587 | "tags": [ 588 | "temporal", 589 | "polling" 590 | ], 591 | "templating": { 592 | "list": [] 593 | }, 594 | "time": { 595 | "from": "now-15m", 596 | "to": "now" 597 | }, 598 | "timepicker": {}, 599 | "timezone": "", 600 | "title": "Soak Test - Polling", 601 | "uid": "trsgBasVk", 602 | "version": 3, 603 | "weekStart": "" 604 | } 605 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-services.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 31, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "collapsed": false, 33 | "gridPos": { 34 | "h": 1, 35 | "w": 24, 36 | "x": 0, 37 | "y": 0 38 | }, 39 | "id": 2, 40 | "panels": [], 41 | "title": "Services", 42 | "type": "row" 43 | }, 44 | { 45 | "datasource": { 46 | "type": "prometheus", 47 | "uid": "P1809F7CD0C75ACF3" 48 | }, 49 | "fieldConfig": { 50 | "defaults": { 51 | "color": { 52 | "mode": "palette-classic" 53 | }, 54 | "custom": { 55 | "axisCenteredZero": false, 56 | "axisColorMode": "text", 57 | "axisLabel": "", 58 | "axisPlacement": "auto", 59 | "barAlignment": 0, 60 | "drawStyle": "line", 61 | "fillOpacity": 0, 62 | "gradientMode": "none", 63 | "hideFrom": { 64 | "legend": false, 65 | "tooltip": false, 66 | "viz": false 67 | }, 68 | "lineInterpolation": "linear", 69 | "lineWidth": 1, 70 | "pointSize": 5, 71 | "scaleDistribution": { 72 | "type": "linear" 73 | }, 74 | "showPoints": "never", 75 | "spanNulls": false, 76 | "stacking": { 77 | "group": "A", 78 | "mode": "none" 79 | }, 80 | "thresholdsStyle": { 81 | "mode": "off" 82 | } 83 | }, 84 | "mappings": [], 85 | "thresholds": { 86 | "mode": "absolute", 87 | "steps": [ 88 | { 89 | "color": "green", 90 | "value": null 91 | }, 92 | { 93 | "color": "red", 94 | "value": 80 95 | } 96 | ] 97 | }, 98 | "unit": "short" 99 | }, 100 | "overrides": [] 101 | }, 102 | "gridPos": { 103 | "h": 8, 104 | "w": 12, 105 | "x": 0, 106 | "y": 1 107 | }, 108 | "id": 6, 109 | "options": { 110 | "legend": { 111 | "calcs": [], 112 | "displayMode": "list", 113 | "placement": "bottom", 114 | "showLegend": true 115 | }, 116 | "tooltip": { 117 | "mode": "single", 118 | "sort": "none" 119 | } 120 | }, 121 | "targets": [ 122 | { 123 | "datasource": { 124 | "type": "prometheus", 125 | "uid": "prometheus" 126 | }, 127 | "editorMode": "code", 128 | "exemplar": true, 129 | "expr": "sum by (service_name) (rate(service_requests[$__rate_interval]))", 130 | "interval": "", 131 | "legendFormat": "{{pod}}", 132 | "range": true, 133 | "refId": "A" 134 | } 135 | ], 136 | "title": "RPS", 137 | "type": "timeseries" 138 | }, 139 | { 140 | "datasource": { 141 | "type": "prometheus", 142 | "uid": "P1809F7CD0C75ACF3" 143 | }, 144 | "fieldConfig": { 145 | "defaults": { 146 | "color": { 147 | "mode": "palette-classic" 148 | }, 149 | "custom": { 150 | "axisCenteredZero": false, 151 | "axisColorMode": "text", 152 | "axisLabel": "", 153 | "axisPlacement": "auto", 154 | "barAlignment": 0, 155 | "drawStyle": "line", 156 | "fillOpacity": 0, 157 | "gradientMode": "none", 158 | "hideFrom": { 159 | "legend": false, 160 | "tooltip": false, 161 | "viz": false 162 | }, 163 | "lineInterpolation": "linear", 164 | "lineWidth": 1, 165 | "pointSize": 5, 166 | "scaleDistribution": { 167 | "type": "linear" 168 | }, 169 | "showPoints": "never", 170 | "spanNulls": false, 171 | "stacking": { 172 | "group": "A", 173 | "mode": "none" 174 | }, 175 | "thresholdsStyle": { 176 | "mode": "off" 177 | } 178 | }, 179 | "mappings": [], 180 | "thresholds": { 181 | "mode": "absolute", 182 | "steps": [ 183 | { 184 | "color": "green", 185 | "value": null 186 | }, 187 | { 188 | "color": "red", 189 | "value": 80 190 | } 191 | ] 192 | }, 193 | "unit": "s" 194 | }, 195 | "overrides": [] 196 | }, 197 | "gridPos": { 198 | "h": 8, 199 | "w": 12, 200 | "x": 12, 201 | "y": 1 202 | }, 203 | "id": 8, 204 | "options": { 205 | "legend": { 206 | "calcs": [], 207 | "displayMode": "list", 208 | "placement": "bottom", 209 | "showLegend": true 210 | }, 211 | "tooltip": { 212 | "mode": "single", 213 | "sort": "none" 214 | } 215 | }, 216 | "targets": [ 217 | { 218 | "datasource": { 219 | "type": "prometheus", 220 | "uid": "prometheus" 221 | }, 222 | "editorMode": "code", 223 | "exemplar": true, 224 | "expr": "histogram_quantile(0.95, sum by (le, service_name) (rate(service_latency_bucket{operation!~\"Poll.*\"}[$__rate_interval])))", 225 | "interval": "", 226 | "legendFormat": "__auto", 227 | "range": true, 228 | "refId": "A" 229 | } 230 | ], 231 | "title": "Latency", 232 | "type": "timeseries" 233 | }, 234 | { 235 | "datasource": { 236 | "type": "prometheus", 237 | "uid": "P1809F7CD0C75ACF3" 238 | }, 239 | "fieldConfig": { 240 | "defaults": { 241 | "color": { 242 | "mode": "palette-classic" 243 | }, 244 | "custom": { 245 | "axisCenteredZero": false, 246 | "axisColorMode": "text", 247 | "axisLabel": "", 248 | "axisPlacement": "auto", 249 | "barAlignment": 0, 250 | "drawStyle": "line", 251 | "fillOpacity": 0, 252 | "gradientMode": "none", 253 | "hideFrom": { 254 | "legend": false, 255 | "tooltip": false, 256 | "viz": false 257 | }, 258 | "lineInterpolation": "linear", 259 | "lineWidth": 1, 260 | "pointSize": 5, 261 | "scaleDistribution": { 262 | "type": "linear" 263 | }, 264 | "showPoints": "never", 265 | "spanNulls": false, 266 | "stacking": { 267 | "group": "A", 268 | "mode": "none" 269 | }, 270 | "thresholdsStyle": { 271 | "mode": "off" 272 | } 273 | }, 274 | "mappings": [], 275 | "thresholds": { 276 | "mode": "absolute", 277 | "steps": [ 278 | { 279 | "color": "green", 280 | "value": null 281 | }, 282 | { 283 | "color": "red", 284 | "value": 80 285 | } 286 | ] 287 | }, 288 | "unit": "short" 289 | }, 290 | "overrides": [] 291 | }, 292 | "gridPos": { 293 | "h": 8, 294 | "w": 12, 295 | "x": 0, 296 | "y": 9 297 | }, 298 | "id": 12, 299 | "options": { 300 | "legend": { 301 | "calcs": [], 302 | "displayMode": "list", 303 | "placement": "bottom", 304 | "showLegend": true 305 | }, 306 | "tooltip": { 307 | "mode": "single", 308 | "sort": "none" 309 | } 310 | }, 311 | "targets": [ 312 | { 313 | "datasource": { 314 | "type": "prometheus", 315 | "uid": "prometheus" 316 | }, 317 | "editorMode": "code", 318 | "exemplar": true, 319 | "expr": "sum by (error_type) (rate(service_error_with_type[$__rate_interval])) > 0", 320 | "interval": "", 321 | "legendFormat": "__auto", 322 | "range": true, 323 | "refId": "A" 324 | } 325 | ], 326 | "title": "Errors", 327 | "type": "timeseries" 328 | }, 329 | { 330 | "datasource": { 331 | "type": "prometheus", 332 | "uid": "P1809F7CD0C75ACF3" 333 | }, 334 | "fieldConfig": { 335 | "defaults": { 336 | "color": { 337 | "mode": "palette-classic" 338 | }, 339 | "custom": { 340 | "axisCenteredZero": false, 341 | "axisColorMode": "text", 342 | "axisLabel": "", 343 | "axisPlacement": "auto", 344 | "barAlignment": 0, 345 | "drawStyle": "line", 346 | "fillOpacity": 0, 347 | "gradientMode": "none", 348 | "hideFrom": { 349 | "legend": false, 350 | "tooltip": false, 351 | "viz": false 352 | }, 353 | "lineInterpolation": "linear", 354 | "lineWidth": 1, 355 | "pointSize": 5, 356 | "scaleDistribution": { 357 | "type": "linear" 358 | }, 359 | "showPoints": "never", 360 | "spanNulls": false, 361 | "stacking": { 362 | "group": "A", 363 | "mode": "none" 364 | }, 365 | "thresholdsStyle": { 366 | "mode": "off" 367 | } 368 | }, 369 | "mappings": [], 370 | "thresholds": { 371 | "mode": "absolute", 372 | "steps": [ 373 | { 374 | "color": "green", 375 | "value": null 376 | }, 377 | { 378 | "color": "red", 379 | "value": 80 380 | } 381 | ] 382 | }, 383 | "unit": "short" 384 | }, 385 | "overrides": [] 386 | }, 387 | "gridPos": { 388 | "h": 8, 389 | "w": 12, 390 | "x": 12, 391 | "y": 9 392 | }, 393 | "id": 10, 394 | "options": { 395 | "legend": { 396 | "calcs": [], 397 | "displayMode": "list", 398 | "placement": "bottom", 399 | "showLegend": true 400 | }, 401 | "tooltip": { 402 | "mode": "single", 403 | "sort": "none" 404 | } 405 | }, 406 | "targets": [ 407 | { 408 | "datasource": { 409 | "type": "prometheus", 410 | "uid": "prometheus" 411 | }, 412 | "editorMode": "code", 413 | "exemplar": true, 414 | "expr": "sum by (operation, resource_exhausted_cause) (rate(service_errors_resource_exhausted[$__rate_interval])) > 0", 415 | "interval": "", 416 | "legendFormat": "{{operation}}: {{resource_exhausted_cause}}", 417 | "range": true, 418 | "refId": "A" 419 | } 420 | ], 421 | "title": "Resource Exhausted", 422 | "type": "timeseries" 423 | } 424 | ], 425 | "refresh": "30s", 426 | "schemaVersion": 37, 427 | "style": "dark", 428 | "tags": [ 429 | "temporal", 430 | "services" 431 | ], 432 | "templating": { 433 | "list": [] 434 | }, 435 | "time": { 436 | "from": "now-15m", 437 | "to": "now" 438 | }, 439 | "timepicker": {}, 440 | "timezone": "", 441 | "title": "Soak Test - Services", 442 | "uid": "BwQ6UzyVk", 443 | "version": 2, 444 | "weekStart": "" 445 | } 446 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-slo.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "datasource", 8 | "uid": "grafana" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 36, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "collapsed": false, 33 | "datasource": { 34 | "type": "datasource", 35 | "uid": "grafana" 36 | }, 37 | "gridPos": { 38 | "h": 1, 39 | "w": 24, 40 | "x": 0, 41 | "y": 0 42 | }, 43 | "id": 6, 44 | "panels": [], 45 | "title": "Overview", 46 | "type": "row" 47 | }, 48 | { 49 | "datasource": { 50 | "type": "prometheus", 51 | "uid": "prometheus" 52 | }, 53 | "fieldConfig": { 54 | "defaults": { 55 | "color": { 56 | "mode": "palette-classic" 57 | }, 58 | "custom": { 59 | "axisCenteredZero": false, 60 | "axisColorMode": "text", 61 | "axisLabel": "", 62 | "axisPlacement": "auto", 63 | "barAlignment": 0, 64 | "drawStyle": "line", 65 | "fillOpacity": 0, 66 | "gradientMode": "none", 67 | "hideFrom": { 68 | "legend": false, 69 | "tooltip": false, 70 | "viz": false 71 | }, 72 | "lineInterpolation": "linear", 73 | "lineWidth": 1, 74 | "pointSize": 5, 75 | "scaleDistribution": { 76 | "type": "linear" 77 | }, 78 | "showPoints": "never", 79 | "spanNulls": false, 80 | "stacking": { 81 | "group": "A", 82 | "mode": "none" 83 | }, 84 | "thresholdsStyle": { 85 | "mode": "off" 86 | } 87 | }, 88 | "mappings": [], 89 | "thresholds": { 90 | "mode": "absolute", 91 | "steps": [ 92 | { 93 | "color": "green", 94 | "value": null 95 | }, 96 | { 97 | "color": "red", 98 | "value": 80 99 | } 100 | ] 101 | }, 102 | "unit": "short" 103 | }, 104 | "overrides": [] 105 | }, 106 | "gridPos": { 107 | "h": 8, 108 | "w": 12, 109 | "x": 0, 110 | "y": 1 111 | }, 112 | "id": 8, 113 | "options": { 114 | "legend": { 115 | "calcs": [], 116 | "displayMode": "list", 117 | "placement": "bottom", 118 | "showLegend": false 119 | }, 120 | "tooltip": { 121 | "mode": "single", 122 | "sort": "none" 123 | } 124 | }, 125 | "targets": [ 126 | { 127 | "datasource": { 128 | "type": "prometheus", 129 | "uid": "prometheus" 130 | }, 131 | "exemplar": false, 132 | "expr": "sum(rate(state_transition_count_count{exported_namespace=\"default\"}[$__rate_interval]))", 133 | "interval": "", 134 | "legendFormat": "", 135 | "refId": "A" 136 | } 137 | ], 138 | "title": "State Transitions Per Second", 139 | "type": "timeseries" 140 | }, 141 | { 142 | "collapsed": false, 143 | "datasource": { 144 | "type": "datasource", 145 | "uid": "grafana" 146 | }, 147 | "gridPos": { 148 | "h": 1, 149 | "w": 24, 150 | "x": 0, 151 | "y": 9 152 | }, 153 | "id": 2, 154 | "panels": [], 155 | "title": "SLO", 156 | "type": "row" 157 | }, 158 | { 159 | "datasource": { 160 | "type": "prometheus", 161 | "uid": "prometheus" 162 | }, 163 | "fieldConfig": { 164 | "defaults": { 165 | "color": { 166 | "mode": "palette-classic" 167 | }, 168 | "custom": { 169 | "axisCenteredZero": false, 170 | "axisColorMode": "text", 171 | "axisLabel": "", 172 | "axisPlacement": "auto", 173 | "barAlignment": 0, 174 | "drawStyle": "line", 175 | "fillOpacity": 0, 176 | "gradientMode": "none", 177 | "hideFrom": { 178 | "legend": false, 179 | "tooltip": false, 180 | "viz": false 181 | }, 182 | "lineInterpolation": "linear", 183 | "lineWidth": 1, 184 | "pointSize": 5, 185 | "scaleDistribution": { 186 | "type": "linear" 187 | }, 188 | "showPoints": "never", 189 | "spanNulls": false, 190 | "stacking": { 191 | "group": "A", 192 | "mode": "none" 193 | }, 194 | "thresholdsStyle": { 195 | "mode": "dashed" 196 | } 197 | }, 198 | "mappings": [], 199 | "min": 0, 200 | "thresholds": { 201 | "mode": "absolute", 202 | "steps": [ 203 | { 204 | "color": "green", 205 | "value": null 206 | }, 207 | { 208 | "color": "red", 209 | "value": 0.15 210 | } 211 | ] 212 | }, 213 | "unit": "s" 214 | }, 215 | "overrides": [] 216 | }, 217 | "gridPos": { 218 | "h": 8, 219 | "w": 12, 220 | "x": 0, 221 | "y": 10 222 | }, 223 | "id": 4, 224 | "options": { 225 | "legend": { 226 | "calcs": [], 227 | "displayMode": "list", 228 | "placement": "bottom", 229 | "showLegend": true 230 | }, 231 | "tooltip": { 232 | "mode": "single", 233 | "sort": "none" 234 | } 235 | }, 236 | "targets": [ 237 | { 238 | "datasource": { 239 | "type": "prometheus", 240 | "uid": "prometheus" 241 | }, 242 | "exemplar": true, 243 | "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(temporal_request_latency_attempt_bucket{job=\"benchmark-monitoring\",operation=\"StartWorkflowExecution\"}[$__rate_interval])))", 244 | "interval": "", 245 | "legendFormat": "{{operation}}", 246 | "refId": "A" 247 | } 248 | ], 249 | "title": "Request Latency p95", 250 | "type": "timeseries" 251 | }, 252 | { 253 | "datasource": { 254 | "type": "datasource", 255 | "uid": "grafana" 256 | }, 257 | "gridPos": { 258 | "h": 8, 259 | "w": 12, 260 | "x": 12, 261 | "y": 10 262 | }, 263 | "id": 20, 264 | "options": { 265 | "alertInstanceLabelFilter": "{namespace=\"temporal\"}", 266 | "alertName": "", 267 | "dashboardAlerts": false, 268 | "groupBy": [], 269 | "groupMode": "default", 270 | "maxItems": 20, 271 | "sortOrder": 1, 272 | "stateFilter": { 273 | "error": true, 274 | "firing": true, 275 | "noData": false, 276 | "normal": false, 277 | "pending": true 278 | }, 279 | "viewMode": "list" 280 | }, 281 | "title": "Temporal Alerts", 282 | "type": "alertlist" 283 | } 284 | ], 285 | "refresh": "30s", 286 | "schemaVersion": 37, 287 | "style": "dark", 288 | "tags": [ 289 | "slo", 290 | "temporal" 291 | ], 292 | "templating": { 293 | "list": [] 294 | }, 295 | "time": { 296 | "from": "now-15m", 297 | "to": "now" 298 | }, 299 | "timepicker": {}, 300 | "timezone": "", 301 | "title": "Soak Test - SLO", 302 | "uid": "3f8dd4d1-9c3b-4ead-955b-f94000cc2273", 303 | "version": 2, 304 | "weekStart": "" 305 | } 306 | -------------------------------------------------------------------------------- /k8s/monitoring/dashboards/soak-test-worker.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "datasource", 8 | "uid": "grafana" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 32, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "collapsed": false, 33 | "gridPos": { 34 | "h": 1, 35 | "w": 24, 36 | "x": 0, 37 | "y": 0 38 | }, 39 | "id": 17, 40 | "panels": [], 41 | "title": "Pods", 42 | "type": "row" 43 | }, 44 | { 45 | "datasource": { 46 | "type": "prometheus", 47 | "uid": "P1809F7CD0C75ACF3" 48 | }, 49 | "fieldConfig": { 50 | "defaults": { 51 | "color": { 52 | "mode": "palette-classic" 53 | }, 54 | "custom": { 55 | "axisCenteredZero": false, 56 | "axisColorMode": "text", 57 | "axisLabel": "", 58 | "axisPlacement": "auto", 59 | "barAlignment": 0, 60 | "drawStyle": "line", 61 | "fillOpacity": 0, 62 | "gradientMode": "none", 63 | "hideFrom": { 64 | "legend": false, 65 | "tooltip": false, 66 | "viz": false 67 | }, 68 | "lineInterpolation": "linear", 69 | "lineWidth": 1, 70 | "pointSize": 5, 71 | "scaleDistribution": { 72 | "type": "linear" 73 | }, 74 | "showPoints": "auto", 75 | "spanNulls": false, 76 | "stacking": { 77 | "group": "A", 78 | "mode": "none" 79 | }, 80 | "thresholdsStyle": { 81 | "mode": "off" 82 | } 83 | }, 84 | "mappings": [], 85 | "thresholds": { 86 | "mode": "absolute", 87 | "steps": [ 88 | { 89 | "color": "green", 90 | "value": null 91 | }, 92 | { 93 | "color": "red", 94 | "value": 80 95 | } 96 | ] 97 | }, 98 | "unit": "percentunit" 99 | }, 100 | "overrides": [] 101 | }, 102 | "gridPos": { 103 | "h": 8, 104 | "w": 12, 105 | "x": 0, 106 | "y": 1 107 | }, 108 | "id": 12, 109 | "options": { 110 | "legend": { 111 | "calcs": [], 112 | "displayMode": "list", 113 | "placement": "bottom", 114 | "showLegend": true 115 | }, 116 | "tooltip": { 117 | "mode": "single", 118 | "sort": "none" 119 | } 120 | }, 121 | "targets": [ 122 | { 123 | "datasource": { 124 | "type": "prometheus", 125 | "uid": "prometheus" 126 | }, 127 | "exemplar": true, 128 | "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"temporal\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-worker\", workload_type=\"statefulset\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\",resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-worker\", workload_type=\"statefulset\"}\n) by (pod)\n", 129 | "interval": "", 130 | "legendFormat": "{{pod}}", 131 | "refId": "A" 132 | } 133 | ], 134 | "title": "Worker CPU%", 135 | "type": "timeseries" 136 | }, 137 | { 138 | "datasource": { 139 | "type": "prometheus", 140 | "uid": "prometheus" 141 | }, 142 | "fieldConfig": { 143 | "defaults": { 144 | "color": { 145 | "mode": "palette-classic" 146 | }, 147 | "custom": { 148 | "axisCenteredZero": false, 149 | "axisColorMode": "text", 150 | "axisLabel": "", 151 | "axisPlacement": "auto", 152 | "barAlignment": 0, 153 | "drawStyle": "line", 154 | "fillOpacity": 0, 155 | "gradientMode": "none", 156 | "hideFrom": { 157 | "legend": false, 158 | "tooltip": false, 159 | "viz": false 160 | }, 161 | "lineInterpolation": "linear", 162 | "lineWidth": 1, 163 | "pointSize": 5, 164 | "scaleDistribution": { 165 | "type": "linear" 166 | }, 167 | "showPoints": "auto", 168 | "spanNulls": false, 169 | "stacking": { 170 | "group": "A", 171 | "mode": "none" 172 | }, 173 | "thresholdsStyle": { 174 | "mode": "off" 175 | } 176 | }, 177 | "mappings": [], 178 | "thresholds": { 179 | "mode": "absolute", 180 | "steps": [ 181 | { 182 | "color": "green", 183 | "value": null 184 | }, 185 | { 186 | "color": "red", 187 | "value": 80 188 | } 189 | ] 190 | }, 191 | "unit": "percentunit" 192 | }, 193 | "overrides": [] 194 | }, 195 | "gridPos": { 196 | "h": 8, 197 | "w": 12, 198 | "x": 12, 199 | "y": 1 200 | }, 201 | "id": 13, 202 | "options": { 203 | "legend": { 204 | "calcs": [], 205 | "displayMode": "list", 206 | "placement": "bottom", 207 | "showLegend": true 208 | }, 209 | "tooltip": { 210 | "mode": "single", 211 | "sort": "none" 212 | } 213 | }, 214 | "targets": [ 215 | { 216 | "datasource": { 217 | "type": "prometheus", 218 | "uid": "prometheus" 219 | }, 220 | "exemplar": true, 221 | "expr": "sum(\n container_memory_working_set_bytes{namespace=\"temporal\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-worker\", workload_type=\"statefulset\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\",namespace=\"temporal\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"temporal\", workload=\"temporal-worker\", workload_type=\"statefulset\"}\n) by (pod)\n", 222 | "interval": "", 223 | "legendFormat": "{{pod}}", 224 | "refId": "A" 225 | } 226 | ], 227 | "title": "Worker Memory%", 228 | "type": "timeseries" 229 | } 230 | ], 231 | "refresh": "30s", 232 | "schemaVersion": 37, 233 | "style": "dark", 234 | "tags": [ 235 | "worker", 236 | "temporal" 237 | ], 238 | "templating": { 239 | "list": [] 240 | }, 241 | "time": { 242 | "from": "now-1h", 243 | "to": "now" 244 | }, 245 | "timepicker": {}, 246 | "timezone": "browser", 247 | "title": "Soak Test - Worker", 248 | "uid": "af4475f3-4457-42b5-9586-d396aa45319c", 249 | "version": 2, 250 | "weekStart": "" 251 | } 252 | -------------------------------------------------------------------------------- /k8s/monitoring/grafana.ini: -------------------------------------------------------------------------------- 1 | [date_formats] 2 | default_timezone = UTC 3 | 4 | [auth.anonymous] 5 | enabled = true 6 | org_role = Admin 7 | -------------------------------------------------------------------------------- /k8s/monitoring/kustomization.yaml: -------------------------------------------------------------------------------- 1 | bases: 2 | - "github.com/prometheus-operator/kube-prometheus?ref=v0.12.0" 3 | secretGenerator: 4 | - name: grafana-config 5 | namespace: monitoring 6 | behavior: replace 7 | options: 8 | disableNameSuffixHash: true 9 | files: 10 | - grafana.ini 11 | configMapGenerator: 12 | - name: grafana-dashboards 13 | namespace: monitoring 14 | behavior: merge 15 | options: 16 | disableNameSuffixHash: true 17 | files: 18 | - dashboards/folder.yaml 19 | - name: grafana-temporal-dashboard-definitions 20 | namespace: monitoring 21 | options: 22 | disableNameSuffixHash: true 23 | files: 24 | - dashboards/soak-test-frontend.json 25 | - dashboards/soak-test-history.json 26 | - dashboards/soak-test-matching.json 27 | - dashboards/soak-test-persistence.json 28 | - dashboards/soak-test-pods.json 29 | - dashboards/soak-test-polling.json 30 | - dashboards/soak-test-services.json 31 | - dashboards/soak-test-slo.json 32 | - dashboards/soak-test-worker.json 33 | resources: 34 | - prometheus-rbacTemporal.yaml 35 | - temporal-rules.yaml 36 | patchesStrategicMerge: 37 | - ./dashboards/cloudwatch.yaml 38 | - ./dashboards/patch.yaml -------------------------------------------------------------------------------- /k8s/monitoring/prometheus-rbacTemporal.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | labels: 5 | app.kubernetes.io/component: prometheus 6 | app.kubernetes.io/instance: k8s 7 | app.kubernetes.io/name: prometheus 8 | app.kubernetes.io/part-of: kube-prometheus 9 | app.kubernetes.io/version: 2.42.0 10 | name: prometheus-k8s 11 | namespace: temporal 12 | rules: 13 | - apiGroups: 14 | - "" 15 | resources: 16 | - services 17 | - endpoints 18 | - pods 19 | verbs: 20 | - get 21 | - list 22 | - watch 23 | - apiGroups: 24 | - extensions 25 | resources: 26 | - ingresses 27 | verbs: 28 | - get 29 | - list 30 | - watch 31 | - apiGroups: 32 | - networking.k8s.io 33 | resources: 34 | - ingresses 35 | verbs: 36 | - get 37 | - list 38 | - watch 39 | --- 40 | apiVersion: rbac.authorization.k8s.io/v1 41 | kind: RoleBinding 42 | metadata: 43 | labels: 44 | app.kubernetes.io/component: prometheus 45 | app.kubernetes.io/instance: k8s 46 | app.kubernetes.io/name: prometheus 47 | app.kubernetes.io/part-of: kube-prometheus 48 | app.kubernetes.io/version: 2.42.0 49 | name: prometheus-k8s 50 | namespace: temporal 51 | roleRef: 52 | apiGroup: rbac.authorization.k8s.io 53 | kind: Role 54 | name: prometheus-k8s 55 | subjects: 56 | - kind: ServiceAccount 57 | name: prometheus-k8s 58 | namespace: monitoring 59 | -------------------------------------------------------------------------------- /k8s/monitoring/temporal-rules.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | prometheus: k8s 6 | role: alert-rules 7 | name: temporal-rules 8 | namespace: monitoring 9 | spec: 10 | groups: 11 | - name: temporal-metrics 12 | rules: 13 | - alert: TemporalRequestLatencyHigh 14 | annotations: 15 | description: Temporal {{ $labels.operation }} request latency is currently {{ $value | humanize }}, outside of SLO 150ms. 16 | summary: Temporal request latency is too high. 17 | expr: | 18 | histogram_quantile(0.95, sum by (le, operation) (rate(temporal_request_latency_bucket{job="benchmark-monitoring",operation="StartWorkflowExecution"}[5m]))) 19 | > 0.150 20 | for: 5m 21 | labels: 22 | namespace: temporal 23 | severity: critical 24 | - alert: TemporalWorkflowTaskScheduleToStartLatencyHigh 25 | annotations: 26 | description: Temporal Workflow Task Schedule to Start latency is currently {{ $value | humanize }}, outside of SLO 150ms. 27 | summary: Temporal Workflow Task Schedule to Start latency is too high. 28 | expr: | 29 | histogram_quantile(0.95, sum by (le) (rate(temporal_workflow_task_schedule_to_start_latency_bucket{namespace="default"}[5m]))) 30 | > 0.150 31 | for: 5m 32 | labels: 33 | namespace: temporal 34 | severity: critical 35 | - alert: TemporalActivityScheduleToStartLatencyHigh 36 | annotations: 37 | description: Temporal Activity Schedule to Start latency is currently {{ $value | humanize }}, outside of SLO 150ms. 38 | summary: Temporal Activity Schedule to Start latency is too high. 39 | expr: | 40 | histogram_quantile(0.95, sum by (le) (rate(temporal_activity_schedule_to_start_latency_bucket{namespace="default"}[5m]))) 41 | > 0.150 42 | for: 5m 43 | labels: 44 | namespace: temporal 45 | severity: critical 46 | -------------------------------------------------------------------------------- /k8s/temporal/frontend-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: temporal-frontend 5 | labels: 6 | app.kubernetes.io/component: frontend 7 | app.kubernetes.io/instance: temporal 8 | app.kubernetes.io/name: temporal 9 | app.kubernetes.io/part-of: temporal 10 | app.kubernetes.io/version: 1.20.3 11 | spec: 12 | progressDeadlineSeconds: 600 13 | replicas: 2 14 | revisionHistoryLimit: 10 15 | selector: 16 | matchLabels: 17 | app.kubernetes.io/component: frontend 18 | app.kubernetes.io/instance: temporal 19 | app.kubernetes.io/name: temporal 20 | strategy: 21 | rollingUpdate: 22 | maxSurge: 25% 23 | maxUnavailable: 25% 24 | type: RollingUpdate 25 | template: 26 | metadata: 27 | labels: 28 | app.kubernetes.io/component: frontend 29 | app.kubernetes.io/instance: temporal 30 | app.kubernetes.io/name: temporal 31 | app.kubernetes.io/part-of: temporal 32 | app.kubernetes.io/version: 1.20.3 33 | spec: 34 | containers: 35 | - env: 36 | - name: BIND_ON_IP 37 | valueFrom: 38 | fieldRef: 39 | apiVersion: v1 40 | fieldPath: status.podIP 41 | - name: SERVICES 42 | value: frontend 43 | - name: PROMETHEUS_ENDPOINT 44 | value: 0.0.0.0:8000 45 | - name: DYNAMIC_CONFIG_FILE_PATH 46 | value: /etc/temporal/dynamic_config/dynamic_config.yaml 47 | envFrom: 48 | - configMapRef: 49 | name: temporal-env 50 | image: temporalio/server:1.20.3 51 | imagePullPolicy: IfNotPresent 52 | livenessProbe: 53 | failureThreshold: 3 54 | initialDelaySeconds: 150 55 | periodSeconds: 10 56 | successThreshold: 1 57 | tcpSocket: 58 | port: rpc 59 | timeoutSeconds: 1 60 | name: temporal 61 | ports: 62 | - name: rpc 63 | containerPort: 7233 64 | protocol: TCP 65 | - name: metrics 66 | containerPort: 8000 67 | protocol: TCP 68 | terminationMessagePath: /dev/termination-log 69 | terminationMessagePolicy: File 70 | volumeMounts: 71 | - mountPath: /etc/temporal/dynamic_config 72 | name: dynamic-config 73 | dnsPolicy: ClusterFirst 74 | restartPolicy: Always 75 | schedulerName: default-scheduler 76 | securityContext: {} 77 | terminationGracePeriodSeconds: 30 78 | volumes: 79 | - configMap: 80 | defaultMode: 420 81 | items: 82 | - key: dynamic_config.yaml 83 | path: dynamic_config.yaml 84 | name: temporal-dynamic-config 85 | name: dynamic-config -------------------------------------------------------------------------------- /k8s/temporal/frontend-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: temporal-frontend 5 | labels: 6 | app.kubernetes.io/component: frontend 7 | app.kubernetes.io/instance: temporal 8 | app.kubernetes.io/name: temporal 9 | app.kubernetes.io/part-of: temporal 10 | app.kubernetes.io/version: 1.18.0 11 | spec: 12 | internalTrafficPolicy: Cluster 13 | ports: 14 | - name: grpc-rpc 15 | port: 7233 16 | protocol: TCP 17 | targetPort: rpc 18 | selector: 19 | app.kubernetes.io/component: frontend 20 | app.kubernetes.io/instance: temporal 21 | app.kubernetes.io/name: temporal 22 | sessionAffinity: None 23 | type: ClusterIP 24 | clusterIP: None 25 | -------------------------------------------------------------------------------- /k8s/temporal/history-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: temporal-history 5 | labels: 6 | app.kubernetes.io/component: history 7 | app.kubernetes.io/instance: temporal 8 | app.kubernetes.io/name: temporal 9 | app.kubernetes.io/part-of: temporal 10 | app.kubernetes.io/version: 1.20.3 11 | spec: 12 | replicas: 2 13 | revisionHistoryLimit: 10 14 | selector: 15 | matchLabels: 16 | app.kubernetes.io/component: history 17 | app.kubernetes.io/instance: temporal 18 | app.kubernetes.io/name: temporal 19 | strategy: 20 | rollingUpdate: 21 | maxSurge: 100% 22 | maxUnavailable: 100% 23 | type: RollingUpdate 24 | template: 25 | metadata: 26 | labels: 27 | app.kubernetes.io/component: history 28 | app.kubernetes.io/instance: temporal 29 | app.kubernetes.io/name: temporal 30 | app.kubernetes.io/part-of: temporal 31 | app.kubernetes.io/version: 1.20.3 32 | spec: 33 | containers: 34 | - env: 35 | - name: BIND_ON_IP 36 | valueFrom: 37 | fieldRef: 38 | apiVersion: v1 39 | fieldPath: status.podIP 40 | - name: SERVICES 41 | value: history 42 | - name: PROMETHEUS_ENDPOINT 43 | value: 0.0.0.0:8000 44 | - name: DYNAMIC_CONFIG_FILE_PATH 45 | value: /etc/temporal/dynamic_config/dynamic_config.yaml 46 | - name: SQL_MAX_CONNS 47 | value: "40" 48 | envFrom: 49 | - configMapRef: 50 | name: temporal-env 51 | image: temporalio/server:1.20.3 52 | imagePullPolicy: IfNotPresent 53 | livenessProbe: 54 | failureThreshold: 3 55 | initialDelaySeconds: 150 56 | periodSeconds: 10 57 | successThreshold: 1 58 | tcpSocket: 59 | port: rpc 60 | timeoutSeconds: 1 61 | name: temporal 62 | ports: 63 | - name: rpc 64 | containerPort: 7234 65 | protocol: TCP 66 | - name: metrics 67 | containerPort: 8000 68 | protocol: TCP 69 | terminationMessagePath: /dev/termination-log 70 | terminationMessagePolicy: File 71 | volumeMounts: 72 | - mountPath: /etc/temporal/dynamic_config 73 | name: dynamic-config 74 | dnsPolicy: ClusterFirst 75 | restartPolicy: Always 76 | schedulerName: default-scheduler 77 | securityContext: {} 78 | terminationGracePeriodSeconds: 30 79 | volumes: 80 | - configMap: 81 | defaultMode: 420 82 | items: 83 | - key: dynamic_config.yaml 84 | path: dynamic_config.yaml 85 | name: temporal-dynamic-config 86 | name: dynamic-config -------------------------------------------------------------------------------- /k8s/temporal/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - frontend-service.yaml 3 | - frontend-deployment.yaml 4 | - worker-deployment.yaml 5 | - matching-deployment.yaml 6 | - history-deployment.yaml 7 | - monitoring-service.yaml 8 | - service-monitor.yaml 9 | namespace: temporal -------------------------------------------------------------------------------- /k8s/temporal/matching-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: temporal-matching 5 | labels: 6 | app.kubernetes.io/component: matching 7 | app.kubernetes.io/instance: temporal 8 | app.kubernetes.io/name: temporal 9 | app.kubernetes.io/part-of: temporal 10 | app.kubernetes.io/version: 1.20.3 11 | spec: 12 | replicas: 2 13 | revisionHistoryLimit: 10 14 | selector: 15 | matchLabels: 16 | app.kubernetes.io/component: matching 17 | app.kubernetes.io/instance: temporal 18 | app.kubernetes.io/name: temporal 19 | strategy: 20 | rollingUpdate: 21 | maxSurge: 100% 22 | maxUnavailable: 100% 23 | type: RollingUpdate 24 | template: 25 | metadata: 26 | labels: 27 | app.kubernetes.io/component: matching 28 | app.kubernetes.io/instance: temporal 29 | app.kubernetes.io/name: temporal 30 | app.kubernetes.io/part-of: temporal 31 | app.kubernetes.io/version: 1.20.3 32 | spec: 33 | containers: 34 | - env: 35 | - name: BIND_ON_IP 36 | valueFrom: 37 | fieldRef: 38 | apiVersion: v1 39 | fieldPath: status.podIP 40 | - name: SERVICES 41 | value: matching 42 | - name: PROMETHEUS_ENDPOINT 43 | value: 0.0.0.0:8000 44 | - name: DYNAMIC_CONFIG_FILE_PATH 45 | value: /etc/temporal/dynamic_config/dynamic_config.yaml 46 | envFrom: 47 | - configMapRef: 48 | name: temporal-env 49 | image: temporalio/server:1.20.3 50 | imagePullPolicy: IfNotPresent 51 | livenessProbe: 52 | failureThreshold: 3 53 | initialDelaySeconds: 150 54 | periodSeconds: 10 55 | successThreshold: 1 56 | tcpSocket: 57 | port: rpc 58 | timeoutSeconds: 1 59 | name: temporal 60 | ports: 61 | - name: rpc 62 | containerPort: 7235 63 | protocol: TCP 64 | - name: metrics 65 | containerPort: 8000 66 | protocol: TCP 67 | terminationMessagePath: /dev/termination-log 68 | terminationMessagePolicy: File 69 | volumeMounts: 70 | - mountPath: /etc/temporal/dynamic_config 71 | name: dynamic-config 72 | dnsPolicy: ClusterFirst 73 | restartPolicy: Always 74 | schedulerName: default-scheduler 75 | securityContext: {} 76 | terminationGracePeriodSeconds: 30 77 | volumes: 78 | - configMap: 79 | defaultMode: 420 80 | items: 81 | - key: dynamic_config.yaml 82 | path: dynamic_config.yaml 83 | name: temporal-dynamic-config 84 | name: dynamic-config -------------------------------------------------------------------------------- /k8s/temporal/monitoring-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: temporal-monitoring 5 | labels: 6 | app.kubernetes.io/instance: temporal 7 | app.kubernetes.io/name: temporal 8 | app.kubernetes.io/part-of: temporal 9 | app.kubernetes.io/version: 1.18.0 10 | app.kubernetes.io/component: monitoring 11 | spec: 12 | type: ClusterIP 13 | clusterIP: None 14 | internalTrafficPolicy: Cluster 15 | publishNotReadyAddresses: true 16 | ports: 17 | - name: metrics 18 | port: 9090 19 | targetPort: metrics 20 | protocol: TCP 21 | selector: 22 | app.kubernetes.io/instance: temporal 23 | app.kubernetes.io/name: temporal 24 | -------------------------------------------------------------------------------- /k8s/temporal/service-monitor.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: temporal-monitor 5 | labels: 6 | app.kubernetes.io/name: temporal 7 | app.kubernetes.io/instance: temporal 8 | app.kubernetes.io/component: monitoring 9 | spec: 10 | endpoints: 11 | - port: metrics 12 | interval: 10s 13 | namespaceSelector: 14 | matchNames: 15 | - temporal 16 | selector: 17 | matchLabels: 18 | app.kubernetes.io/name: temporal 19 | app.kubernetes.io/instance: temporal 20 | app.kubernetes.io/component: monitoring 21 | -------------------------------------------------------------------------------- /k8s/temporal/worker-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: temporal-worker 5 | labels: 6 | app.kubernetes.io/component: worker 7 | app.kubernetes.io/instance: temporal 8 | app.kubernetes.io/name: temporal 9 | app.kubernetes.io/part-of: temporal 10 | app.kubernetes.io/version: 1.20.3 11 | spec: 12 | progressDeadlineSeconds: 600 13 | replicas: 1 14 | revisionHistoryLimit: 10 15 | selector: 16 | matchLabels: 17 | app.kubernetes.io/component: worker 18 | app.kubernetes.io/instance: temporal 19 | app.kubernetes.io/name: temporal 20 | strategy: 21 | rollingUpdate: 22 | maxSurge: 25% 23 | maxUnavailable: 25% 24 | type: RollingUpdate 25 | template: 26 | metadata: 27 | labels: 28 | app.kubernetes.io/component: worker 29 | app.kubernetes.io/instance: temporal 30 | app.kubernetes.io/name: temporal 31 | app.kubernetes.io/part-of: temporal 32 | app.kubernetes.io/version: 1.20.3 33 | spec: 34 | containers: 35 | - env: 36 | - name: BIND_ON_IP 37 | valueFrom: 38 | fieldRef: 39 | apiVersion: v1 40 | fieldPath: status.podIP 41 | - name: SERVICES 42 | value: worker 43 | - name: PROMETHEUS_ENDPOINT 44 | value: 0.0.0.0:8000 45 | - name: DYNAMIC_CONFIG_FILE_PATH 46 | value: /etc/temporal/dynamic_config/dynamic_config.yaml 47 | envFrom: 48 | - configMapRef: 49 | name: temporal-env 50 | image: temporalio/server:1.20.3 51 | imagePullPolicy: IfNotPresent 52 | name: temporal 53 | ports: 54 | - name: rpc 55 | containerPort: 7239 56 | protocol: TCP 57 | - name: metrics 58 | containerPort: 8000 59 | protocol: TCP 60 | terminationMessagePath: /dev/termination-log 61 | terminationMessagePolicy: File 62 | volumeMounts: 63 | - mountPath: /etc/temporal/dynamic_config 64 | name: dynamic-config 65 | dnsPolicy: ClusterFirst 66 | restartPolicy: Always 67 | schedulerName: default-scheduler 68 | securityContext: {} 69 | terminationGracePeriodSeconds: 30 70 | volumes: 71 | - configMap: 72 | defaultMode: 420 73 | items: 74 | - key: dynamic_config.yaml 75 | path: dynamic_config.yaml 76 | name: temporal-dynamic-config 77 | name: dynamic-config -------------------------------------------------------------------------------- /stacks/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | -------------------------------------------------------------------------------- /stacks/Pulumi.eks-cassandra-medium.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | aws:assumeRole: 3 | roleArn: arn:aws:iam::912773994842:role/BenchmarkClusterAdmin 4 | aws:defaultTags: 5 | tags: 6 | Stack: eks-rds-cassandra-medium 7 | pulumi:template: kubernetes-go 8 | temporal-benchmarks-k8s:EnvironmentStackName: temporalio/temporal-benchmarks-aws-environment/main 9 | temporal-benchmarks-k8s:Cluster: 10 | EKS: 11 | NodeCount: 9 12 | NodeType: m5.2xlarge 13 | temporal-benchmarks-k8s:Persistence: 14 | Cassandra: 15 | NodeCount: 3 16 | ReplicaCount: 3 17 | NodeType: m5.xlarge 18 | Visibility: 19 | OpenSearch: 20 | EngineVersion: OpenSearch_2.3 21 | InstanceType: m5.large.search 22 | temporal-benchmarks-k8s:Temporal: 23 | Frontend: 24 | Pods: 4 25 | CPU: 26 | request: 1.5 27 | limit: 2 28 | History: 29 | Shards: 4096 30 | Pods: 8 31 | CPU: 32 | request: 2 33 | limit: 2.5 34 | Memory: 35 | request: 5Gi 36 | limit: 6Gi 37 | Matching: 38 | Pods: 4 39 | TaskQueuePartitions: 4 40 | CPU: 41 | request: 1 42 | limit: 1.2 43 | Memory: 44 | request: 150Mi 45 | limit: 200Mi 46 | Workers: 47 | Pods: 16 48 | WorkflowPollers: 40 49 | ActivityPollers: 100 50 | CPU: 51 | request: 250m 52 | limit: 500m 53 | SoakTest: 54 | ConcurrentWorkflows: 200 55 | DynamicConfig: 56 | frontend.rps: 57 | - value: 2400 58 | matching.rps: 59 | - value: 2400 60 | -------------------------------------------------------------------------------- /stacks/Pulumi.eks-rds-aurora-mysql-medium.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | aws:assumeRole: 3 | roleArn: arn:aws:iam::912773994842:role/BenchmarkClusterAdmin 4 | aws:defaultTags: 5 | tags: 6 | Stack: eks-rds-aurora-mysql-medium 7 | pulumi:template: kubernetes-go 8 | temporal-benchmarks-k8s:EnvironmentStackName: temporalio/temporal-benchmarks-aws-environment/main 9 | temporal-benchmarks-k8s:Cluster: 10 | EKS: 11 | NodeCount: 9 12 | NodeType: m5.2xlarge 13 | temporal-benchmarks-k8s:Persistence: 14 | RDS: 15 | Engine: aurora-mysql 16 | EngineVersion: 5.7.mysql_aurora.2.11.1 17 | InstanceType: db.r5.4xlarge 18 | Visibility: 19 | OpenSearch: 20 | EngineVersion: OpenSearch_2.3 21 | InstanceType: c6g.large.search 22 | temporal-benchmarks-k8s:Temporal: 23 | Frontend: 24 | Pods: 4 25 | CPU: 26 | request: 1.5 27 | limit: 2 28 | Memory: 29 | request: 256Mi 30 | limit: 512Mi 31 | History: 32 | Shards: 4096 33 | Pods: 8 34 | CPU: 35 | request: 3 36 | limit: 3.5 37 | Memory: 38 | request: 5Gi 39 | limit: 6Gi 40 | Matching: 41 | Pods: 4 42 | TaskQueuePartitions: 4 43 | CPU: 44 | request: 1 45 | limit: 1.5 46 | Memory: 47 | request: 150Mi 48 | limit: 250Mi 49 | Workers: 50 | Pods: 16 51 | WorkflowPollers: 40 52 | ActivityPollers: 120 53 | CPU: 54 | request: 250m 55 | limit: 400m 56 | Memory: 57 | request: 25Mi 58 | limit: 50Mi 59 | SoakTest: 60 | ConcurrentWorkflows: 140 61 | DynamicConfig: 62 | frontend.rps: 63 | - value: 2400 64 | matching.rps: 65 | - value: 2400 66 | -------------------------------------------------------------------------------- /stacks/Pulumi.eks-rds-aurora-postgres-medium.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | aws:assumeRole: 3 | roleArn: arn:aws:iam::912773994842:role/BenchmarkClusterAdmin 4 | aws:defaultTags: 5 | tags: 6 | Stack: eks-rds-aurora-postgres-medium 7 | pulumi:template: kubernetes-go 8 | temporal-benchmarks-k8s:EnvironmentStackName: temporalio/temporal-benchmarks-aws-environment/main 9 | temporal-benchmarks-k8s:Cluster: 10 | EKS: 11 | NodeCount: 5 12 | NodeType: m5.2xlarge 13 | temporal-benchmarks-k8s:Persistence: 14 | RDS: 15 | Engine: aurora-postgresql 16 | EngineVersion: 14.4 17 | InstanceType: db.r5.8xlarge 18 | Visibility: 19 | OpenSearch: 20 | EngineVersion: OpenSearch_2.3 21 | InstanceType: m5.large.search 22 | temporal-benchmarks-k8s:Temporal: 23 | Frontend: 24 | Pods: 4 25 | CPU: 26 | request: 1.5 27 | limit: 2 28 | History: 29 | Shards: 4096 30 | CPU: 31 | request: 2 32 | limit: 2.5 33 | Memory: 34 | request: 3Gi 35 | limit: 4Gi 36 | Matching: 37 | TaskQueuePartitions: 4 38 | CPU: 39 | request: 750m 40 | limit: 1000m 41 | Workers: 42 | Pods: 16 43 | WorkflowPollers: 40 44 | ActivityPollers: 80 45 | CPU: 46 | request: 250m 47 | limit: 500m 48 | SoakTest: 49 | Pods: 2 50 | DynamicConfig: 51 | frontend.rps: 52 | - value: 2400 53 | matching.rps: 54 | - value: 2400 55 | -------------------------------------------------------------------------------- /stacks/Pulumi.eks-rds-mysql-scaling-series.yaml: -------------------------------------------------------------------------------- 1 | encryptionsalt: v1:2OehxWopbd0=:v1:YZaAahJaE8EWnmpw:Ygo7ewgyYj0zdFZlzOCsS4s5J5HjfQ== 2 | config: 3 | aws:assumeRole: 4 | roleArn: arn:aws:iam::912773994842:role/BenchmarkClusterAdmin 5 | aws:defaultTags: 6 | tags: 7 | Stack: eks-rds-aurora-mysql-medium 8 | pulumi:template: kubernetes-go 9 | temporal-benchmarks-k8s:AWS: 10 | Region: us-west-2 11 | AvailabilityZones: ["us-west-2a", "us-west-2b", "us-west-2c"] 12 | PrivateSubnetIds: ["subnet-072e150708ec6a826", "subnet-0d8f7e9920a4f4cd0", "subnet-0ae931146b4d89ed9"] 13 | PublicSubnetIds: ["subnet-0e09633802916e33f", "subnet-0a414606075f3c508", "subnet-0281dbc974eec544d"] 14 | RdsSubnetGroupName: "temporal-benchmark-rds-2e8d5ab" 15 | Role: BenchmarkClusterAdmin 16 | VpcId: "vpc-0e66ce57785e3e745" 17 | temporal-benchmarks-k8s:Benchmark: 18 | Workers: 19 | CPU: 20 | Request: 2 21 | Memory: 22 | Request: 1Gi 23 | Pods: 12 24 | WorkflowPollers: 1200 25 | ActivityPollers: 2000 26 | SoakTest: 27 | ConcurrentWorkflows: 300 28 | Pods: 4 29 | CPU: 30 | Request: 1 31 | Memory: 32 | Request: 1Gi 33 | temporal-benchmarks-k8s:Cluster: 34 | EKS: 35 | NodeCount: 10 36 | TemporalNodeCount: 8 37 | NodeType: m5.2xlarge 38 | temporal-benchmarks-k8s:Persistence: 39 | RDS: 40 | Engine: mysql 41 | EngineVersion: "8.0.32" 42 | InstanceType: db.r5.16xlarge 43 | IOPS: 20000 44 | temporal-benchmarks-k8s:Temporal: 45 | SetCPULimits: false 46 | SetGoMaxProcs: true 47 | Frontend: 48 | Pods: 8 49 | CPU: 50 | Request: 2 51 | Memory: 52 | Request: 1Gi 53 | History: 54 | Pods: 4 55 | CPU: 56 | Request: 4 57 | Memory: 58 | Request: 8Gi 59 | Shards: 1024 60 | Matching: 61 | Pods: 8 62 | CPU: 63 | Request: 2 64 | Memory: 65 | Request: 1Gi 66 | TaskQueuePartitions: 8 67 | Worker: 68 | Pods: 2 69 | CPU: 70 | Request: 1 71 | Memory: 72 | Request: 1Gi 73 | DynamicConfig: 74 | frontend.rps: 75 | - value: 6000 76 | frontend.namespaceCount: 77 | - value: 6000 78 | matching.rps: 79 | - value: 6000 -------------------------------------------------------------------------------- /stacks/Pulumi.eks-rds-postgres-medium.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | aws:assumeRole: 3 | roleArn: arn:aws:iam::912773994842:role/BenchmarkClusterAdmin 4 | aws:defaultTags: 5 | tags: 6 | Stack: eks-rds-postgres-medium 7 | pulumi:template: kubernetes-go 8 | temporal-benchmarks-k8s:EnvironmentStackName: temporalio/temporal-benchmarks-aws-environment/main 9 | temporal-benchmarks-k8s:Cluster: 10 | EKS: 11 | NodeCount: 3 12 | NodeType: m5.2xlarge 13 | temporal-benchmarks-k8s:Persistence: 14 | RDS: 15 | Engine: postgres 16 | EngineVersion: 14.4 17 | InstanceType: db.r5.4xlarge 18 | Visibility: 19 | OpenSearch: 20 | EngineVersion: OpenSearch_2.3 21 | InstanceType: m5.large.search 22 | temporal-benchmarks-k8s:Temporal: 23 | Frontend: 24 | Pods: 3 25 | CPU: 26 | request: 1.5 27 | limit: 2 28 | History: 29 | Shards: 4096 30 | CPU: 31 | request: 1.5 32 | limit: 2 33 | Matching: 34 | TaskQueuePartitions: 4 35 | CPU: 36 | request: 750m 37 | limit: 1000m 38 | Workers: 39 | Pods: 16 40 | WorkflowPollers: 40 41 | ActivityPollers: 80 42 | CPU: 43 | request: 150m 44 | limit: 200m 45 | SoakTest: 46 | Pods: 2 47 | DynamicConfig: 48 | frontend.rps: 49 | - value: 2400 50 | matching.rps: 51 | - value: 2400 52 | frontend.persistenceMaxQPS: 53 | - value: 1200 54 | history.persistenceMaxQPS: 55 | - value: 1200 56 | matching.persistenceMaxQPS: 57 | - value: 1200 58 | worker.persistenceMaxQPS: 59 | - value: 1200 60 | -------------------------------------------------------------------------------- /stacks/Pulumi.eks-rds-postgres-micro.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | aws:assumeRole: 3 | roleArn: arn:aws:iam::912773994842:role/BenchmarkClusterAdmin 4 | aws:defaultTags: 5 | tags: 6 | Stack: eks-rds-postgres-micro 7 | pulumi:template: kubernetes-go 8 | temporal-benchmarks-k8s:EnvironmentStackName: temporalio/temporal-benchmarks-aws-environment/main 9 | temporal-benchmarks-k8s:Cluster: 10 | EKS: 11 | NodeCount: 1 12 | NodeType: m5.2xlarge 13 | temporal-benchmarks-k8s:Persistence: 14 | RDS: 15 | Engine: postgres 16 | EngineVersion: 14.4 17 | InstanceType: db.r5.large 18 | SingleAZ: true 19 | temporal-benchmarks-k8s:Temporal: 20 | Frontend: 21 | Pods: 1 22 | CPU: 23 | request: 1.5 24 | limit: 2 25 | History: 26 | Shards: 512 27 | CPU: 28 | request: 1.5 29 | limit: 2 30 | Matching: 31 | TaskQueuePartitions: 4 32 | CPU: 33 | request: 750m 34 | limit: 1000m 35 | Workers: 36 | Pods: 4 37 | WorkflowPollers: 40 38 | ActivityPollers: 80 39 | CPU: 40 | request: 150m 41 | limit: 200m 42 | SoakTest: 43 | ConcurrentWorkflows: 15 44 | -------------------------------------------------------------------------------- /stacks/Pulumi.yaml: -------------------------------------------------------------------------------- 1 | name: temporal-benchmarks-k8s 2 | runtime: nodejs 3 | description: Temporal benchmarks k8s clusters 4 | template: 5 | config: 6 | temporal-benchmarks-k8s:EnvironmentStackName: 7 | description: AWS Environment stack to deploy into. See ../environment/aws. 8 | default: temporalio/temporal-benchmarks-aws-environment/main 9 | temporal-benchmarks-k8s:NodeType: 10 | description: Node instance type to use for EKS 11 | default: t2.medium 12 | temporal-benchmarks-k8s:NodeCount: 13 | description: Number of nodes to use for EKS 14 | default: 6 15 | temporal-benchmarks-k8s:PersistenceEngine: 16 | description: RDS engine 17 | temporal-benchmarks-k8s:PersistenceEngineVersion: 18 | description: RDS engine version 19 | temporal-benchmarks-k8s:PersistenceParameterGroupName: 20 | description: RDS engine parameter group 21 | temporal-benchmarks-k8s:PersistenceInstance: 22 | description: RDS instance type 23 | temporal-benchmarks-k8s:HistoryShards: 24 | description: Number of history shards to use (https://docs.temporal.io/references/configuration#numhistoryshards) 25 | default: 2048 26 | temporal-benchmarks-k8s:TaskQueuePartitions: 27 | description: Number of task queue partitions to use for the benchmark task queue. This value is only by the stack to scale the matching system, it is not passed on as configuration for Temporal Server. Dynamic config should be set to actually apply the required settings for the server, see DynamicConfig. 28 | default: 8 29 | temporal-benchmarks-k8s:WorkerCount: 30 | description: Number of benchmark worker processes to run 31 | default: 16 32 | temporal-benchmarks-k8s:WorkerWorkflowPollers: 33 | description: Number of workflow task pollers for each benchmark worker (https://pkg.go.dev/go.temporal.io/sdk@v1.17.0/internal#WorkerOptions) 34 | default: 32 35 | temporal-benchmarks-k8s:WorkerActivityPollers: 36 | description: Number of activity task pollers for each benchmark worker (https://pkg.go.dev/go.temporal.io/sdk@v1.17.0/internal#WorkerOptions) 37 | default: 16 38 | temporal-benchmarks-k8s:DynamicConfig: 39 | description: "Temporal Server dynamic config. Note: this should be used to set task queue partitions as shown in the example below" 40 | default: | 41 | matching.numTaskqueueReadPartitions: 42 | - value: 8 43 | matching.numTaskqueueWritePartitions: 44 | - value: 8 45 | -------------------------------------------------------------------------------- /stacks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark stacks 2 | 3 | These stacks build Temporal clusters ready for running benchmarks on. For this first stage only EKS clusters using RDS are supported. 4 | 5 | ## Deployment 6 | 7 | Before deploying a stack you will need to have an AWS environment stack deployed. Please see the [AWS Environment stack](../environment/aws/README.md). 8 | 9 | Once the environment stack is deployed, unless you are a Temporal employee you will need to adjust the `EnvironmentStackName` config value in the stack you wish to deploy to point to your environment stack rather than `temporalio/temporal-benchmarks-aws-environment/main` that we use for our CI. 10 | 11 | If you'd like to replicate one of our existing benchmarks, that is all you should need to adjust. You can then bring up the stack with `pulumi -s up`. 12 | 13 | For example, to bring up an EKS cluster with Temporal running against an RDS postgres m6i.2xlarge instance you can use our existing stack configuration with: 14 | 15 | ```shell 16 | $ pulumi -s eks-rds-postgres-m6i-2xlarge up 17 | ``` 18 | -------------------------------------------------------------------------------- /stacks/fetch-kubeconfig: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | KUBECONFIG="/tmp/kubeconfig.${1//\//-}" 6 | 7 | if [ ! -f $KUBECONFIG ]; then 8 | pulumi stack -s "$1" output kubeconfig --json > "${KUBECONFIG}" 9 | fi 10 | 11 | echo "export KUBECONFIG=${KUBECONFIG}" 12 | -------------------------------------------------------------------------------- /stacks/grafana-tunnel: -------------------------------------------------------------------------------- 1 | kubectl run grafana-tunnel \ 2 | --image ngrok/ngrok \ 3 | --image-pull-policy=Always \ 4 | --env NGROK_AUTHTOKEN=$NGROK_AUTHTOKEN \ 5 | -- http grafana.monitoring.svc.cluster.local:3000 --region=us --domain=temporal-benchmark.ngrok.io 6 | -------------------------------------------------------------------------------- /stacks/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "foo", 3 | "main": "index.ts", 4 | "devDependencies": { 5 | "@types/node": "^14" 6 | }, 7 | "dependencies": { 8 | "@pulumi/aws": "^5.0.0", 9 | "@pulumi/awsx": "^0.40.0", 10 | "@pulumi/eks": "^0.42.5", 11 | "@pulumi/gcp": "^6.41.0", 12 | "@pulumi/pulumi": "^3.0.0", 13 | "@types/js-yaml": "^4.0.5" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /stacks/port-forward-grafana: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | kubectl --namespace monitoring port-forward svc/grafana 3000 -------------------------------------------------------------------------------- /stacks/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "strict": true, 4 | "outDir": "bin", 5 | "target": "es2016", 6 | "module": "commonjs", 7 | "moduleResolution": "node", 8 | "sourceMap": true, 9 | "experimentalDecorators": true, 10 | "pretty": true, 11 | "noFallthroughCasesInSwitch": true, 12 | "noImplicitReturns": true, 13 | "forceConsistentCasingInFileNames": true 14 | }, 15 | "files": [ 16 | "index.ts" 17 | ] 18 | } 19 | --------------------------------------------------------------------------------