├── .devcontainer.json
├── .gitbook.yaml
├── .github
├── CODEOWNERS
├── ISSUE_TEMPLATE
│ └── codebundle.md
├── config.yaml
├── queries
│ ├── addDiscussionComment.graphql
│ ├── createDiscussion.graphql
│ ├── deleteDiscussion.graphql
│ ├── getComments.graphql
│ ├── getDiscussion.graphql
│ └── searchDiscussions.graphql
├── scripts
│ ├── index-config.yaml
│ ├── index.py
│ ├── meta.py
│ ├── pydoc2md.sh
│ ├── reference_scores.json
│ ├── semver-it.sh
│ ├── task_analysis.json
│ └── update_titles.py
└── workflows
│ ├── build-push.yaml
│ ├── pypi.yaml
│ ├── release.yaml
│ ├── score_manual.yaml
│ ├── score_pr.yaml
│ └── semver.yml
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── Introduction.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── SUMMARY.md
├── VERSION
├── codebundles
├── aws-cloudwatch-overused-ec2
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── aws-eks-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── aws-eks-health.yaml
│ │ └── templates
│ │ │ ├── aws-eks-health-sli.yaml
│ │ │ ├── aws-eks-health-slx.yaml
│ │ │ └── aws-eks-health-taskset.yaml
│ ├── README.md
│ ├── auth.sh
│ ├── check_eks_cluster_health.sh
│ ├── check_eks_fargate_cluster_health_status.sh
│ ├── list_eks_fargate_metrics.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── aws-eks-node-reboot
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── aws-elasticache-redis-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── aws-elasticache-redis-health.yaml
│ │ └── templates
│ │ │ ├── aws-elasticache-redis-health-sli.yaml
│ │ │ ├── aws-elasticache-redis-health-slx.yaml
│ │ │ └── aws-elasticache-redis-health-taskset.yaml
│ ├── README.md
│ ├── analyze_aws_elasticache_redis_metrics.sh
│ ├── auth.sh
│ ├── meta.yaml
│ ├── monitor_redis_performance.sh
│ ├── redis_status_scan.sh
│ ├── runbook.robot
│ ├── sli.robot
│ └── validate_aws_elasticache_redis_config.py
├── aws-lambda-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── aws-lambda-health.yaml
│ │ └── templates
│ │ │ ├── aws-lambda-health-sli.yaml
│ │ │ ├── aws-lambda-health-slx.yaml
│ │ │ └── aws-lambda-health-taskset.yaml
│ ├── README.md
│ ├── analyze_lambda_invocation_errors.sh
│ ├── auth.sh
│ ├── list_lambda_runtimes.sh
│ ├── meta.yaml
│ ├── monitor_aws_lambda_performance_metrics.sh
│ ├── runbook.robot
│ └── sli.robot
├── aws-s3-bucket-storage-report
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── aws-s3-bucket-storage-report.yaml
│ │ └── templates
│ │ │ ├── aws-s3-bucket-storage-report-slx.yaml
│ │ │ └── aws-s3-bucket-storage-report-taskset.yaml
│ ├── README.md
│ ├── auth.sh
│ ├── check_aws_s3_bucket_storage_utilization.sh
│ ├── meta.yaml
│ └── runbook.robot
├── azure-acr-image-sync
│ ├── README.md
│ ├── acr_sync_images.sh
│ ├── check_for_image_updates.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── azure-adf-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-adf-health.yaml
│ │ └── templates
│ │ │ ├── azure-adf-health-sli.yaml
│ │ │ ├── azure-adf-health-slx.yaml
│ │ │ ├── azure-adf-health-taskset.yaml
│ │ │ └── azure-adf-health-workflow.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── README.md
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── fail-sim-pipeline.json
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── adf_details.sh
│ ├── data_volume_audit.sh
│ ├── error_patterns.json
│ ├── error_trend.sh
│ ├── failed_pipeline.sh
│ ├── long_pipeline_runs.sh
│ ├── resource_health.sh
│ ├── runbook.robot
│ └── sli.robot
├── azure-aks-triage
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-aks-triage.yaml
│ │ └── templates
│ │ │ ├── azure-aks-triage-sli.yaml
│ │ │ ├── azure-aks-triage-slx.yaml
│ │ │ ├── azure-aks-triage-taskset.yaml
│ │ │ └── azure-aks-triage-workflow.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── README.md
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── aks_activities.sh
│ ├── aks_cluster_health.sh
│ ├── aks_network.sh
│ ├── aks_resource_health.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── azure-apim-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-apim-health.yaml
│ │ └── templates
│ │ │ ├── azure-apim-health-sli.yaml
│ │ │ ├── azure-apim-health-slx.yaml
│ │ │ ├── azure-apim-health-taskset.yaml
│ │ │ └── azure-apim-health-workflow.yaml
│ ├── .test
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── README.md
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── apim_diagnostic_logs.sh
│ ├── apim_metrics.sh
│ ├── apim_policies.sh
│ ├── apim_resource_health.sh
│ ├── check_apim_ssl_certs.sh
│ ├── gather_apim_resource_information.sh
│ ├── inspect_apim_dependencies.sh
│ ├── runbook.robot
│ ├── sli.robot
│ └── verify_apim_policies.sh
├── azure-appgateway-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-appgateway-health.yaml
│ │ └── templates
│ │ │ ├── azure-appgateway-health-sli.yaml
│ │ │ ├── azure-appgateway-health-slx.yaml
│ │ │ ├── azure-appgateway-health-taskset.yaml
│ │ │ └── azure-appgateway-health-workflow.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── README.md
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── app_gateway_backend_health.sh
│ ├── app_gateway_config_health.sh
│ ├── app_gateway_log_analytics.sh
│ ├── app_gateway_log_errors.sh
│ ├── app_gateway_metrics.sh
│ ├── app_gateway_related_resources.sh
│ ├── app_gateway_resource_health.sh
│ ├── app_gateway_ssl_certs.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── azure-appservice-functionapp-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-appservice-function-health.yaml
│ │ └── templates
│ │ │ ├── azure-appservice-function-health-sli.yaml
│ │ │ ├── azure-appservice-function-health-slx.yaml
│ │ │ ├── azure-appservice-function-health-taskset.yaml
│ │ │ └── azure-appservice-function-workflow.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── README.md
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── appservice_activities.sh
│ ├── appservice_config_health.sh
│ ├── appservice_deployment_health.sh
│ ├── appservice_health_metric.sh
│ ├── appservice_log_analysis.sh
│ ├── appservice_logs.sh
│ ├── appservice_plan_utilization_health.sh
│ ├── appservice_resource_health.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── azure-appservice-webapp-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-appservice-webapp-health.yaml
│ │ └── templates
│ │ │ ├── azure-appservice-webapp-health-sli.yaml
│ │ │ ├── azure-appservice-webapp-health-slx.yaml
│ │ │ ├── azure-appservice-webapp-health-taskset.yaml
│ │ │ └── azure-appservice-webapp-health-workflow.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── README.md
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── appservice_activities.sh
│ ├── appservice_config_health.sh
│ ├── appservice_deployment_health.sh
│ ├── appservice_health_metric.sh
│ ├── appservice_log_analysis.sh
│ ├── appservice_logs.sh
│ ├── appservice_metric_health.sh
│ ├── appservice_resource_health.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── azure-appservice-webapp-ops
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-appservice-webapp-ops.yaml
│ │ └── templates
│ │ │ ├── azure-appservice-webapp-ops-slx.yaml
│ │ │ └── azure-appservice-webapp-ops-taskset.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── README.md
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── appservice_logs.sh
│ ├── appservice_plan_scaledown.sh
│ ├── appservice_plan_scaleup.sh
│ ├── appservice_redeploy.sh
│ ├── appservice_restart.sh
│ ├── appservice_scale_in.sh
│ ├── appservice_scale_out.sh
│ ├── appservice_slot_swap.sh
│ ├── meta.yaml
│ └── runbook.robot
├── azure-kv-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-kv-health.yaml
│ │ └── templates
│ │ │ ├── azure-kv-health-sli.yaml
│ │ │ ├── azure-kv-health-slx.yaml
│ │ │ ├── azure-kv-health-taskset.yaml
│ │ │ └── azure-kv-health-workflow.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── availability.sh
│ ├── expiry-checks.sh
│ ├── kv_config.sh
│ ├── kv_resource_health.sh
│ ├── log.sh
│ ├── performance_metrics.sh
│ ├── runbook.robot
│ └── sli.robot
├── azure-loadbalancer-triage
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── az-lb-health.yaml
│ │ └── templates
│ │ │ ├── az-lb-health-slx.yaml
│ │ │ └── az-lb-health-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── azure-servicebus-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-servicebus-health.yaml
│ │ └── templates
│ │ │ ├── az-servicebus-health-slx.yaml
│ │ │ └── az-servicebus-health-taskset.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ ├── configure_security_test.sh
│ │ ├── generate_log_activity.sh
│ │ ├── generate_traffic.sh
│ │ ├── inject_test_messages.sh
│ │ ├── setup_connectivity_test.sh
│ │ └── terraform
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── main.tf.dr
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ ├── service_bus_alerts_check.sh
│ ├── service_bus_capacity.sh
│ ├── service_bus_config_health.sh
│ ├── service_bus_connectivity_test.sh
│ ├── service_bus_disaster_recovery.sh
│ ├── service_bus_log_analytics.sh
│ ├── service_bus_metrics.sh
│ ├── service_bus_queue_health.sh
│ ├── service_bus_related_resources.sh
│ ├── service_bus_resource_health.sh
│ ├── service_bus_security_audit.sh
│ ├── service_bus_topic_health.sh
│ └── sli.robot
├── azure-vmss-triage
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── azure-vmss-triage.yaml
│ │ └── templates
│ │ │ ├── azure-vmss-triage-sli.yaml
│ │ │ ├── azure-vmss-triage-slx.yaml
│ │ │ └── azure-vmss-triage-taskset.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── README.md
│ │ │ ├── Taskfile.yaml
│ │ │ ├── backend.tf
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ ├── terraform.tfvars
│ │ │ └── vars.tf
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ ├── sli.robot
│ ├── vmss_activities.sh
│ ├── vmss_config.sh
│ └── vmss_metrics.sh
├── curl-gmp-kong-ingress-inspection
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── curl-gmp-nginx-ingress-inspection
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── curl-http-ok
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ ├── http-ok-tls-aks-public-loadbalancer-ext-dns-tls.yaml
│ │ │ ├── http-ok-tls.yaml
│ │ │ └── http-ok.yaml
│ │ └── templates
│ │ │ ├── http-ok-sli.yaml
│ │ │ ├── http-ok-slo.yaml
│ │ │ ├── http-ok-slx.yaml
│ │ │ ├── http-ok-taskset.yaml
│ │ │ ├── http-ok-tls-aks-public-loadbalancer-ext-dns-tls-sli.yaml
│ │ │ ├── http-ok-tls-aks-public-loadbalancer-ext-dns-tls-slx.yaml
│ │ │ ├── http-ok-tls-aks-public-loadbalancer-ext-dns-tls-taskset.yaml
│ │ │ ├── http-ok-tls-sli.yaml
│ │ │ ├── http-ok-tls-slo.yaml
│ │ │ ├── http-ok-tls-slx.yaml
│ │ │ └── http-ok-tls-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── gcloud-log-inspection
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── gcloud-node-preempt
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── gcloud-node-preempt.yaml
│ │ └── templates
│ │ │ ├── gcloud-node-preempt-sli.yaml
│ │ │ ├── gcloud-node-preempt-slo.yaml
│ │ │ ├── gcloud-node-preempt-slx.yaml
│ │ │ └── gcloud-node-preempt-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── gcp-bucket-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── gcp-bucket-health.yaml
│ │ └── templates
│ │ │ ├── gcp-bucket-health-sli.yaml
│ │ │ ├── gcp-bucket-health-slo.yaml
│ │ │ ├── gcp-bucket-health-slx.yaml
│ │ │ └── gcp-bucket-health-taskset.yaml
│ ├── README.md
│ ├── bucket_details.sh
│ ├── bucket_ops_costs.sh
│ ├── bucket_size.sh
│ ├── check_security.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── gcp-cloud-function-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── gcp-cloud-function-health.yaml
│ │ └── templates
│ │ │ ├── gcp-cloud-function-health-sli.yaml
│ │ │ ├── gcp-cloud-function-health-slo.yaml
│ │ │ ├── gcp-cloud-function-health-slx.yaml
│ │ │ └── gcp-cloud-function-health-taskset.yaml
│ ├── README.md
│ ├── cloud_functions_next_steps.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── gh-actions-artifact-analysis
│ ├── README.md
│ ├── gh_actions_artifact_analysis.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── gh-actions-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── gh-actions-health.yaml
│ │ └── templates
│ │ │ ├── gh-actions-health-sli.yaml
│ │ │ ├── gh-actions-health-slx.yaml
│ │ │ └── gh-actions-health-taskset.yaml
│ ├── README.md
│ ├── calculate_org_sli.sh
│ ├── calculate_performance_sli.sh
│ ├── calculate_rate_limit_sli.sh
│ ├── calculate_runner_sli.sh
│ ├── calculate_security_sli.sh
│ ├── calculate_workflow_sli.sh
│ ├── check_billing_usage.sh
│ ├── check_long_running_workflows.sh
│ ├── check_org_workflow_health.sh
│ ├── check_rate_limits.sh
│ ├── check_repo_health_summary.sh
│ ├── check_runner_health.sh
│ ├── check_security_workflows.sh
│ ├── check_workflow_failures.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── gke-cluster-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── gke-cluster-health.yaml
│ │ └── templates
│ │ │ ├── gke-cluster-health-sli.yaml
│ │ │ ├── gke-cluster-health-slx.yaml
│ │ │ ├── gke-cluster-health-taskset.yaml
│ │ │ └── gke-cluster-health-workflow.yaml
│ ├── .test
│ │ ├── README.md
│ │ └── Taskfile.yaml
│ ├── README.md
│ ├── cluster_health.sh
│ ├── cluster_operations.sh
│ ├── gcp_recommendations.sh
│ ├── gke_node_size.py
│ ├── node_pool_health.sh
│ ├── quota_check.sh
│ ├── runbook.robot
│ ├── sa_check.sh
│ └── sli.robot
├── jenkins-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── jenkins-instance-health.yaml
│ │ └── templates
│ │ │ ├── jenkins-instance-health-sli.yaml
│ │ │ ├── jenkins-instance-health-slx.yaml
│ │ │ └── jenkins-instance-health-taskset.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── Taskfile.yaml
│ │ │ ├── create_jenkins_token.sh
│ │ │ ├── failed-job.xml
│ │ │ ├── failed-pipeline.xml
│ │ │ ├── long-running-job.xml
│ │ │ ├── main.tf
│ │ │ ├── provider.tf
│ │ │ └── python-docker-pipeline.xml
│ ├── README.md
│ ├── error_patterns.json
│ ├── failed_build_logs.sh
│ ├── long_running_builds.sh
│ ├── runbook.robot
│ └── sli.robot
├── k8s-app-troubleshoot
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-app-troubleshoot.yaml
│ │ └── templates
│ │ │ ├── k8s-app-troubleshoot-sli.yaml
│ │ │ ├── k8s-app-troubleshoot-slo.yaml
│ │ │ ├── k8s-app-troubleshoot-slx.yaml
│ │ │ └── k8s-app-troubleshoot-taskset.yaml
│ ├── README.md
│ ├── env_check.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── k8s-application-log-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ ├── k8s-deployment-logs-health.yaml
│ │ │ └── k8s-statefulset-logs-health.yaml
│ │ └── templates
│ │ │ ├── k8s-deployment-logs-health-sli.yaml
│ │ │ ├── k8s-deployment-logs-health-slx.yaml
│ │ │ ├── k8s-deployment-logs-health-taskset.yaml
│ │ │ ├── k8s-deployment-logs-health-workflow.yaml
│ │ │ ├── k8s-ss-logs-health-sli.yaml
│ │ │ ├── k8s-ss-logs-health-slx.yaml
│ │ │ ├── k8s-ss-logs-taskset.yaml
│ │ │ └── k8s-statefulset-logs-health-workflow.yaml
│ ├── .test
│ │ └── Taskfile.yaml
│ ├── README.md
│ ├── error_patterns.json
│ ├── get_pod_logs_for_workload.sh
│ ├── ignore_patterns.json
│ ├── meta.yaml
│ ├── runbook.robot
│ ├── scan_application_restarts.sh
│ ├── scan_auth_failures.sh
│ ├── scan_connection_failures.sh
│ ├── scan_error_logs.sh
│ ├── scan_log_anomalies.sh
│ ├── scan_logs.py
│ ├── scan_null_pointer_exceptions.sh
│ ├── scan_resource_warnings.sh
│ ├── scan_service_dependency_failures.sh
│ ├── scan_stack_traces.sh
│ ├── scan_timeout_errors.sh
│ ├── sli.robot
│ └── summarize.py
├── k8s-argocd-application-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-argocd-application-health.yaml
│ │ └── templates
│ │ │ ├── k8s-argocd-application-health-cli-taskset.yaml
│ │ │ └── k8s-argocd-application-health-slx.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-argocd-helm-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-argocd-helm-health.yaml
│ │ └── templates
│ │ │ ├── k8s-argocd-helm-health-slx.yaml
│ │ │ └── k8s-argocd-helm-health-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-artifactory-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-artifactory.yaml
│ │ └── templates
│ │ │ ├── k8s-artifactory-healthcheck-slx.yaml
│ │ │ └── k8s-artifactory-healthcheck-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-certmanager-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-certmanager-certificates-health.yaml
│ │ └── templates
│ │ │ ├── k8s-certmanager-certificate-health-sli.yaml
│ │ │ ├── k8s-certmanager-certificate-health-slo.yaml
│ │ │ ├── k8s-certmanager-certificate-health-slx.yaml
│ │ │ ├── k8s-certmanager-certificate-health-taskset.yaml
│ │ │ └── k8s-certmanager-certificate-health-workflow.yaml
│ ├── README.md
│ ├── certificate_next_steps.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── k8s-chaos-flux
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-chaos-namespace
│ ├── README.md
│ ├── auth.sh
│ ├── change_service_port.sh
│ ├── change_service_selector.sh
│ ├── delete_random_pods.sh
│ ├── drain_node.sh
│ ├── expand_tmp.sh
│ ├── meta.yaml
│ ├── oomkill_pod.sh
│ └── runbook.robot
├── k8s-chaos-workload
│ ├── README.md
│ ├── auth.sh
│ ├── change_service_port.sh
│ ├── change_service_selector.sh
│ ├── expand_tmp.sh
│ ├── kill_workload_pod.sh
│ ├── meta.yaml
│ ├── oomkill_workload_pod.sh
│ └── runbook.robot
├── k8s-cluster-node-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-cluster-node-health.yaml
│ │ └── templates
│ │ │ ├── k8s-cluster-node-health-sli.yaml
│ │ │ ├── k8s-cluster-node-health-slx.yaml
│ │ │ ├── k8s-cluster-node-health-taskset.yaml
│ │ │ └── k8s-cluster-node-health-workflow.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── node_restart_check.sh
│ ├── runbook.robot
│ └── sli.robot
├── k8s-cluster-resource-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-cluster-resource-health.yaml
│ │ └── templates
│ │ │ ├── k8s-cluster-resource-health-sli.yaml
│ │ │ ├── k8s-cluster-resource-health-slx.yaml
│ │ │ └── k8s-cluster-resource-health-taskset.yaml
│ ├── README.md
│ ├── get_high_use_nodes.sh
│ ├── meta.yaml
│ ├── overlimit_check.sh
│ ├── pods_impacting_high_use_nodes.sh
│ ├── runbook.robot
│ └── sli.robot
├── k8s-daemonset-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-daemonset-health.yaml
│ │ └── templates
│ │ │ ├── k8s-daemonset-health-slx.yaml
│ │ │ └── k8s-daemonset-health-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ ├── validate_probes.sh
│ ├── workload_issues.sh
│ └── workload_next_steps.sh
├── k8s-deployment-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-deployment-health.yaml
│ │ └── templates
│ │ │ ├── k8s-deployment-health-slx.yaml
│ │ │ └── k8s-deployment-health-taskset.yaml
│ ├── .test
│ │ └── Taskfile.yaml
│ ├── README.md
│ ├── check_replicaset.sh
│ ├── container_restarts.sh
│ ├── deployment_logs.sh
│ ├── event_anomalies.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ ├── validate_probes.sh
│ ├── workload_issues.sh
│ └── workload_next_steps.sh
├── k8s-deployment-ops
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-deployment-ops.yaml
│ │ └── templates
│ │ │ ├── k8s-deployment-ops-slx.yaml
│ │ │ └── k8s-deployment-ops-taskset.yaml
│ ├── .test
│ │ └── Taskfile.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-fluxcd-helm-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-flux-helm-health.yaml
│ │ └── templates
│ │ │ ├── k8s-flux-helm-health-slx.yaml
│ │ │ └── k8s-flux-helm-health-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-fluxcd-kustomization-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-flux-kustomization-health.yaml
│ │ └── templates
│ │ │ ├── k8s-flux-kustomize-health-sli.yaml
│ │ │ ├── k8s-flux-kustomize-health-slx.yaml
│ │ │ └── k8s-flux-kustomize-health-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ ├── sli.robot
│ └── workload_next_steps.sh
├── k8s-fluxcd-reconcile
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-fluxcd-reconcile.yaml
│ │ └── templates
│ │ │ ├── k8s-fluxcd-reconcile-sli.yaml
│ │ │ ├── k8s-fluxcd-reconcile-slo.yaml
│ │ │ ├── k8s-fluxcd-reconcile-slx.yaml
│ │ │ └── k8s-fluxcd-reconcile-taskset.yaml
│ ├── README.md
│ ├── flux_reconcile_report.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── k8s-gitops-gh-remediate
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-gitops-gh-remediate.yaml
│ │ └── templates
│ │ │ ├── k8s-gitops-gh-remediate-slx.yaml
│ │ │ └── k8s-gitops-gh-remediate-taskset.yaml
│ ├── .test
│ │ └── README.md
│ ├── README.md
│ ├── meta.yaml
│ ├── pvc_utilization_check.sh
│ ├── resource_quota_check.sh
│ ├── runbook.robot
│ ├── update_github_manifests.sh
│ ├── validate_all_probes.sh
│ └── vpa_recommendations.sh
├── k8s-image-check
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-image-health.yaml
│ │ └── templates
│ │ │ ├── k8s-image-check-slx.yaml
│ │ │ └── k8s-image-check-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-ingress-gce-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-ingress-gce-healthcheck
│ │ └── templates
│ │ │ ├── k8s-ingress-gce-healthcheck-slx.yaml
│ │ │ └── k8s-ingress-gce-healthcheck-taskset.yaml
│ ├── README.md
│ ├── check_gce_ingress_objects.sh
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-ingress-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-ingress-health .yaml
│ │ └── templates
│ │ │ ├── k8s-ingress-healthcheck-slx.yaml
│ │ │ └── k8s-ingress-healthcheck-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-istio-system-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-istio-system-health.yaml
│ │ └── templates
│ │ │ ├── k8s-istio-system-health-sli.yaml
│ │ │ ├── k8s-istio-system-health-slx.yaml
│ │ │ └── k8s-istio-system-health-taskset.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── terraform
│ │ │ ├── Taskfile.yaml
│ │ │ ├── book-info.yaml
│ │ │ ├── bookinfo-gateway.yaml
│ │ │ ├── bookinfo
│ │ │ ├── bookinfo.yaml
│ │ │ └── fault-injection-details-v1.yaml
│ │ │ ├── dr-bookinfo.yaml
│ │ │ ├── faulty-gateway.yaml
│ │ │ ├── kubeconfig-sa-token.yaml
│ │ │ ├── main.tf
│ │ │ ├── outputs.tf
│ │ │ ├── provider.tf
│ │ │ ├── standard-install.yaml
│ │ │ ├── variables.tf
│ │ │ └── versions.tf
│ ├── README.md
│ ├── analyze_istio_configurations.sh
│ ├── check_istio_injection.sh
│ ├── controlplane_error_patterns.json
│ ├── istio_controlplane_logs.sh
│ ├── istio_installation_verify.sh
│ ├── istio_mtls_check.sh
│ ├── istio_proxy_logs.sh
│ ├── istio_sidecar_injection_report.sh
│ ├── istio_sidecar_resource_usage.sh
│ ├── proxy_error_patterns.json
│ ├── runbook.robot
│ └── sli.robot
├── k8s-jaeger-http-query
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-jaeger-http-query.yaml
│ │ └── templates
│ │ │ ├── k8s-jaeger-http-query-slx.yaml
│ │ │ └── k8s-jaeger-http-query-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── query_jaeger_http_errors.sh
│ └── runbook.robot
├── k8s-jenkins-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-jenkins-healthcheck.yaml
│ │ └── templates
│ │ │ ├── k8s-jenkins-health-slx.yaml
│ │ │ └── k8s-jenkins-health-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-labeledpods-healthcheck
│ └── sli.robot
├── k8s-loki-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-loki-healthcheck.yaml
│ │ └── templates
│ │ │ ├── k8s-loki-healthcheck-slx.yaml
│ │ │ └── k8s-loki-healthcheck-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-namespace-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-namespace-healthcheck.yaml
│ │ └── templates
│ │ │ ├── k8s-namespace-healthcheck-sli.yaml
│ │ │ ├── k8s-namespace-healthcheck-slo.yaml
│ │ │ ├── k8s-namespace-healthcheck-slx.yaml
│ │ │ ├── k8s-namespace-healthcheck-taskset.yaml
│ │ │ └── k8s-namespace-healthcheck-workflow.yaml
│ ├── README.md
│ ├── container_restarts.sh
│ ├── find_resource_owners.sh
│ ├── meta.yaml
│ ├── resource_quota_check.sh
│ ├── runbook.robot
│ ├── sli.robot
│ ├── workload_issues.sh
│ └── workload_next_steps.sh
├── k8s-otelcollector
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-otelcollector.yaml
│ │ └── templates
│ │ │ ├── k8s-otelcollector-slx.yaml
│ │ │ └── k8s-otelcollector-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── otel_dropped_check.sh
│ ├── otel_error_check.sh
│ ├── otel_metrics_check.sh
│ └── runbook.robot
├── k8s-podresources-health
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-pod-resources.yaml
│ │ └── templates
│ │ │ ├── k8s-pod-resources-slx.yaml
│ │ │ └── k8s-pod-resources-taskset.yaml
│ ├── find_resource_owners.sh
│ ├── identify_resource_contrained_pods.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── vpa_recommendations.sh
├── k8s-postgres-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ ├── k8s-postgres-healthcheck-crunchy.yaml
│ │ │ └── k8s-postgres-healthcheck-zalando.yaml
│ │ └── templates
│ │ │ ├── k8s-postgres-healthcheck-crunchy-sli.yaml
│ │ │ ├── k8s-postgres-healthcheck-crunchy-slx.yaml
│ │ │ ├── k8s-postgres-healthcheck-crunchy-taskset.yaml
│ │ │ ├── k8s-postgres-healthcheck-zalando-sli.yaml
│ │ │ ├── k8s-postgres-healthcheck-zalando-slx.yaml
│ │ │ └── k8s-postgres-healthcheck-zalando-taskset.yaml
│ ├── backup_health.sh
│ ├── config_health.sh
│ ├── dbquery.sh
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── k8s-prometheus-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-prometheus-healthcheck.yaml
│ │ └── templates
│ │ │ ├── k8s-prometheus-healthcheck-slx.yaml
│ │ │ └── k8s-prometheus-healthcheck-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ └── validate_servicemonitors.sh
├── k8s-pvc-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-pvc-healthcheck.yaml
│ │ └── templates
│ │ │ ├── k8s-pvc-healthcheck-sli.yaml
│ │ │ ├── k8s-pvc-healthcheck-slx.yaml
│ │ │ ├── k8s-pvc-healthcheck-taskset.yaml
│ │ │ └── k8s-pvc-healthcheck-workflow.yaml
│ ├── .test
│ │ ├── README.md
│ │ ├── Taskfile.yaml
│ │ └── kubernetes
│ │ │ └── mainfest.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── pvc_utilization_check.sh
│ ├── runbook.robot
│ ├── sli.robot
│ └── storage_next_steps.sh
├── k8s-redis-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-redis-healthcheck.yaml
│ │ └── templates
│ │ │ ├── k8s-redis-health-slx.yaml
│ │ │ └── k8s-redis-health-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-restart-resource
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-serviceaccount-check
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-serviceaccount-check.yaml
│ │ └── templates
│ │ │ ├── k8s-serviceaccount-check-slx.yaml
│ │ │ └── k8s-serviceaccount-check-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
├── k8s-statefulset-healthcheck
│ ├── .runwhen
│ │ ├── generation-rules
│ │ │ └── k8s-statefulset-health.yaml
│ │ └── templates
│ │ │ ├── k8s-statefulset-health-slx.yaml
│ │ │ └── k8s-statefulset-health-taskset.yaml
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ ├── validate_probes.sh
│ ├── workload_issues.sh
│ └── workload_next_steps.sh
├── k8s-tail-logs-dynamic
│ ├── README.md
│ ├── meta.yaml
│ ├── runbook.robot
│ └── sli.robot
├── k8s-vault-healthcheck
│ ├── README.md
│ ├── meta.yaml
│ └── runbook.robot
└── terraform-cloud-workspace-lock-check
│ ├── meta.yaml
│ └── runbook.robot
├── extras
└── lnav
│ └── formats
│ └── http_logrus_custom.json
├── interactive_console_output.xml
├── libraries
├── .docs
│ ├── CLI.md
│ ├── Suggest.md
│ ├── _test_parsers.md
│ ├── cli_utils.md
│ ├── json_parser.md
│ ├── k8s_applications.md
│ ├── k8s_helper.md
│ ├── local_process.md
│ ├── migrations_inspector.md
│ ├── parsers.md
│ ├── postgres_helper.md
│ ├── repository.md
│ └── stdout_parser.md
├── Jenkins
│ ├── __init__.py
│ └── jenkins.py
├── RW
│ ├── CLI
│ │ ├── CLI.py
│ │ ├── __init__.py
│ │ ├── cli_utils.py
│ │ ├── json_parser.py
│ │ ├── local_process.py
│ │ ├── postgres_helper.py
│ │ └── stdout_parser.py
│ ├── K8sApplications
│ │ ├── __init__.py
│ │ ├── _test_parsers.py
│ │ ├── k8s_applications.py
│ │ ├── migrations_inspector.py
│ │ ├── no_stacktraces_report.jinja2
│ │ ├── parsers.py
│ │ ├── repository.py
│ │ ├── simple_stacktrace_report.jinja2
│ │ ├── test.sh
│ │ └── test_data
│ │ │ ├── djangojson.log
│ │ │ ├── golang.log
│ │ │ ├── java.log
│ │ │ ├── node.log
│ │ │ └── python.log
│ ├── K8sHelper
│ │ ├── __init__.py
│ │ └── k8s_helper.py
│ ├── K8sLog
│ │ ├── __init__.py
│ │ └── k8s_log.py
│ ├── NextSteps
│ │ ├── Kubernetes
│ │ │ └── mapping.yaml
│ │ ├── Suggest.py
│ │ └── __init__.py
│ └── __init__.py
└── __init__.py
├── pyproject.toml
├── requirements.txt
└── task_analysis.json
/.gitbook.yaml:
--------------------------------------------------------------------------------
1 | root: ./
2 | structure:
3 | readme: ./Introduction.md
4 | summary: ./SUMMARY.md
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # This is a comment.
2 | # Each line is a file pattern followed by one or more owners.
3 | # Read more: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
4 |
5 | # These owners will be the default owners for everything in
6 | # the repo. Unless a later match takes precedence,
7 | # these owners will be requested for
8 | # review when someone opens a pull request.
9 | * @runwhen-contrib/runwhen-team
10 |
11 | # Order is important; the last matching pattern takes the most
12 | # precedence.
13 |
14 | # When someone opens a pull request that only
15 | # modifies JS files, only @js-owner and not the global
16 | # owner(s) will be requested for a review.
17 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/codebundle.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Codebundle
3 | about: The scope for a new codebundle
4 | title: "[Codebundle]"
5 | labels:
6 | assignees: ''
7 | ---
8 |
9 | **Scope**
10 | Describe the scope and purpose of the codebundle. Be specific about its functionality and what it aims to achieve.
11 |
12 | **Requested By (Optional)**
13 | - N/A
14 |
15 | **Related Integrations**
16 | Example:
17 | - Kubernetes
18 | - Nginx
19 |
20 | **Definition of Done**
21 | - [ ] Codebundle Implementation accomplishes scope (or provide a justification for why it may differ)
22 | - [ ] Attach a screenshot of the codebundle pushing a metric locally (if an SLI) or showing a report (if a taskset).
23 | - [ ] Attach a screenshot of the SLI or Taskset running in a workspace.
24 |
25 | **Optional Comments**
26 | If you have any further requests, or comments regarding this codebundle, add them here.
--------------------------------------------------------------------------------
/.github/queries/addDiscussionComment.graphql:
--------------------------------------------------------------------------------
1 | mutation AddDiscussionComment($discussion_id: ID!) {
2 | addDiscussionComment(input: { discussionId: $discussion_id, body: "Possibly outdated or obsolete. Please review." }) {
3 | comment {
4 | id
5 | }
6 | }
7 | }
--------------------------------------------------------------------------------
/.github/queries/createDiscussion.graphql:
--------------------------------------------------------------------------------
1 | mutation CreateDiscussion($repo_id: ID!, $codebundle: String!, $discussion_body: String!, $category_id: ID!) {
2 | createDiscussion(input: { repositoryId: $repo_id, title: $codebundle, body: $discussion_body, categoryId: $category_id }) {
3 | discussion {
4 | id
5 | }
6 | }
7 | }
--------------------------------------------------------------------------------
/.github/queries/deleteDiscussion.graphql:
--------------------------------------------------------------------------------
1 | mutation DeleteDiscussion($discussion_id: ID!) {
2 | deleteDiscussion(input: { id: $discussion_id }) {
3 | clientMutationId
4 | }
5 | }
--------------------------------------------------------------------------------
/.github/queries/getComments.graphql:
--------------------------------------------------------------------------------
1 | query GetComments($discussion_id: ID!) {
2 | node(id: $discussion_id) {
3 | ... on Discussion {
4 | comments(first: 100) {
5 | edges {
6 | node {
7 | body
8 | }
9 | }
10 | }
11 | }
12 | }
13 | }
--------------------------------------------------------------------------------
/.github/queries/getDiscussion.graphql:
--------------------------------------------------------------------------------
1 | query GetDiscussion($discussion_id: ID!) {
2 | discussion:node(id: $discussion_id) {
3 | ... on Discussion {
4 | repository {
5 | id
6 | }
7 | category {
8 | id
9 | }
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/.github/queries/searchDiscussions.graphql:
--------------------------------------------------------------------------------
1 | query SearchDiscussions($searchQuery: String!) {
2 | search(query: $searchQuery, type: REPOSITORY, first: 100) {
3 | edges {
4 | node {
5 | ... on Repository {
6 | discussions(first: 100) {
7 | nodes {
8 | id
9 | title
10 | }
11 | }
12 | }
13 | }
14 | }
15 | }
16 | }
--------------------------------------------------------------------------------
/.github/scripts/index-config.yaml:
--------------------------------------------------------------------------------
1 | # in the repos, the key specifies the name of the temp directory name
2 | repos:
3 | rw-cli-codecollection: https://github.com/runwhen-contrib/rw-cli-codecollection.git
4 | robot_file_pattern:
5 | codebundles: .robot
--------------------------------------------------------------------------------
/.github/scripts/pydoc2md.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # ======================================================================================
3 | # Synopsis: a script for generating markdown files from python docstrings in a chosen directory
4 |
5 | function main (){
6 | src_dir=$1
7 | md_dir=$2
8 | pyfiles=$(find "$src_dir" -name "*.py" | grep -v "__init__")
9 | echo "Generating documentation for files:"
10 | echo "$pyfiles"
11 | for pyfile in $pyfiles; do
12 | module_path=$(echo "$pyfile" | sed -e 's|/|.|g' -e 's|.py$||')
13 | markdown_path="${pyfile%.py}.md"
14 | markdown_filename=$(basename "$markdown_path")
15 | pydoc-markdown -m $module_path > $md_dir$markdown_filename
16 | done
17 | }
18 | main "$@"
--------------------------------------------------------------------------------
/.github/scripts/reference_scores.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "task": "Check EC2 Health",
4 | "score": 1,
5 | "reasoning": "Lacks specificity about what is being checked and where."
6 | },
7 | {
8 | "task": "Check For Overutilized EC2 Instances",
9 | "score": 2,
10 | "reasoning": "Provides more detail about 'what' is being checked, but lacks a location."
11 | },
12 | {
13 | "task": "Check For Overutilized EC2 Instances in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`",
14 | "score": 5,
15 | "reasoning": "Fully detailed, including both what (overutilized EC2 instances) and where (specific AWS region and account)."
16 | }
17 | ]
18 |
--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
1 | # .github/release.yml
2 | # see https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes#configuring-automatically-generated-release-notes
3 |
4 | changelog:
5 | exclude:
6 | labels:
7 | - ignore-for-release
8 | authors:
9 | - octocat
10 | - github-actions
11 | categories:
12 | - title: Breaking Changes 🛠
13 | labels:
14 | - Semver-Major
15 | - breaking-change
16 | - title: Exciting New Features 🎉
17 | labels:
18 | - Semver-Minor
19 | - enhancement
20 | - title: Other Changes
21 | labels:
22 | - "*"
--------------------------------------------------------------------------------
/.github/workflows/score_pr.yaml:
--------------------------------------------------------------------------------
1 | name: Score CodeCollection
2 | on:
3 | workflow_dispatch:
4 | pull_request:
5 | paths:
6 | - "codebundles/**"
7 | - ".github/workflows/score.yaml"
8 | - "!src/VERSION"
9 |
10 | permissions:
11 | contents: write
12 | pull-requests: write
13 |
14 | jobs:
15 | score-codebundles:
16 | runs-on: ubuntu-latest
17 | steps:
18 | - name: Check out the repo
19 | uses: actions/checkout@v3
20 | with:
21 | fetch-depth: 0
22 |
23 | - name: Set Git user
24 | run: |
25 | git config user.name "github-actions[bot]"
26 | git config user.email "github-actions[bot]@users.noreply.github.com"
27 |
28 | - uses: runwhen-contrib/github-actions/codecollection-score@main
29 | with:
30 | directory: .
31 | apply_suggestions: true
32 | only_changed: true
33 | env:
34 | GH_TOKEN: ${{ github.token }}
35 |
--------------------------------------------------------------------------------
/Introduction.md:
--------------------------------------------------------------------------------
1 | # RunWhen CLI CodeCollection
2 | Welcome to the documentation for the `rw-cli-codecollection` which contains codebundles specialized for CLI-based actions. There are 2 key sections:
3 |
4 | * Codebundles: Contains information on configuring and running the codebundles in this codecollection.
5 | * Keywords: Contains documentation for authors looking to use the keywords implemented in this codecollection for their own codebundles.
6 |
7 | > Note: keywords from this codecollection can be installed via pip from [pypi](https://pypi.org/project/runwhen-cli-keywords/)
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the top-level files frequently used in a package
2 | include README.md
3 | include VERSION
4 | include requirements.txt
5 | include LICENSE
6 |
7 | # Include everything within the RW directory
8 | graft RW
9 |
10 | # Exclude common unwanted patterns
11 | exclude *.py[cod]
12 | exclude __pycache__
13 | exclude .DS_Store
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | # CodeCollection Registry
10 | To explore all CodeCollections and tasks, please visit the [CodeCollection Registry](https://registry.runwhen.com/).
11 |
12 | [](https://registry.runwhen.com)
13 |
14 | ## RunWhen CLI Codecollection
15 | This repository is **one of many** CodeCollections that is used with the [RunWhen Platform](https://www.runwhen.com) and [RunWhen Local](https://docs.runwhen.com/public/v/runwhen-local). It contains CodeBundles that are maintained by the RunWhen team and perform health, operational, and troubleshooting tasks.
16 |
17 | Please see the **[contributing](CONTRIBUTING.md)** and **[code of conduct](CODE_OF_CONDUCT.md)** for details on adding your contributions to this project.
--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.24
2 |
--------------------------------------------------------------------------------
/codebundles/aws-cloudwatch-overused-ec2/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/aws-eks-health/.runwhen/generation-rules/aws-eks-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: aws
5 | generationRules:
6 | - resourceTypes:
7 | - aws_eks_clusters
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: aws-eks-health
15 | qualifiers: ["resource"]
16 | baseTemplateName: aws-eks-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: aws-eks-health-taskset.yaml
23 |
--------------------------------------------------------------------------------
/codebundles/aws-eks-health/auth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # if required AWS_ cli vars are not set, error and exit 1
4 | if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY || -z $AWS_REGION ]]; then
5 | echo "AWS credentials not set. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables."
6 | exit 1
7 | fi
8 |
9 | # if AWS_ROLE_ARN then assume the role using sts and override the pre-existing key ENVs
10 | if [[ -n $AWS_ROLE_ARN ]]; then
11 | sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession")
12 | AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId')
13 | AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey')
14 | AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken')
15 | export AWS_ACCESS_KEY_ID
16 | export AWS_SECRET_ACCESS_KEY
17 | export AWS_SESSION_TOKEN
18 | fi
19 |
--------------------------------------------------------------------------------
/codebundles/aws-eks-health/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/aws-eks-node-reboot/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/aws-elasticache-redis-health/.runwhen/generation-rules/aws-elasticache-redis-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: aws
5 | generationRules:
6 | - resourceTypes:
7 | # - aws_elasticache_serverless_caches
8 | - aws_elasticache_clusters
9 | matchRules:
10 | - type: pattern
11 | pattern: ".+"
12 | properties: [name]
13 | mode: substring
14 | slxs:
15 | - baseName: aws-elasticache-redis-health
16 | qualifiers: ["resource"]
17 | baseTemplateName: aws-elasticache-redis-health
18 | levelOfDetail: basic
19 | outputItems:
20 | - type: slx
21 | - type: sli
22 | - type: runbook
23 | templateName: aws-elasticache-redis-health-taskset.yaml
24 |
--------------------------------------------------------------------------------
/codebundles/aws-elasticache-redis-health/.runwhen/templates/aws-elasticache-redis-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/aws/elasticache.png
11 | alias: AWS Elastiache Caches in {{match_resource.resource.region}}
12 | asMeasuredBy: The number of unavailable Elasticache serverless caches in {{match_resource.resource.region}}
13 | configProvided:
14 | - name: SLX_PLACEHOLDER
15 | value: SLX_PLACEHOLDER
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: The number of unavailable Elasticache serverless caches should be 0.
19 | additionalContext:
20 | {% include "aws.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "aws-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/aws-elasticache-redis-health/auth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # if required AWS_ cli vars are not set, error and exit 1
4 | if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY || -z $AWS_REGION ]]; then
5 | echo "AWS credentials not set. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables."
6 | exit 1
7 | fi
8 |
9 | # if AWS_ROLE_ARN then assume the role using sts and override the pre-existing key ENVs
10 | if [[ -n $AWS_ROLE_ARN ]]; then
11 | sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession")
12 | AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId')
13 | AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey')
14 | AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken')
15 | export AWS_ACCESS_KEY_ID
16 | export AWS_SECRET_ACCESS_KEY
17 | export AWS_SESSION_TOKEN
18 | fi
19 |
--------------------------------------------------------------------------------
/codebundles/aws-elasticache-redis-health/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/aws-lambda-health/.runwhen/generation-rules/aws-lambda-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: aws
5 | generationRules:
6 | - resourceTypes:
7 | - aws_lambda_functions
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: aws-lambda-health
15 | qualifiers: ["resource"]
16 | baseTemplateName: aws-lambda-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: aws-lambda-health-taskset.yaml
23 |
--------------------------------------------------------------------------------
/codebundles/aws-lambda-health/.runwhen/templates/aws-lambda-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/aws/lambda.png
11 | alias: AWS Lambda Health For Region {{match_resource.resource.region}}
12 | asMeasuredBy: The number of Failed AWS Lambdas in project {{match_resource.resource.region}}
13 | configProvided:
14 | - name: SLX_PLACEHOLDER
15 | value: SLX_PLACEHOLDER
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: The total count of failing Lambdas should be 0.
19 | additionalContext:
20 | {% include "aws.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "aws-tags.yaml" ignore missing %}
24 | - name: service
25 | value: lamda
26 | - name: access
27 | value: read-only
--------------------------------------------------------------------------------
/codebundles/aws-lambda-health/auth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # if required AWS_ cli vars are not set, error and exit 1
4 | if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY || -z $AWS_REGION ]]; then
5 | echo "AWS credentials not set. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables."
6 | exit 1
7 | fi
8 |
9 | # if AWS_ROLE_ARN then assume the role using sts and override the pre-existing key ENVs
10 | if [[ -n $AWS_ROLE_ARN ]]; then
11 | sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession")
12 | AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId')
13 | AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey')
14 | AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken')
15 | export AWS_ACCESS_KEY_ID
16 | export AWS_SECRET_ACCESS_KEY
17 | export AWS_SESSION_TOKEN
18 | fi
19 |
--------------------------------------------------------------------------------
/codebundles/aws-lambda-health/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/aws-s3-bucket-storage-report/.runwhen/generation-rules/aws-s3-bucket-storage-report.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: aws
5 | generationRules:
6 | - resourceTypes:
7 | - aws_s3_buckets
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: aws-s3-bucket-storage-report
15 | qualifiers: ["resource"]
16 | baseTemplateName: aws-s3-bucket-storage-report
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: runbook
21 | templateName: aws-s3-bucket-storage-report-taskset.yaml
22 |
--------------------------------------------------------------------------------
/codebundles/aws-s3-bucket-storage-report/.runwhen/templates/aws-s3-bucket-storage-report-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/aws/s3.png
11 | alias: AWS S3 in {{match_resource.resource.region}}
12 | asMeasuredBy: Availability of S3 in {{match_resource.resource.region}}
13 | configProvided:
14 | - name: SLX_PLACEHOLDER
15 | value: SLX_PLACEHOLDER
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: S3 buckets in {{match_resource.resource.region}} should be available.
19 | additionalContext:
20 | {% include "aws.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "aws-tags.yaml" ignore missing %}
24 | - name: category
25 | value: storage
26 | - name: access
27 | value: read-only
--------------------------------------------------------------------------------
/codebundles/aws-s3-bucket-storage-report/README.md:
--------------------------------------------------------------------------------
1 | # aws-s3-bucket-storage-report CodeBundle
2 | ### Tags:`AWS`, `S3 Bucket`, `Storage Issue`
3 | ## CodeBundle Objective:
4 | Outputs the current usage values of all S3 buckets in a given AWS region, and the number of objects stored in them.
5 |
6 | ## CodeBundle Inputs:
7 |
8 | export AWS_REGION="PLACEHOLDER"
9 | export AWS_ACCESS_KEY_ID="PLACEHOLDER"
10 | export AWS_SECRET_ACCESS_KEY="PLACEHOLDER"
11 |
12 |
13 | ## CodeBundle Tasks:
14 | ### `Check AWS S3 Bucket Storage Utilization`
15 | #### Tags:`Amazon Web Services`, `AWS S3`, `Bucket Storage`
16 | ### Task Documentation:
17 | This script checks and displays the storage utilization of a specified AWS S3 bucket. It uses the AWS CLI to list all objects in the bucket recursively, displaying the results in a human-readable format and providing a summary of the total storage used.
18 | #### Usage Example:
19 | `./check_AWS_S3_bucket_storage_utilization.sh`
20 |
--------------------------------------------------------------------------------
/codebundles/aws-s3-bucket-storage-report/auth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # if required AWS_ cli vars are not set, error and exit 1
4 | if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY || -z $AWS_REGION ]]; then
5 | echo "AWS credentials not set. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables."
6 | exit 1
7 | fi
8 |
9 | # if AWS_ROLE_ARN then assume the role using sts and override the pre-existing key ENVs
10 | if [[ -n $AWS_ROLE_ARN ]]; then
11 | sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession")
12 | AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId')
13 | AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey')
14 | AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken')
15 | export AWS_ACCESS_KEY_ID
16 | export AWS_SECRET_ACCESS_KEY
17 | export AWS_SESSION_TOKEN
18 | fi
19 |
--------------------------------------------------------------------------------
/codebundles/aws-s3-bucket-storage-report/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/azure-acr-image-sync/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/.runwhen/generation-rules/azure-adf-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_datafactory_factories
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: az-adf-health
15 | qualifiers: ["resource_group"]
16 | baseTemplateName: azure-adf-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: azure-adf-health-taskset.yaml
23 | - type: workflow
24 |
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/.runwhen/templates/azure-adf-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "Azure Data Factory SLI Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{ match_resource.resource_group.name }} Azure Data Factory SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{ match_resource.resource_group.name }} Azure Data Factory health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{ match_resource.resource_group.name }}-{{ "Azure Data Factory SLI Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/.test/README.md:
--------------------------------------------------------------------------------
1 | # Azure Virtual Machine Health
2 | This codebundle runs a suite of metrics checks for Data factory in Azure. It identifies:
3 | -
4 |
5 | ## Configuration
6 |
7 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
8 |
9 | - `AZ_USERNAME`: Service principal's client ID
10 | - `AZ_SECRET_VALUE`: The credential secret value from the app registration
11 | - `AZ_TENANT`: The Azure tenancy ID
12 | - `AZ_SUBSCRIPTION`: The Azure subscription ID
13 |
14 | ## Testing
15 | See the .test directory for infrastructure test code.
16 |
17 | ## Notes
18 |
19 | This codebundle assumes the service principal authentication flow
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/.test/terraform/fail-sim-pipeline.json:
--------------------------------------------------------------------------------
1 | {
2 | "properties": {
3 | "activities": [
4 | {
5 | "name": "FailStep",
6 | "type": "Fail",
7 | "typeProperties": {
8 | "message": "Simulated failure for monitoring.",
9 | "errorCode": 500
10 | }
11 | }
12 | ]
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "~> 4.7.0"
6 | }
7 | tls = {
8 | source = "hashicorp/tls"
9 | version = "~> 4.0"
10 | }
11 | azapi = {
12 | source = "azure/azapi"
13 | version = "2.3.0"
14 | }
15 | }
16 | }
17 |
18 | # Configure the Microsoft Azure Provider
19 | provider "azurerm" {
20 | features {}
21 | }
22 |
23 | provider "azuread" {}
24 | provider "tls" {}
25 | provider "azapi" {}
26 |
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-data-factory-health"
2 | name = "adf-hlth"
3 | location = "Canada Central"
4 | table_name = "dbo.NonExistentTable"
5 | tags = {
6 | "env" : "test",
7 | "lifecycle" : "deleteme",
8 | "product" : "runwhen"
9 | }
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "name" {
6 | type = string
7 | }
8 |
9 | variable "location" {
10 | type = string
11 | default = "East US"
12 | }
13 |
14 | variable "tags" {
15 | type = map(string)
16 | }
17 |
18 | variable "sp_principal_id" {
19 | type = string
20 | }
21 |
22 | variable "tenant_id" {
23 | type = string
24 | }
25 |
26 | # Password for .pfx files (self-signed)
27 | variable "table_name" {
28 | type = string
29 | default = "dbo.CustomerTransactions"
30 | }
--------------------------------------------------------------------------------
/codebundles/azure-adf-health/README.md:
--------------------------------------------------------------------------------
1 | # Azure Data Factory Health
2 | This codebundle runs a suite of metrics checks for Data Factory in Azure. It identifies:
3 | - Check Azure Data Factory Availability
4 |
5 | ## Configuration
6 |
7 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
8 |
9 | - `AZURE_SUBSCRIPTION_ID`: The Azure subscription ID
10 | - `AZURE_RESOURCE_GROUP`: The Azure Resource Group
11 |
12 | ## Testing
13 | See the .test directory for infrastructure test code.
14 |
15 | ## Notes
16 |
17 | This codebundle assumes the service principal authentication flow
--------------------------------------------------------------------------------
/codebundles/azure-aks-triage/.runwhen/generation-rules/azure-aks-triage.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_containerservice_managed_clusters
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: az-aks-triage
15 | qualifiers: [resource, resource_group]
16 | baseTemplateName: azure-aks-triage
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: azure-aks-triage-taskset.yaml
23 | - type: workflow
--------------------------------------------------------------------------------
/codebundles/azure-aks-triage/.runwhen/templates/azure-aks-triage-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/azure/containers/10023-icon-service-Kubernetes-Services.svg
11 | alias: {{match_resource.resource.name}} AKS Health
12 | asMeasuredBy: "As scored by upstream Azure resource health, critical or error activities, and configuration health. 1=Health, 0=Unhealthy"
13 | owners:
14 | - {{ workspace.owner_email }}
15 | statement: AKS Clusters should be in a healthy state.
16 | additionalContext:
17 | {% include "azure-hierarchy.yaml" ignore missing %}
18 | qualified_name: "{{ match_resource.qualified_name }}"
19 | tags:
20 | {% include "azure-tags.yaml" ignore missing %}
21 | - name: platform
22 | value: azure
23 | - name: service
24 | value: aks
25 | - name: access
26 | value: read-only
--------------------------------------------------------------------------------
/codebundles/azure-aks-triage/.runwhen/templates/azure-aks-triage-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "AKS SLI Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.name}} AKS SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} AKS health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.name}}-{{ "AKS SLI Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/azure-aks-triage/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-aks-triage/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "~> 4.7.0"
6 | }
7 | }
8 | }
9 |
10 | # Configure the Microsoft Azure Provider
11 | provider "azurerm" {
12 | features {}
13 | }
14 |
15 | provider "azuread" {}
--------------------------------------------------------------------------------
/codebundles/azure-aks-triage/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-aks"
2 | cluster_name = "aks-cl-1"
3 | location = "East US"
4 | tags = {
5 | "env" : "test",
6 | "lifecycle" : "deleteme",
7 | "product" : "runwhen"
8 | }
--------------------------------------------------------------------------------
/codebundles/azure-aks-triage/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "location" {
6 | type = string
7 | default = "East US"
8 | }
9 |
10 | variable "cluster_name" {
11 | type = string
12 | default = "East US"
13 | }
14 |
15 | variable "tags" {
16 | type = map(string)
17 | }
18 |
19 | variable "sp_principal_id" {
20 | type = string
21 | }
22 |
23 | variable "tenant_id" {
24 | type = string
25 | }
--------------------------------------------------------------------------------
/codebundles/azure-aks-triage/README.md:
--------------------------------------------------------------------------------
1 | # Azure AKS Cluster Triage
2 | This CodeBundle checks for AKS Cluster Health based on how Azure is reporting resource health, network configuration recommendations, activities that have occured, and provisioning status of resources.
3 |
4 | ## Configuration
5 |
6 | The SLI & TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
7 | - `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in
8 | - `AKS_CLUSTER`: The name of the AKS Cluster in the resource group to target with checks
9 | - `TIME_PERIOD_MINUTES`: The time window, in minutes, to look back for activities and events which may indicate issues.
10 |
11 | ## Notes
12 |
13 | This codebundle assumes the service principal authentication flow which is handled from the import secret Keyword.
14 |
15 |
16 | ## TODO
17 | - [ ] Add documentation
--------------------------------------------------------------------------------
/codebundles/azure-apim-health/.runwhen/generation-rules/azure-apim-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_apimanagement_service
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: az-appgw-health
15 | qualifiers: ["resource", "resource_group"]
16 | baseTemplateName: azure-apim-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: azure-apim-health-taskset.yaml
23 | - type: workflow
24 |
--------------------------------------------------------------------------------
/codebundles/azure-apim-health/.runwhen/templates/azure-apim-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "APIM SLI Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.name}} APIM SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} APIM health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.name}}-{{ "APIM SLI Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/azure-apim-health/.test/terraform/README.md:
--------------------------------------------------------------------------------
1 | ## Infrastructure
2 | This will build out a simple linux web app service set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory)
3 |
4 | ## Usage
5 |
6 | ### State management
7 | State is managed locally with `terraform.tfstate` and is gitignored.
8 |
9 | ### Auth
10 | az login --use-device-code
11 |
12 | ### Requirements
13 | The following vars must exist:
14 |
15 | ```
16 | export ARM_SUBSCRIPTION_ID=[]
17 | export AZ_TENANT_ID=[]
18 | export AZ_CLIENT_SECRET=[]
19 | export AZ_CLIENT_ID=[]
20 | export AZ_SECRET_ID=[]
21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv)
22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID
23 | export TF_VAR_tenant_id=$AZ_TENANT_ID
--------------------------------------------------------------------------------
/codebundles/azure-apim-health/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-apim-health/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "~>3.0"
6 | }
7 | }
8 | required_version = ">=1.0"
9 | }
10 |
11 | provider "azurerm" {
12 | features {}
13 | }
14 |
15 | # Pull subscription info from the current CLI session
16 | data "azurerm_subscription" "current" {}
17 |
18 | # Pull tenant and user details from the current CLI session
19 | data "azurerm_client_config" "current" {}
--------------------------------------------------------------------------------
/codebundles/azure-apim-health/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-apim-health"
2 | location = "Canada Central"
3 | tags = {
4 | "env" : "test",
5 | "lifecycle" : "deleteme",
6 | "product" : "runwhen"
7 | }
8 | codebundle = "azure-apim-health"
9 | codecollection = "rw-cli-codecollection"
--------------------------------------------------------------------------------
/codebundles/azure-apim-health/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | description = "Name of the resource group to create/use."
4 | }
5 |
6 | variable "location" {
7 | type = string
8 | description = "Azure location for all resources."
9 | }
10 |
11 | variable "tags" {
12 | type = map(string)
13 | description = "Tags to apply to resources."
14 | default = {}
15 | }
16 |
17 | variable "sp_principal_id" {
18 | type = string
19 | description = "Client (service principal) ID with access to the resources."
20 | }
21 |
22 | variable "codebundle" {
23 | type = string
24 | description = "Base name for your resources."
25 | default = "example-bundle"
26 | }
27 |
28 |
29 |
--------------------------------------------------------------------------------
/codebundles/azure-apim-health/README.md:
--------------------------------------------------------------------------------
1 |
2 | as login --use-device-code
3 | ## Test 1
4 | export APP_SERVICE_NAME=azure-apim-health-f1
5 | export AZ_RESOURCE_GROUP=azure-apim-health
6 | export APIM_NAME=azure-apim-health-apim
7 | export AZURE_RESOURCE_SUBSCRIPTION_ID=$ARM_SUBSCRIPTION_ID
8 | export AZURE_CONFIG_DIR=/var/tmp/runwhen/azure-apim-health/runbook.robot/.azure
9 | az login --use-device-code
--------------------------------------------------------------------------------
/codebundles/azure-appgateway-health/.runwhen/generation-rules/azure-appgateway-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_network_application_gateways
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: az-appgw-health
15 | qualifiers: ["resource", "resource_group"]
16 | baseTemplateName: azure-appgateway-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: azure-appgateway-health-taskset.yaml
23 | - type: workflow
24 |
--------------------------------------------------------------------------------
/codebundles/azure-appgateway-health/.runwhen/templates/azure-appgateway-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "App Gateway SLI Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.name}} App Gateway SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} App Gateway health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.name}}-{{ "App Gateway SLI Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/azure-appgateway-health/.test/README.md:
--------------------------------------------------------------------------------
1 | # Usage
2 |
3 | # Infrastructure Setup
4 | The terraform directory contains infrastructure used for testing. This infrastructure will build 2 app gateways, app services, app service plans. One of the app services is healthy, the other has a bad image.
5 |
6 | # Local Development Testing
7 |
8 |
9 | ro sli.robot
10 | ro runbook.robot
11 |
--------------------------------------------------------------------------------
/codebundles/azure-appgateway-health/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-appgateway-health/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "~> 4.7.0"
6 | }
7 | tls = {
8 | source = "hashicorp/tls"
9 | version = "~> 4.0"
10 | }
11 | }
12 | }
13 |
14 | # Configure the Microsoft Azure Provider
15 | provider "azurerm" {
16 | features {}
17 | }
18 |
19 | provider "azuread" {}
20 | provider "tls" {}
21 |
--------------------------------------------------------------------------------
/codebundles/azure-appgateway-health/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-appgateway-health"
2 | location = "Canada Central"
3 | tags = {
4 | "env" : "test",
5 | "lifecycle" : "deleteme",
6 | "product" : "runwhen"
7 | }
--------------------------------------------------------------------------------
/codebundles/azure-appgateway-health/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "location" {
6 | type = string
7 | default = "East US"
8 | }
9 |
10 | variable "tags" {
11 | type = map(string)
12 | }
13 |
14 | variable "sp_principal_id" {
15 | type = string
16 | }
17 |
18 | variable "tenant_id" {
19 | type = string
20 | }
21 |
22 | # Password for .pfx files (self-signed)
23 | variable "ssl_cert_password" {
24 | type = string
25 | default = "P@ssw0rd123!"
26 | }
--------------------------------------------------------------------------------
/codebundles/azure-appgateway-health/README.md:
--------------------------------------------------------------------------------
1 | # Azure Application Gateway Health
2 | Checks key metrics for Azure Application Gateways and queries the health status of backend pools used by the gateway.
3 |
4 |
5 | ## Configuration
6 |
7 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
8 |
9 | - `AZ_USERNAME`: Service principal's client ID
10 | - `AZ_SECRET_VALUE`: The credential secret value from the app registration
11 | - `AZ_TENANT`: The Azure tenancy ID
12 | - `AZ_SUBSCRIPTION`: The Azure subscription ID
13 | - `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in
14 | - `APPGATEWAY`: The name of the application gateway in the resource group to target with checks
15 |
16 | ## Notes
17 |
18 | This codebundle assumes the service principal authentication flow.
19 |
20 | ## TODO
21 | - [ ] config best practices check
22 | - [ ] Add documentation
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/.runwhen/generation-rules/azure-appservice-function-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_appservice_web_apps
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | - type: pattern
14 | pattern: "^functionapp(?:,.*)?$"
15 | properties: [kind]
16 | mode: substring
17 | slxs:
18 | - baseName: az-appsvc-function-health
19 | qualifiers: ["resource", "resource_group"]
20 | baseTemplateName: azure-appservice-function-health
21 | levelOfDetail: detailed
22 | outputItems:
23 | - type: slx
24 | - type: sli
25 | - type: runbook
26 | templateName: azure-appservice-function-health-taskset.yaml
27 | - type: workflow
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/.runwhen/templates/azure-appservice-function-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "AppService Function App SLI Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.name}} AppService Function App SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} AppService Function App health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.name}}-{{ "AppService Function App SLI Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/.test/README.md:
--------------------------------------------------------------------------------
1 |
2 | as login --use-device-code
3 | ## Test 1
4 | export APP_SERVICE_NAME=azure-appservice-triage-b1
5 | export AZ_RESOURCE_GROUP=azure-appservice-triage
6 |
7 | ## Test 2
8 | export APP_SERVICE_NAME=azure-appservice-triage-f1
9 | export AZ_RESOURCE_GROUP=azure-appservice-triage
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/.test/terraform/README.md:
--------------------------------------------------------------------------------
1 | ## Infrastructure
2 | This will build out a simple linux web app service set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory)
3 |
4 | ## Usage
5 |
6 | ### State management
7 | State is managed locally with `terraform.tfstate` and is gitignored.
8 |
9 | ### Auth
10 | az login --use-device-code
11 |
12 | ### Requirements
13 | The following vars must exist:
14 |
15 | ```
16 | export ARM_SUBSCRIPTION_ID=[]
17 | export AZ_TENANT_ID=[]
18 | export AZ_CLIENT_SECRET=[]
19 | export AZ_CLIENT_ID=[]
20 | export AZ_SECRET_ID=[]
21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv)
22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID
23 | export TF_VAR_tenant_id=$AZ_TENANT_ID
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "~> 4.7.0"
6 | }
7 | random = {
8 | source = "hashicorp/random"
9 | version = "~> 3.5"
10 | }
11 | }
12 | }
13 |
14 | # Configure the Microsoft Azure Provider
15 | provider "azurerm" {
16 | features {}
17 | }
18 |
19 | provider "azuread" {}
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-apps-func-hlth"
2 | location = "Canada Central"
3 | tags = {
4 | "env" : "test",
5 | "lifecycle" : "deleteme",
6 | "product" : "runwhen"
7 | }
8 | codebundle = "azure-apps-func-hlth"
9 | codecollection = "rw-cli-codecollection"
10 |
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "location" {
6 | type = string
7 | default = "Canada Central"
8 | }
9 |
10 | variable "tags" {
11 | type = map(string)
12 | }
13 |
14 | variable "sp_principal_id" {
15 | type = string
16 | }
17 |
18 | variable "tenant_id" {
19 | type = string
20 | }
21 |
22 | variable "codebundle" {
23 | type = string
24 | }
25 |
26 | variable "codecollection" {
27 | type = string
28 | }
--------------------------------------------------------------------------------
/codebundles/azure-appservice-functionapp-health/README.md:
--------------------------------------------------------------------------------
1 | # Azure App Service Triage
2 | Checks key App Service metrics and the service plan, fetches logs, config and activities for the service and generates a report of present issues for any found.
3 |
4 | ## Configuration
5 |
6 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
7 |
8 | export APPSERVICE
9 | export AZ_RESOURCE_GROUP
10 |
11 | ## Notes
12 |
13 | This codebundle assumes the service principal authentication flow.
14 |
15 | ## TODO
16 | - [ ] look for notable activities in list
17 | - [ ] config best practices check
18 | - [ ] Add documentation
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/.runwhen/generation-rules/azure-appservice-webapp-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_appservice_web_apps
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | - type: pattern
14 | pattern: "^app(?:,.*)?$"
15 | properties: [kind]
16 | mode: substring
17 | slxs:
18 | - baseName: az-appsvc-web-health
19 | qualifiers: ["resource", "resource_group"]
20 | baseTemplateName: azure-appservice-webapp-health
21 | levelOfDetail: detailed
22 | outputItems:
23 | - type: slx
24 | - type: sli
25 | - type: runbook
26 | templateName: azure-appservice-webapp-health-taskset.yaml
27 | - type: workflow
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/.runwhen/templates/azure-appservice-webapp-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "AppService Webapp SLI Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.name}} AppService Webapp SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} AppService WebApp health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.name}}-{{ "AppService Webapp SLI Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/.test/README.md:
--------------------------------------------------------------------------------
1 |
2 | as login --use-device-code
3 | ## Test 1
4 | export APP_SERVICE_NAME=azure-appservice-triage-b1
5 | export AZ_RESOURCE_GROUP=azure-appservice-triage
6 |
7 | ## Test 2
8 | export APP_SERVICE_NAME=azure-appservice-triage-f1
9 | export AZ_RESOURCE_GROUP=azure-appservice-triage
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/.test/terraform/README.md:
--------------------------------------------------------------------------------
1 | ## Infrastructure
2 | This will build out a simple linux web app service set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory)
3 |
4 | ## Usage
5 |
6 | ### State management
7 | State is managed locally with `terraform.tfstate` and is gitignored.
8 |
9 | ### Auth
10 | az login --use-device-code
11 |
12 | ### Requirements
13 | The following vars must exist:
14 |
15 | ```
16 | export ARM_SUBSCRIPTION_ID=[]
17 | export AZ_TENANT_ID=[]
18 | export AZ_CLIENT_SECRET=[]
19 | export AZ_CLIENT_ID=[]
20 | export AZ_SECRET_ID=[]
21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv)
22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID
23 | export TF_VAR_tenant_id=$AZ_TENANT_ID
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "~> 4.7.0"
6 | }
7 | }
8 | }
9 |
10 | # Configure the Microsoft Azure Provider
11 | provider "azurerm" {
12 | features {}
13 | }
14 |
15 | provider "azuread" {}
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-appservice-triage"
2 | location = "Canada Central"
3 | tags = {
4 | "env" : "test",
5 | "lifecycle" : "deleteme",
6 | "product" : "runwhen"
7 | }
8 | codebundle = "azure-appservice-triage"
9 | codecollection = "rw-cli-codecollection"
10 |
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "location" {
6 | type = string
7 | default = "Canada Central"
8 | }
9 |
10 | variable "tags" {
11 | type = map(string)
12 | }
13 |
14 | variable "sp_principal_id" {
15 | type = string
16 | }
17 |
18 | variable "tenant_id" {
19 | type = string
20 | }
21 |
22 | variable "codebundle" {
23 | type = string
24 | }
25 |
26 | variable "codecollection" {
27 | type = string
28 | }
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-health/README.md:
--------------------------------------------------------------------------------
1 | # Azure App Service Triage
2 | Checks key App Service metrics and the service plan, fetches logs, config and activities for the service and generates a report of present issues for any found.
3 |
4 | ## Configuration
5 |
6 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
7 |
8 | export APPSERVICE
9 | export AZ_RESOURCE_GROUP
10 |
11 | ## Notes
12 |
13 | This codebundle assumes the service principal authentication flow.
14 |
15 | ## TODO
16 | - [ ] look for notable activities in list
17 | - [ ] config best practices check
18 | - [ ] Add documentation
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/.runwhen/generation-rules/azure-appservice-webapp-ops.yaml:
--------------------------------------------------------------------------------
1 | # apiVersion: runwhen.com/v1
2 | # kind: GenerationRules
3 | # spec:
4 | # platform: azure
5 | # generationRules:
6 | # - resourceTypes:
7 | # - azure_appservice_web_apps
8 | # matchRules:
9 | # - type: pattern
10 | # pattern: ".+"
11 | # properties: [name]
12 | # mode: substring
13 | # - type: pattern
14 | # pattern: "^app(?:,.*)?$"
15 | # properties: [kind]
16 | # mode: substring
17 | # slxs:
18 | # - baseName: az-appsvc-webapp-ops
19 | # qualifiers: ["resource", "resource_group"]
20 | # baseTemplateName: azure-appservice-webapp-ops
21 | # levelOfDetail: basic
22 | # outputItems:
23 | # - type: slx
24 | # - type: runbook
25 | # templateName: azure-appservice-webapp-ops-taskset.yaml
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/.test/README.md:
--------------------------------------------------------------------------------
1 |
2 | as login --use-device-code
3 | ## Test 1
4 | export APP_SERVICE_NAME=azure-appservice-triage-b1
5 | export AZ_RESOURCE_GROUP=azure-appservice-triage
6 |
7 | ## Test 2
8 | export APP_SERVICE_NAME=azure-appservice-triage-f1
9 | export AZ_RESOURCE_GROUP=azure-appservice-triage
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/.test/terraform/README.md:
--------------------------------------------------------------------------------
1 | ## Infrastructure
2 | This will build out a simple linux web app service set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory)
3 |
4 | ## Usage
5 |
6 | ### State management
7 | State is managed locally with `terraform.tfstate` and is gitignored.
8 |
9 | ### Auth
10 | az login --use-device-code
11 |
12 | ### Requirements
13 | The following vars must exist:
14 |
15 | ```
16 | export ARM_SUBSCRIPTION_ID=[]
17 | export AZ_TENANT_ID=[]
18 | export AZ_CLIENT_SECRET=[]
19 | export AZ_CLIENT_ID=[]
20 | export AZ_SECRET_ID=[]
21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv)
22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID
23 | export TF_VAR_tenant_id=$AZ_TENANT_ID
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "~> 4.7.0"
6 | }
7 | }
8 | }
9 |
10 | # Configure the Microsoft Azure Provider
11 | provider "azurerm" {
12 | features {}
13 | }
14 |
15 | provider "azuread" {}
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-appservice-ops"
2 | location = "Canada Central"
3 | tags = {
4 | "env" : "test",
5 | "lifecycle" : "deleteme",
6 | "product" : "runwhen"
7 | }
8 | codebundle = "azure-appservice-ops"
9 | codecollection = "rw-cli-codecollection"
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "location" {
6 | type = string
7 | default = "Canada Central"
8 | }
9 |
10 | variable "tags" {
11 | type = map(string)
12 | }
13 |
14 | variable "sp_principal_id" {
15 | type = string
16 | }
17 |
18 | variable "tenant_id" {
19 | type = string
20 | }
21 |
22 | variable "codebundle" {
23 | type = string
24 | }
25 |
26 | variable "codecollection" {
27 | type = string
28 | }
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Swap Deployment Slots for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`
3 | - Checks whether the plan supports deployment slots (Standard or Premium tier).
4 | - Lists all available slots.
5 | - If SOURCE_SLOT and TARGET_SLOT are not provided, it attempts to figure them out automatically, assuming:
6 | - The “production” slot is the default slot with "isSlot": false.
7 | - The non-production slot(s) have "isSlot": true.
8 | - If exactly one non-production slot exists, we set source to that slot and target to "production".
9 | - If there are multiple non-production slots, we fail unless the user specifies which ones to swap.
--------------------------------------------------------------------------------
/codebundles/azure-appservice-webapp-ops/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/azure-kv-health/.runwhen/generation-rules/azure-kv-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_keyvault_keyvault
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: az-kv-health
15 | qualifiers: ["resource_group"]
16 | baseTemplateName: azure-kv-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: azure-kv-health-taskset.yaml
23 | - type: workflow
24 |
--------------------------------------------------------------------------------
/codebundles/azure-kv-health/.runwhen/templates/azure-kv-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "Key Vault SLI Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{ match_resource.resource_group.name }} Key Vault SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{ match_resource.resource_group.name }} Key Vault health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{ match_resource.resource_group.name }}-{{ "Key Vault SLI Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/azure-kv-health/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-kv-health/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "4.18.0"
6 | }
7 | }
8 | }
9 |
10 | provider "azurerm" {
11 | features {
12 | key_vault {
13 | purge_soft_delete_on_destroy = true
14 | recover_soft_deleted_key_vaults = true
15 | }
16 | }
17 | }
18 |
19 | provider "azuread" {}
--------------------------------------------------------------------------------
/codebundles/azure-kv-health/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-vm-triage"
2 | location = "East US"
3 | kv_name = "test-yoko"
4 | tags = {
5 | "env" : "test",
6 | "lifecycle" : "deleteme",
7 | "product" : "runwhen"
8 | }
--------------------------------------------------------------------------------
/codebundles/azure-kv-health/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "location" {
6 | type = string
7 | default = "East US"
8 | }
9 |
10 |
11 | variable "tags" {
12 | type = map(string)
13 | }
14 |
15 | variable "sp_principal_id" {
16 | type = string
17 | }
18 |
19 | variable "kv_name" {
20 | type = string
21 | }
--------------------------------------------------------------------------------
/codebundles/azure-kv-health/README.md:
--------------------------------------------------------------------------------
1 | # Azure Key Vault Health
2 | This codebundle runs a suite of metrics checks for Key Vault in Azure. It identifies:
3 | - Check Key Vault Availability
4 | - Check Key Vault Configuration
5 | - Check Expiring Key Vault Items (Keys, Secrets and Certificates)
6 | - Check Key Vault Logs for Issues
7 | - Check Key Vault Performance Metrics
8 |
9 | ## Configuration
10 |
11 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
12 |
13 | - `AZ_USERNAME`: Service principal's client ID
14 | - `AZ_SECRET_VALUE`: The credential secret value from the app registration
15 | - `AZ_TENANT`: The Azure tenancy ID
16 | - `AZ_SUBSCRIPTION`: The Azure subscription ID
17 |
18 | ## Testing
19 | See the .test directory for infrastructure test code.
20 |
21 | ## Notes
22 |
23 | This codebundle assumes the service principal authentication flow
--------------------------------------------------------------------------------
/codebundles/azure-kv-health/availability.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | subscription_id="$AZURE_RESOURCE_SUBSCRIPTION_ID"
4 | resource_group="$AZURE_RESOURCE_GROUP"
5 |
6 | json_output='{"metrics":['
7 | first=true
8 |
9 | for kv in $(az keyvault list -g "$resource_group" --subscription "$subscription_id" --query "[].name" -o tsv); do
10 |
11 | availability=$(az monitor metrics list \
12 | --resource "/subscriptions/$subscription_id/resourceGroups/$resource_group/providers/Microsoft.KeyVault/vaults/$kv" \
13 | --metric Availability \
14 | --aggregation average \
15 | --interval PT1H \
16 | --query "value[0].timeseries[0].data[-1].average" \
17 | --start-time $(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ) \
18 | --output tsv)
19 |
20 | # Default to N/A if no data is returned
21 | availability=${availability:-"N/A"}
22 |
23 | # Append to JSON array
24 | if [ "$first" = true ]; then
25 | first=false
26 | else
27 | json_output+=','
28 | fi
29 | json_output+="{\"kv_name\":\"$kv\",\"percentage\":\"$availability\"}"
30 | done
31 |
32 | json_output+=']}'
33 | echo "$json_output"
--------------------------------------------------------------------------------
/codebundles/azure-loadbalancer-triage/.runwhen/generation-rules/az-lb-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_network_load_balancers
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: az-lb-health
15 | levelOfDetail: basic
16 | qualifiers: [resource, resource_group]
17 | baseTemplateName: az-lb-health
18 | outputItems:
19 | - type: slx
20 | - type: runbook
21 | templateName: az-lb-health-taskset.yaml
22 |
--------------------------------------------------------------------------------
/codebundles/azure-loadbalancer-triage/.runwhen/templates/az-lb-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/azure/networking/10062-icon-service-Load-Balancers.svg
11 | alias: {{match_resource.name}} Azure Load Balancer Health
12 | asMeasuredBy: "Querying the Azure Load Balancer health for incidents or critical events."
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Ensure Azure Network Load Balancers are healthy.
19 | additionalContext:
20 | {% include "azure-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "azure-tags.yaml" ignore missing %}
24 | - name: service
25 | value: loadbalancer
26 | - name: access
27 | value: read-only
--------------------------------------------------------------------------------
/codebundles/azure-loadbalancer-triage/README.md:
--------------------------------------------------------------------------------
1 | # Azure LoadBalancer Triage
2 |
3 | Queries the activity logs of internal loadbalancers (AKS ingress) objects in Azure and optionally inspects internal AKS ingress objects if available.
4 |
5 | ## Tasks
6 | `Health Check Internal Azure Load Balancer`
7 |
8 | ## Configuration
9 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
10 |
11 | - `AZ_USERNAME`: Azure service account username secret used to authenticate.
12 | - `AZ_CLIENT_SECRET`: Azure service account client secret used to authenticate.
13 | - `AZ_TENANT`: Azure tenant ID used to authenticate to.
14 | - `AZ_HISTORY_RANGE`: The history range to inspect for incidents in the activity log, in hours. Defaults to 24 hours.
15 |
16 | ## Requirements
17 | - A kubeconfig with appropriate RBAC permissions to perform the desired command.
18 |
19 | ## TODO
20 | - [ ] Refine issues raised
21 | - [ ] Array support for issues
22 | - [ ] Look at cross az/kubectl for better triage
23 | - [ ] Add additional documentation.
24 |
25 |
--------------------------------------------------------------------------------
/codebundles/azure-loadbalancer-triage/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/azure-servicebus-health/.runwhen/generation-rules/azure-servicebus-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_servicebus_namespaces
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: az-servicebus-health
15 | qualifiers: ["resource", "resource_group"]
16 | baseTemplateName: az-servicebus-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: az-servicebus-health-taskset.yaml
23 |
--------------------------------------------------------------------------------
/codebundles/azure-servicebus-health/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-servicebus-health/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | version = "~> 4.7.0"
6 | }
7 | tls = {
8 | source = "hashicorp/tls"
9 | version = "~> 4.0"
10 | }
11 | }
12 | }
13 |
14 | # Configure the Microsoft Azure Provider
15 | provider "azurerm" {
16 | features {}
17 | }
18 |
19 | provider "azuread" {}
20 | provider "tls" {}
21 |
--------------------------------------------------------------------------------
/codebundles/azure-servicebus-health/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-servicebus-health"
2 | location = "Canada Central"
3 | secondary_location = "East US"
4 | tags = {
5 | "env" : "test",
6 | "lifecycle" : "deleteme",
7 | "product" : "runwhen"
8 | }
--------------------------------------------------------------------------------
/codebundles/azure-servicebus-health/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "location" {
6 | type = string
7 | default = "East US"
8 | }
9 |
10 | variable "secondary_location" {
11 | type = string
12 | default = "East US"
13 | }
14 |
15 | variable "tags" {
16 | type = map(string)
17 | }
18 |
19 | variable "sp_principal_id" {
20 | type = string
21 | }
22 |
23 | variable "tenant_id" {
24 | type = string
25 | }
26 |
27 | # Password for .pfx files (self-signed)
28 | variable "ssl_cert_password" {
29 | type = string
30 | default = "P@ssw0rd123!"
31 | }
32 |
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/.runwhen/generation-rules/azure-vmss-triage.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: azure
5 | generationRules:
6 | - resourceTypes:
7 | - azure_compute_virtual_machine_scale_sets
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: az-vmss-triage
15 | qualifiers: ["resource", "resource_group"]
16 | baseTemplateName: azure-vmss-triage
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: azure-vmss-triage-taskset.yaml
23 |
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/.runwhen/templates/azure-vmss-triage-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/azure/compute/10034-icon-service-VM-Scale-Sets.svg
11 | alias: Azure VM Scale Set {{match_resource.resource.name}}
12 | asMeasuredBy: Composite health score of resources & activities.
13 | configProvided:
14 | - name: SLX_PLACEHOLDER
15 | value: SLX_PLACEHOLDER
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: VM Scale Set should be available.
19 | additionalContext:
20 | {% include "azure-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "azure-tags.yaml" ignore missing %}
24 | - name: service
25 | value: vmss
26 | - name: access
27 | value: read-only
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/.test/terraform/README.md:
--------------------------------------------------------------------------------
1 | ## Infrastructure
2 | This will build out a simple VM scale set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory)
3 |
4 | ## Usage
5 |
6 | ### State management
7 | State is managed locally with `terraform.tfstate` and is gitignored.
8 |
9 | ### Auth
10 | az login --use-device-code
11 |
12 | ### Requirements
13 | The following vars must exist:
14 |
15 | ```
16 | export ARM_SUBSCRIPTION_ID=[]
17 | export AZ_TENANT_ID=[]
18 | export AZ_CLIENT_SECRET=[]
19 | export AZ_CLIENT_ID=[]
20 | export AZ_SECRET_ID=[]
21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv)
22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID
23 | export TF_VAR_tenant_id=$AZ_TENANT_ID
24 | ```
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/.test/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "local" {
3 | path = "terraform.tfstate"
4 | }
5 | }
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | azurerm = {
4 | source = "hashicorp/azurerm"
5 | }
6 | }
7 | }
8 |
9 | # Configure the Microsoft Azure Provider
10 | provider "azurerm" {
11 | features {}
12 | }
13 |
14 | provider "azuread" {}
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/.test/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | resource_group = "azure-vm-triage"
2 | location = "East US"
3 | tags = {
4 | "env" : "test",
5 | "lifecycle" : "deleteme",
6 | "product" : "runwhen"
7 | }
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/.test/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "resource_group" {
2 | type = string
3 | }
4 |
5 | variable "location" {
6 | type = string
7 | default = "East US"
8 | }
9 |
10 |
11 | variable "tags" {
12 | type = map(string)
13 | }
14 |
15 | variable "sp_principal_id" {
16 | type = string
17 | }
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/README.md:
--------------------------------------------------------------------------------
1 | # Azure Virtual Machine Scale Set Triage
2 | This codebundle runs a suite of metrics checks for a VM Scale Set in Azure. It fetches activities and the current configuration which is added to a report for review at that point in time.
3 |
4 | ## Configuration
5 |
6 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
7 |
8 | - `AZ_USERNAME`: Service principal's client ID
9 | - `AZ_SECRET_VALUE`: The credential secret value from the app registration
10 | - `AZ_TENANT`: The Azure tenancy ID
11 | - `AZ_SUBSCRIPTION`: The Azure subscription ID
12 | - `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in
13 | - `VMSCALESET`: The name of the VM Scale Set in the resource group to target with checks
14 |
15 | ## Notes
16 |
17 | This codebundle assumes the service principal authentication flow
18 |
19 | ## TODO
20 | - [ ] remote exec functionality
21 | - [ ] look for notable activities in list
22 | - [ ] config best practices check
23 | - [ ] Add documentation
--------------------------------------------------------------------------------
/codebundles/azure-vmss-triage/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/generation-rules/http-ok-tls.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - ingress
7 | matchRules:
8 | - type: and
9 | matches:
10 | - type: pattern
11 | pattern: ".+"
12 | properties: [name]
13 | mode: substring
14 | - type: pattern
15 | pattern: ".+"
16 | properties: [spec/tls/hosts]
17 | mode: substring
18 | slxs:
19 | - baseName: http-ok-tls-test
20 | qualifiers: ["resource", "namespace", "cluster"]
21 | baseTemplateName: http-ok-tls
22 | levelOfDetail: basic
23 | outputItems:
24 | - type: slx
25 | - type: sli
26 | - type: slo
27 | - type: runbook
28 | templateName: http-ok-tls-taskset.yaml
29 |
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/generation-rules/http-ok.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - ingress
7 | matchRules:
8 | - type: and
9 | matches:
10 | - type: pattern
11 | pattern: ".+"
12 | properties: [name]
13 | mode: substring
14 | - type: not
15 | predicate:
16 | type: exists
17 | path: spec/tls/hosts
18 | slxs:
19 | - baseName: http-ok
20 | qualifiers: ["resource", "namespace", "cluster"]
21 | baseTemplateName: http-ok
22 | levelOfDetail: basic
23 | outputItems:
24 | - type: slx
25 | - type: sli
26 | - type: slo
27 | - type: runbook
28 | templateName: http-ok-taskset.yaml
29 |
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/templates/http-ok-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 1
17 | operand: eq
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/templates/http-ok-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | alias: {{match_resource.resource.metadata.name}} Public HTTP OK
11 | asMeasuredBy: HTTP 200 returned within the desired latency.
12 | configProvided:
13 | - name: OBJECT_NAME
14 | value: {{match_resource.resource.metadata.name}}
15 | icon: Cloud
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: HTTP Ingress resources should respond with HTTP 200 in under 1s
19 | additionalContext:
20 | namespace: "{{match_resource.resource.metadata.namespace}}"
21 | labelMap: "{{match_resource.resource.metadata.labels}}"
22 | cluster: "{{ cluster.name }}"
23 | context: "{{ cluster.context }}"
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/templates/http-ok-taskset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Runbook
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | location: {{default_location}}
11 | codeBundle:
12 | {% if repo_url %}
13 | repoUrl: {{repo_url}}
14 | {% else %}
15 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
16 | {% endif %}
17 | {% if ref %}
18 | ref: {{ref}}
19 | {% else %}
20 | ref: main
21 | {% endif %}
22 | pathToRobot: codebundles/curl-http-ok/runbook.robot
23 | configProvided:
24 | - name: URL
25 | value: http://{{match_resource.resource.spec.rules[0].host}}
26 | - name: TARGET_LATENCY
27 | value: '1.2'
28 | - name: DESIRED_RESPONSE_CODE
29 | value: '200'
30 | - name: OWNER_DETAILS
31 | value: '{"name":"{{match_resource.resource.metadata.name}}", "kind":"Ingress","namespace":"{{match_resource.resource.metadata.namespace}}"}'
32 | secretsProvided: []
33 |
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-aks-public-loadbalancer-ext-dns-tls-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | alias: {{match_resource.resource.metadata.name}} Public HTTP OK (Azure LB)
11 | asMeasuredBy: HTTP 200 returned within the desired latency.
12 | configProvided:
13 | - name: OBJECT_NAME
14 | value: {{match_resource.resource.metadata.name}}
15 | icon: Cloud
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: HTTP AKS LoadBalancer and Service resources should respond with HTTP 200 in under 1s
19 | additionalContext:
20 | namespace: "{{match_resource.resource.metadata.namespace}}"
21 | labelMap: "{{match_resource.resource.metadata.labels}}"
22 | cluster: "{{ cluster.name }}"
23 | context: "{{ cluster.context }}"
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 1
17 | operand: eq
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | alias: {{match_resource.resource.metadata.name}} Public HTTP OK
11 | asMeasuredBy: HTTP 200 returned within the desired latency.
12 | configProvided:
13 | - name: OBJECT_NAME
14 | value: {{match_resource.resource.metadata.name}}
15 | icon: Cloud
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: HTTP Ingress resources should respond with HTTP 200 in under 1.2s
19 | additionalContext:
20 | namespace: "{{match_resource.resource.metadata.namespace}}"
21 | labelMap: "{{match_resource.resource.metadata.labels}}"
22 | cluster: "{{ cluster.name }}"
23 | context: "{{ cluster.context }}"
--------------------------------------------------------------------------------
/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-taskset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Runbook
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | location: {{default_location}}
11 | codeBundle:
12 | {% if repo_url %}
13 | repoUrl: {{repo_url}}
14 | {% else %}
15 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
16 | {% endif %}
17 | {% if ref %}
18 | ref: {{ref}}
19 | {% else %}
20 | ref: main
21 | {% endif %}
22 | pathToRobot: codebundles/curl-http-ok/runbook.robot
23 | configProvided:
24 | - name: URL
25 | value: https://{{match_resource.resource.spec.tls[0].hosts[0]}}
26 | - name: TARGET_LATENCY
27 | value: '1.2'
28 | - name: DESIRED_RESPONSE_CODE
29 | value: '200'
30 | - name: OWNER_DETAILS
31 | value: '{"name":"{{match_resource.resource.metadata.name}}", "kind":"Ingress","namespace":"{{match_resource.resource.metadata.namespace}}"}'
32 | secretsProvided: []
33 |
--------------------------------------------------------------------------------
/codebundles/gcloud-log-inspection/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/gcloud-node-preempt/.runwhen/generation-rules/gcloud-node-preempt.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: gcp
5 | generationRules:
6 | - resourceTypes:
7 | - gcp_compute_instances
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [scheduling/preemptible]
12 | mode: substring
13 | slxs:
14 | - baseName: node-preempt
15 | qualifiers: ["project"]
16 | baseTemplateName: gcloud-node-preempt
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: slo
22 | - type: runbook
23 | templateName: gcloud-node-preempt-taskset.yaml
24 |
--------------------------------------------------------------------------------
/codebundles/gcloud-node-preempt/.runwhen/templates/gcloud-node-preempt-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 1
17 | operand: lt
--------------------------------------------------------------------------------
/codebundles/gcloud-node-preempt/.runwhen/templates/gcloud-node-preempt-taskset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Runbook
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | location: {{default_location}}
11 | description: Counts the total number of nodes undergoing a preempt event.
12 | codeBundle:
13 | {% if repo_url %}
14 | repoUrl: {{repo_url}}
15 | {% else %}
16 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
17 | {% endif %}
18 | {% if ref %}
19 | ref: {{ref}}
20 | {% else %}
21 | ref: main
22 | {% endif %}
23 | pathToRobot: codebundles/gcloud-node-preempt/runbook.robot
24 | intervalStrategy: intermezzo
25 | intervalSeconds: 300
26 | configProvided:
27 | - name: GCP_PROJECT_ID
28 | value: {{match_resource.resource.project_id}}
29 | - name: AGE
30 | value: '30'
31 | secretsProvided:
32 | - name: gcp_credentials_json
33 | workspaceKey: {{custom.gcp_ops_suite_sa}}
--------------------------------------------------------------------------------
/codebundles/gcloud-node-preempt/README.md:
--------------------------------------------------------------------------------
1 | # gcloud Node Preempt List
2 | This code checks if any GCP (Google Cloud Platform) nodes have an active preempt operation. It uses the gcloud command-line tool to interact with GCP APIs and retrieve the necessary information.
3 |
4 |
5 | ## SLI
6 | The SLI lists all preempt node operations that have a status that does not match "DONE", counts the total nodes in this state, and pushes the metric.
7 |
8 | ## TaskSet
9 | The Taskset lists all preempt node operations that have a status that does not match "DONE" and returns the following details in json format:
10 |
11 | - startTime
12 | - targetLink
13 | - statusMessage
14 | - progress
15 | - zone
16 | - selfLink
17 |
18 |
19 | ## Requirements
20 | The following permissions are required on the GCP service account used with the gcloud utility:
21 |
22 | - 'compute.globalOperations.list'
--------------------------------------------------------------------------------
/codebundles/gcp-bucket-health/.runwhen/generation-rules/gcp-bucket-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: gcp
5 | generationRules:
6 | - resourceTypes:
7 | - gcp_storage_buckets
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: gcp-bucket-health
15 | qualifiers: ["project"]
16 | baseTemplateName: gcp-bucket-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: slo
22 | - type: runbook
23 | templateName: gcp-bucket-health-taskset.yaml
24 |
--------------------------------------------------------------------------------
/codebundles/gcp-bucket-health/.runwhen/templates/gcp-bucket-health-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 1
17 | operand: eq
--------------------------------------------------------------------------------
/codebundles/gcp-bucket-health/README.md:
--------------------------------------------------------------------------------
1 | # GCP Bucket Health
2 | This code checks if any GCP (Google Cloud Platform) buckets are unhealthy, focusing on:
3 | - Utilization (with a user defined threshold for issue/alert generation)
4 | - Security Configuration (with a user defined threshold on when to generate issues/alerts for publicly accessible buckets)
5 |
6 |
7 | ## SLI
8 | The SLI:
9 | - counts the number of buckets that are above the user defined threshold
10 | - counts the number of publicly accessible buckets above the user defined threshold
11 |
12 | ## TaskSet
13 | The Taskset lists provides the following tasks:
14 |
15 | - Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}`
16 | - Add GCP Bucket Storage Configuration for `${PROJECT_IDS}` to Report
17 | - Check GCP Bucket Security Configuration for `${PROJECT_IDS}`
18 |
19 | ## Requirements
20 | The following roles are useful on the GCP service account used with the gcloud utility:
21 |
22 | - Viewer
23 | - Security Reviewer
24 |
25 | ## TODO
26 | Update required GCP SA permissions.
--------------------------------------------------------------------------------
/codebundles/gcp-cloud-function-health/.runwhen/generation-rules/gcp-cloud-function-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: gcp
5 | generationRules:
6 | - resourceTypes:
7 | - gcp_functions_functions
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: gcp-function-health
15 | qualifiers: ["project"]
16 | baseTemplateName: gcp-cloud-function-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: slo
22 | - type: runbook
23 | templateName: gcp-cloud-function-health-taskset.yaml
24 |
--------------------------------------------------------------------------------
/codebundles/gcp-cloud-function-health/.runwhen/templates/gcp-cloud-function-health-sli.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelIndicator
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | displayUnitsLong: Number
11 | displayUnitsShort: '#'
12 | locations:
13 | - {{default_location}}
14 | description: Measures ____
15 | codeBundle:
16 | {% if repo_url %}
17 | repoUrl: {{repo_url}}
18 | {% else %}
19 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
20 | {% endif %}
21 | {% if ref %}
22 | ref: {{ref}}
23 | {% else %}
24 | ref: main
25 | {% endif %}
26 | pathToRobot: codebundles/gcp-cloud-function-health/sli.robot
27 | intervalStrategy: intermezzo
28 | intervalSeconds: 300
29 | configProvided:
30 | - name: GCP_PROJECT_ID
31 | value: {{match_resource.resource.project_id}}
32 | secretsProvided:
33 | - name: gcp_credentials_json
34 | workspaceKey: {{custom.gcp_ops_suite_sa}}
--------------------------------------------------------------------------------
/codebundles/gcp-cloud-function-health/.runwhen/templates/gcp-cloud-function-health-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 0
17 | operand: eq
--------------------------------------------------------------------------------
/codebundles/gcp-cloud-function-health/.runwhen/templates/gcp-cloud-function-health-taskset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Runbook
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | location: {{default_location}}
11 | description: Runs tasks to triage unhealthy GCP Cloud Functions
12 | codeBundle:
13 | {% if repo_url %}
14 | repoUrl: {{repo_url}}
15 | {% else %}
16 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
17 | {% endif %}
18 | {% if ref %}
19 | ref: {{ref}}
20 | {% else %}
21 | ref: main
22 | {% endif %}
23 | pathToRobot: codebundles/gcp-cloud-function-health/runbook.robot
24 | intervalStrategy: intermezzo
25 | intervalSeconds: 300
26 | configProvided:
27 | - name: GCP_PROJECT_ID
28 | value: {{match_resource.resource.project_id}}
29 | secretsProvided:
30 | - name: gcp_credentials_json
31 | workspaceKey: {{custom.gcp_ops_suite_sa}}
--------------------------------------------------------------------------------
/codebundles/gcp-cloud-function-health/README.md:
--------------------------------------------------------------------------------
1 | # GCP Cloud Function Health
2 | This code checks if any GCP (Google Cloud Platform) cloud functions are unhealthy. It uses the gcloud command-line tool to interact with GCP APIs and retrieve the necessary information.
3 |
4 | > Note: Only cloud functions v1 is supported at this time for automatic discovery with the RunWhen Local Discovery Process. The tasks will support either generation.
5 |
6 | ## SLI
7 | The SLI counts the number of cloud functions that are "FAILED" state and pushes the metric.
8 |
9 | ## TaskSet
10 | The Taskset lists provides the following tasks:
11 |
12 | - List Unhealhy Cloud Functions in GCP Project
13 | - Get Error Logs for Unhealthy Cloud Functions in GCP Project
14 |
15 | ## Requirements
16 | The following permissions are required on the GCP service account used with the gcloud utility:
17 |
18 | - `cloudfunctions.functions.get`
19 | - `cloudfunctions.functions.list`
--------------------------------------------------------------------------------
/codebundles/gh-actions-artifact-analysis/README.md:
--------------------------------------------------------------------------------
1 | # GitHub Actions Artifact Analysis
2 | This codebundle is highly configurable and integrates with GitHub Actions and workflow artifacts. It downloads a specified artifact from the last workflow run, analyzes a artifact with a user provided command (typically using linux / bash tools like jq)
3 |
4 | ## SLI
5 | This SLI downloads the artifact from the latest run of the GitHub Actions workflow, runs the analysis command (which must result in a metric), and pushes the metric to the RunWhen Platform.
6 |
7 | ## TaskSet
8 | This SLI downloads the artifact from the latest GitHub Actions workflow run, executes the analysis command and adds the details to the report. It can also generate Issues if:
9 | - a user specified string is found in the report output
10 | - the latest run didn't complete successfully
11 | - the latest run is older than the desired time period ($PERIOD_HOURS)
12 |
--------------------------------------------------------------------------------
/codebundles/gh-actions-artifact-analysis/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/gh-actions-health/.runwhen/generation-rules/gh-actions-health.yaml:
--------------------------------------------------------------------------------
1 | # We do not currently support github as a platform type, but this is the template for how we would do it if we did.
2 | # apiVersion: runwhen.com/v1
3 | # kind: GenerationRules
4 | # spec:
5 | # platform: github
6 | # generationRules:
7 | # - resourceTypes:
8 | # - github_organizations
9 | # - github_repository
10 | # matchRules:
11 | # - type: pattern
12 | # pattern: ".+"
13 | # properties: [name]
14 | # mode: substring
15 | # slxs:
16 | # - baseName: gh-actions-health
17 | # qualifiers: ["resource"]
18 | # baseTemplateName: gh-actions-health
19 | # levelOfDetail: basic
20 | # outputItems:
21 | # - type: slx
22 | # - type: sli
23 | # - type: runbook
24 | # templateName: gh-actions-health-taskset.yaml
--------------------------------------------------------------------------------
/codebundles/gh-actions-health/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
--------------------------------------------------------------------------------
/codebundles/gke-cluster-health/.runwhen/generation-rules/gke-cluster-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: gcp
5 | generationRules:
6 | - resourceTypes:
7 | - gcp_container_clusters
8 | matchRules:
9 | - type: pattern
10 | pattern: ".+"
11 | properties: [name]
12 | mode: substring
13 | slxs:
14 | - baseName: gke-cluster-health
15 | qualifiers: ["project"]
16 | baseTemplateName: gke-cluster-health
17 | levelOfDetail: basic
18 | outputItems:
19 | - type: slx
20 | - type: sli
21 | - type: runbook
22 | templateName: gke-cluster-health-taskset.yaml
23 | - type: workflow
24 |
--------------------------------------------------------------------------------
/codebundles/gke-cluster-health/.runwhen/templates/gke-cluster-health-taskset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Runbook
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | location: {{default_location}}
11 | description: Runs tasks validate GKE Cluster health
12 | codeBundle:
13 | {% if repo_url %}
14 | repoUrl: {{repo_url}}
15 | {% else %}
16 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
17 | {% endif %}
18 | {% if ref %}
19 | ref: {{ref}}
20 | {% else %}
21 | ref: main
22 | {% endif %}
23 | pathToRobot: codebundles/gke-cluster-health/runbook.robot
24 | configProvided:
25 | - name: GCP_PROJECT_ID
26 | value: {{match_resource.resource.project_id}}
27 | secretsProvided:
28 | - name: gcp_credentials_json
29 | workspaceKey: {{custom.gcp_ops_suite_sa}}
--------------------------------------------------------------------------------
/codebundles/gke-cluster-health/.runwhen/templates/gke-cluster-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "GKE Health Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.project_id}} GKE Health Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for GKE clusters in {{match_resource.resource.project_id}}
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.project_id}}-{{ "GKE Health Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/gke-cluster-health/.test/README.md:
--------------------------------------------------------------------------------
1 | export GCP_PROJECT_ID=""
2 | export RW_FROM_FILE='{"gcp_credentials_json":"/home/runwhen/codecollection/auth/svc.json"}'
3 | export CLOUDSDK_CORE_PROJECT=$GCP_PROJECT_ID
4 |
--------------------------------------------------------------------------------
/codebundles/jenkins-health/.runwhen/generation-rules/jenkins-instance-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: aws
5 | generationRules:
6 | - resourceTypes:
7 | - ec2_instance
8 | matchRules:
9 | - type: and
10 | matches:
11 | - type: pattern
12 | pattern: "jenkins-server"
13 | properties: [tag-values]
14 | mode: substring
15 | - type: pattern
16 | pattern: "running"
17 | properties: [state/Name]
18 | mode: substring
19 |
20 | slxs:
21 | - baseName: jenkins-instance-health
22 | levelOfDetail: detailed
23 | qualifiers: ["resource"]
24 | baseTemplateName: jenkins-instance-health
25 | outputItems:
26 | - type: slx
27 | - type: sli
28 | - type: runbook
29 | templateName: jenkins-instance-health-taskset.yaml
30 |
--------------------------------------------------------------------------------
/codebundles/jenkins-health/.runwhen/templates/jenkins-instance-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/Jenkins_logo.svg
11 | alias: {{match_resource.resource.tags.Name}} Instance Health
12 | # alias: Jenkins Health
13 | asMeasuredBy: The number of failed Jenkins builds.
14 | configProvided:
15 | - name: SLX_PLACEHOLDER
16 | value: SLX_PLACEHOLDER
17 | owners:
18 | - {{workspace.owner_email}}
19 | statement: The number of failed Jenkins builds should be zero.
20 | additionalContext: []
--------------------------------------------------------------------------------
/codebundles/jenkins-health/.runwhen/templates/jenkins-instance-health-taskset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Runbook
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | location: {{default_location}}
11 | codeBundle:
12 | {% if repo_url %}
13 | repoUrl: {{repo_url}}
14 | {% else %}
15 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
16 | {% endif %}
17 | {% if ref %}
18 | ref: {{ref}}
19 | {% else %}
20 | ref: main
21 | {% endif %}
22 | pathToRobot: codebundles/jenkins-health/runbook.robot
23 | configProvided:
24 | - name: JENKINS_URL
25 | value: {{custom.jenkins_url}}
26 | secretsProvided:
27 | - name: JENKINS_USERNAME
28 | workspaceKey: {{custom.jenkins_username}}
29 | - name: JENKINS_TOKEN
30 | workspaceKey: {{custom.jenkins_token}}
31 |
--------------------------------------------------------------------------------
/codebundles/jenkins-health/.test/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 | region = "us-west-2" # Replace with your desired region
3 | }
--------------------------------------------------------------------------------
/codebundles/jenkins-health/README.md:
--------------------------------------------------------------------------------
1 | # AWS Jenkins Health
2 |
3 | This CodeBundle monitors and evaluates the health of Jenkins using the Jenkins REST API
4 |
5 | ## SLI
6 | The SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following:
7 | - Check if Jenkins instance is reachable and responding (endpoint)
8 | - Check For Failed Build Logs in Jenkins
9 | - Check For Long Running Builds in Jenkins
10 | - Check For Long Queued Builds in Jenkins
11 | - Check Jenkins Executor Utilization
12 |
13 | ## TaskSet
14 | Similar to the SLI, but produces a report on the specific jenkns apis and raises issues for each Jenkins check that requires attention.
15 |
16 | ## Required Configuration
17 |
18 | ```
19 | export JENKINS_URL=""
20 | export JENKINS_USERNAME=""
21 | export JENKINS_TOKEN=""
22 | ```
23 |
24 | ## Testing
25 | See the `.test` directory for infrastructure test code.
--------------------------------------------------------------------------------
/codebundles/k8s-app-troubleshoot/.runwhen/templates/k8s-app-troubleshoot-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 1
17 | operand: eq
--------------------------------------------------------------------------------
/codebundles/k8s-app-troubleshoot/.runwhen/templates/k8s-app-troubleshoot-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/gcp/cloud_load_balancing/cloud_load_balancing.svg
11 | alias: {{match_resource.resource.metadata.name}} Application Error Monitor
12 | asMeasuredBy: The number of errors and parsable exceptions in the application logs.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: The application should not be throwing exceptions.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/.runwhen/generation-rules/k8s-deployment-logs-health.yaml:
--------------------------------------------------------------------------------
1 | # apiVersion: runwhen.com/v1
2 | # kind: GenerationRules
3 | # spec:
4 | # generationRules:
5 | # - resourceTypes:
6 | # - deployment
7 | # matchRules:
8 | # - type: pattern
9 | # pattern: ".+"
10 | # properties: [name]
11 | # mode: substring
12 | # slxs:
13 | # - baseName: depl-logs-health
14 | # levelOfDetail: detailed
15 | # qualifiers: ["resource", "namespace", "cluster"]
16 | # baseTemplateName: k8s-deployment-logs-health
17 | # outputItems:
18 | # - type: slx
19 | # # - type: sli
20 | # - type: runbook
21 | # templateName: k8s-deployment-logs-health-taskset.yaml
22 | # # - type: workflow
23 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/.runwhen/generation-rules/k8s-statefulset-logs-health.yaml:
--------------------------------------------------------------------------------
1 | # apiVersion: runwhen.com/v1
2 | # kind: GenerationRules
3 | # spec:
4 | # generationRules:
5 | # - resourceTypes:
6 | # - statefulSet
7 | # matchRules:
8 | # - type: pattern
9 | # pattern: ".+"
10 | # properties: [name]
11 | # mode: substring
12 | # slxs:
13 | # - baseName: ss-logs-health
14 | # levelOfDetail: detailed
15 | # qualifiers: ["resource", "namespace", "cluster"]
16 | # baseTemplateName: k8s-ss-logs-health
17 | # outputItems:
18 | # - type: slx
19 | # # - type: sli
20 | # - type: runbook
21 | # templateName: k8s-ss-logs-health-taskset.yaml
22 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/.runwhen/templates/k8s-deployment-logs-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "Deployment Log Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.metadata.name}} Deployment Log SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.metadata.name}} deployment log health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.metadata.name}}-{{ "Deployment Log Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/.runwhen/templates/k8s-ss-logs-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/deploy.svg
11 | alias: {{match_resource.resource.metadata.name}} StatefulSet Log Health Check
12 | asMeasuredBy: Error logs, stack traces, connection failures, etc.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Pods for {{match_resource.resource.metadata.name}} StatefulSet should have error free logs.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/.runwhen/templates/k8s-statefulset-logs-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "StatefulSet Log Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.metadata.name}} StatefulSet Log SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.metadata.name}} statefulset log health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.metadata.name}}-{{ "StatefulSet Log Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/ignore_patterns.json:
--------------------------------------------------------------------------------
1 | {
2 | "patterns": [
3 | {
4 | "match": "connection closed before message completed",
5 | "category": "Ignore",
6 | "explanation": "Normal closures"
7 | },
8 | {
9 | "match": "server idle timeout",
10 | "category": "Ignore",
11 | "explanation": "Normal closures"
12 | }
13 | ]
14 | }
15 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/scan_application_restarts.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python3 scan_logs.py
4 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/scan_auth_failures.sh:
--------------------------------------------------------------------------------
1 | # #!/bin/bash
2 |
3 | python3 scan_logs.py
4 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/scan_connection_failures.sh:
--------------------------------------------------------------------------------
1 | # #!/bin/bash
2 |
3 | CATEGORIES=${CATEGORIES} python3 scan_logs.py
4 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/scan_error_logs.sh:
--------------------------------------------------------------------------------
1 | # #!/bin/bash
2 |
3 | python3 scan_logs.py
4 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/scan_null_pointer_exceptions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python3 scan_logs.py
3 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/scan_resource_warnings.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python3 scan_logs.py
4 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/scan_service_dependency_failures.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python3 scan_logs.py
4 |
--------------------------------------------------------------------------------
/codebundles/k8s-application-log-health/scan_timeout_errors.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python3 scan_logs.py
4 |
--------------------------------------------------------------------------------
/codebundles/k8s-argocd-application-health/.runwhen/generation-rules/k8s-argocd-application-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - applications.argoproj.io
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: argocd-app
14 | qualifiers: ["resource", "namespace", "cluster"]
15 | baseTemplateName: k8s-argocd-application-health
16 | levelOfDetail: detailed
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-argocd-application-health-cli-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-argocd-application-health/.runwhen/templates/k8s-argocd-application-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/argo-icon-color.svg
11 | alias: {{match_resource.resource.metadata.name}} ArgoCD Application Health
12 | asMeasuredBy: The sync status of the ArgoCD application object.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Resources for {{match_resource.resource.metadata.namespace}} should be synced in a healthy state.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-argocd-application-health/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes ArgoCD Application Health
2 | This codebundle is used to help measure and troubleshoot the health of an ArgoCD managed application.
3 |
4 | ## TaskSet
5 | This taskset collects information and runs general troubleshooting checks against argocd application objects within a namespace.
6 |
7 | Example configuration for an application in which the ArgoCD Application object resides in the same namespace as the resources themselves:
8 | ```
9 | export DISTRIBUTION=Kubernetes
10 | export CONTEXT=cluster-1
11 | export APPLICATION=otel-demo
12 | export APPLICATION_TARGET_NAMESPACE=otel-demo
13 | export APPLICATION_APP_NAMESPACE=otel-demo
14 | export ERROR_PATTERN="Quota|Error|Exception"
15 | ```
16 |
17 | ## TODO
18 | - [ ] Try support for list of applications in conjunction with single application
19 | - [ ] Add documentation
20 | - [ ] Add issues
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-argocd-helm-health/.runwhen/generation-rules/k8s-argocd-helm-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - applications.argoproj.io
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: argocd-helm
14 | qualifiers: ["resource", "namespace", "cluster"]
15 | baseTemplateName: k8s-argocd-helm-health
16 | levelOfDetail: detailed
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-argocd-helm-health-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-argocd-helm-health/.runwhen/templates/k8s-argocd-helm-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/argo-icon-color.svg
11 | alias: {{match_resource.resource.metadata.name}} ArgoCD Helm Health
12 | asMeasuredBy: The sync status of the ArgoCD Helm releases.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Helm releases for {{match_resource.resource.metadata.namespace}} should be synced and versioned aligned.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-argocd-helm-health/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes ArgoCD Helm Health
2 | This codebundle is used to help measure and troubleshoot the health of an ArgoCD managed Helm deployments.
3 |
4 | ## TaskSet
5 | This taskset collects information and runs general troubleshooting checks against argocd Helm applications objects within a namespace.
6 |
7 | Example configuration for an application in which the ArgoCD Application object resides in the same namespace as the resources themselves:
8 | ```
9 | export DISTRIBUTION=Kubernetes
10 | export CONTEXT=cluster-1
11 | export NAMESPACE=otel-demo
12 | export RESOURCE_NAME="applications.argoproj.io"
13 | ```
14 |
15 | ## TODO
16 | - [ ] Try support for list of applications in conjunction with single application
17 | - [ ] Add documentation
18 | - [ ] Add issues
19 |
--------------------------------------------------------------------------------
/codebundles/k8s-artifactory-health/.runwhen/generation-rules/k8s-artifactory.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - stateful_set
7 | matchRules:
8 | - type: pattern
9 | pattern: "artifactory"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: arti-health
14 | levelOfDetail: basic
15 | qualifiers: ["resource", "namespace", "cluster"]
16 | baseTemplateName: k8s-artifactory-healthcheck
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-artifactory-healthcheck-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-artifactory-health/.runwhen/templates/k8s-artifactory-healthcheck-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/jfrog-icon.svg
11 | alias: {{namespace.name}} Artifactory Health
12 | asMeasuredBy: The availablity reported by the artifactory http endpoints.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Artifactory services should be healthy and available.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-certmanager-healthcheck/.runwhen/generation-rules/k8s-certmanager-certificates-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - certificates.cert-manager.io
7 | matchRules:
8 | - type: pattern
9 | pattern: "."
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: cert-health
14 | qualifiers: ["namespace", "cluster"]
15 | baseTemplateName: k8s-certmanager-certificate-health
16 | levelOfDetail: detailed
17 | outputItems:
18 | - type: slx
19 | - type: sli
20 | - type: slo
21 | - type: runbook
22 | templateName: k8s-certmanager-certificate-health-taskset.yaml
23 | - type: workflow
24 |
--------------------------------------------------------------------------------
/codebundles/k8s-certmanager-healthcheck/.runwhen/templates/k8s-certmanager-certificate-health-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 0
17 | operand: eq
--------------------------------------------------------------------------------
/codebundles/k8s-certmanager-healthcheck/.runwhen/templates/k8s-certmanager-certificate-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/cert-manager.svg
11 | alias: {{namespace.name}} SSL Certificate Health
12 | asMeasuredBy: Certificates in an unready state
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: All certificates should be in a Ready state 99.5%.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
26 |
--------------------------------------------------------------------------------
/codebundles/k8s-certmanager-healthcheck/.runwhen/templates/k8s-certmanager-certificate-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "SSL Certificate Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{match_resource.resource.metadata.namespace}} SSL Certificate Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{namespace.name}} SSL Certificate Health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{match_resource.resource.metadata.namespace}}-{{ "SSL Certificate Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-flux/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-namespace/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes Namespace Chaos Engineering
2 |
3 | This codebundle provides chaos injection for kubernetes namespaces
4 |
5 | ## Tasks
6 |
7 | `Test Namespace Highly Available`
8 | `Test Node Drain`
9 | `Mangle Service Selector`
10 | `Mangle Service Port`
11 | `Fill Pod Tmp`
12 |
13 | ## Configuration
14 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
15 |
16 | - `KUBECONFIG`: The kubeconfig secret containing access info for the cluster.
17 | - `CONTEXT`: The Kubernetes context to operate within.
18 | - `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.
19 |
20 |
21 | ## Requirements
22 | - A kubeconfig with appropriate RBAC permissions to perform the desired command.
23 |
24 | ## TODO
25 | - [ ] Add additional documentation.
26 |
27 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-namespace/auth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Check if required kubectl environment variables set
4 | if [[ -z $CONTEXT || -z $KUBECONFIG ]]; then
5 | echo "Missing required environment variables for kubectl: CONTEXT, KUBECONFIG"
6 | exit 1
7 | fi
8 | if [[ -f $KUBECONFIG ]]; then
9 | cat "$KUBECONFIG" > /tmp/kubeconfig
10 | else
11 | echo "$KUBECONFIG" > /tmp/kubeconfig
12 | fi
13 | export KUBECONFIG="/tmp/kubeconfig"
14 | kubectl config set-context "$CONTEXT" > /dev/null
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-namespace/delete_random_pods.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Environment Variables
4 | # NAMESPACE
5 | # CONTEXT
6 |
7 | MAX_DELETIONS=10
8 | POD_NAMES=$(kubectl get --context $CONTEXT pods -oname -n $NAMESPACE)
9 | echo "Starting random pod deletions in namespace $NAMESPACE"
10 | deleted_count=0
11 | for pod_name in $POD_NAMES; do
12 | # Roll a 50/50 chance
13 | if (( RANDOM % 2 == 0 )); then
14 | # Delete the pod
15 | kubectl delete --context $CONTEXT $pod_name -n $NAMESPACE
16 | echo "Waiting between deletions..."
17 | sleep 3
18 | # Increment the deleted count
19 | ((deleted_count++))
20 | fi
21 | # Check if we have deleted 10 pods
22 | if (( deleted_count >= MAX_DELETIONS )); then
23 | break
24 | fi
25 | done
26 |
27 | echo "Random deletions complete. Current Pod States:"
28 | kubectl get --context $CONTEXT pods -n $NAMESPACE
29 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-namespace/drain_node.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Check if the service account has the necessary permissions
4 | if ! kubectl auth can-i create nodes; then
5 | echo "Insufficient permissions to make node changes."
6 | exit 1
7 | fi
8 |
9 | READYNODES=$(kubectl get nodes | grep Ready | awk '{print $1}')
10 |
11 | NODES=($READYNODES)
12 |
13 | # Get random node
14 | RANDOM_INDEX=$((RANDOM % ${#NODES[@]}))
15 | RANDOM_NODE=${NODES[$RANDOM_INDEX]}
16 |
17 | if [ -z "$RANDOM_NODE" ]; then
18 | echo "No suitable nodes found for draining."
19 | exit 1
20 | fi
21 |
22 | # Drain the node
23 | kubectl drain $RANDOM_NODE --ignore-daemonsets
24 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-namespace/expand_tmp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Environment Variables:
4 | # NAMESPACE
5 | # CONTEXT
6 |
7 | # Find a random pod in the given namespace
8 | pod=$(kubectl get --context $CONTEXT pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | shuf -n 1)
9 |
10 | echo "Expanding /tmp of pod $pod in namespace $NAMESPACE"
11 |
12 | # Exec into the pod and create a file at /tmp/chaos
13 | kubectl exec --context $CONTEXT -n "$NAMESPACE" "$pod" -- touch /tmp/chaos
14 |
15 | # Fill the file with random data until it consumes all space in the container
16 | kubectl exec --context $CONTEXT -n "$NAMESPACE" "$pod" -- sh -c "dd if=/dev/zero of=/tmp/chaos bs=1M count=1024"
17 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-namespace/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-workload/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes Workload Chaos Engineering
2 |
3 | This codebundle provides chaos injection for a specific workload within a Kubernetes namespace.
4 |
5 | ## Configuration
6 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
7 |
8 | - `KUBECONFIG`: The kubeconfig secret containing access info for the cluster.
9 | - `CONTEXT`: The Kubernetes context to operate within.
10 | - `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.
11 | - `WORKLOAD_NAME`: The specific workload to inject chaos experiments into. Eg: deployment/my-app
12 |
13 |
14 | ## Requirements
15 | - A kubeconfig with appropriate RBAC permissions to perform the desired command.
16 |
17 | ## TODO
18 | - [ ] Add additional documentation.
19 |
20 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-workload/auth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Check if required kubectl environment variables set
4 | if [[ -z $CONTEXT || -z $KUBECONFIG ]]; then
5 | echo "Missing required environment variables for kubectl: CONTEXT, KUBECONFIG"
6 | exit 1
7 | fi
8 | if [[ -f $KUBECONFIG ]]; then
9 | cat "$KUBECONFIG" > /tmp/kubeconfig
10 | else
11 | echo "$KUBECONFIG" > /tmp/kubeconfig
12 | fi
13 | export KUBECONFIG="/tmp/kubeconfig"
14 | kubectl config set-context "$CONTEXT" > /dev/null
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-workload/expand_tmp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Environment Variables:
4 | # NAMESPACE
5 | # CONTEXT
6 | # WORKLOAD_NAME
7 |
8 | selectors=$(kubectl get --context "$CONTEXT" -n "$NAMESPACE" "$WORKLOAD_NAME" -o jsonpath='{ .spec.selector.matchLabels }')
9 | selectors=$(echo $selectors | jq -r 'to_entries | map("\(.key)=\(.value)") | join(",")')
10 | echo "Fetching pods with label selector: $selectors"
11 | pods=$(kubectl get --context "$CONTEXT" pods -n "$NAMESPACE" -l "$selectors" -o jsonpath='{.items[*].metadata.name}')
12 |
13 | # Find a random pod in the given namespace
14 | pod=$(echo $pods | tr ' ' '\n' | shuf -n 1)
15 |
16 | echo "Expanding /tmp of pod $pod in namespace $NAMESPACE"
17 |
18 | # Exec into the pod and create a file at /tmp/chaos
19 | kubectl exec --context $CONTEXT -n "$NAMESPACE" "$pod" -- touch /tmp/chaos
20 |
21 | # Fill the file with random data until it consumes all space in the container
22 | kubectl exec --context $CONTEXT -n "$NAMESPACE" "$pod" -- sh -c "dd if=/dev/zero of=/tmp/chaos bs=1M count=1024"
23 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-workload/kill_workload_pod.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Environment Variables
4 | # NAMESPACE
5 | # CONTEXT
6 | # WORKLOAD_NAME
7 |
8 | selectors=$(kubectl get --context "$CONTEXT" -n "$NAMESPACE" "$WORKLOAD_NAME" -o jsonpath='{ .spec.selector.matchLabels }')
9 | selectors=$(echo $selectors | jq -r 'to_entries | map("\(.key)=\(.value)") | join(",")')
10 | echo "Fetching pods with label selector: $selectors"
11 | pods=$(kubectl get --context "$CONTEXT" pods -n "$NAMESPACE" -l "$selectors" -o jsonpath='{.items[*].metadata.name}')
12 |
13 | MAX_DELETIONS=1
14 | echo "Killing a pod owned by "$WORKLOAD_NAME" in namespace $NAMESPACE"
15 | deleted_count=0
16 | for pod_name in $pods; do
17 | # Delete the pod
18 | kubectl delete --context $CONTEXT pod $pod_name -n $NAMESPACE
19 | # Increment the deleted count
20 | ((deleted_count++))
21 | # Check if we have deleted 10 pods
22 | if (( deleted_count >= MAX_DELETIONS )); then
23 | break
24 | fi
25 | done
26 |
27 | echo "Deletions complete. Current Pod States:"
28 | kubectl get --context $CONTEXT pods -n $NAMESPACE
29 |
--------------------------------------------------------------------------------
/codebundles/k8s-chaos-workload/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/k8s-cluster-node-health/.runwhen/generation-rules/k8s-cluster-node-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: kubernetes
5 | generationRules:
6 | - resourceTypes:
7 | - cluster
8 | matchRules:
9 | - type: and
10 | matches:
11 | - type: pattern
12 | pattern: ".+"
13 | properties: [name]
14 | mode: substring
15 | slxs:
16 | - baseName: cluster-node-health
17 | qualifiers: ["cluster"]
18 | baseTemplateName: k8s-cluster-node-health
19 | levelOfDetail: basic
20 | outputItems:
21 | - type: slx
22 | - type: sli
23 | - type: runbook
24 | templateName: k8s-cluster-node-health-taskset.yaml
25 | - type: workflow
26 |
--------------------------------------------------------------------------------
/codebundles/k8s-cluster-node-health/.runwhen/templates/k8s-cluster-node-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/infrastructure_components/labeled/node.svg
11 | alias: {{ cluster.name }} Cluster Node Health
12 | asMeasuredBy: Node restarts, ready status, and other error or pressure conditions.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{cluster.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Cluster nodes for {{cluster.context}} should be ready and available 100% of the time.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-cluster-node-health/.runwhen/templates/k8s-cluster-node-health-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "Node Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{ cluster.name }} Node Health SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{ cluster.name }} cluster node health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{cluster.name}}-{{ "Node Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/k8s-cluster-node-health/README.md:
--------------------------------------------------------------------------------
1 | # K8s Cluster Node Health
2 |
3 | ## SLI
4 | The Service Level Indicator will generate a score for the health of the nodes in the cluster. This is an aggregate score from the tasks, which currently include:
5 | - Check for Node Restarts in Cluster
6 |
7 | ## TaskSet
8 | ### Check for Node Restarts in Cluster
9 | Create a report of all nodes start/stop/preempts/removals in the cluster. This will generate an information issue since node starts/stops may be routine, but users may want to be aware that they are happening if their pods are temporarily affected.
10 |
11 | ## Requirements
12 | - Service account with permissions to:
13 | - get nodes
14 | - list nodes
15 |
--------------------------------------------------------------------------------
/codebundles/k8s-cluster-resource-health/.runwhen/generation-rules/k8s-cluster-resource-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | platform: kubernetes
5 | generationRules:
6 | - resourceTypes:
7 | - cluster
8 | matchRules:
9 | - type: and
10 | matches:
11 | - type: pattern
12 | pattern: ".+"
13 | properties: [name]
14 | mode: substring
15 | slxs:
16 | - baseName: cluster-resource
17 | qualifiers: ["cluster"]
18 | baseTemplateName: k8s-cluster-resource-health
19 | levelOfDetail: basic
20 | outputItems:
21 | - type: slx
22 | - type: runbook
23 | templateName: k8s-cluster-resource-health-taskset.yaml
24 |
--------------------------------------------------------------------------------
/codebundles/k8s-cluster-resource-health/.runwhen/templates/k8s-cluster-resource-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes-icon-color.svg
11 | alias: {{ cluster.name }} Cluster Resource Health
12 | asMeasuredBy: Node cpu and memory utilization.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{cluster.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Cluster resources for {{cluster.context}} should be less than 90% utilization.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-cluster-resource-health/README.md:
--------------------------------------------------------------------------------
1 | # K8s Cluster Resource Health
2 |
3 | ## SLI
4 | The Service Level Indicator will count the amount of nodes that are over 90% active utilization according to `kubectl top nodes`
5 |
6 | ## TaskSet
7 | ### Identify High Utilization Nodes for Cluster
8 | Create a report of all nodes that are above 90% utilization. Raise issues for each node that is in this state.
9 |
10 | ### Identify Pods Causing High Node Utilization in Cluster
11 | This task identifies overutilized nodes and creates a report of each pod that is using more than it's defined request. Since requests are what a cluster autoscaler uses to make decisions, this list should be used to increase the pod requests so that autoscalers can make better scaling decisions.
12 |
13 | Raises an issue for each namespace
14 |
15 |
16 | ## Requirements
17 | - Service account with permissions to:
18 | - get nodes
19 | - list nodes
20 | - get/list nodes in api group "metrics.k8s.io"
--------------------------------------------------------------------------------
/codebundles/k8s-daemonset-healthcheck/.runwhen/generation-rules/k8s-daemonset-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - daemonset
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: ds-health
14 | levelOfDetail: detailed
15 | qualifiers: ["resource", "namespace", "cluster"]
16 | baseTemplateName: k8s-daemonset-health
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-daemonset-health-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-daemonset-healthcheck/.runwhen/templates/k8s-daemonset-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/ds.svg
11 | alias: {{match_resource.resource.metadata.name}} DaemonSet Health
12 | asMeasuredBy: The Running state of desired pods.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: DaemonSet {{match_resource.resource.metadata.name}} should be in a healthy state.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-deployment-healthcheck/.runwhen/generation-rules/k8s-deployment-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - deployment
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: depl-health
14 | levelOfDetail: detailed
15 | qualifiers: ["resource", "namespace", "cluster"]
16 | baseTemplateName: k8s-deployment-health
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-deployment-health-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-deployment-ops/.runwhen/generation-rules/k8s-deployment-ops.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - deployment
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: depl-ops
14 | levelOfDetail: detailed
15 | qualifiers: ["resource", "namespace", "cluster"]
16 | baseTemplateName: k8s-deployment-ops
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-deployment-ops-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-deployment-ops/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-helm-health/.runwhen/generation-rules/k8s-flux-helm-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - helmreleases.helm.toolkit.fluxcd.io
7 | matchRules:
8 | - type: and
9 | matches:
10 | - type: pattern
11 | pattern: ".+"
12 | properties: [name]
13 | mode: substring
14 | slxs:
15 | - baseName: flux-helm
16 | qualifiers: ["namespace", "cluster"]
17 | baseTemplateName: k8s-flux-helm-health
18 | levelOfDetail: detailed
19 | outputItems:
20 | - type: slx
21 | - type: runbook
22 | templateName: k8s-flux-helm-health-taskset.yaml
23 |
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-helm-health/.runwhen/templates/k8s-flux-helm-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/helm-icon-color.svg
11 | alias: {{namespace.name}} Helm Release Health
12 | asMeasuredBy: The reconciliation status of the helm release object.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Helm releases for {{namespace.name}} should be reconciled in a good state.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 |
25 | - name: access
26 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-kustomization-health/.runwhen/generation-rules/k8s-flux-kustomization-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - kustomizations.kustomize.toolkit.fluxcd.io
7 | matchRules:
8 | - type: and
9 | matches:
10 | - type: pattern
11 | pattern: ".+"
12 | properties: [name]
13 | mode: substring
14 | slxs:
15 | - baseName: flux-kstmz
16 | qualifiers: ["namespace", "cluster"]
17 | baseTemplateName: k8s-flux-kustomize-health
18 | levelOfDetail: detailed
19 | outputItems:
20 | - type: slx
21 | - type: sli
22 | - type: runbook
23 | templateName: k8s-flux-kustomize-health-taskset.yaml
24 |
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-kustomization-health/.runwhen/templates/k8s-flux-kustomize-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/flux-icon-color.svg
11 | alias: {{namespace.name}} GitOps Flux Kustomization Health
12 | asMeasuredBy: The sync/ready status of the Flux Kustomization objects in namespace {{namespace.name}}.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Flux Kustomizations for {{namespace.name}} should be synced and ready.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 |
25 | - name: access
26 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-reconcile/.runwhen/generation-rules/k8s-fluxcd-reconcile.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - namespace
7 | matchRules:
8 | - type: pattern
9 | pattern: "flux-system"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: fluxcd-reconciliation
14 | levelOfDetail: basic
15 | qualifiers: ["namespace", "cluster"]
16 | baseTemplateName: k8s-fluxcd-reconcile
17 | outputItems:
18 | - type: slx
19 | - type: sli
20 | - type: slo
21 | - type: runbook
22 | templateName: k8s-fluxcd-reconcile-taskset.yaml
23 |
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-reconcile/.runwhen/templates/k8s-fluxcd-reconcile-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 1
17 | operand: eq
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-reconcile/.runwhen/templates/k8s-fluxcd-reconcile-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/helm-icon-color.svg
11 | alias: FluxCD Reconciliation Health For {{cluster.name}}
12 | asMeasuredBy: The reconciliation loops for all of fluxcd in {{cluster.name}}
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: All FluxCD controllers should have no errors in their reconciliation loops
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-reconcile/.runwhen/templates/k8s-fluxcd-reconcile-taskset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Runbook
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | location: {{default_location}}
11 | codeBundle:
12 | {% if repo_url %}
13 | repoUrl: {{repo_url}}
14 | {% else %}
15 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
16 | {% endif %}
17 | {% if ref %}
18 | ref: {{ref}}
19 | {% else %}
20 | ref: main
21 | {% endif %}
22 | pathToRobot: codebundles/k8s-fluxcd-reconcile/runbook.robot
23 | configProvided:
24 | - name: DISTRIBUTION
25 | value: {{custom.kubernetes_distribution}}
26 | - name: CONTEXT
27 | value: {{context}}
28 | secretsProvided:
29 | {% if wb_version %}
30 | {% include "kubernetes-auth.yaml" ignore missing %}
31 | {% else %}
32 | - name: kubeconfig
33 | workspaceKey: {{custom.kubeconfig_secret_name}}
34 | {% endif %}
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-reconcile/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes FluxCD Reconciliation Errors
2 | This codebundle measures the number of reconciliation errors in the fluxcd controllers and can generate a report of them.
3 |
4 | ## TaskSet
5 | This taskset generates a report containing a summary of logs for each controller and their errors counts, ending with a total error count.
6 |
7 | Example configuration:
8 | ```
9 | CONTEXT=sandbox-cluster-1
10 | ```
11 |
12 | ## SLI
13 | The SLI can be used to monitor the overall health of the reconciliation loops for FluxCD and alert developers when a bad manifest has been provided.
14 |
15 | ## Requirements
16 | - A kubeconfig with `get` permissions to on the objects/namespaces that are involved in the query.
17 |
18 | ## TODO
19 | - Add additional rbac and kubectl resources and use cases
--------------------------------------------------------------------------------
/codebundles/k8s-fluxcd-reconcile/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/k8s-gitops-gh-remediate/.runwhen/templates/k8s-gitops-gh-remediate-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/github-mark.svg
11 | alias: {{match_resource.resource.metadata.name}} GitOps Configuration Remediations
12 | asMeasuredBy: ""
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Remediate resources in Namespace {{match_resource.resource.metadata.name}} managed in GitHub repositories.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-write
--------------------------------------------------------------------------------
/codebundles/k8s-gitops-gh-remediate/.test/README.md:
--------------------------------------------------------------------------------
1 | export RW_FROM_FILE='{"kubeconfig":"/home/runwhen/codecollection/auth/kubeconfig"}'
2 | export github_token=""
3 | export NAMESPACE="online-boutique"
4 | export CONTEXT='sandbox-cluster-1'
5 | export KUBERNETES_DISTRIBUTION_BINARY="kubectl"
--------------------------------------------------------------------------------
/codebundles/k8s-gitops-gh-remediate/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/k8s-image-check/.runwhen/generation-rules/k8s-image-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - namespace
7 | matchRules:
8 | - type: pattern
9 | pattern: "."
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: image-check
14 | levelOfDetail: detailed
15 | qualifiers: ["namespace", "cluster"]
16 | baseTemplateName: k8s-image-check
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-image-check-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-image-check/.runwhen/templates/k8s-image-check-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/container-registry.svg
11 | alias: {{namespace.name}} Image Check
12 | asMeasuredBy: Images & their tags running in the namespace for all containers in pods.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: List details about images running in the namespace.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-ingress-healthcheck/.runwhen/generation-rules/k8s-ingress-health .yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - ingress
7 | matchRules:
8 | - type: and
9 | matches:
10 | - type: pattern
11 | pattern: ".+"
12 | properties: [name]
13 | mode: substring
14 | slxs:
15 | - baseName: ingress-health
16 | qualifiers: ["namespace", "cluster"]
17 | baseTemplateName: k8s-ingress-healthcheck
18 | levelOfDetail: basic
19 | outputItems:
20 | - type: slx
21 | - type: runbook
22 | templateName: k8s-ingress-healthcheck-taskset.yaml
23 |
--------------------------------------------------------------------------------
/codebundles/k8s-ingress-healthcheck/.runwhen/templates/k8s-ingress-healthcheck-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/ing.svg
11 | alias: {{namespace.name}} Ingress Health
12 | asMeasuredBy: Ingress objects with valid services and endpoints.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: All ingress objects should have services and endpoints backing them.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-ingress-healthcheck/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes Ingress Healthcheck
2 | The `k8s-ingress-healthchech` codebundle checks the health of ingress objects within a Namespace.
3 |
4 | ## Tasks
5 | `Fetch Ingress Object Health in Namespace` - This command will list every ingress object and determine whether it has a service and and endpoint. If so, it is considered healthy. It will print out the health result along with the error or the details regarding the service name and pod endpoint names and IPs.
6 |
7 | Example configuration:
8 | ```
9 | KUBERNETES_DISTRIBUTION_BINARY=kubectl
10 | CONTEXT=sandbox-cluster-1
11 | NAMESPACE=my-namespace
12 | ```
13 |
14 | ## Requirements
15 | - A kubeconfig with `get` permissions to on the objects/namespaces that are involved in the query.
16 |
17 |
18 | ## TODO
19 | - Add additional rbac and kubectl resources and use cases
20 | - Add additional troubleshooting tasks as use cases evolve
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.runwhen/generation-rules/k8s-istio-system-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - namespace
7 | matchRules:
8 | - type: pattern
9 | pattern: "istio-system"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: k8s-istio-system-health
14 | levelOfDetail: detailed
15 | qualifiers: ["cluster"]
16 | baseTemplateName: k8s-istio-system-health
17 | outputItems:
18 | - type: slx
19 | - type: sli
20 | - type: runbook
21 | templateName: k8s-istio-system-health-taskset.yaml
22 |
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.runwhen/templates/k8s-istio-system-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/Istio.svg
11 | alias: {{ cluster.name }} Istio System Health
12 | asMeasuredBy: ""
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Measures health of Istio system by checking istio proxy sidecar injection status, high memory and cpu usage, warnings and errors in logs, valid certificates, configuration and verify istio installation.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.test/README.md:
--------------------------------------------------------------------------------
1 | export AWS_ACCESS_KEY_ID=""
2 | export AWS_DEFAULT_LOCATION=""
3 | export AWS_SECRET_ACCESS_KEY=""
4 | export CONTEXT="istio-cluster"
5 | export RW_API_URL="papi.beta.runwhen.com"
6 | export RW_WORKSPACE=""
7 | export RW_PAT=""
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.test/terraform/bookinfo/fault-injection-details-v1.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: networking.istio.io/v1alpha3
2 | kind: VirtualService
3 | metadata:
4 | name: details
5 | spec:
6 | hosts:
7 | - details
8 | http:
9 | - fault:
10 | abort:
11 | httpStatus: 500
12 | percentage:
13 | value: 100
14 | route:
15 | - destination:
16 | host: details
17 | subset: v1
18 | - route:
19 | - destination:
20 | host: details
21 | subset: v1
22 | ---
23 | apiVersion: networking.istio.io/v1alpha3
24 | kind: DestinationRule
25 | metadata:
26 | name: details
27 | spec:
28 | host: details
29 | subsets:
30 | - name: v1
31 | labels:
32 | version: v1
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.test/terraform/faulty-gateway.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: networking.istio.io/v1beta1
2 | kind: Gateway
3 | metadata:
4 | name: faulty-gateway
5 | namespace: istio-system
6 | spec:
7 | selector:
8 | istio: ingressgateway
9 | servers:
10 | - port:
11 | number: 80
12 | name: http
13 | protocol: HTTP
14 | hosts:
15 | - "invalid-host.local" # Error: No VirtualService matches this host
16 |
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.test/terraform/kubeconfig-sa-token.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 | name: kubeconfig-sa-token
5 | namespace: kube-system
6 | annotations:
7 | kubernetes.io/service-account.name: kubeconfig-sa
8 | type: kubernetes.io/service-account-token
9 |
10 |
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.test/terraform/outputs.tf:
--------------------------------------------------------------------------------
1 | output "configure_kubectl" {
2 | description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
3 | value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
4 | }
5 |
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.test/terraform/variables.tf:
--------------------------------------------------------------------------------
1 | variable "vpc_cidr" {
2 | description = "The cidr for aws vpc"
3 | type = string
4 | default = "10.0.0.0/16"
5 | }
6 |
7 | variable "istio_version" {
8 | description = "Istio version"
9 | type = string
10 | default = "1.20.2"
11 | }
12 |
13 | variable "cluster_name" {
14 | description = "The name of the EKS cluster"
15 | type = string
16 | default = "istio-cluster"
17 | }
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/.test/terraform/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.3"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = ">= 5.34"
8 | }
9 | helm = {
10 | source = "hashicorp/helm"
11 | version = ">= 2.9"
12 | }
13 | kubernetes = {
14 | source = "hashicorp/kubernetes"
15 | version = ">= 2.20"
16 | }
17 | kubectl = {
18 | source = "gavinbunney/kubectl"
19 | version = ">= 1.14.0"
20 | }
21 | }
22 |
23 | # ## Used for end-to-end testing on project; update to suit your needs
24 | # backend "s3" {
25 | # bucket = "terraform-ssp-github-actions-state"
26 | # region = "us-west-2"
27 | # key = "e2e/istio/terraform.tfstate"
28 | # }
29 | }
30 |
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/controlplane_error_patterns.json:
--------------------------------------------------------------------------------
1 | {
2 | "warnings": [
3 | "upstream connect error or disconnect/reset before headers. reset reason: connection failure",
4 | "error:1408F10B:SSL routines:ssl3_get_record:wrong version number",
5 | "istio-proxy fails to start",
6 | "502 Bad Gateway",
7 | "istio-ingressgateway is running but has no listener on port"
8 | ],
9 | "errors": [
10 | "panic: runtime error",
11 | "Failed to create listener",
12 | "proxy exited with status",
13 | "Pilot push failed",
14 | "failed to reconcile state",
15 | "Error adding/updating listener"
16 | ]
17 | }
18 |
--------------------------------------------------------------------------------
/codebundles/k8s-istio-system-health/proxy_error_patterns.json:
--------------------------------------------------------------------------------
1 | {
2 | "warnings": [
3 | "JWT validation failed",
4 | "upstream connect error or disconnect/reset before headers",
5 | "TLS handshake error",
6 | "No healthy upstream",
7 | "Error adding/updating cluster",
8 | "Downstream connection terminated",
9 | "Stream removed with error",
10 | "connection terminated with error",
11 | "503 UH no_healthy_upstream",
12 | "503 UC upstream connect error",
13 | "504 DC downstream connection termination",
14 | "FI fault_filter_abort",
15 | "DNS resolution failed"
16 | ],
17 | "errors": [
18 | "Envoy proxy is NOT ready",
19 | "Unable to establish connection",
20 | "upstream connect error",
21 | "bad certificate",
22 | "remote error: tls",
23 | "no route configured",
24 | "Listener filter chain match failed",
25 | "Failed to bind listener"
26 | ]
27 | }
28 |
--------------------------------------------------------------------------------
/codebundles/k8s-jaeger-http-query/.runwhen/generation-rules/k8s-jaeger-http-query.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - service
7 | matchRules:
8 | - type: and
9 | matches:
10 | - type: pattern
11 | pattern: "query"
12 | properties: [label-values]
13 | mode: substring
14 | - type: pattern
15 | pattern: "jaeger"
16 | properties: [label-values]
17 | mode: substring
18 | slxs:
19 | - baseName: jaeger-http
20 | levelOfDetail: detailed
21 | qualifiers: ["resource", "namespace", "cluster"]
22 | baseTemplateName: k8s-jaeger-http-query
23 | outputItems:
24 | - type: slx
25 | - type: runbook
26 | templateName: k8s-jaeger-http-query-taskset.yaml
27 |
--------------------------------------------------------------------------------
/codebundles/k8s-jaeger-http-query/.runwhen/templates/k8s-jaeger-http-query-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/jaeger_tracing.svg
11 | alias: Jaeger HTTP Query for Namespace {{match_resource.resource.metadata.namespace}}
12 | asMeasuredBy: None
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Jaeger instance {{match_resource.resource.metadata.name}} should report on HTTP traces related to ingested services.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-jenkins-healthcheck/.runwhen/generation-rules/k8s-jenkins-healthcheck.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - statefulset
7 | matchRules:
8 | - type: pattern
9 | pattern: "jenkins"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: jenkins-health
14 | levelOfDetail: detailed
15 | qualifiers: ["resource", "namespace", "cluster"]
16 | baseTemplateName: k8s-jenkins-health
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-jenkins-health-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-loki-healthcheck/.runwhen/generation-rules/k8s-loki-healthcheck.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - statefulset
7 | matchRules:
8 | - type: pattern
9 | pattern: "loki"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: loki-hlthck
14 | qualifiers: ["resource", "namespace", "cluster"]
15 | baseTemplateName: k8s-loki-healthcheck
16 | levelOfDetail: detailed
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-loki-healthcheck-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-loki-healthcheck/.runwhen/templates/k8s-loki-healthcheck-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/grafana-loki.svg
11 | alias: Loki Stack Health
12 | asMeasuredBy: The Loki stack is up, and healthy.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Loki's stack should be up, healthy with a up-to-date hash ring in the {{namespace.name}} namespace.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-loki-healthcheck/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes Loki Healthcheck
2 |
3 | A set of tasks to query the state and health of a Loki deployment in Kubernetes.
4 |
5 | ## Tasks
6 | `Check Loki Ring API`
7 | `Check Loki API Ready`
8 |
9 | ## Configuration
10 |
11 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
12 |
13 | - `kubeconfig`: The kubeconfig secret containing access info for the cluster.
14 | - `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.
15 | - `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.
16 | - `CONTEXT`: The Kubernetes context to operate within.
17 | - `NAMESPACE`: The name of the namespace to search.
18 |
19 | ## Notes
20 |
21 | Please note that these checks require Kubernetes RBAC exec permissions for the service account used.
22 |
23 | ## TODO
24 | - [ ] Add documentation
25 | - [ ] Add more complex hash ring checks
26 | - [ ] Refine raised issues
--------------------------------------------------------------------------------
/codebundles/k8s-namespace-healthcheck/.runwhen/generation-rules/k8s-namespace-healthcheck.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - namespace
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: ns-health
14 | levelOfDetail: basic
15 | qualifiers: ["namespace", "cluster"]
16 | baseTemplateName: k8s-namespace-healthcheck
17 | outputItems:
18 | - type: slx
19 | - type: sli
20 | # - type: slo
21 | - type: runbook
22 | templateName: k8s-namespace-healthcheck-taskset.yaml
23 | - type: workflow
24 |
--------------------------------------------------------------------------------
/codebundles/k8s-namespace-healthcheck/.runwhen/templates/k8s-namespace-healthcheck-slo.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelObjective
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | codeBundle:
11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git
12 | pathToYaml: codebundles/slo-default/queries.yaml
13 | ref: main
14 | sloSpecType: simple-mwmb
15 | objective: 99
16 | threshold: 1
17 | operand: eq
--------------------------------------------------------------------------------
/codebundles/k8s-namespace-healthcheck/.runwhen/templates/k8s-namespace-healthcheck-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/ns.svg
11 | alias: {{namespace.name}} Namespace Health
12 | asMeasuredBy: Aggregate score based on Kubernetes API Server queries
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Overall health for {{namespace.name}} should be 1, 99% of the time.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-namespace-healthcheck/.runwhen/templates/k8s-namespace-healthcheck-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{ "Namespace Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{namespace.name}} Namespace SLI Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{namespace.name}} namespace health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{namespace.name}}-{{ "Namespace Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/k8s-otelcollector/.runwhen/generation-rules/k8s-otelcollector.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - deployment
7 | - daemonset
8 | - statefulset
9 | matchRules:
10 | - type: and
11 | matches:
12 | - type: pattern
13 | pattern: "opentelemetry-collector"
14 | properties: [label-values]
15 | mode: substring
16 | - type: pattern
17 | pattern: "col"
18 | properties: [name]
19 | mode: substring
20 | slxs:
21 | - baseName: k8s-otelcollector
22 | levelOfDetail: detailed
23 | qualifiers: ["resource", "namespace", "cluster"]
24 | baseTemplateName: k8s-otelcollector
25 | outputItems:
26 | - type: slx
27 | - type: runbook
28 | templateName: k8s-otelcollector-taskset.yaml
29 |
--------------------------------------------------------------------------------
/codebundles/k8s-otelcollector/.runwhen/templates/k8s-otelcollector-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/jaeger_tracing.svg
11 | alias: OTEL Collector Health for Namespace {{match_resource.resource.metadata.namespace}}
12 | asMeasuredBy: None
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: OTEL Collector {{match_resource.resource.metadata.name}} should not have large queues or error logs.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
26 |
--------------------------------------------------------------------------------
/codebundles/k8s-otelcollector/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/codebundles/k8s-otelcollector/otel_dropped_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # ENV:
4 | # CONTEXT
5 | # NAMESPACE
6 | # METRICS_PORT
7 | # WORKLOAD_NAME
8 | # WORKLOAD_SERVICE
9 | since=60m
10 | output=$(kubectl --context $CONTEXT -n $NAMESPACE logs service/$WORKLOAD_SERVICE --since=$since --all-containers=true | grep dropped -A 20)
11 | if [ -n "$output" ]; then
12 | echo -E "Dropped Spans Found:"
13 | echo -E "$output"
14 | exit 1
15 | fi
16 | exit 0
--------------------------------------------------------------------------------
/codebundles/k8s-otelcollector/otel_error_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # ENV:
4 | # CONTEXT
5 | # NAMESPACE
6 | # METRICS_PORT
7 | # WORKLOAD_NAME
8 | # WORKLOAD_SERVICE
9 | since=60m
10 | output=$(kubectl --context $CONTEXT -n $NAMESPACE logs service/$WORKLOAD_SERVICE --since=$since --all-containers=true | grep error)
11 | if [ -n "$output" ]; then
12 | echo -E "Error(s) Found:"
13 | echo -E "$output"
14 | exit 1
15 | fi
16 | exit 0
--------------------------------------------------------------------------------
/codebundles/k8s-otelcollector/otel_metrics_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # ENV:
4 | # CONTEXT
5 | # NAMESPACE
6 | # METRICS_PORT
7 | # WORKLOAD_NAME
8 | # WORKLOAD_SERVICE
9 |
10 | THRESHOLD=500
11 | rv=0
12 | metrics=$(kubectl --context $CONTEXT -n $NAMESPACE exec $WORKLOAD_NAME -- curl $WORKLOAD_SERVICE:$METRICS_PORT/metrics)
13 | queued_spans=$(echo -E "$metrics" | grep "otelcol_exporter_queue_size{")
14 | while IFS= read -r line; do
15 | echo "$line"
16 | value=$(echo "$line" | awk '{print $2}')
17 | if [ "$value" -gt "$THRESHOLD" ]; then
18 | echo "Error: queued spans ($value) exceeds threshold ($THRESHOLD)"
19 | rv=1
20 |
21 | fi
22 | done <<< "$queued_spans"
23 | exit $rv
--------------------------------------------------------------------------------
/codebundles/k8s-podresources-health/.runwhen/generation-rules/k8s-pod-resources.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - namespace
7 | matchRules:
8 | - type: pattern
9 | pattern: "."
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: pod-resources
14 | levelOfDetail: detailed
15 | qualifiers: ["namespace", "cluster"]
16 | baseTemplateName: k8s-pod-resources
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-pod-resources-taskset.yaml
--------------------------------------------------------------------------------
/codebundles/k8s-podresources-health/.runwhen/templates/k8s-pod-resources-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/pod.svg
11 | alias: {{namespace.name}} Pod Resources
12 | asMeasuredBy: Kubectl get and Kubectl Top
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Pods should have resources configured, and resource usage should not be exceeded.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-postgres-healthcheck/.runwhen/generation-rules/k8s-postgres-healthcheck-crunchy.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - postgresclusters.postgres-operator.crunchydata.com
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: postgres-health
14 | qualifiers: ["resource", "namespace", "cluster"]
15 | baseTemplateName: k8s-postgres-healthcheck-crunchy
16 | levelOfDetail: detailed
17 | outputItems:
18 | - type: slx
19 | - type: sli
20 | - type: runbook
21 | templateName: k8s-postgres-healthcheck-crunchy-taskset.yaml
22 |
--------------------------------------------------------------------------------
/codebundles/k8s-postgres-healthcheck/.runwhen/generation-rules/k8s-postgres-healthcheck-zalando.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - postgresqls.acid.zalan.do
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: postgres-health
14 | qualifiers: ["resource", "namespace", "cluster"]
15 | baseTemplateName: k8s-postgres-healthcheck-zalando
16 | levelOfDetail: detailed
17 | outputItems:
18 | - type: slx
19 | - type: sli
20 | - type: runbook
21 | templateName: k8s-postgres-healthcheck-zalando-taskset.yaml
22 |
--------------------------------------------------------------------------------
/codebundles/k8s-postgres-healthcheck/.runwhen/templates/k8s-postgres-healthcheck-crunchy-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/CrunchyDataPrimaryIcon.png
11 | alias: {{match_resource.resource.metadata.name}} Postgres Health
12 | asMeasuredBy: Database is up and accepting connections.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Database should be available and accept connections 99.5% of the time.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
26 |
--------------------------------------------------------------------------------
/codebundles/k8s-postgres-healthcheck/.runwhen/templates/k8s-postgres-healthcheck-zalando-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/postgresql.svg
11 | alias: {{match_resource.resource.metadata.name}} Postgres Health
12 | asMeasuredBy: Database is up and accepting connections.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Database should be available and accept connections 99.5% of the time.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
26 |
--------------------------------------------------------------------------------
/codebundles/k8s-prometheus-healthcheck/.runwhen/generation-rules/k8s-prometheus-healthcheck.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - prometheuses.monitoring.coreos.com
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: kubeprom-hlthck
14 | qualifiers: ["resource", "namespace", "cluster"]
15 | baseTemplateName: k8s-prometheus-healthcheck
16 | levelOfDetail: detailed
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-prometheus-healthcheck-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-prometheus-healthcheck/.runwhen/templates/k8s-prometheus-healthcheck-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/prometheus_color.svg
11 | alias: Kubeprometheus Operator Health
12 | asMeasuredBy: The Kubeprometheus operator is healthy and its ServiceMonitors are functional.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: The Kubeprometheus operator should be healthy in the {{namespace.name}} namespace and its ServiceMonitors are functional.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-pvc-healthcheck/.runwhen/generation-rules/k8s-pvc-healthcheck.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - persistentvolumeclaim
7 | matchRules:
8 | - type: pattern
9 | pattern: "."
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: pvc-health
14 | levelOfDetail: detailed
15 | qualifiers: ["namespace", "cluster"]
16 | baseTemplateName: k8s-pvc-healthcheck
17 | outputItems:
18 | - type: slx
19 | - type: sli
20 | - type: runbook
21 | templateName: k8s-pvc-healthcheck-taskset.yaml
22 | - type: workflow
--------------------------------------------------------------------------------
/codebundles/k8s-pvc-healthcheck/.runwhen/templates/k8s-pvc-healthcheck-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/pvc.svg
11 | alias: {{namespace.name}} PVC Healthcheck
12 | asMeasuredBy: Aggregate score based on unattched PVCs or PVCs with errors.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: PVC's should be bound and healthy.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-pvc-healthcheck/.runwhen/templates/k8s-pvc-healthcheck-workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: Workflow
3 | metadata:
4 | name: {{slx_name}}-{{namespace.name}}-{{ "PVC Alert Workflow" | replace(" ", "-") | lower }}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | fromActivities:
11 | - displayName: {{namespace.name}} Namespace PVC Alert Workflow
12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{namespace.name}} PVC health
13 | actions:
14 | - tasks:
15 | slx: {{slx_name.split('--')[1]}}
16 | persona: eager-edgar
17 | titles:
18 | - '*'
19 | sessionTTL: 20m
20 | match:
21 | activityVerbs:
22 | - SLI_ALERTS_STARTED
23 | slxs:
24 | - {{slx_name.split('--')[1]}}
25 | name: {{namespace.name}}-{{ "PVC Alert Workflow" | replace(" ", "-") | lower }}
--------------------------------------------------------------------------------
/codebundles/k8s-redis-healthcheck/.runwhen/generation-rules/k8s-redis-healthcheck.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - deployment
7 | matchRules:
8 | - type: pattern
9 | pattern: "redis"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: redis-health
14 | levelOfDetail: detailed
15 | qualifiers: ["resource", "namespace", "cluster"]
16 | baseTemplateName: k8s-redis-health
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-redis-health-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-redis-healthcheck/.runwhen/templates/k8s-redis-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/redis-logo.svg
11 | alias: {{match_resource.resource.metadata.name}} Health
12 | asMeasuredBy: The ability to ping, read and write keys to the Redis service.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Redis endpoints should be responsive and healthy state.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-serviceaccount-check/.runwhen/generation-rules/k8s-serviceaccount-check.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - namespace
7 | matchRules:
8 | - type: and
9 | matches:
10 | - type: pattern
11 | pattern: ".+"
12 | properties: [name]
13 | mode: substring
14 | slxs:
15 | - baseName: sa-check
16 | qualifiers: ["namespace", "cluster"]
17 | baseTemplateName: k8s-serviceaccount-check
18 | levelOfDetail: detailed
19 | outputItems:
20 | - type: slx
21 | - type: runbook
22 | templateName: k8s-serviceaccount-check-taskset.yaml
23 |
--------------------------------------------------------------------------------
/codebundles/k8s-serviceaccount-check/.runwhen/templates/k8s-serviceaccount-check-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | labels:
5 | slx: {{slx_name}}
6 | workspace: {{workspace.short_name}}
7 | name: {{slx_name}}
8 | annotations:
9 | {% include "common-annotations.yaml" %}
10 | spec:
11 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/sa.svg
12 | alias: {{namespace.name}} Service Account Check
13 | asMeasuredBy: A temporary curl pod using a namespaced service account to interact with the API server.
14 | configProvided:
15 | - name: OBJECT_NAME
16 | value: {{match_resource.resource.metadata.name}}
17 | owners:
18 | - {{workspace.owner_email}}
19 | statement: Pods should be able to contact the Kubernetes API server.
20 | additionalContext:
21 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
22 | qualified_name: "{{ match_resource.qualified_name }}"
23 | tags:
24 | {% include "kubernetes-tags.yaml" ignore missing %}
25 | - name: access
26 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-statefulset-healthcheck/.runwhen/generation-rules/k8s-statefulset-health.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: GenerationRules
3 | spec:
4 | generationRules:
5 | - resourceTypes:
6 | - statefulset
7 | matchRules:
8 | - type: pattern
9 | pattern: ".+"
10 | properties: [name]
11 | mode: substring
12 | slxs:
13 | - baseName: ss-health
14 | levelOfDetail: detailed
15 | qualifiers: ["resource", "namespace", "cluster"]
16 | baseTemplateName: k8s-statefulset-health
17 | outputItems:
18 | - type: slx
19 | - type: runbook
20 | templateName: k8s-statefulset-health-taskset.yaml
21 |
--------------------------------------------------------------------------------
/codebundles/k8s-statefulset-healthcheck/.runwhen/templates/k8s-statefulset-health-slx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: runwhen.com/v1
2 | kind: ServiceLevelX
3 | metadata:
4 | name: {{slx_name}}
5 | labels:
6 | {% include "common-labels.yaml" %}
7 | annotations:
8 | {% include "common-annotations.yaml" %}
9 | spec:
10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/sts.svg
11 | alias: {{match_resource.resource.metadata.name}} StatefulSet Health
12 | asMeasuredBy: The Running state of desired pods.
13 | configProvided:
14 | - name: OBJECT_NAME
15 | value: {{match_resource.resource.metadata.name}}
16 | owners:
17 | - {{workspace.owner_email}}
18 | statement: Statefulset {{match_resource.resource.metadata.name}} should be in a healthy state.
19 | additionalContext:
20 | {% include "kubernetes-hierarchy.yaml" ignore missing %}
21 | qualified_name: "{{ match_resource.qualified_name }}"
22 | tags:
23 | {% include "kubernetes-tags.yaml" ignore missing %}
24 | - name: access
25 | value: read-only
--------------------------------------------------------------------------------
/codebundles/k8s-tail-logs-dynamic/meta.yaml:
--------------------------------------------------------------------------------
1 | commands: []
2 |
--------------------------------------------------------------------------------
/libraries/.docs/Suggest.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # libraries.RW.NextSteps.Suggest
4 |
5 | Utility library for suggesting next steps based on a static troubleshooting yaml database
6 |
7 | See https://github.com/seatgeek/thefuzz
8 |
9 | Scope: Global
10 |
11 |
--------------------------------------------------------------------------------
/libraries/.docs/_test_parsers.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # libraries.RW.K8sApplications.\_test\_parsers
4 |
5 |
--------------------------------------------------------------------------------
/libraries/.docs/k8s_helper.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # libraries.RW.K8sHelper.k8s\_helper
4 |
5 |
6 |
7 | #### get\_related\_resource\_recommendations
8 |
9 | ```python
10 | def get_related_resource_recommendations(k8s_object)
11 | ```
12 |
13 | Parse a Kubernetes object JSON for specific annotations or labels and return recommendations.
14 |
15 | **Arguments**:
16 |
17 | - `obj_json` _dict_ - The Kubernetes object JSON.
18 |
19 |
20 | **Returns**:
21 |
22 | - `str` - Recommendations based on the object's annotations or labels.
23 |
24 |
25 |
26 | #### sanitize\_messages
27 |
28 | ```python
29 | def sanitize_messages(input_string)
30 | ```
31 |
32 | Sanitize the message string by replacing ncharacters that can't be processed into json issue details.
33 |
34 | **Arguments**:
35 |
36 | - input_string: The string to be sanitized.
37 |
38 |
39 | **Returns**:
40 |
41 | - The sanitized string.
42 |
43 |
--------------------------------------------------------------------------------
/libraries/.docs/local_process.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # libraries.RW.CLI.local\_process
4 |
5 | TODO: should be incorporated into platform behaviour
6 | Acts as interoperable layer between ShellRequest/Response and local processes - hacky
7 |
8 |
--------------------------------------------------------------------------------
/libraries/.docs/migrations_inspector.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # libraries.RW.K8sApplications.migrations\_inspector
4 |
5 |
--------------------------------------------------------------------------------
/libraries/.docs/parsers.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # libraries.RW.K8sApplications.parsers
4 |
5 |
6 |
7 | ## StackTraceData Objects
8 |
9 | ```python
10 | @dataclass
11 | class StackTraceData()
12 | ```
13 |
14 |
15 |
16 | #### line\_nums
17 |
18 | line numbers associated with exceptions per file
19 |
20 |
21 |
22 | ## BaseStackTraceParse Objects
23 |
24 | ```python
25 | class BaseStackTraceParse()
26 | ```
27 |
28 | Base class for stacktrace parsing functions.
29 | Should be stateless so it can be used as a utility class.
30 |
31 | Note that the default behavior assumes python stack traces, and inheritors can override for other languages.
32 |
33 |
--------------------------------------------------------------------------------
/libraries/.docs/postgres_helper.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # libraries.RW.CLI.postgres\_helper
4 |
5 |
--------------------------------------------------------------------------------
/libraries/.docs/repository.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # libraries.RW.K8sApplications.repository
4 |
5 |
--------------------------------------------------------------------------------
/libraries/Jenkins/__init__.py:
--------------------------------------------------------------------------------
1 | from .jenkins import *
2 |
--------------------------------------------------------------------------------
/libraries/RW/CLI/__init__.py:
--------------------------------------------------------------------------------
1 | from .CLI import *
2 | from .postgres_helper import k8s_postgres_query, get_password, get_user
3 |
--------------------------------------------------------------------------------
/libraries/RW/K8sApplications/__init__.py:
--------------------------------------------------------------------------------
1 | from .k8s_applications import *
2 |
--------------------------------------------------------------------------------
/libraries/RW/K8sApplications/migrations_inspector.py:
--------------------------------------------------------------------------------
1 | # determines migration library/tool and then fetches migration table info
2 |
--------------------------------------------------------------------------------
/libraries/RW/K8sApplications/no_stacktraces_report.jinja2:
--------------------------------------------------------------------------------
1 | # Stack Trace Report
2 |
3 | Report Created At: {{ data.timestamp }}
4 |
5 | ## Summary
6 | **Total Stack Traces:** {{ data.stacktraces|length }}
7 |
8 |
9 | **No stacktraces were found!**
--------------------------------------------------------------------------------
/libraries/RW/K8sApplications/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pytest --log-cli-level=DEBUG _test_parsers.py
--------------------------------------------------------------------------------
/libraries/RW/K8sApplications/test_data/golang.log:
--------------------------------------------------------------------------------
1 | rpc error: code = Unavailable desc = connection error: desc = "transport: Error while dialing dial tcp 10.107.253.212:7070: connect: connection refused"
2 | could not retrieve cart
3 | main.(*frontendServer).homeHandler
4 | /src/handlers.go:69
5 | net/http.HandlerFunc.ServeHTTP
6 | /usr/local/go/src/net/http/server.go:2109
7 | github.com/gorilla/mux.(*Router).ServeHTTP
8 | /go/pkg/mod/github.com/gorilla/mux@v1.8.0/mux.go:210
9 | main.(*logHandler).ServeHTTP
10 | /src/middleware.go:82
11 | main.ensureSessionID.func1
12 | /src/middleware.go:109
13 | net/http.HandlerFunc.ServeHTTP
14 | /usr/local/go/src/net/http/server.go:2109
15 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp.(*Handler).ServeHTTP
16 | /go/pkg/mod/go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp@v0.37.0/handler.go:210
17 | net/http.serverHandler.ServeHTTP
18 | /usr/local/go/src/net/http/server.go:2947
19 | net/http.(*conn).serve
20 | /usr/local/go/src/net/http/server.go:1991
21 | runtime.goexit
22 | /usr/local/go/src/runtime/asm_amd64.s:1594
--------------------------------------------------------------------------------
/libraries/RW/K8sApplications/test_data/java.log:
--------------------------------------------------------------------------------
1 | Exception in thread "main" java.lang.IndexOutOfBoundsException: Index 5 out of bounds for length 3
2 | at java.util.ArrayList.rangeCheck(ArrayList.java:659)
3 | at java.util.ArrayList.get(ArrayList.java:435)
4 | at MainKt.main(Main.kt:6)
--------------------------------------------------------------------------------
/libraries/RW/K8sApplications/test_data/node.log:
--------------------------------------------------------------------------------
1 | Error: ENOENT: no such file or directory, open 'missing-file.txt'
2 | at Object.openSync (fs.js:462:3)
3 | at Object.readFileSync (fs.js:364:35)
4 | at handleRequest (/server.js:15:18)
5 | at Layer.handle [as handle_request] (/node_modules/express/lib/router/layer.js:95:5)
6 | at next (/node_modules/express/lib/router/route.js:144:13)
7 | at Route.dispatch (/node_modules/express/lib/router/route.js:114:3)
8 | at Layer.handle [as handle_request] (/node_modules/express/lib/router/layer.js:95:5)
9 | at /node_modules/express/lib/router/index.js:284:15
10 | at Function.process_params (/node_modules/express/lib/router/index.js:346:12)
11 | at next (/node_modules/express/lib/router/index.js:280:10)
--------------------------------------------------------------------------------
/libraries/RW/K8sHelper/__init__.py:
--------------------------------------------------------------------------------
1 | from .k8s_helper import *
2 |
--------------------------------------------------------------------------------
/libraries/RW/K8sLog/__init__.py:
--------------------------------------------------------------------------------
1 | from .k8s_log import K8sLog
2 |
3 | __version__ = "1.0.0"
--------------------------------------------------------------------------------
/libraries/RW/NextSteps/__init__.py:
--------------------------------------------------------------------------------
1 | from .Suggest import *
2 |
--------------------------------------------------------------------------------
/libraries/RW/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This line is required so that we can have RW.Core in one directory
3 | and the other RW libs in other directories
4 | See - https://packaging.python.org/en/latest/guides/packaging-namespace-packages/
5 | """
6 | __path__ = __import__("pkgutil").extend_path(__path__, __name__)
--------------------------------------------------------------------------------
/libraries/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This line is required so that we can have RW.Core in one directory
3 | and the other RW libs in other directories
4 | See - https://packaging.python.org/en/latest/guides/packaging-namespace-packages/
5 | """
6 | __path__ = __import__("pkgutil").extend_path(__path__, __name__)
7 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=61.2",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
8 | [project]
9 | name = "rw-cli-keywords"
10 | description = "A set of RunWhen published CLI keywords and python libraries for interacting with APIs using CLIs"
11 | readme = "README.md"
12 | license = { text = "Apache License 2.0" }
13 | authors = [
14 | { name = "RunWhen", email = "info@runwhen.com" }
15 | ]
16 | classifiers = [
17 | "Programming Language :: Python :: 3",
18 | "License :: OSI Approved :: Apache Software License"
19 | ]
20 | # We declare these fields as dynamic because they come from external files
21 | dynamic = ["version", "dependencies"]
22 |
23 | [tool.setuptools.packages.find]
24 | where = ["libraries"]
25 |
26 | # Dynamically read the version from the VERSION file
27 | # and the dependencies from requirements.txt.
28 | [tool.setuptools.dynamic]
29 | version = { file = "VERSION" }
30 | dependencies = { file = "requirements.txt" }
31 |
32 | [project.urls]
33 | homepage = "https://github.com/runwhen-contrib/rw-cli-codecollection"
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | robotframework>=4.1.2
2 | jmespath>=1.0.1
3 | python-dateutil>=2.9.0
4 | requests>=2.31.0
5 | thefuzz>=0.20.0
6 | pyyaml>=6.0.1
7 | jinja2>=3.1.4
8 | tabulate>=0.9.0
--------------------------------------------------------------------------------