├── .devcontainer.json ├── .gitbook.yaml ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ └── codebundle.md ├── config.yaml ├── queries │ ├── addDiscussionComment.graphql │ ├── createDiscussion.graphql │ ├── deleteDiscussion.graphql │ ├── getComments.graphql │ ├── getDiscussion.graphql │ └── searchDiscussions.graphql ├── scripts │ ├── index-config.yaml │ ├── index.py │ ├── meta.py │ ├── pydoc2md.sh │ ├── reference_scores.json │ ├── semver-it.sh │ ├── task_analysis.json │ └── update_titles.py └── workflows │ ├── build-push.yaml │ ├── pypi.yaml │ ├── release.yaml │ ├── score_manual.yaml │ ├── score_pr.yaml │ └── semver.yml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Dockerfile ├── Introduction.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── SUMMARY.md ├── VERSION ├── codebundles ├── aws-cloudwatch-overused-ec2 │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── aws-eks-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── aws-eks-health.yaml │ │ └── templates │ │ │ ├── aws-eks-health-sli.yaml │ │ │ ├── aws-eks-health-slx.yaml │ │ │ └── aws-eks-health-taskset.yaml │ ├── README.md │ ├── auth.sh │ ├── check_eks_cluster_health.sh │ ├── check_eks_fargate_cluster_health_status.sh │ ├── list_eks_fargate_metrics.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── aws-eks-node-reboot │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── aws-elasticache-redis-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── aws-elasticache-redis-health.yaml │ │ └── templates │ │ │ ├── aws-elasticache-redis-health-sli.yaml │ │ │ ├── aws-elasticache-redis-health-slx.yaml │ │ │ └── aws-elasticache-redis-health-taskset.yaml │ ├── README.md │ ├── analyze_aws_elasticache_redis_metrics.sh │ ├── auth.sh │ ├── meta.yaml │ ├── monitor_redis_performance.sh │ ├── redis_status_scan.sh │ ├── runbook.robot │ ├── sli.robot │ └── validate_aws_elasticache_redis_config.py ├── aws-lambda-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── aws-lambda-health.yaml │ │ └── templates │ │ │ ├── aws-lambda-health-sli.yaml │ │ │ ├── aws-lambda-health-slx.yaml │ │ │ └── aws-lambda-health-taskset.yaml │ ├── README.md │ ├── analyze_lambda_invocation_errors.sh │ ├── auth.sh │ ├── list_lambda_runtimes.sh │ ├── meta.yaml │ ├── monitor_aws_lambda_performance_metrics.sh │ ├── runbook.robot │ └── sli.robot ├── aws-s3-bucket-storage-report │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── aws-s3-bucket-storage-report.yaml │ │ └── templates │ │ │ ├── aws-s3-bucket-storage-report-slx.yaml │ │ │ └── aws-s3-bucket-storage-report-taskset.yaml │ ├── README.md │ ├── auth.sh │ ├── check_aws_s3_bucket_storage_utilization.sh │ ├── meta.yaml │ └── runbook.robot ├── azure-acr-image-sync │ ├── README.md │ ├── acr_sync_images.sh │ ├── check_for_image_updates.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── azure-adf-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-adf-health.yaml │ │ └── templates │ │ │ ├── azure-adf-health-sli.yaml │ │ │ ├── azure-adf-health-slx.yaml │ │ │ ├── azure-adf-health-taskset.yaml │ │ │ └── azure-adf-health-workflow.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── README.md │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── fail-sim-pipeline.json │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── adf_details.sh │ ├── data_volume_audit.sh │ ├── error_patterns.json │ ├── error_trend.sh │ ├── failed_pipeline.sh │ ├── long_pipeline_runs.sh │ ├── resource_health.sh │ ├── runbook.robot │ └── sli.robot ├── azure-aks-triage │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-aks-triage.yaml │ │ └── templates │ │ │ ├── azure-aks-triage-sli.yaml │ │ │ ├── azure-aks-triage-slx.yaml │ │ │ ├── azure-aks-triage-taskset.yaml │ │ │ └── azure-aks-triage-workflow.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── README.md │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── aks_activities.sh │ ├── aks_cluster_health.sh │ ├── aks_network.sh │ ├── aks_resource_health.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── azure-apim-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-apim-health.yaml │ │ └── templates │ │ │ ├── azure-apim-health-sli.yaml │ │ │ ├── azure-apim-health-slx.yaml │ │ │ ├── azure-apim-health-taskset.yaml │ │ │ └── azure-apim-health-workflow.yaml │ ├── .test │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── README.md │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── apim_diagnostic_logs.sh │ ├── apim_metrics.sh │ ├── apim_policies.sh │ ├── apim_resource_health.sh │ ├── check_apim_ssl_certs.sh │ ├── gather_apim_resource_information.sh │ ├── inspect_apim_dependencies.sh │ ├── runbook.robot │ ├── sli.robot │ └── verify_apim_policies.sh ├── azure-appgateway-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-appgateway-health.yaml │ │ └── templates │ │ │ ├── azure-appgateway-health-sli.yaml │ │ │ ├── azure-appgateway-health-slx.yaml │ │ │ ├── azure-appgateway-health-taskset.yaml │ │ │ └── azure-appgateway-health-workflow.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── README.md │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── app_gateway_backend_health.sh │ ├── app_gateway_config_health.sh │ ├── app_gateway_log_analytics.sh │ ├── app_gateway_log_errors.sh │ ├── app_gateway_metrics.sh │ ├── app_gateway_related_resources.sh │ ├── app_gateway_resource_health.sh │ ├── app_gateway_ssl_certs.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── azure-appservice-functionapp-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-appservice-function-health.yaml │ │ └── templates │ │ │ ├── azure-appservice-function-health-sli.yaml │ │ │ ├── azure-appservice-function-health-slx.yaml │ │ │ ├── azure-appservice-function-health-taskset.yaml │ │ │ └── azure-appservice-function-workflow.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── README.md │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── appservice_activities.sh │ ├── appservice_config_health.sh │ ├── appservice_deployment_health.sh │ ├── appservice_health_metric.sh │ ├── appservice_log_analysis.sh │ ├── appservice_logs.sh │ ├── appservice_plan_utilization_health.sh │ ├── appservice_resource_health.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── azure-appservice-webapp-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-appservice-webapp-health.yaml │ │ └── templates │ │ │ ├── azure-appservice-webapp-health-sli.yaml │ │ │ ├── azure-appservice-webapp-health-slx.yaml │ │ │ ├── azure-appservice-webapp-health-taskset.yaml │ │ │ └── azure-appservice-webapp-health-workflow.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── README.md │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── appservice_activities.sh │ ├── appservice_config_health.sh │ ├── appservice_deployment_health.sh │ ├── appservice_health_metric.sh │ ├── appservice_log_analysis.sh │ ├── appservice_logs.sh │ ├── appservice_metric_health.sh │ ├── appservice_resource_health.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── azure-appservice-webapp-ops │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-appservice-webapp-ops.yaml │ │ └── templates │ │ │ ├── azure-appservice-webapp-ops-slx.yaml │ │ │ └── azure-appservice-webapp-ops-taskset.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── README.md │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── appservice_logs.sh │ ├── appservice_plan_scaledown.sh │ ├── appservice_plan_scaleup.sh │ ├── appservice_redeploy.sh │ ├── appservice_restart.sh │ ├── appservice_scale_in.sh │ ├── appservice_scale_out.sh │ ├── appservice_slot_swap.sh │ ├── meta.yaml │ └── runbook.robot ├── azure-kv-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-kv-health.yaml │ │ └── templates │ │ │ ├── azure-kv-health-sli.yaml │ │ │ ├── azure-kv-health-slx.yaml │ │ │ ├── azure-kv-health-taskset.yaml │ │ │ └── azure-kv-health-workflow.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── availability.sh │ ├── expiry-checks.sh │ ├── kv_config.sh │ ├── kv_resource_health.sh │ ├── log.sh │ ├── performance_metrics.sh │ ├── runbook.robot │ └── sli.robot ├── azure-loadbalancer-triage │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── az-lb-health.yaml │ │ └── templates │ │ │ ├── az-lb-health-slx.yaml │ │ │ └── az-lb-health-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── azure-servicebus-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-servicebus-health.yaml │ │ └── templates │ │ │ ├── az-servicebus-health-slx.yaml │ │ │ └── az-servicebus-health-taskset.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ ├── configure_security_test.sh │ │ ├── generate_log_activity.sh │ │ ├── generate_traffic.sh │ │ ├── inject_test_messages.sh │ │ ├── setup_connectivity_test.sh │ │ └── terraform │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── main.tf.dr │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ ├── service_bus_alerts_check.sh │ ├── service_bus_capacity.sh │ ├── service_bus_config_health.sh │ ├── service_bus_connectivity_test.sh │ ├── service_bus_disaster_recovery.sh │ ├── service_bus_log_analytics.sh │ ├── service_bus_metrics.sh │ ├── service_bus_queue_health.sh │ ├── service_bus_related_resources.sh │ ├── service_bus_resource_health.sh │ ├── service_bus_security_audit.sh │ ├── service_bus_topic_health.sh │ └── sli.robot ├── azure-vmss-triage │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── azure-vmss-triage.yaml │ │ └── templates │ │ │ ├── azure-vmss-triage-sli.yaml │ │ │ ├── azure-vmss-triage-slx.yaml │ │ │ └── azure-vmss-triage-taskset.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── README.md │ │ │ ├── Taskfile.yaml │ │ │ ├── backend.tf │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ ├── terraform.tfvars │ │ │ └── vars.tf │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ ├── sli.robot │ ├── vmss_activities.sh │ ├── vmss_config.sh │ └── vmss_metrics.sh ├── curl-gmp-kong-ingress-inspection │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── curl-gmp-nginx-ingress-inspection │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── curl-http-ok │ ├── .runwhen │ │ ├── generation-rules │ │ │ ├── http-ok-tls-aks-public-loadbalancer-ext-dns-tls.yaml │ │ │ ├── http-ok-tls.yaml │ │ │ └── http-ok.yaml │ │ └── templates │ │ │ ├── http-ok-sli.yaml │ │ │ ├── http-ok-slo.yaml │ │ │ ├── http-ok-slx.yaml │ │ │ ├── http-ok-taskset.yaml │ │ │ ├── http-ok-tls-aks-public-loadbalancer-ext-dns-tls-sli.yaml │ │ │ ├── http-ok-tls-aks-public-loadbalancer-ext-dns-tls-slx.yaml │ │ │ ├── http-ok-tls-aks-public-loadbalancer-ext-dns-tls-taskset.yaml │ │ │ ├── http-ok-tls-sli.yaml │ │ │ ├── http-ok-tls-slo.yaml │ │ │ ├── http-ok-tls-slx.yaml │ │ │ └── http-ok-tls-taskset.yaml │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── gcloud-log-inspection │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── gcloud-node-preempt │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── gcloud-node-preempt.yaml │ │ └── templates │ │ │ ├── gcloud-node-preempt-sli.yaml │ │ │ ├── gcloud-node-preempt-slo.yaml │ │ │ ├── gcloud-node-preempt-slx.yaml │ │ │ └── gcloud-node-preempt-taskset.yaml │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── gcp-bucket-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── gcp-bucket-health.yaml │ │ └── templates │ │ │ ├── gcp-bucket-health-sli.yaml │ │ │ ├── gcp-bucket-health-slo.yaml │ │ │ ├── gcp-bucket-health-slx.yaml │ │ │ └── gcp-bucket-health-taskset.yaml │ ├── README.md │ ├── bucket_details.sh │ ├── bucket_ops_costs.sh │ ├── bucket_size.sh │ ├── check_security.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── gcp-cloud-function-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── gcp-cloud-function-health.yaml │ │ └── templates │ │ │ ├── gcp-cloud-function-health-sli.yaml │ │ │ ├── gcp-cloud-function-health-slo.yaml │ │ │ ├── gcp-cloud-function-health-slx.yaml │ │ │ └── gcp-cloud-function-health-taskset.yaml │ ├── README.md │ ├── cloud_functions_next_steps.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── gh-actions-artifact-analysis │ ├── README.md │ ├── gh_actions_artifact_analysis.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── gh-actions-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── gh-actions-health.yaml │ │ └── templates │ │ │ ├── gh-actions-health-sli.yaml │ │ │ ├── gh-actions-health-slx.yaml │ │ │ └── gh-actions-health-taskset.yaml │ ├── README.md │ ├── calculate_org_sli.sh │ ├── calculate_performance_sli.sh │ ├── calculate_rate_limit_sli.sh │ ├── calculate_runner_sli.sh │ ├── calculate_security_sli.sh │ ├── calculate_workflow_sli.sh │ ├── check_billing_usage.sh │ ├── check_long_running_workflows.sh │ ├── check_org_workflow_health.sh │ ├── check_rate_limits.sh │ ├── check_repo_health_summary.sh │ ├── check_runner_health.sh │ ├── check_security_workflows.sh │ ├── check_workflow_failures.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── gke-cluster-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── gke-cluster-health.yaml │ │ └── templates │ │ │ ├── gke-cluster-health-sli.yaml │ │ │ ├── gke-cluster-health-slx.yaml │ │ │ ├── gke-cluster-health-taskset.yaml │ │ │ └── gke-cluster-health-workflow.yaml │ ├── .test │ │ ├── README.md │ │ └── Taskfile.yaml │ ├── README.md │ ├── cluster_health.sh │ ├── cluster_operations.sh │ ├── gcp_recommendations.sh │ ├── gke_node_size.py │ ├── node_pool_health.sh │ ├── quota_check.sh │ ├── runbook.robot │ ├── sa_check.sh │ └── sli.robot ├── jenkins-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── jenkins-instance-health.yaml │ │ └── templates │ │ │ ├── jenkins-instance-health-sli.yaml │ │ │ ├── jenkins-instance-health-slx.yaml │ │ │ └── jenkins-instance-health-taskset.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── Taskfile.yaml │ │ │ ├── create_jenkins_token.sh │ │ │ ├── failed-job.xml │ │ │ ├── failed-pipeline.xml │ │ │ ├── long-running-job.xml │ │ │ ├── main.tf │ │ │ ├── provider.tf │ │ │ └── python-docker-pipeline.xml │ ├── README.md │ ├── error_patterns.json │ ├── failed_build_logs.sh │ ├── long_running_builds.sh │ ├── runbook.robot │ └── sli.robot ├── k8s-app-troubleshoot │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-app-troubleshoot.yaml │ │ └── templates │ │ │ ├── k8s-app-troubleshoot-sli.yaml │ │ │ ├── k8s-app-troubleshoot-slo.yaml │ │ │ ├── k8s-app-troubleshoot-slx.yaml │ │ │ └── k8s-app-troubleshoot-taskset.yaml │ ├── README.md │ ├── env_check.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── k8s-application-log-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ ├── k8s-deployment-logs-health.yaml │ │ │ └── k8s-statefulset-logs-health.yaml │ │ └── templates │ │ │ ├── k8s-deployment-logs-health-sli.yaml │ │ │ ├── k8s-deployment-logs-health-slx.yaml │ │ │ ├── k8s-deployment-logs-health-taskset.yaml │ │ │ ├── k8s-deployment-logs-health-workflow.yaml │ │ │ ├── k8s-ss-logs-health-sli.yaml │ │ │ ├── k8s-ss-logs-health-slx.yaml │ │ │ ├── k8s-ss-logs-taskset.yaml │ │ │ └── k8s-statefulset-logs-health-workflow.yaml │ ├── .test │ │ └── Taskfile.yaml │ ├── README.md │ ├── error_patterns.json │ ├── get_pod_logs_for_workload.sh │ ├── ignore_patterns.json │ ├── meta.yaml │ ├── runbook.robot │ ├── scan_application_restarts.sh │ ├── scan_auth_failures.sh │ ├── scan_connection_failures.sh │ ├── scan_error_logs.sh │ ├── scan_log_anomalies.sh │ ├── scan_logs.py │ ├── scan_null_pointer_exceptions.sh │ ├── scan_resource_warnings.sh │ ├── scan_service_dependency_failures.sh │ ├── scan_stack_traces.sh │ ├── scan_timeout_errors.sh │ ├── sli.robot │ └── summarize.py ├── k8s-argocd-application-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-argocd-application-health.yaml │ │ └── templates │ │ │ ├── k8s-argocd-application-health-cli-taskset.yaml │ │ │ └── k8s-argocd-application-health-slx.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-argocd-helm-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-argocd-helm-health.yaml │ │ └── templates │ │ │ ├── k8s-argocd-helm-health-slx.yaml │ │ │ └── k8s-argocd-helm-health-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-artifactory-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-artifactory.yaml │ │ └── templates │ │ │ ├── k8s-artifactory-healthcheck-slx.yaml │ │ │ └── k8s-artifactory-healthcheck-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-certmanager-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-certmanager-certificates-health.yaml │ │ └── templates │ │ │ ├── k8s-certmanager-certificate-health-sli.yaml │ │ │ ├── k8s-certmanager-certificate-health-slo.yaml │ │ │ ├── k8s-certmanager-certificate-health-slx.yaml │ │ │ ├── k8s-certmanager-certificate-health-taskset.yaml │ │ │ └── k8s-certmanager-certificate-health-workflow.yaml │ ├── README.md │ ├── certificate_next_steps.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── k8s-chaos-flux │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-chaos-namespace │ ├── README.md │ ├── auth.sh │ ├── change_service_port.sh │ ├── change_service_selector.sh │ ├── delete_random_pods.sh │ ├── drain_node.sh │ ├── expand_tmp.sh │ ├── meta.yaml │ ├── oomkill_pod.sh │ └── runbook.robot ├── k8s-chaos-workload │ ├── README.md │ ├── auth.sh │ ├── change_service_port.sh │ ├── change_service_selector.sh │ ├── expand_tmp.sh │ ├── kill_workload_pod.sh │ ├── meta.yaml │ ├── oomkill_workload_pod.sh │ └── runbook.robot ├── k8s-cluster-node-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-cluster-node-health.yaml │ │ └── templates │ │ │ ├── k8s-cluster-node-health-sli.yaml │ │ │ ├── k8s-cluster-node-health-slx.yaml │ │ │ ├── k8s-cluster-node-health-taskset.yaml │ │ │ └── k8s-cluster-node-health-workflow.yaml │ ├── README.md │ ├── meta.yaml │ ├── node_restart_check.sh │ ├── runbook.robot │ └── sli.robot ├── k8s-cluster-resource-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-cluster-resource-health.yaml │ │ └── templates │ │ │ ├── k8s-cluster-resource-health-sli.yaml │ │ │ ├── k8s-cluster-resource-health-slx.yaml │ │ │ └── k8s-cluster-resource-health-taskset.yaml │ ├── README.md │ ├── get_high_use_nodes.sh │ ├── meta.yaml │ ├── overlimit_check.sh │ ├── pods_impacting_high_use_nodes.sh │ ├── runbook.robot │ └── sli.robot ├── k8s-daemonset-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-daemonset-health.yaml │ │ └── templates │ │ │ ├── k8s-daemonset-health-slx.yaml │ │ │ └── k8s-daemonset-health-taskset.yaml │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ ├── validate_probes.sh │ ├── workload_issues.sh │ └── workload_next_steps.sh ├── k8s-deployment-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-deployment-health.yaml │ │ └── templates │ │ │ ├── k8s-deployment-health-slx.yaml │ │ │ └── k8s-deployment-health-taskset.yaml │ ├── .test │ │ └── Taskfile.yaml │ ├── README.md │ ├── check_replicaset.sh │ ├── container_restarts.sh │ ├── deployment_logs.sh │ ├── event_anomalies.sh │ ├── meta.yaml │ ├── runbook.robot │ ├── validate_probes.sh │ ├── workload_issues.sh │ └── workload_next_steps.sh ├── k8s-deployment-ops │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-deployment-ops.yaml │ │ └── templates │ │ │ ├── k8s-deployment-ops-slx.yaml │ │ │ └── k8s-deployment-ops-taskset.yaml │ ├── .test │ │ └── Taskfile.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-fluxcd-helm-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-flux-helm-health.yaml │ │ └── templates │ │ │ ├── k8s-flux-helm-health-slx.yaml │ │ │ └── k8s-flux-helm-health-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-fluxcd-kustomization-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-flux-kustomization-health.yaml │ │ └── templates │ │ │ ├── k8s-flux-kustomize-health-sli.yaml │ │ │ ├── k8s-flux-kustomize-health-slx.yaml │ │ │ └── k8s-flux-kustomize-health-taskset.yaml │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ ├── sli.robot │ └── workload_next_steps.sh ├── k8s-fluxcd-reconcile │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-fluxcd-reconcile.yaml │ │ └── templates │ │ │ ├── k8s-fluxcd-reconcile-sli.yaml │ │ │ ├── k8s-fluxcd-reconcile-slo.yaml │ │ │ ├── k8s-fluxcd-reconcile-slx.yaml │ │ │ └── k8s-fluxcd-reconcile-taskset.yaml │ ├── README.md │ ├── flux_reconcile_report.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── k8s-gitops-gh-remediate │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-gitops-gh-remediate.yaml │ │ └── templates │ │ │ ├── k8s-gitops-gh-remediate-slx.yaml │ │ │ └── k8s-gitops-gh-remediate-taskset.yaml │ ├── .test │ │ └── README.md │ ├── README.md │ ├── meta.yaml │ ├── pvc_utilization_check.sh │ ├── resource_quota_check.sh │ ├── runbook.robot │ ├── update_github_manifests.sh │ ├── validate_all_probes.sh │ └── vpa_recommendations.sh ├── k8s-image-check │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-image-health.yaml │ │ └── templates │ │ │ ├── k8s-image-check-slx.yaml │ │ │ └── k8s-image-check-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-ingress-gce-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-ingress-gce-healthcheck │ │ └── templates │ │ │ ├── k8s-ingress-gce-healthcheck-slx.yaml │ │ │ └── k8s-ingress-gce-healthcheck-taskset.yaml │ ├── README.md │ ├── check_gce_ingress_objects.sh │ ├── meta.yaml │ └── runbook.robot ├── k8s-ingress-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-ingress-health .yaml │ │ └── templates │ │ │ ├── k8s-ingress-healthcheck-slx.yaml │ │ │ └── k8s-ingress-healthcheck-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-istio-system-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-istio-system-health.yaml │ │ └── templates │ │ │ ├── k8s-istio-system-health-sli.yaml │ │ │ ├── k8s-istio-system-health-slx.yaml │ │ │ └── k8s-istio-system-health-taskset.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── terraform │ │ │ ├── Taskfile.yaml │ │ │ ├── book-info.yaml │ │ │ ├── bookinfo-gateway.yaml │ │ │ ├── bookinfo │ │ │ ├── bookinfo.yaml │ │ │ └── fault-injection-details-v1.yaml │ │ │ ├── dr-bookinfo.yaml │ │ │ ├── faulty-gateway.yaml │ │ │ ├── kubeconfig-sa-token.yaml │ │ │ ├── main.tf │ │ │ ├── outputs.tf │ │ │ ├── provider.tf │ │ │ ├── standard-install.yaml │ │ │ ├── variables.tf │ │ │ └── versions.tf │ ├── README.md │ ├── analyze_istio_configurations.sh │ ├── check_istio_injection.sh │ ├── controlplane_error_patterns.json │ ├── istio_controlplane_logs.sh │ ├── istio_installation_verify.sh │ ├── istio_mtls_check.sh │ ├── istio_proxy_logs.sh │ ├── istio_sidecar_injection_report.sh │ ├── istio_sidecar_resource_usage.sh │ ├── proxy_error_patterns.json │ ├── runbook.robot │ └── sli.robot ├── k8s-jaeger-http-query │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-jaeger-http-query.yaml │ │ └── templates │ │ │ ├── k8s-jaeger-http-query-slx.yaml │ │ │ └── k8s-jaeger-http-query-taskset.yaml │ ├── README.md │ ├── meta.yaml │ ├── query_jaeger_http_errors.sh │ └── runbook.robot ├── k8s-jenkins-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-jenkins-healthcheck.yaml │ │ └── templates │ │ │ ├── k8s-jenkins-health-slx.yaml │ │ │ └── k8s-jenkins-health-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-labeledpods-healthcheck │ └── sli.robot ├── k8s-loki-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-loki-healthcheck.yaml │ │ └── templates │ │ │ ├── k8s-loki-healthcheck-slx.yaml │ │ │ └── k8s-loki-healthcheck-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-namespace-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-namespace-healthcheck.yaml │ │ └── templates │ │ │ ├── k8s-namespace-healthcheck-sli.yaml │ │ │ ├── k8s-namespace-healthcheck-slo.yaml │ │ │ ├── k8s-namespace-healthcheck-slx.yaml │ │ │ ├── k8s-namespace-healthcheck-taskset.yaml │ │ │ └── k8s-namespace-healthcheck-workflow.yaml │ ├── README.md │ ├── container_restarts.sh │ ├── find_resource_owners.sh │ ├── meta.yaml │ ├── resource_quota_check.sh │ ├── runbook.robot │ ├── sli.robot │ ├── workload_issues.sh │ └── workload_next_steps.sh ├── k8s-otelcollector │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-otelcollector.yaml │ │ └── templates │ │ │ ├── k8s-otelcollector-slx.yaml │ │ │ └── k8s-otelcollector-taskset.yaml │ ├── README.md │ ├── meta.yaml │ ├── otel_dropped_check.sh │ ├── otel_error_check.sh │ ├── otel_metrics_check.sh │ └── runbook.robot ├── k8s-podresources-health │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-pod-resources.yaml │ │ └── templates │ │ │ ├── k8s-pod-resources-slx.yaml │ │ │ └── k8s-pod-resources-taskset.yaml │ ├── find_resource_owners.sh │ ├── identify_resource_contrained_pods.sh │ ├── meta.yaml │ ├── runbook.robot │ └── vpa_recommendations.sh ├── k8s-postgres-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ ├── k8s-postgres-healthcheck-crunchy.yaml │ │ │ └── k8s-postgres-healthcheck-zalando.yaml │ │ └── templates │ │ │ ├── k8s-postgres-healthcheck-crunchy-sli.yaml │ │ │ ├── k8s-postgres-healthcheck-crunchy-slx.yaml │ │ │ ├── k8s-postgres-healthcheck-crunchy-taskset.yaml │ │ │ ├── k8s-postgres-healthcheck-zalando-sli.yaml │ │ │ ├── k8s-postgres-healthcheck-zalando-slx.yaml │ │ │ └── k8s-postgres-healthcheck-zalando-taskset.yaml │ ├── backup_health.sh │ ├── config_health.sh │ ├── dbquery.sh │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── k8s-prometheus-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-prometheus-healthcheck.yaml │ │ └── templates │ │ │ ├── k8s-prometheus-healthcheck-slx.yaml │ │ │ └── k8s-prometheus-healthcheck-taskset.yaml │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ └── validate_servicemonitors.sh ├── k8s-pvc-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-pvc-healthcheck.yaml │ │ └── templates │ │ │ ├── k8s-pvc-healthcheck-sli.yaml │ │ │ ├── k8s-pvc-healthcheck-slx.yaml │ │ │ ├── k8s-pvc-healthcheck-taskset.yaml │ │ │ └── k8s-pvc-healthcheck-workflow.yaml │ ├── .test │ │ ├── README.md │ │ ├── Taskfile.yaml │ │ └── kubernetes │ │ │ └── mainfest.yaml │ ├── README.md │ ├── meta.yaml │ ├── pvc_utilization_check.sh │ ├── runbook.robot │ ├── sli.robot │ └── storage_next_steps.sh ├── k8s-redis-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-redis-healthcheck.yaml │ │ └── templates │ │ │ ├── k8s-redis-health-slx.yaml │ │ │ └── k8s-redis-health-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-restart-resource │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-serviceaccount-check │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-serviceaccount-check.yaml │ │ └── templates │ │ │ ├── k8s-serviceaccount-check-slx.yaml │ │ │ └── k8s-serviceaccount-check-taskset.yaml │ ├── README.md │ ├── meta.yaml │ └── runbook.robot ├── k8s-statefulset-healthcheck │ ├── .runwhen │ │ ├── generation-rules │ │ │ └── k8s-statefulset-health.yaml │ │ └── templates │ │ │ ├── k8s-statefulset-health-slx.yaml │ │ │ └── k8s-statefulset-health-taskset.yaml │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ ├── validate_probes.sh │ ├── workload_issues.sh │ └── workload_next_steps.sh ├── k8s-tail-logs-dynamic │ ├── README.md │ ├── meta.yaml │ ├── runbook.robot │ └── sli.robot ├── k8s-vault-healthcheck │ ├── README.md │ ├── meta.yaml │ └── runbook.robot └── terraform-cloud-workspace-lock-check │ ├── meta.yaml │ └── runbook.robot ├── extras └── lnav │ └── formats │ └── http_logrus_custom.json ├── interactive_console_output.xml ├── libraries ├── .docs │ ├── CLI.md │ ├── Suggest.md │ ├── _test_parsers.md │ ├── cli_utils.md │ ├── json_parser.md │ ├── k8s_applications.md │ ├── k8s_helper.md │ ├── local_process.md │ ├── migrations_inspector.md │ ├── parsers.md │ ├── postgres_helper.md │ ├── repository.md │ └── stdout_parser.md ├── Jenkins │ ├── __init__.py │ └── jenkins.py ├── RW │ ├── CLI │ │ ├── CLI.py │ │ ├── __init__.py │ │ ├── cli_utils.py │ │ ├── json_parser.py │ │ ├── local_process.py │ │ ├── postgres_helper.py │ │ └── stdout_parser.py │ ├── K8sApplications │ │ ├── __init__.py │ │ ├── _test_parsers.py │ │ ├── k8s_applications.py │ │ ├── migrations_inspector.py │ │ ├── no_stacktraces_report.jinja2 │ │ ├── parsers.py │ │ ├── repository.py │ │ ├── simple_stacktrace_report.jinja2 │ │ ├── test.sh │ │ └── test_data │ │ │ ├── djangojson.log │ │ │ ├── golang.log │ │ │ ├── java.log │ │ │ ├── node.log │ │ │ └── python.log │ ├── K8sHelper │ │ ├── __init__.py │ │ └── k8s_helper.py │ ├── K8sLog │ │ ├── __init__.py │ │ └── k8s_log.py │ ├── NextSteps │ │ ├── Kubernetes │ │ │ └── mapping.yaml │ │ ├── Suggest.py │ │ └── __init__.py │ └── __init__.py └── __init__.py ├── pyproject.toml ├── requirements.txt └── task_analysis.json /.gitbook.yaml: -------------------------------------------------------------------------------- 1 | root: ./ 2 | structure: 3 | readme: ./Introduction.md 4 | summary: ./SUMMARY.md -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This is a comment. 2 | # Each line is a file pattern followed by one or more owners. 3 | # Read more: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners 4 | 5 | # These owners will be the default owners for everything in 6 | # the repo. Unless a later match takes precedence, 7 | # these owners will be requested for 8 | # review when someone opens a pull request. 9 | * @runwhen-contrib/runwhen-team 10 | 11 | # Order is important; the last matching pattern takes the most 12 | # precedence. 13 | 14 | # When someone opens a pull request that only 15 | # modifies JS files, only @js-owner and not the global 16 | # owner(s) will be requested for a review. 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/codebundle.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Codebundle 3 | about: The scope for a new codebundle 4 | title: "[Codebundle]" 5 | labels: 6 | assignees: '' 7 | --- 8 | 9 | **Scope** 10 | Describe the scope and purpose of the codebundle. Be specific about its functionality and what it aims to achieve. 11 | 12 | **Requested By (Optional)** 13 | - N/A 14 | 15 | **Related Integrations** 16 | Example: 17 | - Kubernetes 18 | - Nginx 19 | 20 | **Definition of Done** 21 | - [ ] Codebundle Implementation accomplishes scope (or provide a justification for why it may differ) 22 | - [ ] Attach a screenshot of the codebundle pushing a metric locally (if an SLI) or showing a report (if a taskset). 23 | - [ ] Attach a screenshot of the SLI or Taskset running in a workspace. 24 | 25 | **Optional Comments** 26 | If you have any further requests, or comments regarding this codebundle, add them here. -------------------------------------------------------------------------------- /.github/queries/addDiscussionComment.graphql: -------------------------------------------------------------------------------- 1 | mutation AddDiscussionComment($discussion_id: ID!) { 2 | addDiscussionComment(input: { discussionId: $discussion_id, body: "Possibly outdated or obsolete. Please review." }) { 3 | comment { 4 | id 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /.github/queries/createDiscussion.graphql: -------------------------------------------------------------------------------- 1 | mutation CreateDiscussion($repo_id: ID!, $codebundle: String!, $discussion_body: String!, $category_id: ID!) { 2 | createDiscussion(input: { repositoryId: $repo_id, title: $codebundle, body: $discussion_body, categoryId: $category_id }) { 3 | discussion { 4 | id 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /.github/queries/deleteDiscussion.graphql: -------------------------------------------------------------------------------- 1 | mutation DeleteDiscussion($discussion_id: ID!) { 2 | deleteDiscussion(input: { id: $discussion_id }) { 3 | clientMutationId 4 | } 5 | } -------------------------------------------------------------------------------- /.github/queries/getComments.graphql: -------------------------------------------------------------------------------- 1 | query GetComments($discussion_id: ID!) { 2 | node(id: $discussion_id) { 3 | ... on Discussion { 4 | comments(first: 100) { 5 | edges { 6 | node { 7 | body 8 | } 9 | } 10 | } 11 | } 12 | } 13 | } -------------------------------------------------------------------------------- /.github/queries/getDiscussion.graphql: -------------------------------------------------------------------------------- 1 | query GetDiscussion($discussion_id: ID!) { 2 | discussion:node(id: $discussion_id) { 3 | ... on Discussion { 4 | repository { 5 | id 6 | } 7 | category { 8 | id 9 | } 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /.github/queries/searchDiscussions.graphql: -------------------------------------------------------------------------------- 1 | query SearchDiscussions($searchQuery: String!) { 2 | search(query: $searchQuery, type: REPOSITORY, first: 100) { 3 | edges { 4 | node { 5 | ... on Repository { 6 | discussions(first: 100) { 7 | nodes { 8 | id 9 | title 10 | } 11 | } 12 | } 13 | } 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /.github/scripts/index-config.yaml: -------------------------------------------------------------------------------- 1 | # in the repos, the key specifies the name of the temp directory name 2 | repos: 3 | rw-cli-codecollection: https://github.com/runwhen-contrib/rw-cli-codecollection.git 4 | robot_file_pattern: 5 | codebundles: .robot -------------------------------------------------------------------------------- /.github/scripts/pydoc2md.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ====================================================================================== 3 | # Synopsis: a script for generating markdown files from python docstrings in a chosen directory 4 | 5 | function main (){ 6 | src_dir=$1 7 | md_dir=$2 8 | pyfiles=$(find "$src_dir" -name "*.py" | grep -v "__init__") 9 | echo "Generating documentation for files:" 10 | echo "$pyfiles" 11 | for pyfile in $pyfiles; do 12 | module_path=$(echo "$pyfile" | sed -e 's|/|.|g' -e 's|.py$||') 13 | markdown_path="${pyfile%.py}.md" 14 | markdown_filename=$(basename "$markdown_path") 15 | pydoc-markdown -m $module_path > $md_dir$markdown_filename 16 | done 17 | } 18 | main "$@" -------------------------------------------------------------------------------- /.github/scripts/reference_scores.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "task": "Check EC2 Health", 4 | "score": 1, 5 | "reasoning": "Lacks specificity about what is being checked and where." 6 | }, 7 | { 8 | "task": "Check For Overutilized EC2 Instances", 9 | "score": 2, 10 | "reasoning": "Provides more detail about 'what' is being checked, but lacks a location." 11 | }, 12 | { 13 | "task": "Check For Overutilized EC2 Instances in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", 14 | "score": 5, 15 | "reasoning": "Fully detailed, including both what (overutilized EC2 instances) and where (specific AWS region and account)." 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | # .github/release.yml 2 | # see https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes#configuring-automatically-generated-release-notes 3 | 4 | changelog: 5 | exclude: 6 | labels: 7 | - ignore-for-release 8 | authors: 9 | - octocat 10 | - github-actions 11 | categories: 12 | - title: Breaking Changes 🛠 13 | labels: 14 | - Semver-Major 15 | - breaking-change 16 | - title: Exciting New Features 🎉 17 | labels: 18 | - Semver-Minor 19 | - enhancement 20 | - title: Other Changes 21 | labels: 22 | - "*" -------------------------------------------------------------------------------- /.github/workflows/score_pr.yaml: -------------------------------------------------------------------------------- 1 | name: Score CodeCollection 2 | on: 3 | workflow_dispatch: 4 | pull_request: 5 | paths: 6 | - "codebundles/**" 7 | - ".github/workflows/score.yaml" 8 | - "!src/VERSION" 9 | 10 | permissions: 11 | contents: write 12 | pull-requests: write 13 | 14 | jobs: 15 | score-codebundles: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Check out the repo 19 | uses: actions/checkout@v3 20 | with: 21 | fetch-depth: 0 22 | 23 | - name: Set Git user 24 | run: | 25 | git config user.name "github-actions[bot]" 26 | git config user.email "github-actions[bot]@users.noreply.github.com" 27 | 28 | - uses: runwhen-contrib/github-actions/codecollection-score@main 29 | with: 30 | directory: . 31 | apply_suggestions: true 32 | only_changed: true 33 | env: 34 | GH_TOKEN: ${{ github.token }} 35 | -------------------------------------------------------------------------------- /Introduction.md: -------------------------------------------------------------------------------- 1 | # RunWhen CLI CodeCollection 2 | Welcome to the documentation for the `rw-cli-codecollection` which contains codebundles specialized for CLI-based actions. There are 2 key sections: 3 | 4 | * Codebundles: Contains information on configuring and running the codebundles in this codecollection. 5 | * Keywords: Contains documentation for authors looking to use the keywords implemented in this codecollection for their own codebundles. 6 | 7 | > Note: keywords from this codecollection can be installed via pip from [pypi](https://pypi.org/project/runwhen-cli-keywords/) -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the top-level files frequently used in a package 2 | include README.md 3 | include VERSION 4 | include requirements.txt 5 | include LICENSE 6 | 7 | # Include everything within the RW directory 8 | graft RW 9 | 10 | # Exclude common unwanted patterns 11 | exclude *.py[cod] 12 | exclude __pycache__ 13 | exclude .DS_Store 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 |
4 | 5 | Join Slack 6 | 7 |

8 | 9 | # CodeCollection Registry 10 | To explore all CodeCollections and tasks, please visit the [CodeCollection Registry](https://registry.runwhen.com/). 11 | 12 | [![Explore CodeCollection Registry](https://storage.googleapis.com/runwhen-nonprod-shared-images/screenshots/registry.png)](https://registry.runwhen.com) 13 | 14 | ## RunWhen CLI Codecollection 15 | This repository is **one of many** CodeCollections that is used with the [RunWhen Platform](https://www.runwhen.com) and [RunWhen Local](https://docs.runwhen.com/public/v/runwhen-local). It contains CodeBundles that are maintained by the RunWhen team and perform health, operational, and troubleshooting tasks. 16 | 17 | Please see the **[contributing](CONTRIBUTING.md)** and **[code of conduct](CODE_OF_CONDUCT.md)** for details on adding your contributions to this project. -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.0.24 2 | -------------------------------------------------------------------------------- /codebundles/aws-cloudwatch-overused-ec2/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/aws-eks-health/.runwhen/generation-rules/aws-eks-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: aws 5 | generationRules: 6 | - resourceTypes: 7 | - aws_eks_clusters 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: aws-eks-health 15 | qualifiers: ["resource"] 16 | baseTemplateName: aws-eks-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: aws-eks-health-taskset.yaml 23 | -------------------------------------------------------------------------------- /codebundles/aws-eks-health/auth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # if required AWS_ cli vars are not set, error and exit 1 4 | if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY || -z $AWS_REGION ]]; then 5 | echo "AWS credentials not set. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables." 6 | exit 1 7 | fi 8 | 9 | # if AWS_ROLE_ARN then assume the role using sts and override the pre-existing key ENVs 10 | if [[ -n $AWS_ROLE_ARN ]]; then 11 | sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession") 12 | AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId') 13 | AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey') 14 | AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken') 15 | export AWS_ACCESS_KEY_ID 16 | export AWS_SECRET_ACCESS_KEY 17 | export AWS_SESSION_TOKEN 18 | fi 19 | -------------------------------------------------------------------------------- /codebundles/aws-eks-health/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/aws-eks-node-reboot/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/aws-elasticache-redis-health/.runwhen/generation-rules/aws-elasticache-redis-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: aws 5 | generationRules: 6 | - resourceTypes: 7 | # - aws_elasticache_serverless_caches 8 | - aws_elasticache_clusters 9 | matchRules: 10 | - type: pattern 11 | pattern: ".+" 12 | properties: [name] 13 | mode: substring 14 | slxs: 15 | - baseName: aws-elasticache-redis-health 16 | qualifiers: ["resource"] 17 | baseTemplateName: aws-elasticache-redis-health 18 | levelOfDetail: basic 19 | outputItems: 20 | - type: slx 21 | - type: sli 22 | - type: runbook 23 | templateName: aws-elasticache-redis-health-taskset.yaml 24 | -------------------------------------------------------------------------------- /codebundles/aws-elasticache-redis-health/.runwhen/templates/aws-elasticache-redis-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/aws/elasticache.png 11 | alias: AWS Elastiache Caches in {{match_resource.resource.region}} 12 | asMeasuredBy: The number of unavailable Elasticache serverless caches in {{match_resource.resource.region}} 13 | configProvided: 14 | - name: SLX_PLACEHOLDER 15 | value: SLX_PLACEHOLDER 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: The number of unavailable Elasticache serverless caches should be 0. 19 | additionalContext: 20 | {% include "aws.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "aws-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/aws-elasticache-redis-health/auth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # if required AWS_ cli vars are not set, error and exit 1 4 | if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY || -z $AWS_REGION ]]; then 5 | echo "AWS credentials not set. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables." 6 | exit 1 7 | fi 8 | 9 | # if AWS_ROLE_ARN then assume the role using sts and override the pre-existing key ENVs 10 | if [[ -n $AWS_ROLE_ARN ]]; then 11 | sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession") 12 | AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId') 13 | AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey') 14 | AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken') 15 | export AWS_ACCESS_KEY_ID 16 | export AWS_SECRET_ACCESS_KEY 17 | export AWS_SESSION_TOKEN 18 | fi 19 | -------------------------------------------------------------------------------- /codebundles/aws-elasticache-redis-health/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/aws-lambda-health/.runwhen/generation-rules/aws-lambda-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: aws 5 | generationRules: 6 | - resourceTypes: 7 | - aws_lambda_functions 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: aws-lambda-health 15 | qualifiers: ["resource"] 16 | baseTemplateName: aws-lambda-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: aws-lambda-health-taskset.yaml 23 | -------------------------------------------------------------------------------- /codebundles/aws-lambda-health/.runwhen/templates/aws-lambda-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/aws/lambda.png 11 | alias: AWS Lambda Health For Region {{match_resource.resource.region}} 12 | asMeasuredBy: The number of Failed AWS Lambdas in project {{match_resource.resource.region}} 13 | configProvided: 14 | - name: SLX_PLACEHOLDER 15 | value: SLX_PLACEHOLDER 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: The total count of failing Lambdas should be 0. 19 | additionalContext: 20 | {% include "aws.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "aws-tags.yaml" ignore missing %} 24 | - name: service 25 | value: lamda 26 | - name: access 27 | value: read-only -------------------------------------------------------------------------------- /codebundles/aws-lambda-health/auth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # if required AWS_ cli vars are not set, error and exit 1 4 | if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY || -z $AWS_REGION ]]; then 5 | echo "AWS credentials not set. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables." 6 | exit 1 7 | fi 8 | 9 | # if AWS_ROLE_ARN then assume the role using sts and override the pre-existing key ENVs 10 | if [[ -n $AWS_ROLE_ARN ]]; then 11 | sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession") 12 | AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId') 13 | AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey') 14 | AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken') 15 | export AWS_ACCESS_KEY_ID 16 | export AWS_SECRET_ACCESS_KEY 17 | export AWS_SESSION_TOKEN 18 | fi 19 | -------------------------------------------------------------------------------- /codebundles/aws-lambda-health/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/aws-s3-bucket-storage-report/.runwhen/generation-rules/aws-s3-bucket-storage-report.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: aws 5 | generationRules: 6 | - resourceTypes: 7 | - aws_s3_buckets 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: aws-s3-bucket-storage-report 15 | qualifiers: ["resource"] 16 | baseTemplateName: aws-s3-bucket-storage-report 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: runbook 21 | templateName: aws-s3-bucket-storage-report-taskset.yaml 22 | -------------------------------------------------------------------------------- /codebundles/aws-s3-bucket-storage-report/.runwhen/templates/aws-s3-bucket-storage-report-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/aws/s3.png 11 | alias: AWS S3 in {{match_resource.resource.region}} 12 | asMeasuredBy: Availability of S3 in {{match_resource.resource.region}} 13 | configProvided: 14 | - name: SLX_PLACEHOLDER 15 | value: SLX_PLACEHOLDER 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: S3 buckets in {{match_resource.resource.region}} should be available. 19 | additionalContext: 20 | {% include "aws.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "aws-tags.yaml" ignore missing %} 24 | - name: category 25 | value: storage 26 | - name: access 27 | value: read-only -------------------------------------------------------------------------------- /codebundles/aws-s3-bucket-storage-report/README.md: -------------------------------------------------------------------------------- 1 | # aws-s3-bucket-storage-report CodeBundle 2 | ### Tags:`AWS`, `S3 Bucket`, `Storage Issue` 3 | ## CodeBundle Objective: 4 | Outputs the current usage values of all S3 buckets in a given AWS region, and the number of objects stored in them. 5 | 6 | ## CodeBundle Inputs: 7 | 8 | export AWS_REGION="PLACEHOLDER" 9 | export AWS_ACCESS_KEY_ID="PLACEHOLDER" 10 | export AWS_SECRET_ACCESS_KEY="PLACEHOLDER" 11 | 12 | 13 | ## CodeBundle Tasks: 14 | ### `Check AWS S3 Bucket Storage Utilization` 15 | #### Tags:`Amazon Web Services`, `AWS S3`, `Bucket Storage` 16 | ### Task Documentation: 17 | This script checks and displays the storage utilization of a specified AWS S3 bucket. It uses the AWS CLI to list all objects in the bucket recursively, displaying the results in a human-readable format and providing a summary of the total storage used. 18 | #### Usage Example: 19 | `./check_AWS_S3_bucket_storage_utilization.sh` 20 | -------------------------------------------------------------------------------- /codebundles/aws-s3-bucket-storage-report/auth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # if required AWS_ cli vars are not set, error and exit 1 4 | if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY || -z $AWS_REGION ]]; then 5 | echo "AWS credentials not set. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables." 6 | exit 1 7 | fi 8 | 9 | # if AWS_ROLE_ARN then assume the role using sts and override the pre-existing key ENVs 10 | if [[ -n $AWS_ROLE_ARN ]]; then 11 | sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession") 12 | AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId') 13 | AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey') 14 | AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken') 15 | export AWS_ACCESS_KEY_ID 16 | export AWS_SECRET_ACCESS_KEY 17 | export AWS_SESSION_TOKEN 18 | fi 19 | -------------------------------------------------------------------------------- /codebundles/aws-s3-bucket-storage-report/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/azure-acr-image-sync/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/azure-adf-health/.runwhen/generation-rules/azure-adf-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_datafactory_factories 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: az-adf-health 15 | qualifiers: ["resource_group"] 16 | baseTemplateName: azure-adf-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: azure-adf-health-taskset.yaml 23 | - type: workflow 24 | -------------------------------------------------------------------------------- /codebundles/azure-adf-health/.runwhen/templates/azure-adf-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "Azure Data Factory SLI Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{ match_resource.resource_group.name }} Azure Data Factory SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{ match_resource.resource_group.name }} Azure Data Factory health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{ match_resource.resource_group.name }}-{{ "Azure Data Factory SLI Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/azure-adf-health/.test/README.md: -------------------------------------------------------------------------------- 1 | # Azure Virtual Machine Health 2 | This codebundle runs a suite of metrics checks for Data factory in Azure. It identifies: 3 | - 4 | 5 | ## Configuration 6 | 7 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 8 | 9 | - `AZ_USERNAME`: Service principal's client ID 10 | - `AZ_SECRET_VALUE`: The credential secret value from the app registration 11 | - `AZ_TENANT`: The Azure tenancy ID 12 | - `AZ_SUBSCRIPTION`: The Azure subscription ID 13 | 14 | ## Testing 15 | See the .test directory for infrastructure test code. 16 | 17 | ## Notes 18 | 19 | This codebundle assumes the service principal authentication flow -------------------------------------------------------------------------------- /codebundles/azure-adf-health/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-adf-health/.test/terraform/fail-sim-pipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "activities": [ 4 | { 5 | "name": "FailStep", 6 | "type": "Fail", 7 | "typeProperties": { 8 | "message": "Simulated failure for monitoring.", 9 | "errorCode": 500 10 | } 11 | } 12 | ] 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /codebundles/azure-adf-health/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "~> 4.7.0" 6 | } 7 | tls = { 8 | source = "hashicorp/tls" 9 | version = "~> 4.0" 10 | } 11 | azapi = { 12 | source = "azure/azapi" 13 | version = "2.3.0" 14 | } 15 | } 16 | } 17 | 18 | # Configure the Microsoft Azure Provider 19 | provider "azurerm" { 20 | features {} 21 | } 22 | 23 | provider "azuread" {} 24 | provider "tls" {} 25 | provider "azapi" {} 26 | -------------------------------------------------------------------------------- /codebundles/azure-adf-health/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-data-factory-health" 2 | name = "adf-hlth" 3 | location = "Canada Central" 4 | table_name = "dbo.NonExistentTable" 5 | tags = { 6 | "env" : "test", 7 | "lifecycle" : "deleteme", 8 | "product" : "runwhen" 9 | } -------------------------------------------------------------------------------- /codebundles/azure-adf-health/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "name" { 6 | type = string 7 | } 8 | 9 | variable "location" { 10 | type = string 11 | default = "East US" 12 | } 13 | 14 | variable "tags" { 15 | type = map(string) 16 | } 17 | 18 | variable "sp_principal_id" { 19 | type = string 20 | } 21 | 22 | variable "tenant_id" { 23 | type = string 24 | } 25 | 26 | # Password for .pfx files (self-signed) 27 | variable "table_name" { 28 | type = string 29 | default = "dbo.CustomerTransactions" 30 | } -------------------------------------------------------------------------------- /codebundles/azure-adf-health/README.md: -------------------------------------------------------------------------------- 1 | # Azure Data Factory Health 2 | This codebundle runs a suite of metrics checks for Data Factory in Azure. It identifies: 3 | - Check Azure Data Factory Availability 4 | 5 | ## Configuration 6 | 7 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 8 | 9 | - `AZURE_SUBSCRIPTION_ID`: The Azure subscription ID 10 | - `AZURE_RESOURCE_GROUP`: The Azure Resource Group 11 | 12 | ## Testing 13 | See the .test directory for infrastructure test code. 14 | 15 | ## Notes 16 | 17 | This codebundle assumes the service principal authentication flow -------------------------------------------------------------------------------- /codebundles/azure-aks-triage/.runwhen/generation-rules/azure-aks-triage.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_containerservice_managed_clusters 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: az-aks-triage 15 | qualifiers: [resource, resource_group] 16 | baseTemplateName: azure-aks-triage 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: azure-aks-triage-taskset.yaml 23 | - type: workflow -------------------------------------------------------------------------------- /codebundles/azure-aks-triage/.runwhen/templates/azure-aks-triage-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/azure/containers/10023-icon-service-Kubernetes-Services.svg 11 | alias: {{match_resource.resource.name}} AKS Health 12 | asMeasuredBy: "As scored by upstream Azure resource health, critical or error activities, and configuration health. 1=Health, 0=Unhealthy" 13 | owners: 14 | - {{ workspace.owner_email }} 15 | statement: AKS Clusters should be in a healthy state. 16 | additionalContext: 17 | {% include "azure-hierarchy.yaml" ignore missing %} 18 | qualified_name: "{{ match_resource.qualified_name }}" 19 | tags: 20 | {% include "azure-tags.yaml" ignore missing %} 21 | - name: platform 22 | value: azure 23 | - name: service 24 | value: aks 25 | - name: access 26 | value: read-only -------------------------------------------------------------------------------- /codebundles/azure-aks-triage/.runwhen/templates/azure-aks-triage-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "AKS SLI Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.name}} AKS SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} AKS health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.name}}-{{ "AKS SLI Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/azure-aks-triage/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-aks-triage/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "~> 4.7.0" 6 | } 7 | } 8 | } 9 | 10 | # Configure the Microsoft Azure Provider 11 | provider "azurerm" { 12 | features {} 13 | } 14 | 15 | provider "azuread" {} -------------------------------------------------------------------------------- /codebundles/azure-aks-triage/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-aks" 2 | cluster_name = "aks-cl-1" 3 | location = "East US" 4 | tags = { 5 | "env" : "test", 6 | "lifecycle" : "deleteme", 7 | "product" : "runwhen" 8 | } -------------------------------------------------------------------------------- /codebundles/azure-aks-triage/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "location" { 6 | type = string 7 | default = "East US" 8 | } 9 | 10 | variable "cluster_name" { 11 | type = string 12 | default = "East US" 13 | } 14 | 15 | variable "tags" { 16 | type = map(string) 17 | } 18 | 19 | variable "sp_principal_id" { 20 | type = string 21 | } 22 | 23 | variable "tenant_id" { 24 | type = string 25 | } -------------------------------------------------------------------------------- /codebundles/azure-aks-triage/README.md: -------------------------------------------------------------------------------- 1 | # Azure AKS Cluster Triage 2 | This CodeBundle checks for AKS Cluster Health based on how Azure is reporting resource health, network configuration recommendations, activities that have occured, and provisioning status of resources. 3 | 4 | ## Configuration 5 | 6 | The SLI & TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 7 | - `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in 8 | - `AKS_CLUSTER`: The name of the AKS Cluster in the resource group to target with checks 9 | - `TIME_PERIOD_MINUTES`: The time window, in minutes, to look back for activities and events which may indicate issues. 10 | 11 | ## Notes 12 | 13 | This codebundle assumes the service principal authentication flow which is handled from the import secret Keyword. 14 | 15 | 16 | ## TODO 17 | - [ ] Add documentation -------------------------------------------------------------------------------- /codebundles/azure-apim-health/.runwhen/generation-rules/azure-apim-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_apimanagement_service 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: az-appgw-health 15 | qualifiers: ["resource", "resource_group"] 16 | baseTemplateName: azure-apim-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: azure-apim-health-taskset.yaml 23 | - type: workflow 24 | -------------------------------------------------------------------------------- /codebundles/azure-apim-health/.runwhen/templates/azure-apim-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "APIM SLI Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.name}} APIM SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} APIM health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.name}}-{{ "APIM SLI Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/azure-apim-health/.test/terraform/README.md: -------------------------------------------------------------------------------- 1 | ## Infrastructure 2 | This will build out a simple linux web app service set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory) 3 | 4 | ## Usage 5 | 6 | ### State management 7 | State is managed locally with `terraform.tfstate` and is gitignored. 8 | 9 | ### Auth 10 | az login --use-device-code 11 | 12 | ### Requirements 13 | The following vars must exist: 14 | 15 | ``` 16 | export ARM_SUBSCRIPTION_ID=[] 17 | export AZ_TENANT_ID=[] 18 | export AZ_CLIENT_SECRET=[] 19 | export AZ_CLIENT_ID=[] 20 | export AZ_SECRET_ID=[] 21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv) 22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID 23 | export TF_VAR_tenant_id=$AZ_TENANT_ID -------------------------------------------------------------------------------- /codebundles/azure-apim-health/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-apim-health/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "~>3.0" 6 | } 7 | } 8 | required_version = ">=1.0" 9 | } 10 | 11 | provider "azurerm" { 12 | features {} 13 | } 14 | 15 | # Pull subscription info from the current CLI session 16 | data "azurerm_subscription" "current" {} 17 | 18 | # Pull tenant and user details from the current CLI session 19 | data "azurerm_client_config" "current" {} -------------------------------------------------------------------------------- /codebundles/azure-apim-health/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-apim-health" 2 | location = "Canada Central" 3 | tags = { 4 | "env" : "test", 5 | "lifecycle" : "deleteme", 6 | "product" : "runwhen" 7 | } 8 | codebundle = "azure-apim-health" 9 | codecollection = "rw-cli-codecollection" -------------------------------------------------------------------------------- /codebundles/azure-apim-health/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | description = "Name of the resource group to create/use." 4 | } 5 | 6 | variable "location" { 7 | type = string 8 | description = "Azure location for all resources." 9 | } 10 | 11 | variable "tags" { 12 | type = map(string) 13 | description = "Tags to apply to resources." 14 | default = {} 15 | } 16 | 17 | variable "sp_principal_id" { 18 | type = string 19 | description = "Client (service principal) ID with access to the resources." 20 | } 21 | 22 | variable "codebundle" { 23 | type = string 24 | description = "Base name for your resources." 25 | default = "example-bundle" 26 | } 27 | 28 | 29 | -------------------------------------------------------------------------------- /codebundles/azure-apim-health/README.md: -------------------------------------------------------------------------------- 1 | 2 | as login --use-device-code 3 | ## Test 1 4 | export APP_SERVICE_NAME=azure-apim-health-f1 5 | export AZ_RESOURCE_GROUP=azure-apim-health 6 | export APIM_NAME=azure-apim-health-apim 7 | export AZURE_RESOURCE_SUBSCRIPTION_ID=$ARM_SUBSCRIPTION_ID 8 | export AZURE_CONFIG_DIR=/var/tmp/runwhen/azure-apim-health/runbook.robot/.azure 9 | az login --use-device-code -------------------------------------------------------------------------------- /codebundles/azure-appgateway-health/.runwhen/generation-rules/azure-appgateway-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_network_application_gateways 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: az-appgw-health 15 | qualifiers: ["resource", "resource_group"] 16 | baseTemplateName: azure-appgateway-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: azure-appgateway-health-taskset.yaml 23 | - type: workflow 24 | -------------------------------------------------------------------------------- /codebundles/azure-appgateway-health/.runwhen/templates/azure-appgateway-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "App Gateway SLI Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.name}} App Gateway SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} App Gateway health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.name}}-{{ "App Gateway SLI Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/azure-appgateway-health/.test/README.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | # Infrastructure Setup 4 | The terraform directory contains infrastructure used for testing. This infrastructure will build 2 app gateways, app services, app service plans. One of the app services is healthy, the other has a bad image. 5 | 6 | # Local Development Testing 7 | 8 | 9 | ro sli.robot 10 | ro runbook.robot 11 | -------------------------------------------------------------------------------- /codebundles/azure-appgateway-health/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-appgateway-health/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "~> 4.7.0" 6 | } 7 | tls = { 8 | source = "hashicorp/tls" 9 | version = "~> 4.0" 10 | } 11 | } 12 | } 13 | 14 | # Configure the Microsoft Azure Provider 15 | provider "azurerm" { 16 | features {} 17 | } 18 | 19 | provider "azuread" {} 20 | provider "tls" {} 21 | -------------------------------------------------------------------------------- /codebundles/azure-appgateway-health/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-appgateway-health" 2 | location = "Canada Central" 3 | tags = { 4 | "env" : "test", 5 | "lifecycle" : "deleteme", 6 | "product" : "runwhen" 7 | } -------------------------------------------------------------------------------- /codebundles/azure-appgateway-health/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "location" { 6 | type = string 7 | default = "East US" 8 | } 9 | 10 | variable "tags" { 11 | type = map(string) 12 | } 13 | 14 | variable "sp_principal_id" { 15 | type = string 16 | } 17 | 18 | variable "tenant_id" { 19 | type = string 20 | } 21 | 22 | # Password for .pfx files (self-signed) 23 | variable "ssl_cert_password" { 24 | type = string 25 | default = "P@ssw0rd123!" 26 | } -------------------------------------------------------------------------------- /codebundles/azure-appgateway-health/README.md: -------------------------------------------------------------------------------- 1 | # Azure Application Gateway Health 2 | Checks key metrics for Azure Application Gateways and queries the health status of backend pools used by the gateway. 3 | 4 | 5 | ## Configuration 6 | 7 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 8 | 9 | - `AZ_USERNAME`: Service principal's client ID 10 | - `AZ_SECRET_VALUE`: The credential secret value from the app registration 11 | - `AZ_TENANT`: The Azure tenancy ID 12 | - `AZ_SUBSCRIPTION`: The Azure subscription ID 13 | - `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in 14 | - `APPGATEWAY`: The name of the application gateway in the resource group to target with checks 15 | 16 | ## Notes 17 | 18 | This codebundle assumes the service principal authentication flow. 19 | 20 | ## TODO 21 | - [ ] config best practices check 22 | - [ ] Add documentation -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/.runwhen/generation-rules/azure-appservice-function-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_appservice_web_apps 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | - type: pattern 14 | pattern: "^functionapp(?:,.*)?$" 15 | properties: [kind] 16 | mode: substring 17 | slxs: 18 | - baseName: az-appsvc-function-health 19 | qualifiers: ["resource", "resource_group"] 20 | baseTemplateName: azure-appservice-function-health 21 | levelOfDetail: detailed 22 | outputItems: 23 | - type: slx 24 | - type: sli 25 | - type: runbook 26 | templateName: azure-appservice-function-health-taskset.yaml 27 | - type: workflow -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/.runwhen/templates/azure-appservice-function-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "AppService Function App SLI Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.name}} AppService Function App SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} AppService Function App health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.name}}-{{ "AppService Function App SLI Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/.test/README.md: -------------------------------------------------------------------------------- 1 | 2 | as login --use-device-code 3 | ## Test 1 4 | export APP_SERVICE_NAME=azure-appservice-triage-b1 5 | export AZ_RESOURCE_GROUP=azure-appservice-triage 6 | 7 | ## Test 2 8 | export APP_SERVICE_NAME=azure-appservice-triage-f1 9 | export AZ_RESOURCE_GROUP=azure-appservice-triage -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/.test/terraform/README.md: -------------------------------------------------------------------------------- 1 | ## Infrastructure 2 | This will build out a simple linux web app service set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory) 3 | 4 | ## Usage 5 | 6 | ### State management 7 | State is managed locally with `terraform.tfstate` and is gitignored. 8 | 9 | ### Auth 10 | az login --use-device-code 11 | 12 | ### Requirements 13 | The following vars must exist: 14 | 15 | ``` 16 | export ARM_SUBSCRIPTION_ID=[] 17 | export AZ_TENANT_ID=[] 18 | export AZ_CLIENT_SECRET=[] 19 | export AZ_CLIENT_ID=[] 20 | export AZ_SECRET_ID=[] 21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv) 22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID 23 | export TF_VAR_tenant_id=$AZ_TENANT_ID -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "~> 4.7.0" 6 | } 7 | random = { 8 | source = "hashicorp/random" 9 | version = "~> 3.5" 10 | } 11 | } 12 | } 13 | 14 | # Configure the Microsoft Azure Provider 15 | provider "azurerm" { 16 | features {} 17 | } 18 | 19 | provider "azuread" {} -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-apps-func-hlth" 2 | location = "Canada Central" 3 | tags = { 4 | "env" : "test", 5 | "lifecycle" : "deleteme", 6 | "product" : "runwhen" 7 | } 8 | codebundle = "azure-apps-func-hlth" 9 | codecollection = "rw-cli-codecollection" 10 | -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "location" { 6 | type = string 7 | default = "Canada Central" 8 | } 9 | 10 | variable "tags" { 11 | type = map(string) 12 | } 13 | 14 | variable "sp_principal_id" { 15 | type = string 16 | } 17 | 18 | variable "tenant_id" { 19 | type = string 20 | } 21 | 22 | variable "codebundle" { 23 | type = string 24 | } 25 | 26 | variable "codecollection" { 27 | type = string 28 | } -------------------------------------------------------------------------------- /codebundles/azure-appservice-functionapp-health/README.md: -------------------------------------------------------------------------------- 1 | # Azure App Service Triage 2 | Checks key App Service metrics and the service plan, fetches logs, config and activities for the service and generates a report of present issues for any found. 3 | 4 | ## Configuration 5 | 6 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 7 | 8 | export APPSERVICE 9 | export AZ_RESOURCE_GROUP 10 | 11 | ## Notes 12 | 13 | This codebundle assumes the service principal authentication flow. 14 | 15 | ## TODO 16 | - [ ] look for notable activities in list 17 | - [ ] config best practices check 18 | - [ ] Add documentation -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/.runwhen/generation-rules/azure-appservice-webapp-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_appservice_web_apps 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | - type: pattern 14 | pattern: "^app(?:,.*)?$" 15 | properties: [kind] 16 | mode: substring 17 | slxs: 18 | - baseName: az-appsvc-web-health 19 | qualifiers: ["resource", "resource_group"] 20 | baseTemplateName: azure-appservice-webapp-health 21 | levelOfDetail: detailed 22 | outputItems: 23 | - type: slx 24 | - type: sli 25 | - type: runbook 26 | templateName: azure-appservice-webapp-health-taskset.yaml 27 | - type: workflow -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/.runwhen/templates/azure-appservice-webapp-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "AppService Webapp SLI Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.name}} AppService Webapp SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.name}} AppService WebApp health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.name}}-{{ "AppService Webapp SLI Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/.test/README.md: -------------------------------------------------------------------------------- 1 | 2 | as login --use-device-code 3 | ## Test 1 4 | export APP_SERVICE_NAME=azure-appservice-triage-b1 5 | export AZ_RESOURCE_GROUP=azure-appservice-triage 6 | 7 | ## Test 2 8 | export APP_SERVICE_NAME=azure-appservice-triage-f1 9 | export AZ_RESOURCE_GROUP=azure-appservice-triage -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/.test/terraform/README.md: -------------------------------------------------------------------------------- 1 | ## Infrastructure 2 | This will build out a simple linux web app service set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory) 3 | 4 | ## Usage 5 | 6 | ### State management 7 | State is managed locally with `terraform.tfstate` and is gitignored. 8 | 9 | ### Auth 10 | az login --use-device-code 11 | 12 | ### Requirements 13 | The following vars must exist: 14 | 15 | ``` 16 | export ARM_SUBSCRIPTION_ID=[] 17 | export AZ_TENANT_ID=[] 18 | export AZ_CLIENT_SECRET=[] 19 | export AZ_CLIENT_ID=[] 20 | export AZ_SECRET_ID=[] 21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv) 22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID 23 | export TF_VAR_tenant_id=$AZ_TENANT_ID -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "~> 4.7.0" 6 | } 7 | } 8 | } 9 | 10 | # Configure the Microsoft Azure Provider 11 | provider "azurerm" { 12 | features {} 13 | } 14 | 15 | provider "azuread" {} -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-appservice-triage" 2 | location = "Canada Central" 3 | tags = { 4 | "env" : "test", 5 | "lifecycle" : "deleteme", 6 | "product" : "runwhen" 7 | } 8 | codebundle = "azure-appservice-triage" 9 | codecollection = "rw-cli-codecollection" 10 | -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "location" { 6 | type = string 7 | default = "Canada Central" 8 | } 9 | 10 | variable "tags" { 11 | type = map(string) 12 | } 13 | 14 | variable "sp_principal_id" { 15 | type = string 16 | } 17 | 18 | variable "tenant_id" { 19 | type = string 20 | } 21 | 22 | variable "codebundle" { 23 | type = string 24 | } 25 | 26 | variable "codecollection" { 27 | type = string 28 | } -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-health/README.md: -------------------------------------------------------------------------------- 1 | # Azure App Service Triage 2 | Checks key App Service metrics and the service plan, fetches logs, config and activities for the service and generates a report of present issues for any found. 3 | 4 | ## Configuration 5 | 6 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 7 | 8 | export APPSERVICE 9 | export AZ_RESOURCE_GROUP 10 | 11 | ## Notes 12 | 13 | This codebundle assumes the service principal authentication flow. 14 | 15 | ## TODO 16 | - [ ] look for notable activities in list 17 | - [ ] config best practices check 18 | - [ ] Add documentation -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/.runwhen/generation-rules/azure-appservice-webapp-ops.yaml: -------------------------------------------------------------------------------- 1 | # apiVersion: runwhen.com/v1 2 | # kind: GenerationRules 3 | # spec: 4 | # platform: azure 5 | # generationRules: 6 | # - resourceTypes: 7 | # - azure_appservice_web_apps 8 | # matchRules: 9 | # - type: pattern 10 | # pattern: ".+" 11 | # properties: [name] 12 | # mode: substring 13 | # - type: pattern 14 | # pattern: "^app(?:,.*)?$" 15 | # properties: [kind] 16 | # mode: substring 17 | # slxs: 18 | # - baseName: az-appsvc-webapp-ops 19 | # qualifiers: ["resource", "resource_group"] 20 | # baseTemplateName: azure-appservice-webapp-ops 21 | # levelOfDetail: basic 22 | # outputItems: 23 | # - type: slx 24 | # - type: runbook 25 | # templateName: azure-appservice-webapp-ops-taskset.yaml -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/.test/README.md: -------------------------------------------------------------------------------- 1 | 2 | as login --use-device-code 3 | ## Test 1 4 | export APP_SERVICE_NAME=azure-appservice-triage-b1 5 | export AZ_RESOURCE_GROUP=azure-appservice-triage 6 | 7 | ## Test 2 8 | export APP_SERVICE_NAME=azure-appservice-triage-f1 9 | export AZ_RESOURCE_GROUP=azure-appservice-triage -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/.test/terraform/README.md: -------------------------------------------------------------------------------- 1 | ## Infrastructure 2 | This will build out a simple linux web app service set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory) 3 | 4 | ## Usage 5 | 6 | ### State management 7 | State is managed locally with `terraform.tfstate` and is gitignored. 8 | 9 | ### Auth 10 | az login --use-device-code 11 | 12 | ### Requirements 13 | The following vars must exist: 14 | 15 | ``` 16 | export ARM_SUBSCRIPTION_ID=[] 17 | export AZ_TENANT_ID=[] 18 | export AZ_CLIENT_SECRET=[] 19 | export AZ_CLIENT_ID=[] 20 | export AZ_SECRET_ID=[] 21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv) 22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID 23 | export TF_VAR_tenant_id=$AZ_TENANT_ID -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "~> 4.7.0" 6 | } 7 | } 8 | } 9 | 10 | # Configure the Microsoft Azure Provider 11 | provider "azurerm" { 12 | features {} 13 | } 14 | 15 | provider "azuread" {} -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-appservice-ops" 2 | location = "Canada Central" 3 | tags = { 4 | "env" : "test", 5 | "lifecycle" : "deleteme", 6 | "product" : "runwhen" 7 | } 8 | codebundle = "azure-appservice-ops" 9 | codecollection = "rw-cli-codecollection" -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "location" { 6 | type = string 7 | default = "Canada Central" 8 | } 9 | 10 | variable "tags" { 11 | type = map(string) 12 | } 13 | 14 | variable "sp_principal_id" { 15 | type = string 16 | } 17 | 18 | variable "tenant_id" { 19 | type = string 20 | } 21 | 22 | variable "codebundle" { 23 | type = string 24 | } 25 | 26 | variable "codecollection" { 27 | type = string 28 | } -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Swap Deployment Slots for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` 3 | - Checks whether the plan supports deployment slots (Standard or Premium tier). 4 | - Lists all available slots. 5 | - If SOURCE_SLOT and TARGET_SLOT are not provided, it attempts to figure them out automatically, assuming: 6 | - The “production” slot is the default slot with "isSlot": false. 7 | - The non-production slot(s) have "isSlot": true. 8 | - If exactly one non-production slot exists, we set source to that slot and target to "production". 9 | - If there are multiple non-production slots, we fail unless the user specifies which ones to swap. -------------------------------------------------------------------------------- /codebundles/azure-appservice-webapp-ops/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/azure-kv-health/.runwhen/generation-rules/azure-kv-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_keyvault_keyvault 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: az-kv-health 15 | qualifiers: ["resource_group"] 16 | baseTemplateName: azure-kv-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: azure-kv-health-taskset.yaml 23 | - type: workflow 24 | -------------------------------------------------------------------------------- /codebundles/azure-kv-health/.runwhen/templates/azure-kv-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "Key Vault SLI Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{ match_resource.resource_group.name }} Key Vault SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{ match_resource.resource_group.name }} Key Vault health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{ match_resource.resource_group.name }}-{{ "Key Vault SLI Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/azure-kv-health/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-kv-health/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "4.18.0" 6 | } 7 | } 8 | } 9 | 10 | provider "azurerm" { 11 | features { 12 | key_vault { 13 | purge_soft_delete_on_destroy = true 14 | recover_soft_deleted_key_vaults = true 15 | } 16 | } 17 | } 18 | 19 | provider "azuread" {} -------------------------------------------------------------------------------- /codebundles/azure-kv-health/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-vm-triage" 2 | location = "East US" 3 | kv_name = "test-yoko" 4 | tags = { 5 | "env" : "test", 6 | "lifecycle" : "deleteme", 7 | "product" : "runwhen" 8 | } -------------------------------------------------------------------------------- /codebundles/azure-kv-health/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "location" { 6 | type = string 7 | default = "East US" 8 | } 9 | 10 | 11 | variable "tags" { 12 | type = map(string) 13 | } 14 | 15 | variable "sp_principal_id" { 16 | type = string 17 | } 18 | 19 | variable "kv_name" { 20 | type = string 21 | } -------------------------------------------------------------------------------- /codebundles/azure-kv-health/README.md: -------------------------------------------------------------------------------- 1 | # Azure Key Vault Health 2 | This codebundle runs a suite of metrics checks for Key Vault in Azure. It identifies: 3 | - Check Key Vault Availability 4 | - Check Key Vault Configuration 5 | - Check Expiring Key Vault Items (Keys, Secrets and Certificates) 6 | - Check Key Vault Logs for Issues 7 | - Check Key Vault Performance Metrics 8 | 9 | ## Configuration 10 | 11 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 12 | 13 | - `AZ_USERNAME`: Service principal's client ID 14 | - `AZ_SECRET_VALUE`: The credential secret value from the app registration 15 | - `AZ_TENANT`: The Azure tenancy ID 16 | - `AZ_SUBSCRIPTION`: The Azure subscription ID 17 | 18 | ## Testing 19 | See the .test directory for infrastructure test code. 20 | 21 | ## Notes 22 | 23 | This codebundle assumes the service principal authentication flow -------------------------------------------------------------------------------- /codebundles/azure-kv-health/availability.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | subscription_id="$AZURE_RESOURCE_SUBSCRIPTION_ID" 4 | resource_group="$AZURE_RESOURCE_GROUP" 5 | 6 | json_output='{"metrics":[' 7 | first=true 8 | 9 | for kv in $(az keyvault list -g "$resource_group" --subscription "$subscription_id" --query "[].name" -o tsv); do 10 | 11 | availability=$(az monitor metrics list \ 12 | --resource "/subscriptions/$subscription_id/resourceGroups/$resource_group/providers/Microsoft.KeyVault/vaults/$kv" \ 13 | --metric Availability \ 14 | --aggregation average \ 15 | --interval PT1H \ 16 | --query "value[0].timeseries[0].data[-1].average" \ 17 | --start-time $(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ) \ 18 | --output tsv) 19 | 20 | # Default to N/A if no data is returned 21 | availability=${availability:-"N/A"} 22 | 23 | # Append to JSON array 24 | if [ "$first" = true ]; then 25 | first=false 26 | else 27 | json_output+=',' 28 | fi 29 | json_output+="{\"kv_name\":\"$kv\",\"percentage\":\"$availability\"}" 30 | done 31 | 32 | json_output+=']}' 33 | echo "$json_output" -------------------------------------------------------------------------------- /codebundles/azure-loadbalancer-triage/.runwhen/generation-rules/az-lb-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_network_load_balancers 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: az-lb-health 15 | levelOfDetail: basic 16 | qualifiers: [resource, resource_group] 17 | baseTemplateName: az-lb-health 18 | outputItems: 19 | - type: slx 20 | - type: runbook 21 | templateName: az-lb-health-taskset.yaml 22 | -------------------------------------------------------------------------------- /codebundles/azure-loadbalancer-triage/.runwhen/templates/az-lb-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/azure/networking/10062-icon-service-Load-Balancers.svg 11 | alias: {{match_resource.name}} Azure Load Balancer Health 12 | asMeasuredBy: "Querying the Azure Load Balancer health for incidents or critical events." 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Ensure Azure Network Load Balancers are healthy. 19 | additionalContext: 20 | {% include "azure-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "azure-tags.yaml" ignore missing %} 24 | - name: service 25 | value: loadbalancer 26 | - name: access 27 | value: read-only -------------------------------------------------------------------------------- /codebundles/azure-loadbalancer-triage/README.md: -------------------------------------------------------------------------------- 1 | # Azure LoadBalancer Triage 2 | 3 | Queries the activity logs of internal loadbalancers (AKS ingress) objects in Azure and optionally inspects internal AKS ingress objects if available. 4 | 5 | ## Tasks 6 | `Health Check Internal Azure Load Balancer` 7 | 8 | ## Configuration 9 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 10 | 11 | - `AZ_USERNAME`: Azure service account username secret used to authenticate. 12 | - `AZ_CLIENT_SECRET`: Azure service account client secret used to authenticate. 13 | - `AZ_TENANT`: Azure tenant ID used to authenticate to. 14 | - `AZ_HISTORY_RANGE`: The history range to inspect for incidents in the activity log, in hours. Defaults to 24 hours. 15 | 16 | ## Requirements 17 | - A kubeconfig with appropriate RBAC permissions to perform the desired command. 18 | 19 | ## TODO 20 | - [ ] Refine issues raised 21 | - [ ] Array support for issues 22 | - [ ] Look at cross az/kubectl for better triage 23 | - [ ] Add additional documentation. 24 | 25 | -------------------------------------------------------------------------------- /codebundles/azure-loadbalancer-triage/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/azure-servicebus-health/.runwhen/generation-rules/azure-servicebus-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_servicebus_namespaces 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: az-servicebus-health 15 | qualifiers: ["resource", "resource_group"] 16 | baseTemplateName: az-servicebus-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: az-servicebus-health-taskset.yaml 23 | -------------------------------------------------------------------------------- /codebundles/azure-servicebus-health/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-servicebus-health/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | version = "~> 4.7.0" 6 | } 7 | tls = { 8 | source = "hashicorp/tls" 9 | version = "~> 4.0" 10 | } 11 | } 12 | } 13 | 14 | # Configure the Microsoft Azure Provider 15 | provider "azurerm" { 16 | features {} 17 | } 18 | 19 | provider "azuread" {} 20 | provider "tls" {} 21 | -------------------------------------------------------------------------------- /codebundles/azure-servicebus-health/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-servicebus-health" 2 | location = "Canada Central" 3 | secondary_location = "East US" 4 | tags = { 5 | "env" : "test", 6 | "lifecycle" : "deleteme", 7 | "product" : "runwhen" 8 | } -------------------------------------------------------------------------------- /codebundles/azure-servicebus-health/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "location" { 6 | type = string 7 | default = "East US" 8 | } 9 | 10 | variable "secondary_location" { 11 | type = string 12 | default = "East US" 13 | } 14 | 15 | variable "tags" { 16 | type = map(string) 17 | } 18 | 19 | variable "sp_principal_id" { 20 | type = string 21 | } 22 | 23 | variable "tenant_id" { 24 | type = string 25 | } 26 | 27 | # Password for .pfx files (self-signed) 28 | variable "ssl_cert_password" { 29 | type = string 30 | default = "P@ssw0rd123!" 31 | } 32 | -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/.runwhen/generation-rules/azure-vmss-triage.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: azure 5 | generationRules: 6 | - resourceTypes: 7 | - azure_compute_virtual_machine_scale_sets 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: az-vmss-triage 15 | qualifiers: ["resource", "resource_group"] 16 | baseTemplateName: azure-vmss-triage 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: azure-vmss-triage-taskset.yaml 23 | -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/.runwhen/templates/azure-vmss-triage-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/azure/compute/10034-icon-service-VM-Scale-Sets.svg 11 | alias: Azure VM Scale Set {{match_resource.resource.name}} 12 | asMeasuredBy: Composite health score of resources & activities. 13 | configProvided: 14 | - name: SLX_PLACEHOLDER 15 | value: SLX_PLACEHOLDER 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: VM Scale Set should be available. 19 | additionalContext: 20 | {% include "azure-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "azure-tags.yaml" ignore missing %} 24 | - name: service 25 | value: vmss 26 | - name: access 27 | value: read-only -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/.test/terraform/README.md: -------------------------------------------------------------------------------- 1 | ## Infrastructure 2 | This will build out a simple VM scale set in a dedicated resource group, and enables the configure SP to own those resources, which will be needed when testing discovery of this with RunWhen Local (through the Taskfile in the parent directory) 3 | 4 | ## Usage 5 | 6 | ### State management 7 | State is managed locally with `terraform.tfstate` and is gitignored. 8 | 9 | ### Auth 10 | az login --use-device-code 11 | 12 | ### Requirements 13 | The following vars must exist: 14 | 15 | ``` 16 | export ARM_SUBSCRIPTION_ID=[] 17 | export AZ_TENANT_ID=[] 18 | export AZ_CLIENT_SECRET=[] 19 | export AZ_CLIENT_ID=[] 20 | export AZ_SECRET_ID=[] 21 | export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv) 22 | export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID 23 | export TF_VAR_tenant_id=$AZ_TENANT_ID 24 | ``` -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/.test/terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" { 3 | path = "terraform.tfstate" 4 | } 5 | } -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | source = "hashicorp/azurerm" 5 | } 6 | } 7 | } 8 | 9 | # Configure the Microsoft Azure Provider 10 | provider "azurerm" { 11 | features {} 12 | } 13 | 14 | provider "azuread" {} -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/.test/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | resource_group = "azure-vm-triage" 2 | location = "East US" 3 | tags = { 4 | "env" : "test", 5 | "lifecycle" : "deleteme", 6 | "product" : "runwhen" 7 | } -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/.test/terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "resource_group" { 2 | type = string 3 | } 4 | 5 | variable "location" { 6 | type = string 7 | default = "East US" 8 | } 9 | 10 | 11 | variable "tags" { 12 | type = map(string) 13 | } 14 | 15 | variable "sp_principal_id" { 16 | type = string 17 | } -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/README.md: -------------------------------------------------------------------------------- 1 | # Azure Virtual Machine Scale Set Triage 2 | This codebundle runs a suite of metrics checks for a VM Scale Set in Azure. It fetches activities and the current configuration which is added to a report for review at that point in time. 3 | 4 | ## Configuration 5 | 6 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 7 | 8 | - `AZ_USERNAME`: Service principal's client ID 9 | - `AZ_SECRET_VALUE`: The credential secret value from the app registration 10 | - `AZ_TENANT`: The Azure tenancy ID 11 | - `AZ_SUBSCRIPTION`: The Azure subscription ID 12 | - `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in 13 | - `VMSCALESET`: The name of the VM Scale Set in the resource group to target with checks 14 | 15 | ## Notes 16 | 17 | This codebundle assumes the service principal authentication flow 18 | 19 | ## TODO 20 | - [ ] remote exec functionality 21 | - [ ] look for notable activities in list 22 | - [ ] config best practices check 23 | - [ ] Add documentation -------------------------------------------------------------------------------- /codebundles/azure-vmss-triage/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/generation-rules/http-ok-tls.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - ingress 7 | matchRules: 8 | - type: and 9 | matches: 10 | - type: pattern 11 | pattern: ".+" 12 | properties: [name] 13 | mode: substring 14 | - type: pattern 15 | pattern: ".+" 16 | properties: [spec/tls/hosts] 17 | mode: substring 18 | slxs: 19 | - baseName: http-ok-tls-test 20 | qualifiers: ["resource", "namespace", "cluster"] 21 | baseTemplateName: http-ok-tls 22 | levelOfDetail: basic 23 | outputItems: 24 | - type: slx 25 | - type: sli 26 | - type: slo 27 | - type: runbook 28 | templateName: http-ok-tls-taskset.yaml 29 | -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/generation-rules/http-ok.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - ingress 7 | matchRules: 8 | - type: and 9 | matches: 10 | - type: pattern 11 | pattern: ".+" 12 | properties: [name] 13 | mode: substring 14 | - type: not 15 | predicate: 16 | type: exists 17 | path: spec/tls/hosts 18 | slxs: 19 | - baseName: http-ok 20 | qualifiers: ["resource", "namespace", "cluster"] 21 | baseTemplateName: http-ok 22 | levelOfDetail: basic 23 | outputItems: 24 | - type: slx 25 | - type: sli 26 | - type: slo 27 | - type: runbook 28 | templateName: http-ok-taskset.yaml 29 | -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/templates/http-ok-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 1 17 | operand: eq -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/templates/http-ok-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | alias: {{match_resource.resource.metadata.name}} Public HTTP OK 11 | asMeasuredBy: HTTP 200 returned within the desired latency. 12 | configProvided: 13 | - name: OBJECT_NAME 14 | value: {{match_resource.resource.metadata.name}} 15 | icon: Cloud 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: HTTP Ingress resources should respond with HTTP 200 in under 1s 19 | additionalContext: 20 | namespace: "{{match_resource.resource.metadata.namespace}}" 21 | labelMap: "{{match_resource.resource.metadata.labels}}" 22 | cluster: "{{ cluster.name }}" 23 | context: "{{ cluster.context }}" -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/templates/http-ok-taskset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Runbook 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | location: {{default_location}} 11 | codeBundle: 12 | {% if repo_url %} 13 | repoUrl: {{repo_url}} 14 | {% else %} 15 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git 16 | {% endif %} 17 | {% if ref %} 18 | ref: {{ref}} 19 | {% else %} 20 | ref: main 21 | {% endif %} 22 | pathToRobot: codebundles/curl-http-ok/runbook.robot 23 | configProvided: 24 | - name: URL 25 | value: http://{{match_resource.resource.spec.rules[0].host}} 26 | - name: TARGET_LATENCY 27 | value: '1.2' 28 | - name: DESIRED_RESPONSE_CODE 29 | value: '200' 30 | - name: OWNER_DETAILS 31 | value: '{"name":"{{match_resource.resource.metadata.name}}", "kind":"Ingress","namespace":"{{match_resource.resource.metadata.namespace}}"}' 32 | secretsProvided: [] 33 | -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-aks-public-loadbalancer-ext-dns-tls-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | alias: {{match_resource.resource.metadata.name}} Public HTTP OK (Azure LB) 11 | asMeasuredBy: HTTP 200 returned within the desired latency. 12 | configProvided: 13 | - name: OBJECT_NAME 14 | value: {{match_resource.resource.metadata.name}} 15 | icon: Cloud 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: HTTP AKS LoadBalancer and Service resources should respond with HTTP 200 in under 1s 19 | additionalContext: 20 | namespace: "{{match_resource.resource.metadata.namespace}}" 21 | labelMap: "{{match_resource.resource.metadata.labels}}" 22 | cluster: "{{ cluster.name }}" 23 | context: "{{ cluster.context }}" -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 1 17 | operand: eq -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | alias: {{match_resource.resource.metadata.name}} Public HTTP OK 11 | asMeasuredBy: HTTP 200 returned within the desired latency. 12 | configProvided: 13 | - name: OBJECT_NAME 14 | value: {{match_resource.resource.metadata.name}} 15 | icon: Cloud 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: HTTP Ingress resources should respond with HTTP 200 in under 1.2s 19 | additionalContext: 20 | namespace: "{{match_resource.resource.metadata.namespace}}" 21 | labelMap: "{{match_resource.resource.metadata.labels}}" 22 | cluster: "{{ cluster.name }}" 23 | context: "{{ cluster.context }}" -------------------------------------------------------------------------------- /codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-taskset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Runbook 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | location: {{default_location}} 11 | codeBundle: 12 | {% if repo_url %} 13 | repoUrl: {{repo_url}} 14 | {% else %} 15 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git 16 | {% endif %} 17 | {% if ref %} 18 | ref: {{ref}} 19 | {% else %} 20 | ref: main 21 | {% endif %} 22 | pathToRobot: codebundles/curl-http-ok/runbook.robot 23 | configProvided: 24 | - name: URL 25 | value: https://{{match_resource.resource.spec.tls[0].hosts[0]}} 26 | - name: TARGET_LATENCY 27 | value: '1.2' 28 | - name: DESIRED_RESPONSE_CODE 29 | value: '200' 30 | - name: OWNER_DETAILS 31 | value: '{"name":"{{match_resource.resource.metadata.name}}", "kind":"Ingress","namespace":"{{match_resource.resource.metadata.namespace}}"}' 32 | secretsProvided: [] 33 | -------------------------------------------------------------------------------- /codebundles/gcloud-log-inspection/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/gcloud-node-preempt/.runwhen/generation-rules/gcloud-node-preempt.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: gcp 5 | generationRules: 6 | - resourceTypes: 7 | - gcp_compute_instances 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [scheduling/preemptible] 12 | mode: substring 13 | slxs: 14 | - baseName: node-preempt 15 | qualifiers: ["project"] 16 | baseTemplateName: gcloud-node-preempt 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: slo 22 | - type: runbook 23 | templateName: gcloud-node-preempt-taskset.yaml 24 | -------------------------------------------------------------------------------- /codebundles/gcloud-node-preempt/.runwhen/templates/gcloud-node-preempt-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 1 17 | operand: lt -------------------------------------------------------------------------------- /codebundles/gcloud-node-preempt/.runwhen/templates/gcloud-node-preempt-taskset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Runbook 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | location: {{default_location}} 11 | description: Counts the total number of nodes undergoing a preempt event. 12 | codeBundle: 13 | {% if repo_url %} 14 | repoUrl: {{repo_url}} 15 | {% else %} 16 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git 17 | {% endif %} 18 | {% if ref %} 19 | ref: {{ref}} 20 | {% else %} 21 | ref: main 22 | {% endif %} 23 | pathToRobot: codebundles/gcloud-node-preempt/runbook.robot 24 | intervalStrategy: intermezzo 25 | intervalSeconds: 300 26 | configProvided: 27 | - name: GCP_PROJECT_ID 28 | value: {{match_resource.resource.project_id}} 29 | - name: AGE 30 | value: '30' 31 | secretsProvided: 32 | - name: gcp_credentials_json 33 | workspaceKey: {{custom.gcp_ops_suite_sa}} -------------------------------------------------------------------------------- /codebundles/gcloud-node-preempt/README.md: -------------------------------------------------------------------------------- 1 | # gcloud Node Preempt List 2 | This code checks if any GCP (Google Cloud Platform) nodes have an active preempt operation. It uses the gcloud command-line tool to interact with GCP APIs and retrieve the necessary information. 3 | 4 | 5 | ## SLI 6 | The SLI lists all preempt node operations that have a status that does not match "DONE", counts the total nodes in this state, and pushes the metric. 7 | 8 | ## TaskSet 9 | The Taskset lists all preempt node operations that have a status that does not match "DONE" and returns the following details in json format: 10 | 11 | - startTime 12 | - targetLink 13 | - statusMessage 14 | - progress 15 | - zone 16 | - selfLink 17 | 18 | 19 | ## Requirements 20 | The following permissions are required on the GCP service account used with the gcloud utility: 21 | 22 | - 'compute.globalOperations.list' -------------------------------------------------------------------------------- /codebundles/gcp-bucket-health/.runwhen/generation-rules/gcp-bucket-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: gcp 5 | generationRules: 6 | - resourceTypes: 7 | - gcp_storage_buckets 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: gcp-bucket-health 15 | qualifiers: ["project"] 16 | baseTemplateName: gcp-bucket-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: slo 22 | - type: runbook 23 | templateName: gcp-bucket-health-taskset.yaml 24 | -------------------------------------------------------------------------------- /codebundles/gcp-bucket-health/.runwhen/templates/gcp-bucket-health-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 1 17 | operand: eq -------------------------------------------------------------------------------- /codebundles/gcp-bucket-health/README.md: -------------------------------------------------------------------------------- 1 | # GCP Bucket Health 2 | This code checks if any GCP (Google Cloud Platform) buckets are unhealthy, focusing on: 3 | - Utilization (with a user defined threshold for issue/alert generation) 4 | - Security Configuration (with a user defined threshold on when to generate issues/alerts for publicly accessible buckets) 5 | 6 | 7 | ## SLI 8 | The SLI: 9 | - counts the number of buckets that are above the user defined threshold 10 | - counts the number of publicly accessible buckets above the user defined threshold 11 | 12 | ## TaskSet 13 | The Taskset lists provides the following tasks: 14 | 15 | - Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}` 16 | - Add GCP Bucket Storage Configuration for `${PROJECT_IDS}` to Report 17 | - Check GCP Bucket Security Configuration for `${PROJECT_IDS}` 18 | 19 | ## Requirements 20 | The following roles are useful on the GCP service account used with the gcloud utility: 21 | 22 | - Viewer 23 | - Security Reviewer 24 | 25 | ## TODO 26 | Update required GCP SA permissions. -------------------------------------------------------------------------------- /codebundles/gcp-cloud-function-health/.runwhen/generation-rules/gcp-cloud-function-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: gcp 5 | generationRules: 6 | - resourceTypes: 7 | - gcp_functions_functions 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: gcp-function-health 15 | qualifiers: ["project"] 16 | baseTemplateName: gcp-cloud-function-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: slo 22 | - type: runbook 23 | templateName: gcp-cloud-function-health-taskset.yaml 24 | -------------------------------------------------------------------------------- /codebundles/gcp-cloud-function-health/.runwhen/templates/gcp-cloud-function-health-sli.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelIndicator 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | displayUnitsLong: Number 11 | displayUnitsShort: '#' 12 | locations: 13 | - {{default_location}} 14 | description: Measures ____ 15 | codeBundle: 16 | {% if repo_url %} 17 | repoUrl: {{repo_url}} 18 | {% else %} 19 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git 20 | {% endif %} 21 | {% if ref %} 22 | ref: {{ref}} 23 | {% else %} 24 | ref: main 25 | {% endif %} 26 | pathToRobot: codebundles/gcp-cloud-function-health/sli.robot 27 | intervalStrategy: intermezzo 28 | intervalSeconds: 300 29 | configProvided: 30 | - name: GCP_PROJECT_ID 31 | value: {{match_resource.resource.project_id}} 32 | secretsProvided: 33 | - name: gcp_credentials_json 34 | workspaceKey: {{custom.gcp_ops_suite_sa}} -------------------------------------------------------------------------------- /codebundles/gcp-cloud-function-health/.runwhen/templates/gcp-cloud-function-health-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 0 17 | operand: eq -------------------------------------------------------------------------------- /codebundles/gcp-cloud-function-health/.runwhen/templates/gcp-cloud-function-health-taskset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Runbook 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | location: {{default_location}} 11 | description: Runs tasks to triage unhealthy GCP Cloud Functions 12 | codeBundle: 13 | {% if repo_url %} 14 | repoUrl: {{repo_url}} 15 | {% else %} 16 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git 17 | {% endif %} 18 | {% if ref %} 19 | ref: {{ref}} 20 | {% else %} 21 | ref: main 22 | {% endif %} 23 | pathToRobot: codebundles/gcp-cloud-function-health/runbook.robot 24 | intervalStrategy: intermezzo 25 | intervalSeconds: 300 26 | configProvided: 27 | - name: GCP_PROJECT_ID 28 | value: {{match_resource.resource.project_id}} 29 | secretsProvided: 30 | - name: gcp_credentials_json 31 | workspaceKey: {{custom.gcp_ops_suite_sa}} -------------------------------------------------------------------------------- /codebundles/gcp-cloud-function-health/README.md: -------------------------------------------------------------------------------- 1 | # GCP Cloud Function Health 2 | This code checks if any GCP (Google Cloud Platform) cloud functions are unhealthy. It uses the gcloud command-line tool to interact with GCP APIs and retrieve the necessary information. 3 | 4 | > Note: Only cloud functions v1 is supported at this time for automatic discovery with the RunWhen Local Discovery Process. The tasks will support either generation. 5 | 6 | ## SLI 7 | The SLI counts the number of cloud functions that are "FAILED" state and pushes the metric. 8 | 9 | ## TaskSet 10 | The Taskset lists provides the following tasks: 11 | 12 | - List Unhealhy Cloud Functions in GCP Project 13 | - Get Error Logs for Unhealthy Cloud Functions in GCP Project 14 | 15 | ## Requirements 16 | The following permissions are required on the GCP service account used with the gcloud utility: 17 | 18 | - `cloudfunctions.functions.get` 19 | - `cloudfunctions.functions.list` -------------------------------------------------------------------------------- /codebundles/gh-actions-artifact-analysis/README.md: -------------------------------------------------------------------------------- 1 | # GitHub Actions Artifact Analysis 2 | This codebundle is highly configurable and integrates with GitHub Actions and workflow artifacts. It downloads a specified artifact from the last workflow run, analyzes a artifact with a user provided command (typically using linux / bash tools like jq) 3 | 4 | ## SLI 5 | This SLI downloads the artifact from the latest run of the GitHub Actions workflow, runs the analysis command (which must result in a metric), and pushes the metric to the RunWhen Platform. 6 | 7 | ## TaskSet 8 | This SLI downloads the artifact from the latest GitHub Actions workflow run, executes the analysis command and adds the details to the report. It can also generate Issues if: 9 | - a user specified string is found in the report output 10 | - the latest run didn't complete successfully 11 | - the latest run is older than the desired time period ($PERIOD_HOURS) 12 | -------------------------------------------------------------------------------- /codebundles/gh-actions-artifact-analysis/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/gh-actions-health/.runwhen/generation-rules/gh-actions-health.yaml: -------------------------------------------------------------------------------- 1 | # We do not currently support github as a platform type, but this is the template for how we would do it if we did. 2 | # apiVersion: runwhen.com/v1 3 | # kind: GenerationRules 4 | # spec: 5 | # platform: github 6 | # generationRules: 7 | # - resourceTypes: 8 | # - github_organizations 9 | # - github_repository 10 | # matchRules: 11 | # - type: pattern 12 | # pattern: ".+" 13 | # properties: [name] 14 | # mode: substring 15 | # slxs: 16 | # - baseName: gh-actions-health 17 | # qualifiers: ["resource"] 18 | # baseTemplateName: gh-actions-health 19 | # levelOfDetail: basic 20 | # outputItems: 21 | # - type: slx 22 | # - type: sli 23 | # - type: runbook 24 | # templateName: gh-actions-health-taskset.yaml -------------------------------------------------------------------------------- /codebundles/gh-actions-health/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] -------------------------------------------------------------------------------- /codebundles/gke-cluster-health/.runwhen/generation-rules/gke-cluster-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: gcp 5 | generationRules: 6 | - resourceTypes: 7 | - gcp_container_clusters 8 | matchRules: 9 | - type: pattern 10 | pattern: ".+" 11 | properties: [name] 12 | mode: substring 13 | slxs: 14 | - baseName: gke-cluster-health 15 | qualifiers: ["project"] 16 | baseTemplateName: gke-cluster-health 17 | levelOfDetail: basic 18 | outputItems: 19 | - type: slx 20 | - type: sli 21 | - type: runbook 22 | templateName: gke-cluster-health-taskset.yaml 23 | - type: workflow 24 | -------------------------------------------------------------------------------- /codebundles/gke-cluster-health/.runwhen/templates/gke-cluster-health-taskset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Runbook 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | location: {{default_location}} 11 | description: Runs tasks validate GKE Cluster health 12 | codeBundle: 13 | {% if repo_url %} 14 | repoUrl: {{repo_url}} 15 | {% else %} 16 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git 17 | {% endif %} 18 | {% if ref %} 19 | ref: {{ref}} 20 | {% else %} 21 | ref: main 22 | {% endif %} 23 | pathToRobot: codebundles/gke-cluster-health/runbook.robot 24 | configProvided: 25 | - name: GCP_PROJECT_ID 26 | value: {{match_resource.resource.project_id}} 27 | secretsProvided: 28 | - name: gcp_credentials_json 29 | workspaceKey: {{custom.gcp_ops_suite_sa}} -------------------------------------------------------------------------------- /codebundles/gke-cluster-health/.runwhen/templates/gke-cluster-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "GKE Health Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.project_id}} GKE Health Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for GKE clusters in {{match_resource.resource.project_id}} 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.project_id}}-{{ "GKE Health Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/gke-cluster-health/.test/README.md: -------------------------------------------------------------------------------- 1 | export GCP_PROJECT_ID="" 2 | export RW_FROM_FILE='{"gcp_credentials_json":"/home/runwhen/codecollection/auth/svc.json"}' 3 | export CLOUDSDK_CORE_PROJECT=$GCP_PROJECT_ID 4 | -------------------------------------------------------------------------------- /codebundles/jenkins-health/.runwhen/generation-rules/jenkins-instance-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: aws 5 | generationRules: 6 | - resourceTypes: 7 | - ec2_instance 8 | matchRules: 9 | - type: and 10 | matches: 11 | - type: pattern 12 | pattern: "jenkins-server" 13 | properties: [tag-values] 14 | mode: substring 15 | - type: pattern 16 | pattern: "running" 17 | properties: [state/Name] 18 | mode: substring 19 | 20 | slxs: 21 | - baseName: jenkins-instance-health 22 | levelOfDetail: detailed 23 | qualifiers: ["resource"] 24 | baseTemplateName: jenkins-instance-health 25 | outputItems: 26 | - type: slx 27 | - type: sli 28 | - type: runbook 29 | templateName: jenkins-instance-health-taskset.yaml 30 | -------------------------------------------------------------------------------- /codebundles/jenkins-health/.runwhen/templates/jenkins-instance-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/Jenkins_logo.svg 11 | alias: {{match_resource.resource.tags.Name}} Instance Health 12 | # alias: Jenkins Health 13 | asMeasuredBy: The number of failed Jenkins builds. 14 | configProvided: 15 | - name: SLX_PLACEHOLDER 16 | value: SLX_PLACEHOLDER 17 | owners: 18 | - {{workspace.owner_email}} 19 | statement: The number of failed Jenkins builds should be zero. 20 | additionalContext: [] -------------------------------------------------------------------------------- /codebundles/jenkins-health/.runwhen/templates/jenkins-instance-health-taskset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Runbook 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | location: {{default_location}} 11 | codeBundle: 12 | {% if repo_url %} 13 | repoUrl: {{repo_url}} 14 | {% else %} 15 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git 16 | {% endif %} 17 | {% if ref %} 18 | ref: {{ref}} 19 | {% else %} 20 | ref: main 21 | {% endif %} 22 | pathToRobot: codebundles/jenkins-health/runbook.robot 23 | configProvided: 24 | - name: JENKINS_URL 25 | value: {{custom.jenkins_url}} 26 | secretsProvided: 27 | - name: JENKINS_USERNAME 28 | workspaceKey: {{custom.jenkins_username}} 29 | - name: JENKINS_TOKEN 30 | workspaceKey: {{custom.jenkins_token}} 31 | -------------------------------------------------------------------------------- /codebundles/jenkins-health/.test/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = "us-west-2" # Replace with your desired region 3 | } -------------------------------------------------------------------------------- /codebundles/jenkins-health/README.md: -------------------------------------------------------------------------------- 1 | # AWS Jenkins Health 2 | 3 | This CodeBundle monitors and evaluates the health of Jenkins using the Jenkins REST API 4 | 5 | ## SLI 6 | The SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following: 7 | - Check if Jenkins instance is reachable and responding (endpoint) 8 | - Check For Failed Build Logs in Jenkins 9 | - Check For Long Running Builds in Jenkins 10 | - Check For Long Queued Builds in Jenkins 11 | - Check Jenkins Executor Utilization 12 | 13 | ## TaskSet 14 | Similar to the SLI, but produces a report on the specific jenkns apis and raises issues for each Jenkins check that requires attention. 15 | 16 | ## Required Configuration 17 | 18 | ``` 19 | export JENKINS_URL="" 20 | export JENKINS_USERNAME="" 21 | export JENKINS_TOKEN="" 22 | ``` 23 | 24 | ## Testing 25 | See the `.test` directory for infrastructure test code. -------------------------------------------------------------------------------- /codebundles/k8s-app-troubleshoot/.runwhen/templates/k8s-app-troubleshoot-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 1 17 | operand: eq -------------------------------------------------------------------------------- /codebundles/k8s-app-troubleshoot/.runwhen/templates/k8s-app-troubleshoot-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/gcp/cloud_load_balancing/cloud_load_balancing.svg 11 | alias: {{match_resource.resource.metadata.name}} Application Error Monitor 12 | asMeasuredBy: The number of errors and parsable exceptions in the application logs. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: The application should not be throwing exceptions. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/.runwhen/generation-rules/k8s-deployment-logs-health.yaml: -------------------------------------------------------------------------------- 1 | # apiVersion: runwhen.com/v1 2 | # kind: GenerationRules 3 | # spec: 4 | # generationRules: 5 | # - resourceTypes: 6 | # - deployment 7 | # matchRules: 8 | # - type: pattern 9 | # pattern: ".+" 10 | # properties: [name] 11 | # mode: substring 12 | # slxs: 13 | # - baseName: depl-logs-health 14 | # levelOfDetail: detailed 15 | # qualifiers: ["resource", "namespace", "cluster"] 16 | # baseTemplateName: k8s-deployment-logs-health 17 | # outputItems: 18 | # - type: slx 19 | # # - type: sli 20 | # - type: runbook 21 | # templateName: k8s-deployment-logs-health-taskset.yaml 22 | # # - type: workflow 23 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/.runwhen/generation-rules/k8s-statefulset-logs-health.yaml: -------------------------------------------------------------------------------- 1 | # apiVersion: runwhen.com/v1 2 | # kind: GenerationRules 3 | # spec: 4 | # generationRules: 5 | # - resourceTypes: 6 | # - statefulSet 7 | # matchRules: 8 | # - type: pattern 9 | # pattern: ".+" 10 | # properties: [name] 11 | # mode: substring 12 | # slxs: 13 | # - baseName: ss-logs-health 14 | # levelOfDetail: detailed 15 | # qualifiers: ["resource", "namespace", "cluster"] 16 | # baseTemplateName: k8s-ss-logs-health 17 | # outputItems: 18 | # - type: slx 19 | # # - type: sli 20 | # - type: runbook 21 | # templateName: k8s-ss-logs-health-taskset.yaml 22 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/.runwhen/templates/k8s-deployment-logs-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "Deployment Log Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.metadata.name}} Deployment Log SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.metadata.name}} deployment log health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.metadata.name}}-{{ "Deployment Log Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/.runwhen/templates/k8s-ss-logs-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/deploy.svg 11 | alias: {{match_resource.resource.metadata.name}} StatefulSet Log Health Check 12 | asMeasuredBy: Error logs, stack traces, connection failures, etc. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Pods for {{match_resource.resource.metadata.name}} StatefulSet should have error free logs. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/.runwhen/templates/k8s-statefulset-logs-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "StatefulSet Log Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.metadata.name}} StatefulSet Log SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{match_resource.resource.metadata.name}} statefulset log health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.metadata.name}}-{{ "StatefulSet Log Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/ignore_patterns.json: -------------------------------------------------------------------------------- 1 | { 2 | "patterns": [ 3 | { 4 | "match": "connection closed before message completed", 5 | "category": "Ignore", 6 | "explanation": "Normal closures" 7 | }, 8 | { 9 | "match": "server idle timeout", 10 | "category": "Ignore", 11 | "explanation": "Normal closures" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/scan_application_restarts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3 scan_logs.py 4 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/scan_auth_failures.sh: -------------------------------------------------------------------------------- 1 | # #!/bin/bash 2 | 3 | python3 scan_logs.py 4 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/scan_connection_failures.sh: -------------------------------------------------------------------------------- 1 | # #!/bin/bash 2 | 3 | CATEGORIES=${CATEGORIES} python3 scan_logs.py 4 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/scan_error_logs.sh: -------------------------------------------------------------------------------- 1 | # #!/bin/bash 2 | 3 | python3 scan_logs.py 4 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/scan_null_pointer_exceptions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python3 scan_logs.py 3 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/scan_resource_warnings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3 scan_logs.py 4 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/scan_service_dependency_failures.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3 scan_logs.py 4 | -------------------------------------------------------------------------------- /codebundles/k8s-application-log-health/scan_timeout_errors.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3 scan_logs.py 4 | -------------------------------------------------------------------------------- /codebundles/k8s-argocd-application-health/.runwhen/generation-rules/k8s-argocd-application-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - applications.argoproj.io 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: argocd-app 14 | qualifiers: ["resource", "namespace", "cluster"] 15 | baseTemplateName: k8s-argocd-application-health 16 | levelOfDetail: detailed 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-argocd-application-health-cli-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-argocd-application-health/.runwhen/templates/k8s-argocd-application-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/argo-icon-color.svg 11 | alias: {{match_resource.resource.metadata.name}} ArgoCD Application Health 12 | asMeasuredBy: The sync status of the ArgoCD application object. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Resources for {{match_resource.resource.metadata.namespace}} should be synced in a healthy state. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-argocd-application-health/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes ArgoCD Application Health 2 | This codebundle is used to help measure and troubleshoot the health of an ArgoCD managed application. 3 | 4 | ## TaskSet 5 | This taskset collects information and runs general troubleshooting checks against argocd application objects within a namespace. 6 | 7 | Example configuration for an application in which the ArgoCD Application object resides in the same namespace as the resources themselves: 8 | ``` 9 | export DISTRIBUTION=Kubernetes 10 | export CONTEXT=cluster-1 11 | export APPLICATION=otel-demo 12 | export APPLICATION_TARGET_NAMESPACE=otel-demo 13 | export APPLICATION_APP_NAMESPACE=otel-demo 14 | export ERROR_PATTERN="Quota|Error|Exception" 15 | ``` 16 | 17 | ## TODO 18 | - [ ] Try support for list of applications in conjunction with single application 19 | - [ ] Add documentation 20 | - [ ] Add issues 21 | -------------------------------------------------------------------------------- /codebundles/k8s-argocd-helm-health/.runwhen/generation-rules/k8s-argocd-helm-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - applications.argoproj.io 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: argocd-helm 14 | qualifiers: ["resource", "namespace", "cluster"] 15 | baseTemplateName: k8s-argocd-helm-health 16 | levelOfDetail: detailed 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-argocd-helm-health-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-argocd-helm-health/.runwhen/templates/k8s-argocd-helm-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/argo-icon-color.svg 11 | alias: {{match_resource.resource.metadata.name}} ArgoCD Helm Health 12 | asMeasuredBy: The sync status of the ArgoCD Helm releases. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Helm releases for {{match_resource.resource.metadata.namespace}} should be synced and versioned aligned. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-argocd-helm-health/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes ArgoCD Helm Health 2 | This codebundle is used to help measure and troubleshoot the health of an ArgoCD managed Helm deployments. 3 | 4 | ## TaskSet 5 | This taskset collects information and runs general troubleshooting checks against argocd Helm applications objects within a namespace. 6 | 7 | Example configuration for an application in which the ArgoCD Application object resides in the same namespace as the resources themselves: 8 | ``` 9 | export DISTRIBUTION=Kubernetes 10 | export CONTEXT=cluster-1 11 | export NAMESPACE=otel-demo 12 | export RESOURCE_NAME="applications.argoproj.io" 13 | ``` 14 | 15 | ## TODO 16 | - [ ] Try support for list of applications in conjunction with single application 17 | - [ ] Add documentation 18 | - [ ] Add issues 19 | -------------------------------------------------------------------------------- /codebundles/k8s-artifactory-health/.runwhen/generation-rules/k8s-artifactory.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - stateful_set 7 | matchRules: 8 | - type: pattern 9 | pattern: "artifactory" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: arti-health 14 | levelOfDetail: basic 15 | qualifiers: ["resource", "namespace", "cluster"] 16 | baseTemplateName: k8s-artifactory-healthcheck 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-artifactory-healthcheck-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-artifactory-health/.runwhen/templates/k8s-artifactory-healthcheck-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/jfrog-icon.svg 11 | alias: {{namespace.name}} Artifactory Health 12 | asMeasuredBy: The availablity reported by the artifactory http endpoints. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Artifactory services should be healthy and available. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-certmanager-healthcheck/.runwhen/generation-rules/k8s-certmanager-certificates-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - certificates.cert-manager.io 7 | matchRules: 8 | - type: pattern 9 | pattern: "." 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: cert-health 14 | qualifiers: ["namespace", "cluster"] 15 | baseTemplateName: k8s-certmanager-certificate-health 16 | levelOfDetail: detailed 17 | outputItems: 18 | - type: slx 19 | - type: sli 20 | - type: slo 21 | - type: runbook 22 | templateName: k8s-certmanager-certificate-health-taskset.yaml 23 | - type: workflow 24 | -------------------------------------------------------------------------------- /codebundles/k8s-certmanager-healthcheck/.runwhen/templates/k8s-certmanager-certificate-health-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 0 17 | operand: eq -------------------------------------------------------------------------------- /codebundles/k8s-certmanager-healthcheck/.runwhen/templates/k8s-certmanager-certificate-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/cert-manager.svg 11 | alias: {{namespace.name}} SSL Certificate Health 12 | asMeasuredBy: Certificates in an unready state 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: All certificates should be in a Ready state 99.5%. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only 26 | -------------------------------------------------------------------------------- /codebundles/k8s-certmanager-healthcheck/.runwhen/templates/k8s-certmanager-certificate-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "SSL Certificate Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{match_resource.resource.metadata.namespace}} SSL Certificate Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{namespace.name}} SSL Certificate Health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{match_resource.resource.metadata.namespace}}-{{ "SSL Certificate Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/k8s-chaos-flux/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-namespace/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Namespace Chaos Engineering 2 | 3 | This codebundle provides chaos injection for kubernetes namespaces 4 | 5 | ## Tasks 6 | 7 | `Test Namespace Highly Available` 8 | `Test Node Drain` 9 | `Mangle Service Selector` 10 | `Mangle Service Port` 11 | `Fill Pod Tmp` 12 | 13 | ## Configuration 14 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 15 | 16 | - `KUBECONFIG`: The kubeconfig secret containing access info for the cluster. 17 | - `CONTEXT`: The Kubernetes context to operate within. 18 | - `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces. 19 | 20 | 21 | ## Requirements 22 | - A kubeconfig with appropriate RBAC permissions to perform the desired command. 23 | 24 | ## TODO 25 | - [ ] Add additional documentation. 26 | 27 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-namespace/auth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if required kubectl environment variables set 4 | if [[ -z $CONTEXT || -z $KUBECONFIG ]]; then 5 | echo "Missing required environment variables for kubectl: CONTEXT, KUBECONFIG" 6 | exit 1 7 | fi 8 | if [[ -f $KUBECONFIG ]]; then 9 | cat "$KUBECONFIG" > /tmp/kubeconfig 10 | else 11 | echo "$KUBECONFIG" > /tmp/kubeconfig 12 | fi 13 | export KUBECONFIG="/tmp/kubeconfig" 14 | kubectl config set-context "$CONTEXT" > /dev/null -------------------------------------------------------------------------------- /codebundles/k8s-chaos-namespace/delete_random_pods.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Environment Variables 4 | # NAMESPACE 5 | # CONTEXT 6 | 7 | MAX_DELETIONS=10 8 | POD_NAMES=$(kubectl get --context $CONTEXT pods -oname -n $NAMESPACE) 9 | echo "Starting random pod deletions in namespace $NAMESPACE" 10 | deleted_count=0 11 | for pod_name in $POD_NAMES; do 12 | # Roll a 50/50 chance 13 | if (( RANDOM % 2 == 0 )); then 14 | # Delete the pod 15 | kubectl delete --context $CONTEXT $pod_name -n $NAMESPACE 16 | echo "Waiting between deletions..." 17 | sleep 3 18 | # Increment the deleted count 19 | ((deleted_count++)) 20 | fi 21 | # Check if we have deleted 10 pods 22 | if (( deleted_count >= MAX_DELETIONS )); then 23 | break 24 | fi 25 | done 26 | 27 | echo "Random deletions complete. Current Pod States:" 28 | kubectl get --context $CONTEXT pods -n $NAMESPACE 29 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-namespace/drain_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the service account has the necessary permissions 4 | if ! kubectl auth can-i create nodes; then 5 | echo "Insufficient permissions to make node changes." 6 | exit 1 7 | fi 8 | 9 | READYNODES=$(kubectl get nodes | grep Ready | awk '{print $1}') 10 | 11 | NODES=($READYNODES) 12 | 13 | # Get random node 14 | RANDOM_INDEX=$((RANDOM % ${#NODES[@]})) 15 | RANDOM_NODE=${NODES[$RANDOM_INDEX]} 16 | 17 | if [ -z "$RANDOM_NODE" ]; then 18 | echo "No suitable nodes found for draining." 19 | exit 1 20 | fi 21 | 22 | # Drain the node 23 | kubectl drain $RANDOM_NODE --ignore-daemonsets 24 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-namespace/expand_tmp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Environment Variables: 4 | # NAMESPACE 5 | # CONTEXT 6 | 7 | # Find a random pod in the given namespace 8 | pod=$(kubectl get --context $CONTEXT pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | shuf -n 1) 9 | 10 | echo "Expanding /tmp of pod $pod in namespace $NAMESPACE" 11 | 12 | # Exec into the pod and create a file at /tmp/chaos 13 | kubectl exec --context $CONTEXT -n "$NAMESPACE" "$pod" -- touch /tmp/chaos 14 | 15 | # Fill the file with random data until it consumes all space in the container 16 | kubectl exec --context $CONTEXT -n "$NAMESPACE" "$pod" -- sh -c "dd if=/dev/zero of=/tmp/chaos bs=1M count=1024" 17 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-namespace/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-workload/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Workload Chaos Engineering 2 | 3 | This codebundle provides chaos injection for a specific workload within a Kubernetes namespace. 4 | 5 | ## Configuration 6 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 7 | 8 | - `KUBECONFIG`: The kubeconfig secret containing access info for the cluster. 9 | - `CONTEXT`: The Kubernetes context to operate within. 10 | - `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces. 11 | - `WORKLOAD_NAME`: The specific workload to inject chaos experiments into. Eg: deployment/my-app 12 | 13 | 14 | ## Requirements 15 | - A kubeconfig with appropriate RBAC permissions to perform the desired command. 16 | 17 | ## TODO 18 | - [ ] Add additional documentation. 19 | 20 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-workload/auth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if required kubectl environment variables set 4 | if [[ -z $CONTEXT || -z $KUBECONFIG ]]; then 5 | echo "Missing required environment variables for kubectl: CONTEXT, KUBECONFIG" 6 | exit 1 7 | fi 8 | if [[ -f $KUBECONFIG ]]; then 9 | cat "$KUBECONFIG" > /tmp/kubeconfig 10 | else 11 | echo "$KUBECONFIG" > /tmp/kubeconfig 12 | fi 13 | export KUBECONFIG="/tmp/kubeconfig" 14 | kubectl config set-context "$CONTEXT" > /dev/null -------------------------------------------------------------------------------- /codebundles/k8s-chaos-workload/expand_tmp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Environment Variables: 4 | # NAMESPACE 5 | # CONTEXT 6 | # WORKLOAD_NAME 7 | 8 | selectors=$(kubectl get --context "$CONTEXT" -n "$NAMESPACE" "$WORKLOAD_NAME" -o jsonpath='{ .spec.selector.matchLabels }') 9 | selectors=$(echo $selectors | jq -r 'to_entries | map("\(.key)=\(.value)") | join(",")') 10 | echo "Fetching pods with label selector: $selectors" 11 | pods=$(kubectl get --context "$CONTEXT" pods -n "$NAMESPACE" -l "$selectors" -o jsonpath='{.items[*].metadata.name}') 12 | 13 | # Find a random pod in the given namespace 14 | pod=$(echo $pods | tr ' ' '\n' | shuf -n 1) 15 | 16 | echo "Expanding /tmp of pod $pod in namespace $NAMESPACE" 17 | 18 | # Exec into the pod and create a file at /tmp/chaos 19 | kubectl exec --context $CONTEXT -n "$NAMESPACE" "$pod" -- touch /tmp/chaos 20 | 21 | # Fill the file with random data until it consumes all space in the container 22 | kubectl exec --context $CONTEXT -n "$NAMESPACE" "$pod" -- sh -c "dd if=/dev/zero of=/tmp/chaos bs=1M count=1024" 23 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-workload/kill_workload_pod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Environment Variables 4 | # NAMESPACE 5 | # CONTEXT 6 | # WORKLOAD_NAME 7 | 8 | selectors=$(kubectl get --context "$CONTEXT" -n "$NAMESPACE" "$WORKLOAD_NAME" -o jsonpath='{ .spec.selector.matchLabels }') 9 | selectors=$(echo $selectors | jq -r 'to_entries | map("\(.key)=\(.value)") | join(",")') 10 | echo "Fetching pods with label selector: $selectors" 11 | pods=$(kubectl get --context "$CONTEXT" pods -n "$NAMESPACE" -l "$selectors" -o jsonpath='{.items[*].metadata.name}') 12 | 13 | MAX_DELETIONS=1 14 | echo "Killing a pod owned by "$WORKLOAD_NAME" in namespace $NAMESPACE" 15 | deleted_count=0 16 | for pod_name in $pods; do 17 | # Delete the pod 18 | kubectl delete --context $CONTEXT pod $pod_name -n $NAMESPACE 19 | # Increment the deleted count 20 | ((deleted_count++)) 21 | # Check if we have deleted 10 pods 22 | if (( deleted_count >= MAX_DELETIONS )); then 23 | break 24 | fi 25 | done 26 | 27 | echo "Deletions complete. Current Pod States:" 28 | kubectl get --context $CONTEXT pods -n $NAMESPACE 29 | -------------------------------------------------------------------------------- /codebundles/k8s-chaos-workload/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/k8s-cluster-node-health/.runwhen/generation-rules/k8s-cluster-node-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: kubernetes 5 | generationRules: 6 | - resourceTypes: 7 | - cluster 8 | matchRules: 9 | - type: and 10 | matches: 11 | - type: pattern 12 | pattern: ".+" 13 | properties: [name] 14 | mode: substring 15 | slxs: 16 | - baseName: cluster-node-health 17 | qualifiers: ["cluster"] 18 | baseTemplateName: k8s-cluster-node-health 19 | levelOfDetail: basic 20 | outputItems: 21 | - type: slx 22 | - type: sli 23 | - type: runbook 24 | templateName: k8s-cluster-node-health-taskset.yaml 25 | - type: workflow 26 | -------------------------------------------------------------------------------- /codebundles/k8s-cluster-node-health/.runwhen/templates/k8s-cluster-node-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/infrastructure_components/labeled/node.svg 11 | alias: {{ cluster.name }} Cluster Node Health 12 | asMeasuredBy: Node restarts, ready status, and other error or pressure conditions. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{cluster.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Cluster nodes for {{cluster.context}} should be ready and available 100% of the time. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-cluster-node-health/.runwhen/templates/k8s-cluster-node-health-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "Node Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{ cluster.name }} Node Health SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{ cluster.name }} cluster node health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{cluster.name}}-{{ "Node Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/k8s-cluster-node-health/README.md: -------------------------------------------------------------------------------- 1 | # K8s Cluster Node Health 2 | 3 | ## SLI 4 | The Service Level Indicator will generate a score for the health of the nodes in the cluster. This is an aggregate score from the tasks, which currently include: 5 | - Check for Node Restarts in Cluster 6 | 7 | ## TaskSet 8 | ### Check for Node Restarts in Cluster 9 | Create a report of all nodes start/stop/preempts/removals in the cluster. This will generate an information issue since node starts/stops may be routine, but users may want to be aware that they are happening if their pods are temporarily affected. 10 | 11 | ## Requirements 12 | - Service account with permissions to: 13 | - get nodes 14 | - list nodes 15 | -------------------------------------------------------------------------------- /codebundles/k8s-cluster-resource-health/.runwhen/generation-rules/k8s-cluster-resource-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | platform: kubernetes 5 | generationRules: 6 | - resourceTypes: 7 | - cluster 8 | matchRules: 9 | - type: and 10 | matches: 11 | - type: pattern 12 | pattern: ".+" 13 | properties: [name] 14 | mode: substring 15 | slxs: 16 | - baseName: cluster-resource 17 | qualifiers: ["cluster"] 18 | baseTemplateName: k8s-cluster-resource-health 19 | levelOfDetail: basic 20 | outputItems: 21 | - type: slx 22 | - type: runbook 23 | templateName: k8s-cluster-resource-health-taskset.yaml 24 | -------------------------------------------------------------------------------- /codebundles/k8s-cluster-resource-health/.runwhen/templates/k8s-cluster-resource-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes-icon-color.svg 11 | alias: {{ cluster.name }} Cluster Resource Health 12 | asMeasuredBy: Node cpu and memory utilization. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{cluster.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Cluster resources for {{cluster.context}} should be less than 90% utilization. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-cluster-resource-health/README.md: -------------------------------------------------------------------------------- 1 | # K8s Cluster Resource Health 2 | 3 | ## SLI 4 | The Service Level Indicator will count the amount of nodes that are over 90% active utilization according to `kubectl top nodes` 5 | 6 | ## TaskSet 7 | ### Identify High Utilization Nodes for Cluster 8 | Create a report of all nodes that are above 90% utilization. Raise issues for each node that is in this state. 9 | 10 | ### Identify Pods Causing High Node Utilization in Cluster 11 | This task identifies overutilized nodes and creates a report of each pod that is using more than it's defined request. Since requests are what a cluster autoscaler uses to make decisions, this list should be used to increase the pod requests so that autoscalers can make better scaling decisions. 12 | 13 | Raises an issue for each namespace 14 | 15 | 16 | ## Requirements 17 | - Service account with permissions to: 18 | - get nodes 19 | - list nodes 20 | - get/list nodes in api group "metrics.k8s.io" -------------------------------------------------------------------------------- /codebundles/k8s-daemonset-healthcheck/.runwhen/generation-rules/k8s-daemonset-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - daemonset 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: ds-health 14 | levelOfDetail: detailed 15 | qualifiers: ["resource", "namespace", "cluster"] 16 | baseTemplateName: k8s-daemonset-health 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-daemonset-health-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-daemonset-healthcheck/.runwhen/templates/k8s-daemonset-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/ds.svg 11 | alias: {{match_resource.resource.metadata.name}} DaemonSet Health 12 | asMeasuredBy: The Running state of desired pods. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: DaemonSet {{match_resource.resource.metadata.name}} should be in a healthy state. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-deployment-healthcheck/.runwhen/generation-rules/k8s-deployment-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - deployment 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: depl-health 14 | levelOfDetail: detailed 15 | qualifiers: ["resource", "namespace", "cluster"] 16 | baseTemplateName: k8s-deployment-health 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-deployment-health-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-deployment-ops/.runwhen/generation-rules/k8s-deployment-ops.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - deployment 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: depl-ops 14 | levelOfDetail: detailed 15 | qualifiers: ["resource", "namespace", "cluster"] 16 | baseTemplateName: k8s-deployment-ops 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-deployment-ops-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-deployment-ops/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-helm-health/.runwhen/generation-rules/k8s-flux-helm-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - helmreleases.helm.toolkit.fluxcd.io 7 | matchRules: 8 | - type: and 9 | matches: 10 | - type: pattern 11 | pattern: ".+" 12 | properties: [name] 13 | mode: substring 14 | slxs: 15 | - baseName: flux-helm 16 | qualifiers: ["namespace", "cluster"] 17 | baseTemplateName: k8s-flux-helm-health 18 | levelOfDetail: detailed 19 | outputItems: 20 | - type: slx 21 | - type: runbook 22 | templateName: k8s-flux-helm-health-taskset.yaml 23 | -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-helm-health/.runwhen/templates/k8s-flux-helm-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/helm-icon-color.svg 11 | alias: {{namespace.name}} Helm Release Health 12 | asMeasuredBy: The reconciliation status of the helm release object. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Helm releases for {{namespace.name}} should be reconciled in a good state. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | 25 | - name: access 26 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-kustomization-health/.runwhen/generation-rules/k8s-flux-kustomization-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - kustomizations.kustomize.toolkit.fluxcd.io 7 | matchRules: 8 | - type: and 9 | matches: 10 | - type: pattern 11 | pattern: ".+" 12 | properties: [name] 13 | mode: substring 14 | slxs: 15 | - baseName: flux-kstmz 16 | qualifiers: ["namespace", "cluster"] 17 | baseTemplateName: k8s-flux-kustomize-health 18 | levelOfDetail: detailed 19 | outputItems: 20 | - type: slx 21 | - type: sli 22 | - type: runbook 23 | templateName: k8s-flux-kustomize-health-taskset.yaml 24 | -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-kustomization-health/.runwhen/templates/k8s-flux-kustomize-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/flux-icon-color.svg 11 | alias: {{namespace.name}} GitOps Flux Kustomization Health 12 | asMeasuredBy: The sync/ready status of the Flux Kustomization objects in namespace {{namespace.name}}. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Flux Kustomizations for {{namespace.name}} should be synced and ready. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | 25 | - name: access 26 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-reconcile/.runwhen/generation-rules/k8s-fluxcd-reconcile.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - namespace 7 | matchRules: 8 | - type: pattern 9 | pattern: "flux-system" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: fluxcd-reconciliation 14 | levelOfDetail: basic 15 | qualifiers: ["namespace", "cluster"] 16 | baseTemplateName: k8s-fluxcd-reconcile 17 | outputItems: 18 | - type: slx 19 | - type: sli 20 | - type: slo 21 | - type: runbook 22 | templateName: k8s-fluxcd-reconcile-taskset.yaml 23 | -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-reconcile/.runwhen/templates/k8s-fluxcd-reconcile-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 1 17 | operand: eq -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-reconcile/.runwhen/templates/k8s-fluxcd-reconcile-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/helm-icon-color.svg 11 | alias: FluxCD Reconciliation Health For {{cluster.name}} 12 | asMeasuredBy: The reconciliation loops for all of fluxcd in {{cluster.name}} 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: All FluxCD controllers should have no errors in their reconciliation loops 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-reconcile/.runwhen/templates/k8s-fluxcd-reconcile-taskset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Runbook 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | location: {{default_location}} 11 | codeBundle: 12 | {% if repo_url %} 13 | repoUrl: {{repo_url}} 14 | {% else %} 15 | repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git 16 | {% endif %} 17 | {% if ref %} 18 | ref: {{ref}} 19 | {% else %} 20 | ref: main 21 | {% endif %} 22 | pathToRobot: codebundles/k8s-fluxcd-reconcile/runbook.robot 23 | configProvided: 24 | - name: DISTRIBUTION 25 | value: {{custom.kubernetes_distribution}} 26 | - name: CONTEXT 27 | value: {{context}} 28 | secretsProvided: 29 | {% if wb_version %} 30 | {% include "kubernetes-auth.yaml" ignore missing %} 31 | {% else %} 32 | - name: kubeconfig 33 | workspaceKey: {{custom.kubeconfig_secret_name}} 34 | {% endif %} -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-reconcile/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes FluxCD Reconciliation Errors 2 | This codebundle measures the number of reconciliation errors in the fluxcd controllers and can generate a report of them. 3 | 4 | ## TaskSet 5 | This taskset generates a report containing a summary of logs for each controller and their errors counts, ending with a total error count. 6 | 7 | Example configuration: 8 | ``` 9 | CONTEXT=sandbox-cluster-1 10 | ``` 11 | 12 | ## SLI 13 | The SLI can be used to monitor the overall health of the reconciliation loops for FluxCD and alert developers when a bad manifest has been provided. 14 | 15 | ## Requirements 16 | - A kubeconfig with `get` permissions to on the objects/namespaces that are involved in the query. 17 | 18 | ## TODO 19 | - Add additional rbac and kubectl resources and use cases -------------------------------------------------------------------------------- /codebundles/k8s-fluxcd-reconcile/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/k8s-gitops-gh-remediate/.runwhen/templates/k8s-gitops-gh-remediate-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/github-mark.svg 11 | alias: {{match_resource.resource.metadata.name}} GitOps Configuration Remediations 12 | asMeasuredBy: "" 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Remediate resources in Namespace {{match_resource.resource.metadata.name}} managed in GitHub repositories. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-write -------------------------------------------------------------------------------- /codebundles/k8s-gitops-gh-remediate/.test/README.md: -------------------------------------------------------------------------------- 1 | export RW_FROM_FILE='{"kubeconfig":"/home/runwhen/codecollection/auth/kubeconfig"}' 2 | export github_token="" 3 | export NAMESPACE="online-boutique" 4 | export CONTEXT='sandbox-cluster-1' 5 | export KUBERNETES_DISTRIBUTION_BINARY="kubectl" -------------------------------------------------------------------------------- /codebundles/k8s-gitops-gh-remediate/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/k8s-image-check/.runwhen/generation-rules/k8s-image-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - namespace 7 | matchRules: 8 | - type: pattern 9 | pattern: "." 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: image-check 14 | levelOfDetail: detailed 15 | qualifiers: ["namespace", "cluster"] 16 | baseTemplateName: k8s-image-check 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-image-check-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-image-check/.runwhen/templates/k8s-image-check-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/container-registry.svg 11 | alias: {{namespace.name}} Image Check 12 | asMeasuredBy: Images & their tags running in the namespace for all containers in pods. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: List details about images running in the namespace. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-ingress-healthcheck/.runwhen/generation-rules/k8s-ingress-health .yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - ingress 7 | matchRules: 8 | - type: and 9 | matches: 10 | - type: pattern 11 | pattern: ".+" 12 | properties: [name] 13 | mode: substring 14 | slxs: 15 | - baseName: ingress-health 16 | qualifiers: ["namespace", "cluster"] 17 | baseTemplateName: k8s-ingress-healthcheck 18 | levelOfDetail: basic 19 | outputItems: 20 | - type: slx 21 | - type: runbook 22 | templateName: k8s-ingress-healthcheck-taskset.yaml 23 | -------------------------------------------------------------------------------- /codebundles/k8s-ingress-healthcheck/.runwhen/templates/k8s-ingress-healthcheck-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/ing.svg 11 | alias: {{namespace.name}} Ingress Health 12 | asMeasuredBy: Ingress objects with valid services and endpoints. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: All ingress objects should have services and endpoints backing them. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-ingress-healthcheck/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Ingress Healthcheck 2 | The `k8s-ingress-healthchech` codebundle checks the health of ingress objects within a Namespace. 3 | 4 | ## Tasks 5 | `Fetch Ingress Object Health in Namespace` - This command will list every ingress object and determine whether it has a service and and endpoint. If so, it is considered healthy. It will print out the health result along with the error or the details regarding the service name and pod endpoint names and IPs. 6 | 7 | Example configuration: 8 | ``` 9 | KUBERNETES_DISTRIBUTION_BINARY=kubectl 10 | CONTEXT=sandbox-cluster-1 11 | NAMESPACE=my-namespace 12 | ``` 13 | 14 | ## Requirements 15 | - A kubeconfig with `get` permissions to on the objects/namespaces that are involved in the query. 16 | 17 | 18 | ## TODO 19 | - Add additional rbac and kubectl resources and use cases 20 | - Add additional troubleshooting tasks as use cases evolve -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.runwhen/generation-rules/k8s-istio-system-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - namespace 7 | matchRules: 8 | - type: pattern 9 | pattern: "istio-system" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: k8s-istio-system-health 14 | levelOfDetail: detailed 15 | qualifiers: ["cluster"] 16 | baseTemplateName: k8s-istio-system-health 17 | outputItems: 18 | - type: slx 19 | - type: sli 20 | - type: runbook 21 | templateName: k8s-istio-system-health-taskset.yaml 22 | -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.runwhen/templates/k8s-istio-system-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/Istio.svg 11 | alias: {{ cluster.name }} Istio System Health 12 | asMeasuredBy: "" 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Measures health of Istio system by checking istio proxy sidecar injection status, high memory and cpu usage, warnings and errors in logs, valid certificates, configuration and verify istio installation. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.test/README.md: -------------------------------------------------------------------------------- 1 | export AWS_ACCESS_KEY_ID="" 2 | export AWS_DEFAULT_LOCATION="" 3 | export AWS_SECRET_ACCESS_KEY="" 4 | export CONTEXT="istio-cluster" 5 | export RW_API_URL="papi.beta.runwhen.com" 6 | export RW_WORKSPACE="" 7 | export RW_PAT="" -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.test/terraform/bookinfo/fault-injection-details-v1.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1alpha3 2 | kind: VirtualService 3 | metadata: 4 | name: details 5 | spec: 6 | hosts: 7 | - details 8 | http: 9 | - fault: 10 | abort: 11 | httpStatus: 500 12 | percentage: 13 | value: 100 14 | route: 15 | - destination: 16 | host: details 17 | subset: v1 18 | - route: 19 | - destination: 20 | host: details 21 | subset: v1 22 | --- 23 | apiVersion: networking.istio.io/v1alpha3 24 | kind: DestinationRule 25 | metadata: 26 | name: details 27 | spec: 28 | host: details 29 | subsets: 30 | - name: v1 31 | labels: 32 | version: v1 -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.test/terraform/faulty-gateway.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1beta1 2 | kind: Gateway 3 | metadata: 4 | name: faulty-gateway 5 | namespace: istio-system 6 | spec: 7 | selector: 8 | istio: ingressgateway 9 | servers: 10 | - port: 11 | number: 80 12 | name: http 13 | protocol: HTTP 14 | hosts: 15 | - "invalid-host.local" # Error: No VirtualService matches this host 16 | -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.test/terraform/kubeconfig-sa-token.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: kubeconfig-sa-token 5 | namespace: kube-system 6 | annotations: 7 | kubernetes.io/service-account.name: kubeconfig-sa 8 | type: kubernetes.io/service-account-token 9 | 10 | -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.test/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | output "configure_kubectl" { 2 | description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" 3 | value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" 4 | } 5 | -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.test/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "vpc_cidr" { 2 | description = "The cidr for aws vpc" 3 | type = string 4 | default = "10.0.0.0/16" 5 | } 6 | 7 | variable "istio_version" { 8 | description = "Istio version" 9 | type = string 10 | default = "1.20.2" 11 | } 12 | 13 | variable "cluster_name" { 14 | description = "The name of the EKS cluster" 15 | type = string 16 | default = "istio-cluster" 17 | } -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/.test/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.3" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = ">= 5.34" 8 | } 9 | helm = { 10 | source = "hashicorp/helm" 11 | version = ">= 2.9" 12 | } 13 | kubernetes = { 14 | source = "hashicorp/kubernetes" 15 | version = ">= 2.20" 16 | } 17 | kubectl = { 18 | source = "gavinbunney/kubectl" 19 | version = ">= 1.14.0" 20 | } 21 | } 22 | 23 | # ## Used for end-to-end testing on project; update to suit your needs 24 | # backend "s3" { 25 | # bucket = "terraform-ssp-github-actions-state" 26 | # region = "us-west-2" 27 | # key = "e2e/istio/terraform.tfstate" 28 | # } 29 | } 30 | -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/controlplane_error_patterns.json: -------------------------------------------------------------------------------- 1 | { 2 | "warnings": [ 3 | "upstream connect error or disconnect/reset before headers. reset reason: connection failure", 4 | "error:1408F10B:SSL routines:ssl3_get_record:wrong version number", 5 | "istio-proxy fails to start", 6 | "502 Bad Gateway", 7 | "istio-ingressgateway is running but has no listener on port" 8 | ], 9 | "errors": [ 10 | "panic: runtime error", 11 | "Failed to create listener", 12 | "proxy exited with status", 13 | "Pilot push failed", 14 | "failed to reconcile state", 15 | "Error adding/updating listener" 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /codebundles/k8s-istio-system-health/proxy_error_patterns.json: -------------------------------------------------------------------------------- 1 | { 2 | "warnings": [ 3 | "JWT validation failed", 4 | "upstream connect error or disconnect/reset before headers", 5 | "TLS handshake error", 6 | "No healthy upstream", 7 | "Error adding/updating cluster", 8 | "Downstream connection terminated", 9 | "Stream removed with error", 10 | "connection terminated with error", 11 | "503 UH no_healthy_upstream", 12 | "503 UC upstream connect error", 13 | "504 DC downstream connection termination", 14 | "FI fault_filter_abort", 15 | "DNS resolution failed" 16 | ], 17 | "errors": [ 18 | "Envoy proxy is NOT ready", 19 | "Unable to establish connection", 20 | "upstream connect error", 21 | "bad certificate", 22 | "remote error: tls", 23 | "no route configured", 24 | "Listener filter chain match failed", 25 | "Failed to bind listener" 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /codebundles/k8s-jaeger-http-query/.runwhen/generation-rules/k8s-jaeger-http-query.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - service 7 | matchRules: 8 | - type: and 9 | matches: 10 | - type: pattern 11 | pattern: "query" 12 | properties: [label-values] 13 | mode: substring 14 | - type: pattern 15 | pattern: "jaeger" 16 | properties: [label-values] 17 | mode: substring 18 | slxs: 19 | - baseName: jaeger-http 20 | levelOfDetail: detailed 21 | qualifiers: ["resource", "namespace", "cluster"] 22 | baseTemplateName: k8s-jaeger-http-query 23 | outputItems: 24 | - type: slx 25 | - type: runbook 26 | templateName: k8s-jaeger-http-query-taskset.yaml 27 | -------------------------------------------------------------------------------- /codebundles/k8s-jaeger-http-query/.runwhen/templates/k8s-jaeger-http-query-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/jaeger_tracing.svg 11 | alias: Jaeger HTTP Query for Namespace {{match_resource.resource.metadata.namespace}} 12 | asMeasuredBy: None 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Jaeger instance {{match_resource.resource.metadata.name}} should report on HTTP traces related to ingested services. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-jenkins-healthcheck/.runwhen/generation-rules/k8s-jenkins-healthcheck.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - statefulset 7 | matchRules: 8 | - type: pattern 9 | pattern: "jenkins" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: jenkins-health 14 | levelOfDetail: detailed 15 | qualifiers: ["resource", "namespace", "cluster"] 16 | baseTemplateName: k8s-jenkins-health 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-jenkins-health-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-loki-healthcheck/.runwhen/generation-rules/k8s-loki-healthcheck.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - statefulset 7 | matchRules: 8 | - type: pattern 9 | pattern: "loki" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: loki-hlthck 14 | qualifiers: ["resource", "namespace", "cluster"] 15 | baseTemplateName: k8s-loki-healthcheck 16 | levelOfDetail: detailed 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-loki-healthcheck-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-loki-healthcheck/.runwhen/templates/k8s-loki-healthcheck-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/grafana-loki.svg 11 | alias: Loki Stack Health 12 | asMeasuredBy: The Loki stack is up, and healthy. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Loki's stack should be up, healthy with a up-to-date hash ring in the {{namespace.name}} namespace. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-loki-healthcheck/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Loki Healthcheck 2 | 3 | A set of tasks to query the state and health of a Loki deployment in Kubernetes. 4 | 5 | ## Tasks 6 | `Check Loki Ring API` 7 | `Check Loki API Ready` 8 | 9 | ## Configuration 10 | 11 | The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: 12 | 13 | - `kubeconfig`: The kubeconfig secret containing access info for the cluster. 14 | - `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`. 15 | - `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`. 16 | - `CONTEXT`: The Kubernetes context to operate within. 17 | - `NAMESPACE`: The name of the namespace to search. 18 | 19 | ## Notes 20 | 21 | Please note that these checks require Kubernetes RBAC exec permissions for the service account used. 22 | 23 | ## TODO 24 | - [ ] Add documentation 25 | - [ ] Add more complex hash ring checks 26 | - [ ] Refine raised issues -------------------------------------------------------------------------------- /codebundles/k8s-namespace-healthcheck/.runwhen/generation-rules/k8s-namespace-healthcheck.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - namespace 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: ns-health 14 | levelOfDetail: basic 15 | qualifiers: ["namespace", "cluster"] 16 | baseTemplateName: k8s-namespace-healthcheck 17 | outputItems: 18 | - type: slx 19 | - type: sli 20 | # - type: slo 21 | - type: runbook 22 | templateName: k8s-namespace-healthcheck-taskset.yaml 23 | - type: workflow 24 | -------------------------------------------------------------------------------- /codebundles/k8s-namespace-healthcheck/.runwhen/templates/k8s-namespace-healthcheck-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | codeBundle: 11 | repoUrl: https://github.com/runwhen-contrib/rw-public-codecollection.git 12 | pathToYaml: codebundles/slo-default/queries.yaml 13 | ref: main 14 | sloSpecType: simple-mwmb 15 | objective: 99 16 | threshold: 1 17 | operand: eq -------------------------------------------------------------------------------- /codebundles/k8s-namespace-healthcheck/.runwhen/templates/k8s-namespace-healthcheck-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/ns.svg 11 | alias: {{namespace.name}} Namespace Health 12 | asMeasuredBy: Aggregate score based on Kubernetes API Server queries 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Overall health for {{namespace.name}} should be 1, 99% of the time. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-namespace-healthcheck/.runwhen/templates/k8s-namespace-healthcheck-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{ "Namespace Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{namespace.name}} Namespace SLI Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{namespace.name}} namespace health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{namespace.name}}-{{ "Namespace Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/k8s-otelcollector/.runwhen/generation-rules/k8s-otelcollector.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - deployment 7 | - daemonset 8 | - statefulset 9 | matchRules: 10 | - type: and 11 | matches: 12 | - type: pattern 13 | pattern: "opentelemetry-collector" 14 | properties: [label-values] 15 | mode: substring 16 | - type: pattern 17 | pattern: "col" 18 | properties: [name] 19 | mode: substring 20 | slxs: 21 | - baseName: k8s-otelcollector 22 | levelOfDetail: detailed 23 | qualifiers: ["resource", "namespace", "cluster"] 24 | baseTemplateName: k8s-otelcollector 25 | outputItems: 26 | - type: slx 27 | - type: runbook 28 | templateName: k8s-otelcollector-taskset.yaml 29 | -------------------------------------------------------------------------------- /codebundles/k8s-otelcollector/.runwhen/templates/k8s-otelcollector-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/jaeger_tracing.svg 11 | alias: OTEL Collector Health for Namespace {{match_resource.resource.metadata.namespace}} 12 | asMeasuredBy: None 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: OTEL Collector {{match_resource.resource.metadata.name}} should not have large queues or error logs. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only 26 | -------------------------------------------------------------------------------- /codebundles/k8s-otelcollector/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /codebundles/k8s-otelcollector/otel_dropped_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ENV: 4 | # CONTEXT 5 | # NAMESPACE 6 | # METRICS_PORT 7 | # WORKLOAD_NAME 8 | # WORKLOAD_SERVICE 9 | since=60m 10 | output=$(kubectl --context $CONTEXT -n $NAMESPACE logs service/$WORKLOAD_SERVICE --since=$since --all-containers=true | grep dropped -A 20) 11 | if [ -n "$output" ]; then 12 | echo -E "Dropped Spans Found:" 13 | echo -E "$output" 14 | exit 1 15 | fi 16 | exit 0 -------------------------------------------------------------------------------- /codebundles/k8s-otelcollector/otel_error_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ENV: 4 | # CONTEXT 5 | # NAMESPACE 6 | # METRICS_PORT 7 | # WORKLOAD_NAME 8 | # WORKLOAD_SERVICE 9 | since=60m 10 | output=$(kubectl --context $CONTEXT -n $NAMESPACE logs service/$WORKLOAD_SERVICE --since=$since --all-containers=true | grep error) 11 | if [ -n "$output" ]; then 12 | echo -E "Error(s) Found:" 13 | echo -E "$output" 14 | exit 1 15 | fi 16 | exit 0 -------------------------------------------------------------------------------- /codebundles/k8s-otelcollector/otel_metrics_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ENV: 4 | # CONTEXT 5 | # NAMESPACE 6 | # METRICS_PORT 7 | # WORKLOAD_NAME 8 | # WORKLOAD_SERVICE 9 | 10 | THRESHOLD=500 11 | rv=0 12 | metrics=$(kubectl --context $CONTEXT -n $NAMESPACE exec $WORKLOAD_NAME -- curl $WORKLOAD_SERVICE:$METRICS_PORT/metrics) 13 | queued_spans=$(echo -E "$metrics" | grep "otelcol_exporter_queue_size{") 14 | while IFS= read -r line; do 15 | echo "$line" 16 | value=$(echo "$line" | awk '{print $2}') 17 | if [ "$value" -gt "$THRESHOLD" ]; then 18 | echo "Error: queued spans ($value) exceeds threshold ($THRESHOLD)" 19 | rv=1 20 | 21 | fi 22 | done <<< "$queued_spans" 23 | exit $rv -------------------------------------------------------------------------------- /codebundles/k8s-podresources-health/.runwhen/generation-rules/k8s-pod-resources.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - namespace 7 | matchRules: 8 | - type: pattern 9 | pattern: "." 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: pod-resources 14 | levelOfDetail: detailed 15 | qualifiers: ["namespace", "cluster"] 16 | baseTemplateName: k8s-pod-resources 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-pod-resources-taskset.yaml -------------------------------------------------------------------------------- /codebundles/k8s-podresources-health/.runwhen/templates/k8s-pod-resources-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/pod.svg 11 | alias: {{namespace.name}} Pod Resources 12 | asMeasuredBy: Kubectl get and Kubectl Top 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Pods should have resources configured, and resource usage should not be exceeded. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-postgres-healthcheck/.runwhen/generation-rules/k8s-postgres-healthcheck-crunchy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - postgresclusters.postgres-operator.crunchydata.com 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: postgres-health 14 | qualifiers: ["resource", "namespace", "cluster"] 15 | baseTemplateName: k8s-postgres-healthcheck-crunchy 16 | levelOfDetail: detailed 17 | outputItems: 18 | - type: slx 19 | - type: sli 20 | - type: runbook 21 | templateName: k8s-postgres-healthcheck-crunchy-taskset.yaml 22 | -------------------------------------------------------------------------------- /codebundles/k8s-postgres-healthcheck/.runwhen/generation-rules/k8s-postgres-healthcheck-zalando.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - postgresqls.acid.zalan.do 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: postgres-health 14 | qualifiers: ["resource", "namespace", "cluster"] 15 | baseTemplateName: k8s-postgres-healthcheck-zalando 16 | levelOfDetail: detailed 17 | outputItems: 18 | - type: slx 19 | - type: sli 20 | - type: runbook 21 | templateName: k8s-postgres-healthcheck-zalando-taskset.yaml 22 | -------------------------------------------------------------------------------- /codebundles/k8s-postgres-healthcheck/.runwhen/templates/k8s-postgres-healthcheck-crunchy-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/CrunchyDataPrimaryIcon.png 11 | alias: {{match_resource.resource.metadata.name}} Postgres Health 12 | asMeasuredBy: Database is up and accepting connections. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Database should be available and accept connections 99.5% of the time. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only 26 | -------------------------------------------------------------------------------- /codebundles/k8s-postgres-healthcheck/.runwhen/templates/k8s-postgres-healthcheck-zalando-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/postgresql.svg 11 | alias: {{match_resource.resource.metadata.name}} Postgres Health 12 | asMeasuredBy: Database is up and accepting connections. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Database should be available and accept connections 99.5% of the time. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only 26 | -------------------------------------------------------------------------------- /codebundles/k8s-prometheus-healthcheck/.runwhen/generation-rules/k8s-prometheus-healthcheck.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - prometheuses.monitoring.coreos.com 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: kubeprom-hlthck 14 | qualifiers: ["resource", "namespace", "cluster"] 15 | baseTemplateName: k8s-prometheus-healthcheck 16 | levelOfDetail: detailed 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-prometheus-healthcheck-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-prometheus-healthcheck/.runwhen/templates/k8s-prometheus-healthcheck-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/prometheus_color.svg 11 | alias: Kubeprometheus Operator Health 12 | asMeasuredBy: The Kubeprometheus operator is healthy and its ServiceMonitors are functional. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: The Kubeprometheus operator should be healthy in the {{namespace.name}} namespace and its ServiceMonitors are functional. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-pvc-healthcheck/.runwhen/generation-rules/k8s-pvc-healthcheck.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - persistentvolumeclaim 7 | matchRules: 8 | - type: pattern 9 | pattern: "." 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: pvc-health 14 | levelOfDetail: detailed 15 | qualifiers: ["namespace", "cluster"] 16 | baseTemplateName: k8s-pvc-healthcheck 17 | outputItems: 18 | - type: slx 19 | - type: sli 20 | - type: runbook 21 | templateName: k8s-pvc-healthcheck-taskset.yaml 22 | - type: workflow -------------------------------------------------------------------------------- /codebundles/k8s-pvc-healthcheck/.runwhen/templates/k8s-pvc-healthcheck-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/pvc.svg 11 | alias: {{namespace.name}} PVC Healthcheck 12 | asMeasuredBy: Aggregate score based on unattched PVCs or PVCs with errors. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: PVC's should be bound and healthy. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-pvc-healthcheck/.runwhen/templates/k8s-pvc-healthcheck-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: Workflow 3 | metadata: 4 | name: {{slx_name}}-{{namespace.name}}-{{ "PVC Alert Workflow" | replace(" ", "-") | lower }} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | fromActivities: 11 | - displayName: {{namespace.name}} Namespace PVC Alert Workflow 12 | description: Start RunSession with Eager Edgar when SLI is alerting for {{namespace.name}} PVC health 13 | actions: 14 | - tasks: 15 | slx: {{slx_name.split('--')[1]}} 16 | persona: eager-edgar 17 | titles: 18 | - '*' 19 | sessionTTL: 20m 20 | match: 21 | activityVerbs: 22 | - SLI_ALERTS_STARTED 23 | slxs: 24 | - {{slx_name.split('--')[1]}} 25 | name: {{namespace.name}}-{{ "PVC Alert Workflow" | replace(" ", "-") | lower }} -------------------------------------------------------------------------------- /codebundles/k8s-redis-healthcheck/.runwhen/generation-rules/k8s-redis-healthcheck.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - deployment 7 | matchRules: 8 | - type: pattern 9 | pattern: "redis" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: redis-health 14 | levelOfDetail: detailed 15 | qualifiers: ["resource", "namespace", "cluster"] 16 | baseTemplateName: k8s-redis-health 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-redis-health-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-redis-healthcheck/.runwhen/templates/k8s-redis-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/redis-logo.svg 11 | alias: {{match_resource.resource.metadata.name}} Health 12 | asMeasuredBy: The ability to ping, read and write keys to the Redis service. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Redis endpoints should be responsive and healthy state. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-serviceaccount-check/.runwhen/generation-rules/k8s-serviceaccount-check.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - namespace 7 | matchRules: 8 | - type: and 9 | matches: 10 | - type: pattern 11 | pattern: ".+" 12 | properties: [name] 13 | mode: substring 14 | slxs: 15 | - baseName: sa-check 16 | qualifiers: ["namespace", "cluster"] 17 | baseTemplateName: k8s-serviceaccount-check 18 | levelOfDetail: detailed 19 | outputItems: 20 | - type: slx 21 | - type: runbook 22 | templateName: k8s-serviceaccount-check-taskset.yaml 23 | -------------------------------------------------------------------------------- /codebundles/k8s-serviceaccount-check/.runwhen/templates/k8s-serviceaccount-check-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | labels: 5 | slx: {{slx_name}} 6 | workspace: {{workspace.short_name}} 7 | name: {{slx_name}} 8 | annotations: 9 | {% include "common-annotations.yaml" %} 10 | spec: 11 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/sa.svg 12 | alias: {{namespace.name}} Service Account Check 13 | asMeasuredBy: A temporary curl pod using a namespaced service account to interact with the API server. 14 | configProvided: 15 | - name: OBJECT_NAME 16 | value: {{match_resource.resource.metadata.name}} 17 | owners: 18 | - {{workspace.owner_email}} 19 | statement: Pods should be able to contact the Kubernetes API server. 20 | additionalContext: 21 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 22 | qualified_name: "{{ match_resource.qualified_name }}" 23 | tags: 24 | {% include "kubernetes-tags.yaml" ignore missing %} 25 | - name: access 26 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-statefulset-healthcheck/.runwhen/generation-rules/k8s-statefulset-health.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: GenerationRules 3 | spec: 4 | generationRules: 5 | - resourceTypes: 6 | - statefulset 7 | matchRules: 8 | - type: pattern 9 | pattern: ".+" 10 | properties: [name] 11 | mode: substring 12 | slxs: 13 | - baseName: ss-health 14 | levelOfDetail: detailed 15 | qualifiers: ["resource", "namespace", "cluster"] 16 | baseTemplateName: k8s-statefulset-health 17 | outputItems: 18 | - type: slx 19 | - type: runbook 20 | templateName: k8s-statefulset-health-taskset.yaml 21 | -------------------------------------------------------------------------------- /codebundles/k8s-statefulset-healthcheck/.runwhen/templates/k8s-statefulset-health-slx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: runwhen.com/v1 2 | kind: ServiceLevelX 3 | metadata: 4 | name: {{slx_name}} 5 | labels: 6 | {% include "common-labels.yaml" %} 7 | annotations: 8 | {% include "common-annotations.yaml" %} 9 | spec: 10 | imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/sts.svg 11 | alias: {{match_resource.resource.metadata.name}} StatefulSet Health 12 | asMeasuredBy: The Running state of desired pods. 13 | configProvided: 14 | - name: OBJECT_NAME 15 | value: {{match_resource.resource.metadata.name}} 16 | owners: 17 | - {{workspace.owner_email}} 18 | statement: Statefulset {{match_resource.resource.metadata.name}} should be in a healthy state. 19 | additionalContext: 20 | {% include "kubernetes-hierarchy.yaml" ignore missing %} 21 | qualified_name: "{{ match_resource.qualified_name }}" 22 | tags: 23 | {% include "kubernetes-tags.yaml" ignore missing %} 24 | - name: access 25 | value: read-only -------------------------------------------------------------------------------- /codebundles/k8s-tail-logs-dynamic/meta.yaml: -------------------------------------------------------------------------------- 1 | commands: [] 2 | -------------------------------------------------------------------------------- /libraries/.docs/Suggest.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # libraries.RW.NextSteps.Suggest 4 | 5 | Utility library for suggesting next steps based on a static troubleshooting yaml database 6 | 7 | See https://github.com/seatgeek/thefuzz 8 | 9 | Scope: Global 10 | 11 | -------------------------------------------------------------------------------- /libraries/.docs/_test_parsers.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # libraries.RW.K8sApplications.\_test\_parsers 4 | 5 | -------------------------------------------------------------------------------- /libraries/.docs/k8s_helper.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # libraries.RW.K8sHelper.k8s\_helper 4 | 5 | 6 | 7 | #### get\_related\_resource\_recommendations 8 | 9 | ```python 10 | def get_related_resource_recommendations(k8s_object) 11 | ``` 12 | 13 | Parse a Kubernetes object JSON for specific annotations or labels and return recommendations. 14 | 15 | **Arguments**: 16 | 17 | - `obj_json` _dict_ - The Kubernetes object JSON. 18 | 19 | 20 | **Returns**: 21 | 22 | - `str` - Recommendations based on the object's annotations or labels. 23 | 24 | 25 | 26 | #### sanitize\_messages 27 | 28 | ```python 29 | def sanitize_messages(input_string) 30 | ``` 31 | 32 | Sanitize the message string by replacing ncharacters that can't be processed into json issue details. 33 | 34 | **Arguments**: 35 | 36 | - input_string: The string to be sanitized. 37 | 38 | 39 | **Returns**: 40 | 41 | - The sanitized string. 42 | 43 | -------------------------------------------------------------------------------- /libraries/.docs/local_process.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # libraries.RW.CLI.local\_process 4 | 5 | TODO: should be incorporated into platform behaviour 6 | Acts as interoperable layer between ShellRequest/Response and local processes - hacky 7 | 8 | -------------------------------------------------------------------------------- /libraries/.docs/migrations_inspector.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # libraries.RW.K8sApplications.migrations\_inspector 4 | 5 | -------------------------------------------------------------------------------- /libraries/.docs/parsers.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # libraries.RW.K8sApplications.parsers 4 | 5 | 6 | 7 | ## StackTraceData Objects 8 | 9 | ```python 10 | @dataclass 11 | class StackTraceData() 12 | ``` 13 | 14 | 15 | 16 | #### line\_nums 17 | 18 | line numbers associated with exceptions per file 19 | 20 | 21 | 22 | ## BaseStackTraceParse Objects 23 | 24 | ```python 25 | class BaseStackTraceParse() 26 | ``` 27 | 28 | Base class for stacktrace parsing functions. 29 | Should be stateless so it can be used as a utility class. 30 | 31 | Note that the default behavior assumes python stack traces, and inheritors can override for other languages. 32 | 33 | -------------------------------------------------------------------------------- /libraries/.docs/postgres_helper.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # libraries.RW.CLI.postgres\_helper 4 | 5 | -------------------------------------------------------------------------------- /libraries/.docs/repository.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # libraries.RW.K8sApplications.repository 4 | 5 | -------------------------------------------------------------------------------- /libraries/Jenkins/__init__.py: -------------------------------------------------------------------------------- 1 | from .jenkins import * 2 | -------------------------------------------------------------------------------- /libraries/RW/CLI/__init__.py: -------------------------------------------------------------------------------- 1 | from .CLI import * 2 | from .postgres_helper import k8s_postgres_query, get_password, get_user 3 | -------------------------------------------------------------------------------- /libraries/RW/K8sApplications/__init__.py: -------------------------------------------------------------------------------- 1 | from .k8s_applications import * 2 | -------------------------------------------------------------------------------- /libraries/RW/K8sApplications/migrations_inspector.py: -------------------------------------------------------------------------------- 1 | # determines migration library/tool and then fetches migration table info 2 | -------------------------------------------------------------------------------- /libraries/RW/K8sApplications/no_stacktraces_report.jinja2: -------------------------------------------------------------------------------- 1 | # Stack Trace Report 2 | 3 | Report Created At: {{ data.timestamp }} 4 | 5 | ## Summary 6 | **Total Stack Traces:** {{ data.stacktraces|length }} 7 | 8 | 9 | **No stacktraces were found!** -------------------------------------------------------------------------------- /libraries/RW/K8sApplications/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pytest --log-cli-level=DEBUG _test_parsers.py -------------------------------------------------------------------------------- /libraries/RW/K8sApplications/test_data/golang.log: -------------------------------------------------------------------------------- 1 | rpc error: code = Unavailable desc = connection error: desc = "transport: Error while dialing dial tcp 10.107.253.212:7070: connect: connection refused" 2 | could not retrieve cart 3 | main.(*frontendServer).homeHandler 4 | /src/handlers.go:69 5 | net/http.HandlerFunc.ServeHTTP 6 | /usr/local/go/src/net/http/server.go:2109 7 | github.com/gorilla/mux.(*Router).ServeHTTP 8 | /go/pkg/mod/github.com/gorilla/mux@v1.8.0/mux.go:210 9 | main.(*logHandler).ServeHTTP 10 | /src/middleware.go:82 11 | main.ensureSessionID.func1 12 | /src/middleware.go:109 13 | net/http.HandlerFunc.ServeHTTP 14 | /usr/local/go/src/net/http/server.go:2109 15 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp.(*Handler).ServeHTTP 16 | /go/pkg/mod/go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp@v0.37.0/handler.go:210 17 | net/http.serverHandler.ServeHTTP 18 | /usr/local/go/src/net/http/server.go:2947 19 | net/http.(*conn).serve 20 | /usr/local/go/src/net/http/server.go:1991 21 | runtime.goexit 22 | /usr/local/go/src/runtime/asm_amd64.s:1594 -------------------------------------------------------------------------------- /libraries/RW/K8sApplications/test_data/java.log: -------------------------------------------------------------------------------- 1 | Exception in thread "main" java.lang.IndexOutOfBoundsException: Index 5 out of bounds for length 3 2 | at java.util.ArrayList.rangeCheck(ArrayList.java:659) 3 | at java.util.ArrayList.get(ArrayList.java:435) 4 | at MainKt.main(Main.kt:6) -------------------------------------------------------------------------------- /libraries/RW/K8sApplications/test_data/node.log: -------------------------------------------------------------------------------- 1 | Error: ENOENT: no such file or directory, open 'missing-file.txt' 2 | at Object.openSync (fs.js:462:3) 3 | at Object.readFileSync (fs.js:364:35) 4 | at handleRequest (/server.js:15:18) 5 | at Layer.handle [as handle_request] (/node_modules/express/lib/router/layer.js:95:5) 6 | at next (/node_modules/express/lib/router/route.js:144:13) 7 | at Route.dispatch (/node_modules/express/lib/router/route.js:114:3) 8 | at Layer.handle [as handle_request] (/node_modules/express/lib/router/layer.js:95:5) 9 | at /node_modules/express/lib/router/index.js:284:15 10 | at Function.process_params (/node_modules/express/lib/router/index.js:346:12) 11 | at next (/node_modules/express/lib/router/index.js:280:10) -------------------------------------------------------------------------------- /libraries/RW/K8sHelper/__init__.py: -------------------------------------------------------------------------------- 1 | from .k8s_helper import * 2 | -------------------------------------------------------------------------------- /libraries/RW/K8sLog/__init__.py: -------------------------------------------------------------------------------- 1 | from .k8s_log import K8sLog 2 | 3 | __version__ = "1.0.0" -------------------------------------------------------------------------------- /libraries/RW/NextSteps/__init__.py: -------------------------------------------------------------------------------- 1 | from .Suggest import * 2 | -------------------------------------------------------------------------------- /libraries/RW/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This line is required so that we can have RW.Core in one directory 3 | and the other RW libs in other directories 4 | See - https://packaging.python.org/en/latest/guides/packaging-namespace-packages/ 5 | """ 6 | __path__ = __import__("pkgutil").extend_path(__path__, __name__) -------------------------------------------------------------------------------- /libraries/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This line is required so that we can have RW.Core in one directory 3 | and the other RW libs in other directories 4 | See - https://packaging.python.org/en/latest/guides/packaging-namespace-packages/ 5 | """ 6 | __path__ = __import__("pkgutil").extend_path(__path__, __name__) 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=61.2", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "rw-cli-keywords" 10 | description = "A set of RunWhen published CLI keywords and python libraries for interacting with APIs using CLIs" 11 | readme = "README.md" 12 | license = { text = "Apache License 2.0" } 13 | authors = [ 14 | { name = "RunWhen", email = "info@runwhen.com" } 15 | ] 16 | classifiers = [ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: Apache Software License" 19 | ] 20 | # We declare these fields as dynamic because they come from external files 21 | dynamic = ["version", "dependencies"] 22 | 23 | [tool.setuptools.packages.find] 24 | where = ["libraries"] 25 | 26 | # Dynamically read the version from the VERSION file 27 | # and the dependencies from requirements.txt. 28 | [tool.setuptools.dynamic] 29 | version = { file = "VERSION" } 30 | dependencies = { file = "requirements.txt" } 31 | 32 | [project.urls] 33 | homepage = "https://github.com/runwhen-contrib/rw-cli-codecollection" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | robotframework>=4.1.2 2 | jmespath>=1.0.1 3 | python-dateutil>=2.9.0 4 | requests>=2.31.0 5 | thefuzz>=0.20.0 6 | pyyaml>=6.0.1 7 | jinja2>=3.1.4 8 | tabulate>=0.9.0 --------------------------------------------------------------------------------