├── .ci-operator.yaml ├── .codecov.yml ├── .editorconfig ├── .gitattributes ├── .github ├── dependabot.yml ├── pull_request_template.md └── workflows │ └── auto-tidy-interceptor.yml ├── .gitignore ├── .golangci.yml ├── LICENSE ├── Makefile ├── OWNERS ├── OWNERS_ALIASES ├── README.md ├── boilerplate ├── _data │ ├── backing-image-tag │ └── last-boilerplate-commit ├── _lib │ ├── boilerplate-commit │ ├── boilerplate.mk │ ├── common.sh │ ├── container-make │ ├── freeze-check │ ├── release.sh │ ├── subscriber │ ├── subscriber-propose │ ├── subscriber-propose-update │ ├── subscriber-report │ ├── subscriber-report-onboarding │ ├── subscriber-report-pr │ ├── subscriber-report-release │ └── subscriber.sh ├── generated-includes.mk ├── openshift │ └── osd-container-image │ │ ├── .ci-operator.yaml │ │ ├── OWNERS_ALIASES │ │ ├── README.md │ │ ├── app-sre-build-push.sh │ │ ├── dependabot.yml │ │ ├── prow-config │ │ ├── standard.mk │ │ └── update ├── update └── update.cfg ├── build └── Dockerfile ├── cadctl ├── .gitignore ├── LICENSE ├── cmd │ ├── investigate │ │ └── investigate.go │ └── root.go └── main.go ├── dashboards └── grafana-dashboard-configuration-anomaly-detection.configmap.yaml ├── go.mod ├── go.sum ├── hack ├── bootstrap-investigation.sh └── codecov.sh ├── images ├── CadCat.png ├── cad_chgm_investigation │ ├── README.md │ ├── chgm_investigation.excalidraw │ ├── chgm_investigation_dark.png │ └── chgm_investigation_light.png └── cad_overview │ ├── cad_architecture.excalidraw │ ├── cad_architecture_dark.png │ └── cad_architecture_light.png ├── interceptor ├── README.md ├── go.mod ├── go.sum ├── main.go ├── pkg │ └── interceptor │ │ ├── metrics.go │ │ └── pdinterceptor.go └── test │ └── e2e.sh ├── openshift ├── PipelinePruning.md ├── README.md ├── assets │ └── cad_pipeline_pruning.drawio.png ├── gateway-template.yaml └── template.yaml ├── pkg ├── ai │ └── k8sgpt │ │ └── k8sgpt.go ├── aws │ ├── aws.go │ ├── aws_test.go │ └── mock │ │ └── aws.go ├── investigations │ ├── aitest │ │ ├── README.md │ │ ├── metadata.yaml │ │ └── testing │ │ │ └── README.md │ ├── apierrorbudgetburn │ │ ├── README.md │ │ ├── apierrorbudgetburn.go │ │ ├── metadata.yaml │ │ └── testing │ │ │ └── README.md │ ├── cannotretrieveupdatessre │ │ ├── README.md │ │ ├── cannotretrieveupdatessre.go │ │ ├── cannotretrieveupdatessre_test.go │ │ ├── metadata.yaml │ │ └── testing │ │ │ └── README.md │ ├── ccam │ │ ├── ccam.go │ │ └── ccam_test.go │ ├── chgm │ │ ├── README.md │ │ ├── chgm.go │ │ ├── chgm_hibernation_check.go │ │ ├── chgm_hibernation_check_test.go │ │ ├── chgm_suite_test.go │ │ ├── chgm_test.go │ │ ├── util.go │ │ └── util_test.go │ ├── clustermonitoringerrorbudgetburn │ │ ├── clustermonitoringerrorbudgetburn.go │ │ ├── clustermonitoringerrorbudgetburn_test.go │ │ └── metadata.yaml │ ├── cpd │ │ └── cpd.go │ ├── insightsoperatordown │ │ ├── insightsoperatordown.go │ │ ├── insightsoperatordown_test.go │ │ ├── metadata.yaml │ │ └── testing │ │ │ ├── README.md │ │ │ └── block-api-openshift.sh │ ├── investigation │ │ └── investigation.go │ ├── machinehealthcheckunterminatedshortcircuitsre │ │ ├── machinehealthcheckunterminatedshortcircuitsre.go │ │ ├── machinehealthcheckunterminatedshortcircuitsre_test.go │ │ ├── metadata.yaml │ │ ├── recommendation.go │ │ └── testing │ │ │ ├── README.md │ │ │ ├── srep-worker-healthcheck_machinehealthcheck.yaml │ │ │ ├── unstoppable_pdb.yaml │ │ │ └── unstoppable_workload.yaml │ ├── pruningcronjoberror │ │ └── metadata.yaml │ ├── registry.go │ ├── upgradeconfigsyncfailureover4hr │ │ ├── README.md │ │ ├── metadata.yaml │ │ ├── upgradeconfigsyncfailureover4hr.go │ │ └── upgradeconfigsyncfailureover4hr_test.go │ └── utils │ │ ├── machine │ │ ├── machine.go │ │ └── machine_test.go │ │ └── node │ │ ├── node.go │ │ └── node_test.go ├── k8s │ ├── client.go │ ├── errors.go │ ├── errors_test.go │ └── scheme.go ├── logging │ └── logging.go ├── managedcloud │ └── managedcloud.go ├── metrics │ ├── README.md │ └── metrics.go ├── networkverifier │ ├── networkverifier.go │ ├── networkverifier_suite_test.go │ └── networkverifier_test.go ├── notewriter │ ├── notewriter.go │ └── notewriter_test.go ├── ocm │ ├── mock │ │ └── ocmmock.go │ ├── ocm.go │ └── ocm_config.go ├── pagerduty │ ├── errors.go │ ├── mock │ │ └── pagerdutymock.go │ ├── pagerduty.go │ ├── pagerduty_suite_test.go │ ├── pagerduty_test.go │ └── types.go └── utils │ ├── utils.go │ └── utils_suite_test.go └── test ├── e2e ├── Dockerfile ├── configuration_anomaly_detection_runner_test.go ├── configuration_anomaly_detection_test.go ├── project.mk ├── test-e2e-suite-template.yml └── utils │ ├── aws.go │ ├── generate_incident.go │ └── utils.go ├── generate_incident.sh ├── launch_local_env.sh ├── set_stage_env.sh └── testinfra ├── haproxy.cfg └── tinyproxy.conf /.ci-operator.yaml: -------------------------------------------------------------------------------- 1 | build_root_image: 2 | name: boilerplate 3 | namespace: openshift 4 | tag: image-v7.3.0 5 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | require_ci_to_pass: no 4 | 5 | coverage: 6 | precision: 2 7 | round: down 8 | range: "20...100" 9 | 10 | status: 11 | project: no 12 | patch: no 13 | changes: no 14 | 15 | parsers: 16 | gcov: 17 | branch_detection: 18 | conditional: yes 19 | loop: yes 20 | method: no 21 | macro: no 22 | 23 | comment: 24 | layout: "reach,diff,flags,tree" 25 | behavior: default 26 | require_changes: no -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | 9 | [*.go] 10 | indent_style = tab 11 | tab_width = 4 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ### BEGIN BOILERPLATE GENERATED -- DO NOT EDIT ### 2 | ### This block must be the last thing in your ### 3 | ### .gitattributes file; otherwise the 'validate' ### 4 | ### CI check will fail. ### 5 | # Used to ensure nobody mucked with boilerplate files. 6 | boilerplate/_lib/freeze-check linguist-generated=false 7 | # Show the boilerplate commit hash update. It's only one line anyway. 8 | boilerplate/_data/last-boilerplate-commit linguist-generated=false 9 | # Used by freeze-check. Good place for attackers to inject badness. 10 | boilerplate/update linguist-generated=false 11 | # Make sure attackers can't hide changes to this configuration 12 | .gitattributes linguist-generated=false 13 | ### END BOILERPLATE GENERATED ### 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gomod" 4 | directories: 5 | - "/" 6 | - "interceptor/" 7 | allow: 8 | - dependency-type: all 9 | schedule: 10 | interval: "daily" 11 | - package-ecosystem: "docker" 12 | directory: "/build" 13 | labels: 14 | - "area/dependency" 15 | - "ok-to-test" 16 | schedule: 17 | interval: "weekly" 18 | ignore: 19 | - dependency-name: "redhat-services-prod/openshift/boilerplate" 20 | # don't upgrade boilerplate via these means 21 | - dependency-name: "openshift4/ose-operator-registry" 22 | # don't upgrade ose-operator-registry via these means 23 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### What type of PR is this? 2 | 3 | (feature/bug/documentation/other) 4 | 5 | ### What this PR does / Why we need it? 6 | 7 | ### Special notes for your reviewer 8 | 9 | ### Test Coverage 10 | #### Guidelines for CAD investigations 11 | - New investgations should be accompanied by unit tests and/or step-by-step manual tests in the investigation README. 12 | - E2E testing is desired for actioning investigations. See README for more info on investigation graduation process. 13 | 14 | #### Test coverage checks 15 | - [ ] Added tests 16 | - [ ] Created jira card to add unit test 17 | - [ ] This PR may not need unit tests 18 | 19 | ### Pre-checks (if applicable) 20 | - [ ] Ran unit tests locally 21 | - [ ] Validated the changes in a cluster 22 | - [ ] Included documentation changes with PR 23 | -------------------------------------------------------------------------------- /.github/workflows/auto-tidy-interceptor.yml: -------------------------------------------------------------------------------- 1 | name: Auto tidy interceptor after cadctl changes 2 | 3 | on: 4 | pull_request_target: 5 | types: [opened, synchronize] 6 | paths: 7 | - 'go.mod' 8 | - 'go.sum' 9 | - '**/*.go' 10 | - '!interceptor/**' 11 | 12 | permissions: 13 | contents: write 14 | pull-requests: write 15 | 16 | jobs: 17 | tidy: 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - name: Checkout PR branch 22 | uses: actions/checkout@v4 23 | with: 24 | ref: ${{ github.event.pull_request.head.ref }} 25 | repository: ${{ github.event.pull_request.head.repo.full_name }} 26 | token: ${{ secrets.GITHUB_TOKEN }} 27 | 28 | - name: Set up Go 29 | uses: actions/setup-go@v5 30 | with: 31 | go-version: stable 32 | 33 | - name: Run go mod tidy in interceptor 34 | working-directory: interceptor 35 | run: go mod tidy 36 | 37 | - name: Check for changes when run go mod tidy in interceptor 38 | id: diffcheck 39 | run: | 40 | if [[ -n "$(git status --porcelain interceptor/go.mod interceptor/go.sum)" ]]; then 41 | echo "changes=true" >> $GITHUB_OUTPUT 42 | else 43 | echo "changes=false" >> $GITHUB_OUTPUT 44 | fi 45 | 46 | - name: Commit and push if there are changes 47 | if: steps.diffcheck.outputs.changes == 'true' 48 | uses: EndBug/add-and-commit@v9 49 | with: 50 | message: "On PR: tidy interceptor go.mod after cadctl go.mod update" 51 | add: "interceptor/go.mod interceptor/go.sum" 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | dist 3 | *.out 4 | .docker 5 | .envrc 6 | .idea 7 | .vscode 8 | cad_testing 9 | e2e-suite.test 10 | payload 11 | test/testinfra/*.log 12 | test/testinfra/*.pem 13 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | run: 3 | modules-download-mode: readonly 4 | output: 5 | path-prefix: "" 6 | linters: 7 | default: none 8 | enable: 9 | - asasalint 10 | - asciicheck 11 | - bidichk 12 | - bodyclose 13 | - decorder 14 | - dupword 15 | - durationcheck 16 | - errcheck 17 | - errchkjson 18 | - errname 19 | - errorlint 20 | - ginkgolinter 21 | - gocheckcompilerdirectives 22 | - goconst 23 | - gocritic 24 | - gocyclo 25 | - goheader 26 | - gomodguard 27 | - gosec 28 | - govet 29 | - grouper 30 | - importas 31 | - ineffassign 32 | - loggercheck 33 | - maintidx 34 | - makezero 35 | - misspell 36 | - nestif 37 | - nilerr 38 | - nilnil 39 | - noctx 40 | - nolintlint 41 | - nosprintfhostport 42 | - prealloc 43 | - predeclared 44 | - promlinter 45 | - reassign 46 | - revive 47 | - rowserrcheck 48 | - staticcheck 49 | - thelper 50 | - tparallel 51 | - unconvert 52 | - unused 53 | - usestdlibvars 54 | - wastedassign 55 | - whitespace 56 | settings: 57 | nestif: 58 | min-complexity: 10 59 | revive: 60 | rules: 61 | - name: dot-imports 62 | arguments: 63 | - allowedPackages: 64 | - github.com/onsi/ginkgo/v2 65 | - github.com/onsi/gomega 66 | severity: warning 67 | disabled: false 68 | exclude: 69 | - "" 70 | exclusions: 71 | generated: lax 72 | paths: 73 | - third_party$ 74 | - builtin$ 75 | - examples$ 76 | formatters: 77 | enable: 78 | - gofmt 79 | - gofumpt 80 | - goimports 81 | exclusions: 82 | generated: lax 83 | paths: 84 | - third_party$ 85 | - builtin$ 86 | - examples$ 87 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | IMAGE_REGISTRY?=quay.io 2 | IMAGE_REPOSITORY?=app-sre 3 | IMAGE_NAME?=configuration-anomaly-detection 4 | DOCKERFILE?=./build/Dockerfile 5 | define ADDITIONAL_IMAGE_SPECS 6 | ./build/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/$(IMAGE_NAME):$(CURRENT_COMMIT) 7 | endef 8 | 9 | include boilerplate/generated-includes.mk 10 | include test/e2e/project.mk 11 | 12 | GOLANGCI_LINT_VERSION=v2.0.2 13 | MOCKGEN_VERSION=v0.5.0 14 | 15 | .DEFAULT_GOAL := all 16 | 17 | help: # Display this help 18 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[0-9A-Za-z_-]+:.*?##/ { printf " \033[36m%-50s\033[0m %s\n", $$1, $$2 } /^\$$\([0-9A-Za-z_-]+\):.*?##/ { gsub("_","-", $$1); printf " \033[36m%-50s\033[0m %s\n", tolower(substr($$1, 3, length($$1)-7)), $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 19 | 20 | ##@ Global: 21 | .PHONY: all 22 | all: interceptor cadctl ## Generate, build, lint, test all subprojects 23 | 24 | .PHONY: build 25 | build: build-interceptor build-cadctl ## Build all subprojects in this repository 26 | 27 | .PHONY: lint 28 | lint: lint-cadctl lint-interceptor ## Lint all subprojects 29 | 30 | .PHONY: test 31 | test: test-cadctl test-interceptor 32 | 33 | ##@ cadctl: 34 | .PHONY: cadctl 35 | cadctl: generate-cadctl build-cadctl test-cadctl lint-cadctl ## Run all targets for cadctl (generate, build, test, lint, generation) 36 | 37 | .PHONY: generate-cadctl 38 | generate-cadctl: check-go121-install install-mockgen ## Generate mocks for cadctl 39 | go generate -mod=readonly ./... 40 | 41 | .PHONY: build-cadctl 42 | build-cadctl: check-go121-install ## Build the cadctl binary 43 | @echo 44 | @echo "Building cadctl..." 45 | cd cadctl && go build -ldflags="-s -w" -mod=readonly -trimpath -o ../bin/cadctl . 46 | 47 | .PHONY: lint-cadctl 48 | lint-cadctl: install-linter ## Lint cadctl subproject 49 | @echo 50 | @echo "Linting cadctl..." 51 | # Explicitly set GOROOT, see https://github.com/golangci/golangci-lint/issues/3107 52 | GOROOT=$$(go env GOROOT) GOLANGCI_LINT_CACHE=$$(mktemp -d) $(GOPATH)/bin/golangci-lint run -c .golangci.yml 53 | 54 | .PHONY: test-cadctl 55 | test-cadctl: check-go121-install ## Run automated tests for cadctl 56 | @echo 57 | @echo "Running unit tests for cadctl..." 58 | go test $(TESTOPTS) -race -mod=readonly ./cadctl/... ./pkg/... 59 | 60 | ##@ Interceptor: 61 | .PHONY: interceptor 62 | interceptor: build-interceptor test-interceptor test-interceptor-e2e lint-interceptor ## Run all targets for interceptor (build, test, lint) 63 | 64 | .PHONY: build-interceptor 65 | build-interceptor: check-go121-install ## Build the interceptor binary 66 | @echo 67 | @echo "Building interceptor..." 68 | cd interceptor && go build -ldflags="-s -w" -mod=readonly -trimpath -o ../bin/interceptor . 69 | 70 | .PHONY: lint-interceptor 71 | lint-interceptor: install-linter ## Lint interceptor subproject 72 | @echo 73 | @echo "Linting interceptor..." 74 | # Explicitly set GOROOT, see https://github.com/golangci/golangci-lint/issues/3107 75 | cd interceptor && GOROOT=$$(go env GOROOT) GOLANGCI_LINT_CACHE=$$(mktemp -d) $(GOPATH)/bin/golangci-lint run -c ../.golangci.yml 76 | 77 | .PHONY: test-interceptor 78 | test-interceptor: check-go121-install check-jq-install build-interceptor ## Run unit tests for interceptor 79 | @echo 80 | @echo "Running unit tests for interceptor..." 81 | cd interceptor && go test -race -mod=readonly ./... 82 | 83 | .PHONY: test-interceptor-e2e 84 | test-interceptor-e2e: check-go121-install check-jq-install check-vault-install build-interceptor ## Run e2e tests for interceptor 85 | @echo 86 | @echo "Running e2e tests for interceptor..." 87 | cd interceptor && ./test/e2e.sh 88 | 89 | ##@ Boilerplate: 90 | .PHONY: boilerplate 91 | bootstrap-investigation: ## Bootstrap a new boilerplate investigation 92 | @cd hack && ./bootstrap-investigation.sh 93 | 94 | 95 | .PHONY: boilerplate-update 96 | boilerplate-update: ## Update boilerplate version 97 | @boilerplate/update 98 | 99 | ### CI Only 100 | .PHONY: coverage 101 | coverage: 102 | hack/codecov.sh 103 | 104 | .PHONY: validate 105 | validate: isclean 106 | 107 | ### Prerequisites 108 | ### It is assumed that 'make' is already installed 109 | ### Version of go is checked but the version the tools are not checked as this should not matter much. 110 | .PHONY: check-%-install 111 | check-%-install: 112 | @type $* 1> /dev/null || (>&2 echo && echo "'$*' IS NOT INSTALLED - install it manually" && echo && false) 113 | 114 | .PHONY: check-go121-install 115 | check-go121-install: 116 | @(type go 1> /dev/null && go version | grep -q 'go[1-9].[2-9][1-9]') || (>&2 echo && echo "'go' WITH VERSION >= 1.21 IS NOT INSTALLED - install it manually" && echo && false) 117 | 118 | .PHONY: install-linter 119 | install-linter: check-curl-install check-go121-install 120 | @ls $(GOPATH)/bin/golangci-lint 1>/dev/null || (echo && echo "Installing 'golangci-lint'..." && mkdir -p $(GOPATH)/bin && curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GOPATH)/bin $(GOLANGCI_LINT_VERSION)) 121 | 122 | .PHONY: install-mockgen 123 | install-mockgen: check-go121-install 124 | @type mockgen 1> /dev/null || (echo && echo "Installing 'mockgen'..." && go install go.uber.org/mock/mockgen@$(MOCKGEN_VERSION)) 125 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | reviewers: 2 | - Makdaam 3 | - Nikokolas3270 4 | - rafael-azevedo 5 | - RaphaelBut 6 | - bng0y 7 | - typeid 8 | - tnierman 9 | - zmird-r 10 | - joshbranham 11 | - MateSaary 12 | - srep-functional-team-orange 13 | approvers: 14 | - Makdaam 15 | - Nikokolas3270 16 | - rafael-azevedo 17 | - RaphaelBut 18 | - bng0y 19 | - typeid 20 | - tnierman 21 | - zmird-r 22 | - joshbranham 23 | - MateSaary 24 | - srep-functional-team-orange 25 | - srep-team-leads 26 | maintainers: 27 | - rafael-azevedo 28 | -------------------------------------------------------------------------------- /OWNERS_ALIASES: -------------------------------------------------------------------------------- 1 | # ================================ DO NOT EDIT ================================ 2 | # This file is managed in https://github.com/openshift/boilerplate 3 | # See the OWNERS_ALIASES docs: https://git.k8s.io/community/contributors/guide/owners.md#OWNERS_ALIASES 4 | # ============================================================================= 5 | aliases: 6 | srep-functional-team-aurora: 7 | - abyrne55 8 | - dakotalongRH 9 | - joshbranham 10 | - luis-falcon 11 | - reedcort 12 | srep-functional-team-fedramp: 13 | - tonytheleg 14 | - theautoroboto 15 | - rhdedgar 16 | - katherinelc321 17 | - rojasreinold 18 | - fsferraz-rh 19 | srep-functional-team-hulk: 20 | - a7vicky 21 | - ravitri 22 | - shitaljante 23 | - devppratik 24 | - Tafhim 25 | - tkong-redhat 26 | - TheUndeadKing 27 | - vaidehi411 28 | - chamalabey 29 | srep-functional-team-orange: 30 | - bergmannf 31 | - Makdaam 32 | - Nikokolas3270 33 | - RaphaelBut 34 | - MateSaary 35 | - rolandmkunkel 36 | - petrkotas 37 | - zmird-r 38 | - evlin-rh 39 | - hectorakemp 40 | srep-functional-team-rocket: 41 | - aliceh 42 | - anispate 43 | - clcollins 44 | - Mhodesty 45 | - nephomaniac 46 | - tnierman 47 | srep-functional-team-security: 48 | - jaybeeunix 49 | - sam-nguyen7 50 | - wshearn 51 | - dem4gus 52 | - npecka 53 | - pshickeydev 54 | - casey-williams-rh 55 | - boranx 56 | srep-functional-team-thor: 57 | - bmeng 58 | - MitaliBhalla 59 | - feichashao 60 | - samanthajayasinghe 61 | - xiaoyu74 62 | - Dee-6777 63 | - Tessg22 64 | - smarthall 65 | srep-infra-cicd: 66 | - mmazur 67 | - mrsantamaria 68 | - ritmun 69 | - jbpratt 70 | - yiqinzhang 71 | srep-functional-leads: 72 | - abyrne55 73 | - clcollins 74 | - Nikokolas3270 75 | - theautoroboto 76 | - smarthall 77 | - sam-nguyen7 78 | - ravitri 79 | srep-team-leads: 80 | - rafael-azevedo 81 | - iamkirkbater 82 | - rogbas 83 | - fahlmant 84 | - dustman9000 85 | - wanghaoran1988 86 | - bng0y 87 | - bmeng 88 | - typeid 89 | sre-group-leads: 90 | - apahim 91 | - maorfr 92 | - rogbas 93 | srep-architects: 94 | - jharrington22 95 | - cblecker 96 | -------------------------------------------------------------------------------- /boilerplate/_data/backing-image-tag: -------------------------------------------------------------------------------- 1 | image-v7.3.0 2 | -------------------------------------------------------------------------------- /boilerplate/_data/last-boilerplate-commit: -------------------------------------------------------------------------------- 1 | 933276b05c4d7c6a049aad2a1b291de3281b1a7b 2 | -------------------------------------------------------------------------------- /boilerplate/_lib/boilerplate.mk: -------------------------------------------------------------------------------- 1 | .PHONY: boilerplate-commit 2 | boilerplate-commit: 3 | @boilerplate/_lib/boilerplate-commit 4 | 5 | .PHONY: boilerplate-freeze-check 6 | boilerplate-freeze-check: 7 | @boilerplate/_lib/freeze-check 8 | -------------------------------------------------------------------------------- /boilerplate/_lib/container-make: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [[ "$1" == "-h"* ]] || [[ "$1" == "--h"* ]]; then 4 | echo "Usage: $0 {arguments to the real 'make'}" 5 | echo "Runs 'make' in the boilerplate backing container." 6 | echo "If the command fails, starts a shell in the container so you can debug." 7 | exit -1 8 | fi 9 | 10 | source ${0%/*}/common.sh 11 | 12 | CONTAINER_ENGINE="${CONTAINER_ENGINE:-$(command -v podman || command -v docker)}" 13 | [[ -n "$CONTAINER_ENGINE" ]] || err "Couldn't find a container engine. Are you already in a container?" 14 | 15 | # Make sure the mount inside the container is named in such a way that 16 | # - openapi-gen (which relies on GOPATH) produces absolute paths; and 17 | # - other go-ish paths are writeable, e.g. for `go mod download`. 18 | CONTAINER_MOUNT=/go/src/$(repo_import $REPO_ROOT) 19 | 20 | # First set up a detached container with the repo mounted. 21 | banner "Starting the container" 22 | CE_OPTS="--platform=linux/amd64" 23 | if [[ "${CONTAINER_ENGINE##*/}" == "podman" ]]; then 24 | CE_OPTS="${CE_OPTS} --userns keep-id" 25 | fi 26 | if [[ "${CONTAINER_ENGINE##*/}" == "podman" ]] && [[ $OSTYPE == *"linux"* ]]; then 27 | CE_OPTS="${CE_OPTS} -v $REPO_ROOT:$CONTAINER_MOUNT:Z" 28 | else 29 | CE_OPTS="${CE_OPTS} -v $REPO_ROOT:$CONTAINER_MOUNT" 30 | fi 31 | container_id=$($CONTAINER_ENGINE run -d ${CE_OPTS} $IMAGE_PULL_PATH sleep infinity) 32 | 33 | if [[ $? -ne 0 ]] || [[ -z "$container_id" ]]; then 34 | err "Couldn't start detached container" 35 | fi 36 | 37 | # Now run our `make` command in it with the right UID and working directory 38 | args="exec -it -u $(id -u):0 -w $CONTAINER_MOUNT $container_id" 39 | banner "Running: make $@" 40 | $CONTAINER_ENGINE $args make "$@" 41 | rc=$? 42 | 43 | # If it failed, drop into the container in a shell 44 | if [[ $rc -ne 0 ]]; then 45 | banner "The 'make' command failed! Starting a shell in the container for debugging. Just 'exit' when done." 46 | $CONTAINER_ENGINE $args /bin/bash 47 | fi 48 | 49 | # Finally, remove the container 50 | banner "Cleaning up the container" 51 | $CONTAINER_ENGINE rm -f $container_id >/dev/null 52 | -------------------------------------------------------------------------------- /boilerplate/_lib/freeze-check: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # NOTE: For security reasons, everything imported or invoked (even 4 | # indirectly) by this script should be audited for vulnerabilities and 5 | # explicitly excluded from `linguist-generated` in the consuming 6 | # repository's .gitattributes. In other words, we want PRs to show 7 | # deltas to this script and all its dependencies by default so that 8 | # attempts to inject or circumvent code are visible. 9 | 10 | set -e 11 | 12 | REPO_ROOT=$(git rev-parse --show-toplevel) 13 | # Hardcoded rather than sourced to reduce attack surface. 14 | BOILERPLATE_GIT_REPO=https://github.com/openshift/boilerplate.git 15 | 16 | # Validate that no subscribed boilerplate artifacts have been changed. 17 | # PR checks may wish to gate on this. 18 | 19 | # This works by grabbing the commit hash of the boilerplate repository 20 | # at which the last update was applied, running the main `update` driver 21 | # against that, and failing if there's a resulting diff. 22 | 23 | # If we can't tell what that commit was, we must assume this is the 24 | # first update, and we'll (noisily) "succeed". 25 | 26 | # Note that this ought to work when you've just committed an update, 27 | # even if you've changed your update.cfg beforehand. We're basically 28 | # making sure you didn't muck with anything after updating. 29 | 30 | # For this to work, you have to be starting from a clean repository 31 | # state (any changes committed). 32 | # TODO(efried): This is not ideal -- it would be nice if I could check 33 | # this before committing my changes -- but how would that work? Diff to 34 | # a file, create a temporary commit, run the rest, remove the commit, 35 | # and reapply the diff? Messy and error-prone -- and I would be 36 | # seriously ticked off if something went wrong and lost my in-flight 37 | # changes. 38 | if ! [ -z "$(git status --porcelain -- ':!build/Dockerfile*')" ]; then 39 | echo "Can't validate boilerplate in a dirty repository. Please commit your changes and try again." >&2 40 | exit 1 41 | fi 42 | 43 | # We glean the last boilerplate commit from the 44 | # last-boilerplate-commit file, which gets laid down by the main 45 | # `update` driver each time it runs. 46 | LBCF=${REPO_ROOT}/boilerplate/_data/last-boilerplate-commit 47 | if ! [[ -f "$LBCF" ]]; then 48 | echo "Couldn't discover last boilerplate commit! Assuming you're bootstrapping." 49 | exit 0 50 | fi 51 | LBC=$(cat $LBCF) 52 | 53 | # Download just that commit 54 | echo "Fetching $LBC from $BOILERPLATE_GIT_REPO" 55 | # boilerplate/update cleans up this temp dir 56 | TMPD=$(mktemp -d) 57 | cd $TMPD 58 | git init 59 | # TODO(efried): DRY this remote. Make it configurable? 60 | git remote add origin $BOILERPLATE_GIT_REPO 61 | git fetch origin $(cat $LBCF) --tags 62 | git reset --hard FETCH_HEAD 63 | 64 | # Now invoke the update script, overriding the source repository we've 65 | # just downloaded at the appropriate commit. 66 | # We invoke the script explicitly rather than via the make target to 67 | # close a security hole whereby the latter is overridden. 68 | echo "Running update" 69 | cd $REPO_ROOT 70 | BOILERPLATE_GIT_REPO="${TMPD}" boilerplate/update 71 | 72 | # Okay, if anything has changed, that's bad. 73 | if [[ $(git status --porcelain -- ':!build/Dockerfile*' | wc -l) -ne 0 ]]; then 74 | echo "Your boilerplate is dirty!" >&2 75 | git status --porcelain -- ':!build/Dockerfile*' 76 | exit 1 77 | fi 78 | 79 | echo "Your boilerplate is clean!" 80 | exit 0 81 | -------------------------------------------------------------------------------- /boilerplate/_lib/release.sh: -------------------------------------------------------------------------------- 1 | # Helpers and variables for dealing with openshift/release 2 | 3 | # NOTE: This library is sourced from user-run scripts. It should not be 4 | # sourced in CI, as it relies on git config that's not necessarily 5 | # present there. 6 | 7 | RELEASE_REPO=openshift/release 8 | 9 | ## Information about the boilerplate consumer 10 | # E.g. "openshift/my-wizbang-operator" 11 | CONSUMER=$(repo_name .) 12 | [[ -z "$CONSUMER" ]] && err " 13 | Failed to determine current repository name" 14 | # 15 | # E.g. "openshift" 16 | CONSUMER_ORG=${CONSUMER%/*} 17 | [[ -z "$CONSUMER_ORG" ]] && err " 18 | Failed to determine consumer org" 19 | # 20 | # E.g. "my-wizbang-operator" 21 | CONSUMER_NAME=${CONSUMER#*/} 22 | [[ -z "$CONSUMER_NAME" ]] && err " 23 | Failed to determine consumer name" 24 | # 25 | # E.g. "master" 26 | # This will produce something like refs/remotes/origin/master 27 | DEFAULT_BRANCH=$(git symbolic-ref refs/remotes/upstream/HEAD 2>/dev/null || git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null || echo defaulting/to/master) 28 | # Strip off refs/remotes/{upstream|origin}/ 29 | DEFAULT_BRANCH=${DEFAULT_BRANCH##*/} 30 | [[ -z "$DEFAULT_BRANCH" ]] && err " 31 | Failed to determine default branch name" 32 | 33 | ## release_process_args "$@" 34 | # 35 | # This is for use by commands expecting one optional argument which is 36 | # the file system path to a clone of the $RELEASE_REPO. 37 | # 38 | # Will invoke `usage` -- which must be defined by the caller -- if 39 | # the wrong number of arguments are received, or if the single argument 40 | # is `help` or a flag. 41 | # 42 | # If exactly one argument is specified and it is valid, it is assigned 43 | # to the global RELEASE_CLONE variable. 44 | release_process_args() { 45 | if [[ $# -eq 1 ]]; then 46 | # Special cases for usage queries 47 | if [[ "$1" == '-'* ]] || [[ "$1" == help ]]; then 48 | usage 49 | fi 50 | 51 | [[ -d $1 ]] || err " 52 | $1: Not a directory." 53 | 54 | [[ $(repo_name $1) == "$RELEASE_REPO" ]] || err " 55 | $1 is not a clone of $RELEASE_REPO; or its 'origin' remote is not set properly." 56 | 57 | # Got a usable clone of openshift/release 58 | RELEASE_CLONE="$1" 59 | 60 | elif [[ $# -ne 0 ]]; then 61 | usage 62 | fi 63 | } 64 | 65 | ## release_validate_invocation 66 | # 67 | # Make sure we were called from a reasonable place, that being: 68 | # - A boilerplate consumer 69 | # - ...that's actually subscribed to a convention 70 | # - ...containing the script being invoked 71 | release_validate_invocation() { 72 | # Make sure we were invoked from a boilerplate consumer. 73 | [[ -z "$CONVENTION_NAME" ]] && err " 74 | $cmd must be invoked from a consumer of an appropriate convention. Where did you get this script from?" 75 | # Or at least not from boilerplate itself 76 | [[ "$CONSUMER" == "openshift/boilerplate" ]] && err " 77 | $cmd must be invoked from a boilerplate consumer, not from boilerplate itself." 78 | 79 | [[ -s $CONVENTION_ROOT/_data/last-boilerplate-commit ]] || err " 80 | $cmd must be invoked from a boilerplate consumer!" 81 | 82 | grep -E -q "^$CONVENTION_NAME(\s.*)?$" $CONVENTION_ROOT/update.cfg || err " 83 | $CONSUMER is not subscribed to $CONVENTION_NAME!" 84 | } 85 | 86 | ## release_prep_clone 87 | # 88 | # If $RELEASE_CLONE is already set: 89 | # - It should represent a directory containing a clean checkout of the 90 | # release repository; otherwise we error. 91 | # - We checkout and pull master. 92 | # Otherwise: 93 | # - We clone the release repo to a temporary directory. 94 | # - We set the $RELEASE_CLONE global variable to point to that 95 | # directory. 96 | release_prep_clone() { 97 | # If a release repo clone wasn't specified, create one 98 | if [[ -z "$RELEASE_CLONE" ]]; then 99 | RELEASE_CLONE=$(mktemp -dt openshift_release_XXXXXXX) 100 | git clone --depth=1 git@github.com:${RELEASE_REPO}.git $RELEASE_CLONE 101 | else 102 | [[ -z "$(git -C $RELEASE_CLONE status --porcelain)" ]] || err " 103 | Your release clone must start clean." 104 | # These will blow up if it's misconfigured 105 | git -C $RELEASE_CLONE checkout master 106 | git -C $RELEASE_CLONE pull 107 | fi 108 | } 109 | 110 | ## release_done_msg BRANCH 111 | # 112 | # Print exit instructions for submitting the release PR. 113 | # BRANCH is a suggested branch name. 114 | release_done_msg() { 115 | echo 116 | git status 117 | 118 | cat < $TMPD/$f 46 | echo $TMPD/$f 47 | return 48 | fi 49 | done 50 | } 51 | 52 | ## expected_prow_config ORG PROJ BRANCH 53 | # 54 | # Prints to stdout the expected prow configuration for the specified 55 | # ORG/PROJ. 56 | expected_prow_config() { 57 | local org=$1 58 | local consumer_name=$2 59 | local branch=$3 60 | # TODO: DRY this with what's in prow-config. 61 | # Do it by making it a template in the convention dir. 62 | cat < Note: The repository's main `Makefile` needs to be edited to have the following line: 6 | 7 | ```make 8 | include boilerplate/generated-includes.mk 9 | ``` 10 | 11 | ## `make` targets and functions 12 | 13 | The provided `Makefile` will build and push a container image defined by a Dockerfile at `build/Dockerfile`. If multiple containers are contained in the repo, they can also be managed by defining an `ADDITIONAL_IMAGE_SPECS` variable like so: 14 | 15 | ```make 16 | define ADDITIONAL_IMAGE_SPECS 17 | ./path/to/a/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/a-image:v1.2.3 18 | ./path/to/b/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/b-image:v4.5.6 19 | endef 20 | ``` 21 | 22 | | Makefile target | Description | 23 | |---|---| 24 | | `make osd-container-image-build` | Build the default container at `build/Dockerfile` and tag it based on the commit. Specify `DOCKERFILE` and `IMAGE_URI` to build other containers. | 25 | | `make osd-container-image-push` | Push the default container. | 26 | | `make osd-container-image-build-push` | Build and push the default container and `ADDITIONAL_IMAGE_SPECS`. Meant to be run by app-interface. | 27 | | `make isclean` | Ensure the local git checkout is clean. | 28 | | `make prow-config` | Updates the corresponding Prow config file in [openshift/release](https://github.com/openshift/release) to run `make test` on merge requests. This `test` make target should be defined by the consumer. If this is a new repository it should be onboarded to openshift/release first before this is run. | 29 | 30 | ## Linting/Testing 31 | 32 | This boilerplate convention does not contain any linting or testing guidelines to support a variety of containers. Those `Makefile` targets should be defined by the consumer themselves. 33 | -------------------------------------------------------------------------------- /boilerplate/openshift/osd-container-image/app-sre-build-push.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ev 4 | 5 | usage() { 6 | cat < $config_dir/$config 54 | build_root: 55 | from_repository: true 56 | images: 57 | - dockerfile_path: build/Dockerfile 58 | to: unused 59 | resources: 60 | '*': 61 | limits: 62 | memory: 4Gi 63 | requests: 64 | cpu: 100m 65 | memory: 200Mi 66 | tests: 67 | - as: test 68 | commands: make test 69 | container: 70 | from: src 71 | zz_generated_metadata: 72 | branch: ${DEFAULT_BRANCH} 73 | org: ${CONSUMER_ORG} 74 | repo: ${CONSUMER_NAME} 75 | EOF 76 | 77 | make jobs 78 | 79 | release_done_msg $release_branch 80 | -------------------------------------------------------------------------------- /boilerplate/openshift/osd-container-image/standard.mk: -------------------------------------------------------------------------------- 1 | # Validate variables in project.mk exist 2 | IMAGE_REGISTRY?=quay.io 3 | IMAGE_REPOSITORY?=app-sre 4 | REGISTRY_USER?=$(QUAY_USER) 5 | REGISTRY_TOKEN?=$(QUAY_TOKEN) 6 | 7 | VERSION_MAJOR?=0 8 | VERSION_MINOR?=1 9 | 10 | ifndef IMAGE_NAME 11 | $(error IMAGE_NAME is not set) 12 | endif 13 | 14 | ### Accommodate docker or podman 15 | # 16 | # The docker/podman creds cache needs to be in a location unique to this 17 | # invocation; otherwise it could collide across jenkins jobs. We'll use 18 | # a .docker folder relative to pwd (the repo root). 19 | CONTAINER_ENGINE_CONFIG_DIR = .docker 20 | # But docker and podman use different options to configure it :eyeroll: 21 | # ==> Podman uses --authfile=PATH *after* the `login` subcommand; but 22 | # also accepts REGISTRY_AUTH_FILE from the env. See 23 | # https://www.mankier.com/1/podman-login#Options---authfile=path 24 | export REGISTRY_AUTH_FILE = ${CONTAINER_ENGINE_CONFIG_DIR}/config.json 25 | # ==> Docker uses --config=PATH *before* (any) subcommand; so we'll glue 26 | # that to the CONTAINER_ENGINE variable itself. (NOTE: I tried half a 27 | # dozen other ways to do this. This was the least ugly one that actually 28 | # works.) 29 | ifndef CONTAINER_ENGINE 30 | CONTAINER_ENGINE=$(shell command -v podman 2>/dev/null || echo docker --config=$(CONTAINER_ENGINE_CONFIG_DIR)) 31 | endif 32 | 33 | # Generate version and tag information from inputs 34 | COMMIT_NUMBER=$(shell git rev-list `git rev-list --parents HEAD | grep -E "^[a-f0-9]{40}$$"`..HEAD --count) 35 | CURRENT_COMMIT=$(shell git rev-parse --short=7 HEAD) 36 | IMAGE_VERSION := $(VERSION_MAJOR).$(VERSION_MINOR).$(COMMIT_NUMBER)-$(CURRENT_COMMIT) 37 | 38 | IMAGE=$(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/$(IMAGE_NAME) 39 | IMAGE_TAG=v$(IMAGE_VERSION) 40 | IMAGE_URI?=$(IMAGE):$(IMAGE_TAG) 41 | DOCKERFILE ?=./build/Dockerfile 42 | 43 | 44 | # Consumer can optionally define ADDITIONAL_IMAGE_SPECS like: 45 | # define ADDITIONAL_IMAGE_SPECS 46 | # ./path/to/a/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/a-image:v1.2.3 47 | # ./path/to/b/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/b-image:v4.5.6 48 | # endef 49 | # Each will be conditionally built and pushed along with the default image. 50 | define IMAGES_TO_BUILD 51 | $(DOCKERFILE) $(IMAGE_URI) 52 | $(ADDITIONAL_IMAGE_SPECS) 53 | endef 54 | export IMAGES_TO_BUILD 55 | 56 | REGISTRY_USER ?= 57 | REGISTRY_TOKEN ?= 58 | 59 | ALLOW_DIRTY_CHECKOUT?=false 60 | 61 | # TODO: Figure out how to discover this dynamically 62 | CONVENTION_DIR := boilerplate/openshift/osd-container-image 63 | 64 | # Set the default goal in a way that works for older & newer versions of `make`: 65 | # Older versions (<=3.8.0) will pay attention to the `default` target. 66 | # Newer versions pay attention to .DEFAULT_GOAL, where uunsetting it makes the next defined target the default: 67 | # https://www.gnu.org/software/make/manual/make.html#index-_002eDEFAULT_005fGOAL-_0028define-default-goal_0029 68 | .DEFAULT_GOAL := 69 | .PHONY: default 70 | default: osd-container-image-build 71 | 72 | .PHONY: isclean 73 | isclean: 74 | @(test "$(ALLOW_DIRTY_CHECKOUT)" != "false" || test 0 -eq $$(git status --porcelain | wc -l)) || (echo "Local git checkout is not clean, commit changes and try again." >&2 && git --no-pager diff && exit 1) 75 | 76 | .PHONY: osd-container-image-build 77 | osd-container-image-build: isclean 78 | ${CONTAINER_ENGINE} build --pull -f $(DOCKERFILE) -t $(IMAGE_URI) . 79 | 80 | .PHONY: osd-container-image-push 81 | osd-container-image-push: osd-container-image-login osd-container-image-build 82 | ${CONTAINER_ENGINE} push ${IMAGE_URI} 83 | 84 | .PHONY: prow-config 85 | prow-config: 86 | ${CONVENTION_DIR}/prow-config ${RELEASE_CLONE} 87 | 88 | 89 | ######################### 90 | # Targets used by app-sre 91 | ######################### 92 | 93 | .PHONY: osd-container-image-login 94 | osd-container-image-login: 95 | @test "${REGISTRY_USER}" != "" && test "${REGISTRY_TOKEN}" != "" || (echo "REGISTRY_USER and REGISTRY_TOKEN must be defined" && exit 1) 96 | mkdir -p ${CONTAINER_ENGINE_CONFIG_DIR} 97 | @${CONTAINER_ENGINE} login -u="${REGISTRY_USER}" -p="${REGISTRY_TOKEN}" quay.io 98 | 99 | # TODO: figure out how to osd-container-image-login only once across multiple `make` calls 100 | .PHONY: osd-container-image-build-push-one 101 | osd-container-image-build-push-one: isclean osd-container-image-login 102 | @(if [[ -z "${IMAGE_URI}" ]]; then echo "Must specify IMAGE_URI"; exit 1; fi) 103 | @(if [[ -z "${DOCKERFILE_PATH}" ]]; then echo "Must specify DOCKERFILE_PATH"; exit 1; fi) 104 | ${CONTAINER_ENGINE} build --pull -f $(DOCKERFILE_PATH) -t $(IMAGE_URI) . 105 | ${CONTAINER_ENGINE} push ${IMAGE_URI} 106 | 107 | # build-push: Construct, tag, and push all container images. 108 | # TODO: Boilerplate this script. 109 | .PHONY: osd-container-image-build-push 110 | osd-container-image-build-push: 111 | ${CONVENTION_DIR}/app-sre-build-push.sh "$$IMAGES_TO_BUILD" 112 | -------------------------------------------------------------------------------- /boilerplate/openshift/osd-container-image/update: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | source $CONVENTION_ROOT/_lib/common.sh 6 | 7 | # No PRE 8 | [[ "$1" == "PRE" ]] && exit 0 9 | 10 | # Expect POST 11 | [[ "$1" == "POST" ]] || err "Got a parameter I don't understand: '$1'. Did the infrastructure change?" 12 | 13 | # Add OWNERS_ALIASES to $REPO_ROOT 14 | echo "Copying OWNERS_ALIASES to your repository root." 15 | cp ${HERE}/OWNERS_ALIASES $REPO_ROOT 16 | 17 | # Add dependabot configuration 18 | mkdir -p $REPO_ROOT/.github 19 | echo "Copying dependabot.yml to .github/dependabot.yml" 20 | cp ${HERE}/dependabot.yml ${REPO_ROOT}/.github/dependabot.yml 21 | 22 | echo "Writing .ci-operator.yaml in your repository root with:" 23 | echo " namespace: $IMAGE_NAMESPACE" 24 | echo " name: $IMAGE_NAME" 25 | echo " tag: $LATEST_IMAGE_TAG" 26 | ${SED?} "s/__NAMESPACE__/$IMAGE_NAMESPACE/; s/__NAME__/$IMAGE_NAME/; s/__TAG__/$LATEST_IMAGE_TAG/" ${HERE}/.ci-operator.yaml > $REPO_ROOT/.ci-operator.yaml 27 | 28 | cat <<'EOF' 29 | 30 | ===================== 31 | THINGS YOU NEED TO DO 32 | ===================== 33 | - Make sure the following line is in your base Makefile: 34 | 35 | include boilerplate/generated-includes.mk 36 | 37 | - Remove any other 'include' lines, unless they're for things truly 38 | unique to your repository. (Otherwise, consider proposing them to 39 | boilerplate.) 40 | 41 | - Delete any obsolete files you're no longer including. 42 | 43 | - Define a `make test` target for Prow 44 | 45 | - Have a Dockerfile in ./build/Dockerfile and define IMAGE_NAME for it. 46 | Others container images can be specified with ADDITIONAL_IMAGE_SPECS 47 | 48 | ===================== 49 | 50 | EOF 51 | -------------------------------------------------------------------------------- /boilerplate/update.cfg: -------------------------------------------------------------------------------- 1 | openshift/osd-container-image 2 | -------------------------------------------------------------------------------- /build/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/redhat-services-prod/openshift/boilerplate:image-v7.0.0 as builder 2 | 3 | ADD . /opt 4 | WORKDIR /opt 5 | 6 | RUN make CGO_ENABLED=0 build-cadctl 7 | RUN make CGO_ENABLED=0 build-interceptor 8 | 9 | 10 | FROM quay.io/app-sre/ubi8-ubi-minimal:8.10 as runner 11 | 12 | COPY --from=builder /opt/bin/cadctl /bin/cadctl 13 | COPY --from=builder /opt/bin/interceptor /bin/interceptor 14 | 15 | ARG BUILD_DATE 16 | ARG VERSION 17 | ARG VCS_REF 18 | ARG DOCKERFILE_PATH 19 | 20 | LABEL vendor="RedHat" \ 21 | name="openshift/configuration-anomaly-detection" \ 22 | description="a CLI tool to detect and mitigate configuration mishaps" \ 23 | io.k8s.display-name="openshift/configuration-anomaly-detection" \ 24 | io.k8s.description="a CLI tool to detect and mitigate configuration mishaps" \ 25 | maintainer="RedHat <>" \ 26 | version="$VERSION" \ 27 | org.label-schema.build-date=$BUILD_DATE \ 28 | org.label-schema.description="a CLI tool to detect and mitigate configuration mishaps" \ 29 | org.label-schema.docker.cmd="docker run --rm openshift/configuration-anomaly-detection" \ 30 | org.label-schema.docker.dockerfile=$DOCKERFILE_PATH \ 31 | org.label-schema.name="openshift/configuration-anomaly-detection" \ 32 | org.label-schema.schema-version="0.1.0" \ 33 | org.label-schema.vcs-branch=$VCS_BRANCH \ 34 | org.label-schema.vcs-ref=$VCS_REF \ 35 | org.label-schema.vcs-url="https://github.com/openshift/configuration-anomaly-detection" \ 36 | org.label-schema.vendor="openshift/configuration-anomaly-detection" \ 37 | org.label-schema.version=$VERSION 38 | 39 | RUN microdnf install jq 40 | 41 | ENTRYPOINT ["/bin/cadctl"] 42 | -------------------------------------------------------------------------------- /cadctl/.gitignore: -------------------------------------------------------------------------------- 1 | /cadctl 2 | -------------------------------------------------------------------------------- /cadctl/cmd/root.go: -------------------------------------------------------------------------------- 1 | // Package cmd holds the cadctl cobra data 2 | /* 3 | Copyright © 2022 Red Hat, Inc. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | package cmd 18 | 19 | import ( 20 | investigate "github.com/openshift/configuration-anomaly-detection/cadctl/cmd/investigate" 21 | "github.com/openshift/configuration-anomaly-detection/pkg/logging" 22 | "github.com/openshift/configuration-anomaly-detection/pkg/metrics" 23 | "github.com/spf13/cobra" 24 | ) 25 | 26 | // rootCmd represents the base command when called without any subcommands 27 | var rootCmd = &cobra.Command{ 28 | Use: "cadctl", 29 | Short: "A util of configuration-anomaly-detection (CAD) checks", 30 | } 31 | 32 | // Execute adds all child commands to the root command and sets flags appropriately. 33 | // This is called by main.main(). It only needs to happen once to the rootCmd. 34 | func Execute() { 35 | err := rootCmd.Execute() 36 | metrics.Push() 37 | if err != nil { 38 | logging.Fatal(err) 39 | } 40 | } 41 | 42 | func init() { 43 | rootCmd.AddCommand(investigate.InvestigateCmd) 44 | } 45 | -------------------------------------------------------------------------------- /cadctl/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2022 Red Hat, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package main is the main package 18 | package main 19 | 20 | import "github.com/openshift/configuration-anomaly-detection/cadctl/cmd" 21 | 22 | func main() { 23 | cmd.Execute() 24 | } 25 | -------------------------------------------------------------------------------- /hack/bootstrap-investigation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | read -p "Enter the new investigation (package) name: " INVESTIGATION_NAME 6 | if [[ "${INVESTIGATION_NAME}" == "" ]] ; then 7 | echo "Investigation name cannot be empty." 8 | exit 1 9 | elif [[ "${INVESTIGATION_NAME}" =~ [^a-zA-Z0-9_] ]] ; then 10 | echo "Investigation name must be alphanumeric." 11 | exit 1 12 | fi 13 | 14 | read -p "Enter new investigation description: " INVESTIGATION_DESCRIPTION 15 | if [[ "${INVESTIGATION_DESCRIPTION}" == "" ]] ; then 16 | INVESTIGATION_DESCRIPTION="TODO" 17 | fi 18 | 19 | read -p "Should Investigate Alert (y/n): " INVESTIGATE_ALERT_BOOL 20 | if [[ "${INVESTIGATE_ALERT_BOOL}" == "y" ]] ; then 21 | read -p "Investigation alert string: " INVESTIGATION_ALERT_STRING 22 | INVESTIGATION_ALERT="strings.Contains(alert, \"${INVESTIGATION_ALERT_STRING}\")" 23 | elif [[ "${INVESTIGATE_ALERT_BOOL}" == "n" ]] ; then 24 | INVESTIGATION_ALERT="false" 25 | else 26 | echo "Invalid input. Please enter 'y' or 'n'." 27 | exit 1 28 | fi 29 | 30 | INVESTIGATION_NAME=$(echo "${INVESTIGATION_NAME}" | tr '[:upper:]' '[:lower:]') 31 | 32 | INVESTIGATION_DIR="../pkg/investigations/${INVESTIGATION_NAME}" 33 | 34 | if [ -d "${INVESTIGATION_DIR}" ]; then 35 | echo "Investigation of name ${INVESTIGATION_NAME} already exists." 36 | exit 1 37 | fi 38 | 39 | mkdir -p "${INVESTIGATION_DIR}" 40 | ls "${INVESTIGATION_DIR}" 41 | 42 | touch "${INVESTIGATION_DIR}/${INVESTIGATION_NAME}.go" 43 | touch "${INVESTIGATION_DIR}/metadata.yaml" 44 | touch "${INVESTIGATION_DIR}/README.md" 45 | mkdir "${INVESTIGATION_DIR}/testing/" 46 | 47 | # Create README.md file 48 | cat < "${INVESTIGATION_DIR}/README.md" 49 | # ${INVESTIGATION_NAME} Investigation 50 | 51 | ${INVESTIGATION_DESCRIPTION} 52 | 53 | ## Testing 54 | 55 | Refer to the [testing README](./testing/README.md) for instructions on testing this investigation 56 | 57 | EOF 58 | 59 | # Create testing/README.md file 60 | cat < "${INVESTIGATION_DIR}/testing/README.md" 61 | # Testing ${INVESTIGATION_NAME} Investigation 62 | 63 | TODO: 64 | - Add a test script or test objects to this `testing/` directory for future maintainers to use 65 | - Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc) 66 | EOF 67 | 68 | 69 | # Create metadata.yaml file 70 | cat < "${INVESTIGATION_DIR}/metadata.yaml" 71 | name: ${INVESTIGATION_NAME} 72 | rbac: 73 | roles: [] 74 | clusterRoleRules: [] 75 | customerDataAccess: false 76 | 77 | EOF 78 | 79 | # Create boilerplate investigation file 80 | cat < "${INVESTIGATION_DIR}/${INVESTIGATION_NAME}.go" 81 | // Package ${INVESTIGATION_NAME} contains...TODO 82 | package ${INVESTIGATION_NAME} 83 | 84 | import ( 85 | "strings" 86 | 87 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" 88 | "github.com/openshift/configuration-anomaly-detection/pkg/logging" 89 | "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" 90 | ) 91 | 92 | type Investigation struct{} 93 | 94 | func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { 95 | result := investigation.InvestigationResult{} 96 | 97 | // Initialize PagerDuty note writer 98 | notes := notewriter.New(r.Name, logging.RawLogger) 99 | defer func() { r.Notes = notes }() 100 | 101 | // TODO: Implement investigation logic here 102 | 103 | return result, r.PdClient.EscalateIncidentWithNote(notes.String()) 104 | } 105 | 106 | func (c *Investigation) Name() string { 107 | return "${INVESTIGATION_NAME}" 108 | } 109 | 110 | func (c *Investigation) Description() string { 111 | return "${INVESTIGATION_DESCRIPTION}" 112 | } 113 | 114 | func (c *Investigation) ShouldInvestigateAlert(alert string) bool { 115 | return ${INVESTIGATION_ALERT} 116 | } 117 | 118 | func (c *Investigation) IsExperimental() bool { 119 | // TODO: Update to false when graduating to production. 120 | return true 121 | } 122 | 123 | EOF 124 | 125 | echo "${INVESTIGATION_NAME} created in ${INVESTIGATION_DIR}" 126 | echo "metadata.yaml file created in ${INVESTIGATION_DIR}" 127 | 128 | # Update registry.go to contain new investigation 129 | if ! grep -q "${INVESTIGATION_NAME}" ../pkg/investigations/registry.go && ! grep -q "${INVESTIGATION_NAME}" ../pkg/investigations/registry.go; then 130 | sed -i "/import (/a \\\t\"github.com/openshift/configuration-anomaly-detection/pkg/investigations/${INVESTIGATION_NAME}\"" ../pkg/investigations/registry.go 131 | sed -i "/var availableInvestigations = \[/a \\\t&${INVESTIGATION_NAME}.Investigation{}," ../pkg/investigations/registry.go 132 | echo "${INVESTIGATION_NAME} added to registry.go" 133 | else 134 | echo "${INVESTIGATION_NAME} already exists in registry.go" 135 | fi 136 | -------------------------------------------------------------------------------- /hack/codecov.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | set -o pipefail 6 | 7 | REPO_ROOT=$(git rev-parse --show-toplevel) 8 | CI_SERVER_URL=https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test 9 | COVER_PROFILE=${COVER_PROFILE:-coverage.out} 10 | JOB_TYPE=${JOB_TYPE:-"local"} 11 | 12 | # Default concurrency to four threads. By default it's the number of procs, 13 | # which seems to be 16 in the CI env. Some consumers' coverage jobs were 14 | # regularly getting OOM-killed; so do this rather than boost the pod resources 15 | # unreasonably. 16 | COV_THREAD_COUNT=${COV_THREAD_COUNT:-4} 17 | make -C "${REPO_ROOT}" test-cadctl TESTOPTS="-coverprofile=${COVER_PROFILE}.tmp -covermode=atomic -coverpkg=./... -p ${COV_THREAD_COUNT}" 18 | 19 | # Remove generated files from coverage profile 20 | grep -v "zz_generated" "${COVER_PROFILE}.tmp" > "${COVER_PROFILE}" 21 | rm -f "${COVER_PROFILE}.tmp" 22 | 23 | # Configure the git refs and job link based on how the job was triggered via prow 24 | if [[ "${JOB_TYPE}" == "presubmit" ]]; then 25 | echo "detected PR code coverage job for #${PULL_NUMBER}" 26 | REF_FLAGS="-P ${PULL_NUMBER} -C ${PULL_PULL_SHA}" 27 | JOB_LINK="${CI_SERVER_URL}/pr-logs/pull/${REPO_OWNER}_${REPO_NAME}/${PULL_NUMBER}/${JOB_NAME}/${BUILD_ID}" 28 | elif [[ "${JOB_TYPE}" == "postsubmit" ]]; then 29 | echo "detected branch code coverage job for ${PULL_BASE_REF}" 30 | REF_FLAGS="-B ${PULL_BASE_REF} -C ${PULL_BASE_SHA}" 31 | JOB_LINK="${CI_SERVER_URL}/logs/${JOB_NAME}/${BUILD_ID}" 32 | elif [[ "${JOB_TYPE}" == "local" ]]; then 33 | echo "coverage report available at ${COVER_PROFILE}" 34 | exit 0 35 | else 36 | echo "${JOB_TYPE} jobs not supported" >&2 37 | exit 1 38 | fi 39 | 40 | # Configure certain internal codecov variables with values from prow. 41 | export CI_BUILD_URL="${JOB_LINK}" 42 | export CI_BUILD_ID="${JOB_NAME}" 43 | export CI_JOB_ID="${BUILD_ID}" 44 | 45 | if [[ "${JOB_TYPE}" != "local" ]]; then 46 | if [[ -z "${ARTIFACT_DIR:-}" ]] || [[ ! -d "${ARTIFACT_DIR}" ]] || [[ ! -w "${ARTIFACT_DIR}" ]]; then 47 | echo '${ARTIFACT_DIR} must be set for non-local jobs, and must point to a writable directory' >&2 48 | exit 1 49 | fi 50 | curl -sS https://codecov.io/bash -o "${ARTIFACT_DIR}/codecov.sh" 51 | bash <(cat "${ARTIFACT_DIR}/codecov.sh") -Z -K -f "${COVER_PROFILE}" -r "${REPO_OWNER}/${REPO_NAME}" ${REF_FLAGS} 52 | else 53 | bash <(curl -s https://codecov.io/bash) -Z -K -f "${COVER_PROFILE}" -r "${REPO_OWNER}/${REPO_NAME}" ${REF_FLAGS} 54 | fi -------------------------------------------------------------------------------- /images/CadCat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/CadCat.png -------------------------------------------------------------------------------- /images/cad_chgm_investigation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_chgm_investigation/README.md -------------------------------------------------------------------------------- /images/cad_chgm_investigation/chgm_investigation_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_chgm_investigation/chgm_investigation_dark.png -------------------------------------------------------------------------------- /images/cad_chgm_investigation/chgm_investigation_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_chgm_investigation/chgm_investigation_light.png -------------------------------------------------------------------------------- /images/cad_overview/cad_architecture_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_overview/cad_architecture_dark.png -------------------------------------------------------------------------------- /images/cad_overview/cad_architecture_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_overview/cad_architecture_light.png -------------------------------------------------------------------------------- /interceptor/README.md: -------------------------------------------------------------------------------- 1 | # CAD Tekton Interceptor 2 | 3 | The tekton interceptor is a component plugged between the event listener and the task runs. The interceptor makes sure we don't start a pipeline for every alert we receive. Instead, alerts are filtered based on whether or not they are handled by CAD. Unhandled alerts are directly escalated and no pipeline is started. 4 | 5 | ## Testing 6 | 7 | ### E2E 8 | 9 | The interceptor has E2E tests starting the HTTP service and checking the HTTP responses. The tests are based on pre-existing PagerDuty alerts. 10 | 11 | ``` bash 12 | 13 | make e2e-interceptor 14 | 15 | # To also print the output of the interceptor service: 16 | CAD_E2E_VERBOSE=true make test-interceptor 17 | ``` 18 | 19 | ## Development 20 | 21 | It is possible to run the interceptor locally in a "minimal" state, where E2E is not used, and only the 22 | crucial-to-run env variables (seen below) are set as placeholders. This is useful for *local* development/debugging. 23 | 24 | ``` bash 25 | $ make build-interceptor 26 | 27 | $ CAD_SILENT_POLICY=test 28 | $ CAD_PD_TOKEN=test 29 | $ PD_SIGNATURE=test 30 | 31 | $ ./bin/interceptor 32 | ``` 33 | -------------------------------------------------------------------------------- /interceptor/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "net/http" 8 | "os" 9 | "os/signal" 10 | "syscall" 11 | "time" 12 | 13 | "github.com/openshift/configuration-anomaly-detection/interceptor/pkg/interceptor" 14 | "github.com/openshift/configuration-anomaly-detection/pkg/logging" 15 | "github.com/prometheus/client_golang/prometheus/promhttp" 16 | "knative.dev/pkg/signals" 17 | "sigs.k8s.io/controller-runtime/pkg/metrics" 18 | ) 19 | 20 | const ( 21 | HTTPPort = 8080 22 | readTimeout = 5 * time.Second 23 | writeTimeout = 20 * time.Second 24 | idleTimeout = 60 * time.Second 25 | ) 26 | 27 | var logger = logging.InitLogger(logging.LogLevelString, "") 28 | 29 | func main() { 30 | // set up signals so we handle the first shutdown signal gracefully 31 | ctx := signals.NewContext() 32 | 33 | stats := interceptor.CreateInterceptorStats() 34 | mux := http.NewServeMux() 35 | mux.Handle("/", interceptor.CreateInterceptorHandler(stats)) 36 | mux.HandleFunc("/ready", readinessHandler) 37 | interceptor.CreateAndRegisterMetricsCollector(stats) 38 | mux.Handle("/metrics", promhttp.HandlerFor(metrics.Registry, promhttp.HandlerOpts{Registry: metrics.Registry})) 39 | 40 | srv := &http.Server{ 41 | Addr: fmt.Sprintf(":%d", HTTPPort), 42 | BaseContext: func(listener net.Listener) context.Context { 43 | return ctx 44 | }, 45 | ReadTimeout: readTimeout, 46 | WriteTimeout: writeTimeout, 47 | IdleTimeout: idleTimeout, 48 | Handler: mux, 49 | } 50 | 51 | // Channel to listen for OS signals 52 | stop := make(chan os.Signal, 1) 53 | signal.Notify(stop, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT) 54 | 55 | // Run server in a goroutine 56 | go func() { 57 | logger.Infof("Listen and serve on port %d", HTTPPort) 58 | if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { 59 | logger.Fatalf("failed to start interceptors service: %v", err) 60 | } 61 | }() 62 | 63 | // Block until we receive a stop signal 64 | <-stop 65 | 66 | // Create a deadline to wait for. 67 | ctxShutDown, cancel := context.WithTimeout(context.Background(), 5*time.Second) 68 | defer cancel() 69 | 70 | // Attempt to gracefully shutdown the server 71 | if err := srv.Shutdown(ctxShutDown); err != nil { 72 | logger.Fatalf("server forced to shutdown: %v", err) 73 | } 74 | 75 | logger.Infof("Server exiting") 76 | } 77 | 78 | func readinessHandler(w http.ResponseWriter, r *http.Request) { 79 | w.WriteHeader(http.StatusOK) 80 | } 81 | -------------------------------------------------------------------------------- /interceptor/pkg/interceptor/metrics.go: -------------------------------------------------------------------------------- 1 | package interceptor 2 | 3 | import ( 4 | "strconv" 5 | 6 | "sigs.k8s.io/controller-runtime/pkg/metrics" 7 | 8 | "github.com/prometheus/client_golang/prometheus" 9 | ) 10 | 11 | const ( 12 | requestsCountMetricName = "cad_interceptor_requests_total" 13 | requestsCountMetricHelp = "Number of times CAD interceptor has been called (through a PagerDuty webhook, normally)" 14 | 15 | errorsCountMetricName = "cad_interceptor_errors_total" 16 | errorsCountMetricHelp = "Number of times CAD interceptor has been failed to process a request" 17 | ) 18 | 19 | var ( 20 | requestsCountMetricDesc = prometheus.NewDesc( 21 | requestsCountMetricName, 22 | requestsCountMetricHelp, 23 | nil, nil) 24 | 25 | errorsCountMetricDesc = prometheus.NewDesc( 26 | errorsCountMetricName, 27 | errorsCountMetricHelp, 28 | []string{"error_code", "reason"}, nil) 29 | ) 30 | 31 | type interceptorMetricsCollector struct { 32 | stats *InterceptorStats 33 | } 34 | 35 | func CreateAndRegisterMetricsCollector(stats *InterceptorStats) { 36 | metrics.Registry.MustRegister(&interceptorMetricsCollector{stats}) 37 | } 38 | 39 | func (c *interceptorMetricsCollector) Describe(ch chan<- *prometheus.Desc) { 40 | prometheus.DescribeByCollect(c, ch) 41 | } 42 | 43 | func (c *interceptorMetricsCollector) Collect(ch chan<- prometheus.Metric) { 44 | ch <- prometheus.MustNewConstMetric( 45 | requestsCountMetricDesc, 46 | prometheus.CounterValue, 47 | float64(c.stats.RequestsCount), 48 | ) 49 | 50 | for codeWithReason, errorsCount := range c.stats.CodeWithReasonToErrorsCount { 51 | ch <- prometheus.MustNewConstMetric( 52 | errorsCountMetricDesc, 53 | prometheus.CounterValue, 54 | float64(errorsCount), 55 | strconv.Itoa(codeWithReason.ErrorCode), 56 | codeWithReason.Reason, 57 | ) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /interceptor/test/e2e.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Text colors for output 5 | GREEN='\033[0;32m' 6 | RED='\033[0;31m' 7 | NC='\033[0m' 8 | 9 | # Load pd token from vault - needed by interceptor 10 | export VAULT_ADDR="https://vault.devshift.net" 11 | export VAULT_TOKEN="$(vault login -method=oidc -token-only)" 12 | for v in $(vault kv get -format=json osd-sre/configuration-anomaly-detection/cad-testing | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done 13 | unset VAULT_ADDR VAULT_TOKEN 14 | echo 15 | 16 | temp_log_file=$(mktemp) 17 | 18 | # Function to send an interceptor request and check the response 19 | function test_interceptor { 20 | 21 | local incident_id=$1 22 | local expected_response=$2 23 | local expected_metrics=$3 24 | local override_signature=$4 25 | 26 | # Run the interceptor and print logs to temporary log file 27 | export PD_SIGNATURE="test" 28 | CAD_PD_TOKEN=$(echo $pd_test_token) CAD_SILENT_POLICY=$(echo $pd_test_silence_policy) ./../bin/interceptor > $temp_log_file 2>&1 & 29 | PAYLOAD_BODY="{\\\"__pd_metadata\\\":{\\\"incident\\\":{\\\"id\\\":\\\"$incident_id\\\"}}}" 30 | PAYLOAD_BODY_FORMATTED='{"__pd_metadata":{"incident":{"id":"'$incident_id'"}}}' 31 | 32 | # Allow for test 3; override the signature after correct one has already been added to env 33 | if [[ "$override_signature" != "" ]]; then 34 | export PD_SIGNATURE=$override_signature 35 | fi 36 | 37 | SIGN=$(echo -n "$PAYLOAD_BODY_FORMATTED" | openssl dgst -sha256 -hmac $PD_SIGNATURE | sed 's/^.* //') 38 | 39 | # Store the PID of the interceptor process 40 | INTERCEPTOR_PID=$! 41 | 42 | # Wrap the webhook originating payload (this is the expected format of the payload sent to the interceptor) 43 | WRAPPED_PAYLOAD="{\"header\":{\"Content-Type\":[\"application/json\"],\"X-PagerDuty-Signature\":[\"v1=$SIGN\"]},\"body\":\"$PAYLOAD_BODY\"}" 44 | 45 | # Wait for 1 second to allow the interceptor to start up 46 | sleep 5 47 | 48 | 49 | # Send an interceptor request to localhost:8080 50 | # See https://pkg.go.dev/github.com/tektoncd/triggers/pkg/apis/triggers/v1alpha1#InterceptorRequest 51 | CURL_EXITCODE=0 52 | CURL_OUTPUT=$(curl -s -X POST -H "X-PagerDuty-Signature:v1=${SIGN}" -H "Content-Type: application/json" \ 53 | -d "$WRAPPED_PAYLOAD" \ 54 | http://localhost:8080) || CURL_EXITCODE=$? 55 | 56 | local return_code=0 57 | 58 | # Check if the curl output differs from the expected response 59 | if [[ "$CURL_OUTPUT" != "$expected_response" ]] || [[ "$CURL_EXITCODE" != "0" ]]; then 60 | echo -e "${RED}Test failed for incident ID $incident_id: Unexpected response.${NC}" 61 | echo -e "${RED}Expected: $expected_response${NC}" 62 | echo -e "${RED}Got: $CURL_OUTPUT${NC}" 63 | echo -e "${RED}Exit code: $CURL_EXITCODE${NC}" 64 | echo -e "" 65 | echo -e "Interceptor logs" 66 | cat $temp_log_file 67 | return_code=1 68 | else 69 | curl_metrics_exitcode=0 70 | curl_metrics_output=$(curl -s http://localhost:8080/metrics | grep '^cad_interceptor_') || curl_metrics_exitcode=$? 71 | 72 | if [[ "$curl_metrics_output" != "$expected_metrics" ]] || [[ "$curl_metrics_exitcode" != "0" ]]; then 73 | echo -e "${RED}Test failed for incident ID $incident_id: Unexpected metrics.${NC}" 74 | echo -e "${RED}Expected: $expected_metrics${NC}" 75 | echo -e "${RED}Got: $curl_metrics_output${NC}" 76 | echo -e "${RED}Exit code: $curl_metrics_exitcode${NC}" 77 | echo -e "" 78 | echo -e "Interceptor logs" 79 | cat $temp_log_file 80 | return_code=1 81 | else 82 | echo -e "${GREEN}Test passed for incident ID $incident_id: Response and metrics are as expected.${NC}" 83 | fi 84 | fi 85 | 86 | # Shut down the interceptor 87 | kill $INTERCEPTOR_PID 88 | 89 | return $return_code 90 | } 91 | 92 | # Expected outputs 93 | # See https://github.com/tektoncd/triggers/blob/v0.27.0/pkg/apis/triggers/v1alpha1/interceptor_types.go#L134 94 | EXPECTED_RESPONSE_CONTINUE='{"continue":true,"status":{}}' 95 | EXPECTED_RESPONSE_STOP='{"continue":false,"status":{}}' 96 | EXPECTED_RESPONSE_SIGNATURE_ERROR='failed to verify signature: invalid webhook signature' 97 | 98 | echo "========= TESTS =============" 99 | # Test for a pre-existing alert we handle (ClusterProvisioningDelay) 100 | echo "Test 1: alert with existing handling returns a 'continue: true' response" 101 | test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_CONTINUE" "cad_interceptor_requests_total 1" 102 | 103 | # Test for an alert we don't handle (alert called unhandled) 104 | echo "Test 2: unhandled alerts returns a 'continue: false' response" 105 | test_interceptor "Q3722KGCG12ZWD" "$EXPECTED_RESPONSE_STOP" "cad_interceptor_requests_total 1" 106 | 107 | # Test for an alert with invalid signature 108 | echo "Test 3: expected failure due to invalid signature" 109 | PD_SIGNATURE="invalid-signature" 110 | test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_SIGNATURE_ERROR" 'cad_interceptor_errors_total{error_code="400",reason="failed to verify signature"} 1'$'\n''cad_interceptor_requests_total 1' "invalid-signature" 111 | -------------------------------------------------------------------------------- /openshift/PipelinePruning.md: -------------------------------------------------------------------------------- 1 | # PipelinePruning 2 | 3 | ![Pipeline Pruning](assets/cad_pipeline_pruning.drawio.png) 4 | 5 | ## Overview 6 | 7 | Prior, we have exploited the AppSRE pipeline pruning via importing their pipeline defaults in service/app-interface. 8 | This has been changed, because it had a few disadvantages. For instance, we have also imported resource constraints 9 | and other defaults that we do not want for our pipeline. 10 | 11 | Instead, we are now using our own PipelinePruner in form of a Cronjob. This Cronjob is defined in [template.yaml](template.yaml). 12 | The Cronjob creates a pod with the following command: `tkn pipelinerun delete --keep=20 -f`. 13 | This command will delete all PipelineRuns except for the last 20. 14 | 15 | For doing this, the cronjob needs permissions, these permissions are set in a role, defined in [template.yaml](template.yaml) as well. 16 | 17 | Note, that we have also defined a ResourceQuota that limits the PipelineRuns to a maximum number of 1000. This does not mean concurrent runs but all the runs that exist for that pipeline. 18 | -------------------------------------------------------------------------------- /openshift/README.md: -------------------------------------------------------------------------------- 1 | # OpenShift Template 2 | 3 | This folder holds the template used by app-interface to deploy CAD resources on a target cluster. 4 | -------------------------------------------------------------------------------- /openshift/assets/cad_pipeline_pruning.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/openshift/assets/cad_pipeline_pruning.drawio.png -------------------------------------------------------------------------------- /openshift/gateway-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | name: configuration-anomaly-detection-gateway-template 5 | parameters: 6 | 7 | - name: IMAGE_TAG 8 | value: v0.7.0 9 | 10 | - name: REGISTRY_IMG 11 | value: quay.io/app-sre/aggregation-gateway 12 | 13 | - name: MEMORY_REQUEST 14 | description: Memory request for the API pods. 15 | value: "512Mi" 16 | 17 | - name: MEMORY_LIMIT 18 | description: Memory limit for the API pods. 19 | value: "1Gi" 20 | 21 | - name: CPU_REQUEST 22 | description: CPU request for the API pods. 23 | value: "200m" 24 | 25 | - name: CPU_LIMIT 26 | description: CPU limit for the API pods. 27 | value: "1" 28 | 29 | objects: 30 | - kind: Service 31 | apiVersion: v1 32 | metadata: 33 | name: aggregation-pushgateway 34 | labels: 35 | app: configuration-anomaly-detection 36 | port: metrics 37 | annotations: 38 | description: Exposes and load balances the aggregation-pushgateway pods 39 | spec: 40 | selector: 41 | app: aggregation-pushgateway 42 | ports: 43 | - name: metrics 44 | port: 9091 45 | targetPort: 9091 46 | protocol: TCP 47 | - kind: Deployment 48 | apiVersion: apps/v1 49 | metadata: 50 | name: aggregation-pushgateway 51 | labels: 52 | app: aggregation-pushgateway 53 | spec: 54 | selector: 55 | matchLabels: 56 | app: aggregation-pushgateway 57 | replicas: 2 58 | strategy: 59 | rollingParams: 60 | intervalSeconds: 1 61 | maxSurge: 25% 62 | maxUnavailable: 25% 63 | timeoutSeconds: 600 64 | updatePeriodSeconds: 1 65 | type: Rolling 66 | template: 67 | metadata: 68 | labels: 69 | app: aggregation-pushgateway 70 | spec: 71 | serviceAccountName: pushgateway 72 | containers: 73 | - name: aggregation-pushgateway 74 | image: ${REGISTRY_IMG}:${IMAGE_TAG} 75 | securityContext: 76 | allowPrivilegeEscalation: false 77 | runAsNonRoot: true 78 | capabilities: 79 | drop: ["ALL"] 80 | seccompProfile: 81 | type: RuntimeDefault 82 | imagePullPolicy: IfNotPresent 83 | env: 84 | - name: PAG_APILISTEN 85 | value: :9091 86 | - name: PAG_LIFECYCLELISTEN 87 | value: :9092 88 | resources: 89 | requests: 90 | cpu: ${CPU_REQUEST} 91 | memory: ${MEMORY_REQUEST} 92 | limits: 93 | cpu: ${CPU_LIMIT} 94 | memory: ${MEMORY_LIMIT} 95 | ports: 96 | - name: metrics 97 | protocol: TCP 98 | containerPort: 9091 99 | - name: lifecycle 100 | protocol: TCP 101 | containerPort: 9092 102 | livenessProbe: 103 | httpGet: 104 | path: /healthy 105 | port: 9092 106 | scheme: HTTP 107 | initialDelaySeconds: 15 108 | periodSeconds: 5 109 | readinessProbe: 110 | httpGet: 111 | path: /ready 112 | port: 9092 113 | scheme: HTTP 114 | initialDelaySeconds: 20 115 | periodSeconds: 10 -------------------------------------------------------------------------------- /pkg/ai/k8sgpt/k8sgpt.go: -------------------------------------------------------------------------------- 1 | package k8sgpt 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "strings" 9 | 10 | k8sgpt_ai "github.com/k8sgpt-ai/k8sgpt/pkg/ai" 11 | "github.com/k8sgpt-ai/k8sgpt/pkg/analysis" 12 | "github.com/k8sgpt-ai/k8sgpt/pkg/cache" 13 | gptK8sClient "github.com/k8sgpt-ai/k8sgpt/pkg/kubernetes" 14 | "k8s.io/apimachinery/pkg/runtime" 15 | "k8s.io/client-go/kubernetes" 16 | "k8s.io/client-go/rest" 17 | "sigs.k8s.io/controller-runtime/pkg/client" 18 | ) 19 | 20 | var model = "mistral-small-maas" 21 | 22 | func K8sGptAnalysis(k8sRestConfig *rest.Config) (string, error) { 23 | ctrlClient, err := client.New(k8sRestConfig, client.Options{Scheme: runtime.NewScheme()}) 24 | if err != nil { 25 | return "", errors.New("unable to init ctrlClient") 26 | } 27 | clientset := kubernetes.NewForConfigOrDie(k8sRestConfig) 28 | 29 | client := &gptK8sClient.Client{CtrlClient: ctrlClient, Config: k8sRestConfig, Client: clientset} 30 | 31 | aiToken := os.Getenv("CAD_HCM_AI_TOKEN") 32 | if aiToken == "" { 33 | return "", errors.New("could not find CAD_HCM_AI_TOKEN env") 34 | } 35 | 36 | aiClient := k8sgpt_ai.NewClient("openai") 37 | aiProvider := &k8sgpt_ai.AIProvider{ 38 | Name: "openai", 39 | Model: model, 40 | BaseURL: "https://mistral-small-maas-maas.apps.rosa.hcmaii01ue1.a9ro.p3.openshiftapps.com/v1", // TODO: Let's not hardcode this. 41 | Password: aiToken, 42 | } 43 | 44 | if err = aiClient.Configure(aiProvider); err != nil { 45 | return "", fmt.Errorf("unable to configure ai provider: %w", err) 46 | } 47 | 48 | cache, err := cache.GetCacheConfiguration() 49 | if err != nil { 50 | return "", fmt.Errorf("unable to get k8sgpt cache configuration: %w", err) 51 | } 52 | cache.DisableCache() 53 | 54 | a := &analysis.Analysis{ 55 | Context: context.Background(), 56 | Filters: []string{"Pod", "Deployment", "ReplicaSet", "PersistentVolumeClaim", "Service", "Ingress", "StatefulSet", "CronJob", "Node", "ValidatingWebhookConfiguration", "MutatingWebhookConfiguration"}, 57 | Client: client, 58 | Language: "english", 59 | Namespace: "", 60 | LabelSelector: "", 61 | Cache: cache, 62 | Explain: true, 63 | MaxConcurrency: 10, 64 | WithDoc: false, 65 | WithStats: false, 66 | AIClient: aiClient, 67 | } 68 | 69 | a.RunAnalysis() 70 | 71 | var output string 72 | anonymize := false 73 | if err := a.GetAIResults(output, anonymize); err != nil { 74 | return "", fmt.Errorf("unable to get ai results: %w", err) 75 | } 76 | 77 | return formatOutput(a) 78 | } 79 | 80 | func formatOutput(a *analysis.Analysis) (string, error) { 81 | var output strings.Builder 82 | 83 | output.WriteString("🤖🔧 AI Analysis Results 🔧🤖\n") 84 | output.WriteString(fmt.Sprintf("Model: %s\n", model)) 85 | if len(a.Errors) != 0 { 86 | output.WriteString("⚠️ Analysis failures: \n") 87 | for _, aerror := range a.Errors { 88 | output.WriteString(fmt.Sprintf("- %s\n", aerror)) 89 | } 90 | } 91 | if len(a.Results) == 0 { 92 | output.WriteString("✅ No cluster problems detected\n") 93 | return output.String(), nil 94 | } 95 | output.WriteString(fmt.Sprintf("🔍 %d cluster issues detected\n", len(a.Results))) 96 | output.WriteString("================\n\n") 97 | 98 | for _, result := range a.Results { 99 | if result.Kind != "" { 100 | output.WriteString(fmt.Sprintf("Kind: %s\n", result.Kind)) 101 | } 102 | 103 | if result.Name != "" { 104 | output.WriteString(fmt.Sprintf("Name: %s\n", result.Name)) 105 | } 106 | 107 | if result.ParentObject != "" { 108 | output.WriteString(fmt.Sprintf("ParentObject: %s\n", result.ParentObject)) 109 | } 110 | 111 | if len(result.Error) > 0 { 112 | output.WriteString("Issues:\n") 113 | for _, err := range result.Error { 114 | output.WriteString(fmt.Sprintf("- %s\n", err.Text)) 115 | if err.KubernetesDoc != "" { 116 | output.WriteString(fmt.Sprintf(" Kubernetes Doc: %s\n", err.KubernetesDoc)) 117 | } 118 | } 119 | } 120 | 121 | if result.Details != "" { 122 | output.WriteString(fmt.Sprintf("Details: %s\n", result.Details)) 123 | } 124 | 125 | output.WriteString("\n------------------------------------------------------------\n\n") 126 | } 127 | 128 | return output.String(), nil 129 | } 130 | -------------------------------------------------------------------------------- /pkg/aws/aws_test.go: -------------------------------------------------------------------------------- 1 | // Package aws contains functions related to aws sdk 2 | package aws 3 | 4 | import ( 5 | "testing" 6 | 7 | awsv2 "github.com/aws/aws-sdk-go-v2/aws" 8 | ec2v2 "github.com/aws/aws-sdk-go-v2/service/ec2" 9 | ec2v2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" 10 | "go.uber.org/mock/gomock" 11 | 12 | awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock" 13 | ) 14 | 15 | func setupSubnetMock(t *testing.T, gatewayId *string, mapPublicIps bool) EC2API { 16 | t.Helper() 17 | ctrl := gomock.NewController(t) 18 | rtb := []ec2v2types.Route{ 19 | { 20 | DestinationCidrBlock: awsv2.String("0.0.0.0/0"), 21 | GatewayId: gatewayId, 22 | }, 23 | } 24 | ec2api := awsmock.NewMockEC2API(ctrl) 25 | ec2api.EXPECT().DescribeSubnets(gomock.Any(), gomock.Any()).Return(&ec2v2.DescribeSubnetsOutput{ 26 | Subnets: []ec2v2types.Subnet{ 27 | { 28 | MapPublicIpOnLaunch: awsv2.Bool(mapPublicIps), 29 | SubnetId: awsv2.String("subnet-1"), 30 | }, 31 | }, 32 | }, nil) 33 | ec2api.EXPECT().DescribeRouteTables(gomock.Any(), gomock.Any()).Return(&ec2v2.DescribeRouteTablesOutput{ 34 | RouteTables: []ec2v2types.RouteTable{ 35 | { 36 | Routes: rtb, 37 | }, 38 | }, 39 | }, nil) 40 | return ec2api 41 | } 42 | 43 | func TestSdkClient_IsSubnetPrivate(t *testing.T) { 44 | type fields struct { 45 | Region string 46 | StsClient StsAPI 47 | Ec2Client EC2API 48 | CloudTrailClient CloudTrailAPI 49 | BaseConfig awsv2.Config 50 | } 51 | type args struct { 52 | subnet string 53 | } 54 | tests := []struct { 55 | name string 56 | fields fields 57 | args args 58 | want bool 59 | wantErr bool 60 | }{ 61 | { 62 | name: "A subnet without a GatewayID is considered private", 63 | fields: fields{ 64 | Region: "us-east-1", 65 | StsClient: nil, 66 | Ec2Client: setupSubnetMock(t, nil, false), 67 | CloudTrailClient: nil, 68 | BaseConfig: awsv2.Config{}, 69 | }, 70 | args: args{ 71 | subnet: "subnet-1", 72 | }, 73 | want: true, 74 | wantErr: false, 75 | }, 76 | { 77 | name: "A subnet with an internet gateway ID is considered public", 78 | fields: fields{ 79 | Region: "us-east-1", 80 | StsClient: nil, 81 | Ec2Client: setupSubnetMock(t, awsv2.String("igw-1"), true), 82 | CloudTrailClient: nil, 83 | BaseConfig: awsv2.Config{}, 84 | }, 85 | args: args{ 86 | subnet: "subnet-1", 87 | }, 88 | want: false, 89 | wantErr: false, 90 | }, 91 | { 92 | name: "A subnet with an virtual private gateway ID is considered private", 93 | fields: fields{ 94 | Region: "us-east-1", 95 | StsClient: nil, 96 | Ec2Client: setupSubnetMock(t, awsv2.String("vgw-1"), false), 97 | CloudTrailClient: nil, 98 | BaseConfig: awsv2.Config{}, 99 | }, 100 | args: args{ 101 | subnet: "subnet-1", 102 | }, 103 | want: true, 104 | wantErr: false, 105 | }, 106 | } 107 | for _, tt := range tests { 108 | t.Run(tt.name, func(t *testing.T) { 109 | c := &SdkClient{ 110 | Region: tt.fields.Region, 111 | StsClient: tt.fields.StsClient, 112 | Ec2Client: tt.fields.Ec2Client, 113 | CloudtrailClient: tt.fields.CloudTrailClient, 114 | BaseConfig: &tt.fields.BaseConfig, 115 | } 116 | got, err := c.IsSubnetPrivate(tt.args.subnet) 117 | if (err != nil) != tt.wantErr { 118 | t.Errorf("SdkClient.IsSubnetPrivate() error = %v, wantErr %v", err, tt.wantErr) 119 | return 120 | } 121 | if got != tt.want { 122 | t.Errorf("SdkClient.IsSubnetPrivate() = %v, want %v", got, tt.want) 123 | } 124 | }) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /pkg/investigations/aitest/README.md: -------------------------------------------------------------------------------- 1 | # aitest Investigation 2 | 3 | Test investigation to run k8sgpt 4 | 5 | ## Testing 6 | 7 | Refer to the [testing README](./testing/README.md) for instructions on testing this investigation 8 | 9 | -------------------------------------------------------------------------------- /pkg/investigations/aitest/metadata.yaml: -------------------------------------------------------------------------------- 1 | name: aitest 2 | rbac: 3 | roles: [] 4 | clusterRoleRules: 5 | - verbs: ["get", "watch", "list"] 6 | apiGroups: ["*"] 7 | resources: ["*"] 8 | customerDataAccess: true 9 | -------------------------------------------------------------------------------- /pkg/investigations/aitest/testing/README.md: -------------------------------------------------------------------------------- 1 | # Testing aitest Investigation 2 | 3 | TODO: 4 | - Add a test script or test objects to this directory for future maintainers to use 5 | - Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc) 6 | -------------------------------------------------------------------------------- /pkg/investigations/apierrorbudgetburn/README.md: -------------------------------------------------------------------------------- 1 | # apierrorbudgetburn Investigation 2 | 3 | POC Api-ErrorBudgetBurn investigation using k8sgpt. 4 | 5 | ## Testing 6 | 7 | Refer to the [testing README](./testing/README.md) for instructions on testing this investigation 8 | 9 | -------------------------------------------------------------------------------- /pkg/investigations/apierrorbudgetburn/apierrorbudgetburn.go: -------------------------------------------------------------------------------- 1 | // Package apierrorbudgetburn contains the investigation for api-ErrorBudgetBurn alerts 2 | package apierrorbudgetburn 3 | 4 | import ( 5 | "errors" 6 | "fmt" 7 | "strings" 8 | 9 | "github.com/openshift/configuration-anomaly-detection/pkg/ai/k8sgpt" 10 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" 11 | k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" 12 | "github.com/openshift/configuration-anomaly-detection/pkg/logging" 13 | ) 14 | 15 | type Investigation struct{} 16 | 17 | func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { 18 | result := investigation.InvestigationResult{} 19 | 20 | k8sConfig, err := k8sclient.NewCfg(r.Cluster.ID(), r.OcmClient, r.Name) 21 | if err != nil { 22 | if errors.Is(err, k8sclient.ErrAPIServerUnavailable) { 23 | return result, r.PdClient.EscalateIncidentWithNote("CAD was unable to access cluster's kube-api. Please investigate manually.") 24 | } 25 | 26 | return result, fmt.Errorf("unable to initialize k8s cli config: %w", err) 27 | } 28 | defer func() { 29 | deferErr := k8sConfig.Clean() 30 | if deferErr != nil { 31 | logging.Error(deferErr) 32 | err = errors.Join(err, deferErr) 33 | } 34 | }() 35 | 36 | analysis, err := k8sgpt.K8sGptAnalysis(&k8sConfig.Config) 37 | if err != nil { 38 | return result, fmt.Errorf("failed to run K8sGptAnalysis: %w", err) 39 | } 40 | 41 | return result, r.PdClient.EscalateIncidentWithNote(analysis) 42 | } 43 | 44 | func (c *Investigation) Name() string { 45 | return "apierrorbudgetburn" 46 | } 47 | 48 | func (c *Investigation) Description() string { 49 | return "POC Api-ErrorBudgetBurn investigation using k8sgpt." 50 | } 51 | 52 | func (c *Investigation) ShouldInvestigateAlert(alert string) bool { 53 | return strings.Contains(alert, "api-ErrorBudgetBurn") 54 | } 55 | 56 | func (c *Investigation) IsExperimental() bool { 57 | // This is an experimental investigation leveraging k8sgpt. 58 | return true 59 | } 60 | -------------------------------------------------------------------------------- /pkg/investigations/apierrorbudgetburn/metadata.yaml: -------------------------------------------------------------------------------- 1 | name: apierrorbudgetburn 2 | rbac: 3 | roles: [] 4 | clusterRoleRules: 5 | - verbs: ["get", "watch", "list"] 6 | apiGroups: ["*"] 7 | resources: ["*"] 8 | customerDataAccess: true 9 | -------------------------------------------------------------------------------- /pkg/investigations/apierrorbudgetburn/testing/README.md: -------------------------------------------------------------------------------- 1 | # Testing apierrorbudgetburn Investigation 2 | 3 | TODO: 4 | - Add a test script or test objects to this directory for future maintainers to use 5 | - Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc) 6 | -------------------------------------------------------------------------------- /pkg/investigations/cannotretrieveupdatessre/README.md: -------------------------------------------------------------------------------- 1 | # cannotretrieveupdatessre Investigation 2 | 3 | Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and updating the notes with investigation details to the PagerDuty alert about the cluster version status. 4 | 5 | ## Investigation Logic 6 | 7 | The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks: 8 | 1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints. 9 | 2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`. 10 | 11 | ## Testing 12 | 13 | Refer to the [testing README](./testing/README.md) for instructions on testing this investigation 14 | -------------------------------------------------------------------------------- /pkg/investigations/cannotretrieveupdatessre/cannotretrieveupdatessre.go: -------------------------------------------------------------------------------- 1 | package cannotretrieveupdatessre 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "strings" 8 | 9 | configv1 "github.com/openshift/api/config/v1" 10 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" 11 | k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" 12 | "github.com/openshift/configuration-anomaly-detection/pkg/logging" 13 | "github.com/openshift/configuration-anomaly-detection/pkg/networkverifier" 14 | "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" 15 | "sigs.k8s.io/controller-runtime/pkg/client" 16 | ) 17 | 18 | const ( 19 | alertname = "CannotRetrieveUpdatesSRE" 20 | remediationName = "CannotRetrieveUpdatesSRE" 21 | ) 22 | 23 | type Investigation struct{} 24 | 25 | // Run executes the investigation for the CannotRetrieveUpdatesSRE alert 26 | func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { 27 | result := investigation.InvestigationResult{} 28 | notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger) 29 | k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName) 30 | if err != nil { 31 | return result, fmt.Errorf("unable to initialize k8s cli: %w", err) 32 | } 33 | defer func() { 34 | deferErr := k8scli.Clean() 35 | if deferErr != nil { 36 | logging.Error(deferErr) 37 | err = errors.Join(err, deferErr) 38 | } 39 | }() 40 | 41 | // Run network verifier 42 | verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient) 43 | if err != nil { 44 | notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error()) 45 | } else { 46 | switch verifierResult { 47 | case networkverifier.Failure: 48 | result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil} 49 | notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason) 50 | case networkverifier.Success: 51 | notes.AppendSuccess("Network verifier passed") 52 | } 53 | } 54 | 55 | // Check ClusterVersion 56 | clusterVersion, err := getClusterVersion(k8scli) 57 | if err != nil { 58 | notes.AppendWarning("Failed to get ClusterVersion: %s", err.Error()) 59 | } else { 60 | notes.AppendSuccess("ClusterVersion found: %s", clusterVersion.Status.Desired.Version) 61 | 62 | failureReason := getUpdateRetrievalFailures(clusterVersion) 63 | if failureReason != "" { 64 | logging.Warnf("Detected ClusterVersion issue: %s", failureReason) 65 | notes.AppendWarning("ClusterVersion related issue detected: %s. Current version %s not found in channel %s", 66 | failureReason, clusterVersion.Status.Desired.Version, clusterVersion.Spec.Channel) 67 | } 68 | } 69 | notes.AppendWarning("Alert escalated to on-call primary for review and please check the ClusterVersion.") 70 | return result, r.PdClient.EscalateIncidentWithNote(notes.String()) 71 | } 72 | 73 | func getClusterVersion(k8scli client.Client) (*configv1.ClusterVersion, error) { 74 | clusterVersion := &configv1.ClusterVersion{} 75 | err := k8scli.Get(context.TODO(), client.ObjectKey{Name: "version"}, clusterVersion) 76 | if err != nil { 77 | return nil, fmt.Errorf("failed to get ClusterVersion: %w", err) 78 | } 79 | return clusterVersion, nil 80 | } 81 | 82 | // getUpdateRetrievalFailures checks for update retrieval failures in the ClusterVersion 83 | func getUpdateRetrievalFailures(clusterVersion *configv1.ClusterVersion) string { 84 | for _, condition := range clusterVersion.Status.Conditions { 85 | msg, found := checkCondition(condition) 86 | if found { 87 | return msg 88 | } 89 | } 90 | return "" 91 | } 92 | 93 | func checkCondition(condition configv1.ClusterOperatorStatusCondition) (string, bool) { 94 | if condition.Type != "RetrievedUpdates" { 95 | return "", false 96 | } 97 | if condition.Status == configv1.ConditionFalse { 98 | return fmt.Sprintf("(Reason: %s). %s", condition.Reason, condition.Message), true 99 | } 100 | return "", false 101 | } 102 | 103 | func (i *Investigation) Name() string { 104 | return alertname 105 | } 106 | 107 | func (i *Investigation) Description() string { 108 | return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname) 109 | } 110 | 111 | func (i *Investigation) ShouldInvestigateAlert(alert string) bool { 112 | return strings.Contains(alert, alertname) 113 | } 114 | 115 | func (i *Investigation) IsExperimental() bool { 116 | return true 117 | } 118 | -------------------------------------------------------------------------------- /pkg/investigations/cannotretrieveupdatessre/metadata.yaml: -------------------------------------------------------------------------------- 1 | name: cannotretrieveupdatessre 2 | rbac: 3 | roles: [] 4 | clusterRoleRules: 5 | - verbs: 6 | - "get" 7 | - "list" 8 | apiGroups: 9 | - "config.openshift.io" 10 | resources: 11 | - clusterversions 12 | customerDataAccess: false 13 | -------------------------------------------------------------------------------- /pkg/investigations/cannotretrieveupdatessre/testing/README.md: -------------------------------------------------------------------------------- 1 | # Testing CannotRetrieveUpdatesSRE Investigation 2 | 3 | ### Update the ClusterVersion Channel 4 | - Below script helps to set the test channel to check the clusterversion change. 5 | ```sh 6 | #!/bin/bash 7 | 8 | # Use test channel for the ClusterVersion 9 | oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18-test"}}' --as backplane-cluster-admin 10 | sleep 30 11 | 12 | # Verify 13 | oc get clusterversion version -o jsonpath='{.spec.channel}' | grep "stable-4.18-test" || { echo "Failed to set the channel"; exit 1; } 14 | 15 | # Optional: Revert back to the original change 16 | #oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18"}}' --as backplane-cluster-admin 17 | ``` 18 | -------------------------------------------------------------------------------- /pkg/investigations/ccam/ccam_test.go: -------------------------------------------------------------------------------- 1 | package ccam 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | 7 | investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" 8 | ) 9 | 10 | func TestEvaluateRandomError(t *testing.T) { 11 | timeoutError := errors.New("credentials are there, error is different: timeout") 12 | input := investigation.Resources{ 13 | Cluster: nil, 14 | ClusterDeployment: nil, 15 | AwsClient: nil, 16 | OcmClient: nil, 17 | PdClient: nil, 18 | AdditionalResources: map[string]interface{}{ 19 | "error": errors.New("timeout"), 20 | }, 21 | } 22 | 23 | inv := Investigation{} 24 | 25 | _, err := inv.Run(&input) 26 | if err.Error() != timeoutError.Error() { 27 | t.Fatalf("Expected error %v, but got %v", timeoutError, err) 28 | } 29 | } 30 | 31 | func TestCustomerRemovedPermissions(t *testing.T) { 32 | tests := []struct { 33 | name string 34 | errorMessage string 35 | expectedMatch bool 36 | }{ 37 | { 38 | name: "Matching error 1", 39 | errorMessage: "unable to query aws credentials from backplane: failed to determine if cluster is using isolated backlpane access: failed to get sts support jump role ARN for cluster 28testqvq0jpo1hsrch6gvbc0123test: failed to get STS Support Jump Role for cluster 28testqvq0jpo1hsrch6gvbc0qgqtest, status is 404, identifier is '404', code is 'CLUSTERS-MGMT-404' and operation identifier is 'teste1d1-3844-46f7-82d4-643c5aeeca53': Failed to find trusted relationship to support role 'RH-Technical-Support-Access'", 40 | expectedMatch: true, 41 | }, 42 | { 43 | name: "Matching error 2", 44 | errorMessage: "unable to query aws credentials from backplane: failed to determine if cluster is using isolated backlpane access: failed to get sts support jump role ARN for cluster test9tm92uu49s29plim5dn1sbc1test: failed to get STS Support Jump Role for cluster test9tm92uu49s29plim5dn1sbc1test, status is 404, identifier is '404', code is 'CLUSTERS-MGMT-404' and operation identifier is 'testf5f3-6591-452f-98cb-3943edf4test': Support role, used with cluster 'test9tm92uu49s29plim5dn1sbc1test', does not exist in the customer's AWS account", 45 | expectedMatch: true, 46 | }, 47 | { 48 | name: "Matching error 3", 49 | errorMessage: "something could not assume support role in customer's account: AccessDenied: something", 50 | expectedMatch: true, 51 | }, 52 | { 53 | name: "Matching error 4", 54 | errorMessage: "unable to query aws credentials from backplane: failed to determine if cluster is using isolated backlpane access: failed to get sts support jump role ARN for cluster : failed to get STS Support Jump Role for cluster , status is 400, identifier is '400', code is 'CLUSTERS-MGMT-400' and operation identifier is '': Please make sure IAM role 'arn:aws:iam:::role/ManagedOpenShift-Installer-Role' exists, and add 'arn:aws:iam:::role/RH-Managed-OpenShift-Installer' to the trust policy on IAM role 'arn:aws:iam:::role/ManagedOpenShift-Installer-Role': Failed to assume role: User: arn:aws:sts:::assumed-role/RH-Managed-OpenShift-Installer/OCM is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam:::role/ManagedOpenShift-Installer-Role", 55 | expectedMatch: true, 56 | }, 57 | { 58 | name: "Matching error 5", 59 | errorMessage: "unable to query aws credentials from backplane: failed to determine if cluster is using isolated backlpane access: failed to get sts support jump role ARN for cluster : failed to get STS Support Jump Role for cluster , status is 400, identifier is '400', code is 'CLUSTERS-MGMT-400' and operation identifier is '': Failed to get role: User: arn:aws:sts:::assumed-role/ManagedOpenShift-Installer-Role/OCM is not authorized to perform: iam:GetRole on resource: role ManagedOpenShift-Support-Role because no identity-based policy allows the iam:GetRole action", 60 | expectedMatch: true, 61 | }, 62 | { 63 | name: "Non-matching error", 64 | errorMessage: "Some timeout error", 65 | expectedMatch: false, 66 | }, 67 | } 68 | 69 | for _, tt := range tests { 70 | t.Run(tt.name, func(t *testing.T) { 71 | match := customerRemovedPermissions(tt.errorMessage) 72 | if match != tt.expectedMatch { 73 | t.Errorf("customerRemovedPermissions() = %v, expectedMatch %v", match, tt.expectedMatch) 74 | } 75 | }) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /pkg/investigations/chgm/README.md: -------------------------------------------------------------------------------- 1 | # ClusterHasGoneMissing Investigation 2 | 3 | ## Alert firing investigation 4 | 5 | 1. PagerDuty webhook receives CHGM alert from Dead Man's Snitch. 6 | 2. CAD Tekton pipeline is triggered via PagerDuty sending a webhook to Tekton EventListener. 7 | 3. Logs into AWS account of cluster and checks for stopped/terminated instances. 8 | - If unable to access AWS account, posts "cluster credentials are missing" limited support reason. 9 | 4. If stopped/terminated instances are found, pulls AWS CloudTrail events for those instances. 10 | - If no stopped/terminated instances are found, escalates to SRE for further investigation. 11 | 5. If the user of the event is: 12 | - Authorized (SRE or OSD managed), runs the network verifier and escalates the alert to SRE for futher investigation. 13 | - **Note:** Authorized users have prefix RH-SRE, osdManagedAdmin, or have the ManagedOpenShift-Installer-Role. 14 | - Not authorized (not SRE or OSD managed), posts the appropriate limited support reason and silences the alert. 15 | 6. Adds notes with investigation details to the PagerDuty alert. 16 | 17 | ## CHGM investigation overview 18 | 19 | ![CHGM investigation overview](./images/cad_chgm_investigation/chgm_investigation_dark.png#gh-dark-mode-only) 20 | ![CHGM investigation overview](./images/cad_chgm_investigation/chgm_investigation_light.png#gh-light-mode-only) 21 | -------------------------------------------------------------------------------- /pkg/investigations/chgm/chgm_hibernation_check.go: -------------------------------------------------------------------------------- 1 | package chgm 2 | 3 | import ( 4 | "sort" 5 | "time" 6 | 7 | cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" 8 | servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1" 9 | "github.com/openshift/configuration-anomaly-detection/pkg/ocm" 10 | ) 11 | 12 | const recentWakeupTime = 2 * time.Hour 13 | 14 | // 30 Days is always a problem as kubelet certificates will be expired 15 | const hibernationTooLong = 30 * 24 * time.Hour 16 | 17 | const ( 18 | hibernationStartEvent = "cluster_state_hibernating" 19 | hibernationEndEvent = "cluster_state_ready" 20 | ) 21 | 22 | // const hibernationOngoingEvent = "cluster_state_hibernating" 23 | // const hibernationResumeEvent = "cluster_state_resuming" 24 | 25 | type hibernationPeriod struct { 26 | HibernationDuration time.Duration 27 | DehibernationTime time.Time 28 | } 29 | 30 | func hibernatedTooLong(hibernations []*hibernationPeriod, now time.Time) bool { 31 | if len(hibernations) == 0 { 32 | return false 33 | } 34 | latestHibernation := hibernations[len(hibernations)-1] 35 | // The cluster was woken up within the RECENT_WAKEUP_TIME which might 36 | // indicate a CSR problem. 37 | if now.Sub(latestHibernation.DehibernationTime) >= recentWakeupTime { 38 | return false 39 | } 40 | // Only clusters that have hibernated for a long time are susceptible to 41 | // have cert issues. 42 | if latestHibernation.HibernationDuration >= hibernationTooLong { 43 | return true 44 | } 45 | return false 46 | } 47 | 48 | func getHibernationStatusForCluster(ocmClient ocm.Client, cluster *cmv1.Cluster) ([]*hibernationPeriod, error) { 49 | filter := "log_type='cluster-state-updates'" 50 | clusterStateUpdates, err := ocmClient.GetServiceLog(cluster, filter) 51 | if err != nil { 52 | return nil, err 53 | } 54 | return createHibernationTimeLine(clusterStateUpdates.Items().Slice()), nil 55 | } 56 | 57 | func createHibernationTimeLine(clusterStateUpdates []*servicelogsv1.LogEntry) []*hibernationPeriod { 58 | var hibernations []*hibernationPeriod 59 | 60 | var hibernationStartTime time.Time 61 | var hibernationEndTime time.Time 62 | sort.SliceStable(clusterStateUpdates, func(i, j int) bool { 63 | return clusterStateUpdates[i].Timestamp().Before(clusterStateUpdates[j].Timestamp()) 64 | }) 65 | for _, stateUpdate := range clusterStateUpdates { 66 | event := stateUpdate.Summary() 67 | date := stateUpdate.Timestamp() 68 | if event == hibernationStartEvent { 69 | hibernationStartTime = date 70 | } 71 | if event == hibernationEndEvent { 72 | if (time.Time.Equal(hibernationStartTime, time.Time{})) { 73 | // Cluster became ready after installation 74 | continue 75 | } 76 | hibernationEndTime = date 77 | hibernation := &hibernationPeriod{ 78 | DehibernationTime: hibernationEndTime, 79 | HibernationDuration: hibernationEndTime.Sub(hibernationStartTime), 80 | } 81 | hibernations = append(hibernations, hibernation) 82 | } 83 | } 84 | // Would be an ongoing hibernation 85 | // if (hibernationStartTime != time.Time{} && hibernationEndTime == time.Time{}) { 86 | // hibernations = append(hibernations, &HibernationPeriod{}) 87 | // } 88 | return hibernations 89 | } 90 | -------------------------------------------------------------------------------- /pkg/investigations/chgm/chgm_hibernation_check_test.go: -------------------------------------------------------------------------------- 1 | package chgm 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | "time" 7 | 8 | servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1" 9 | ) 10 | 11 | func TestCreateHibernationTimeLine(t *testing.T) { 12 | type args struct { 13 | clusterStateUpdates []*servicelogsv1.LogEntry 14 | } 15 | hibernationStartTime := time.Date(2023, 0o1, 0o1, 0o0, 0o0, 0o0, 0o0, time.Local) 16 | hibernationStopTime := time.Date(2023, 0o2, 0o1, 0o0, 0o0, 0o0, 0o0, time.Local) 17 | hibernationStart, _ := servicelogsv1.NewLogEntry().Timestamp(hibernationStartTime).Summary(hibernationStartEvent).Build() 18 | hibernationEnd, _ := servicelogsv1.NewLogEntry().Timestamp(hibernationStopTime).Summary(hibernationEndEvent).Build() 19 | var emptyHibernationSlice []*hibernationPeriod 20 | tests := []struct { 21 | name string 22 | args args 23 | want []*hibernationPeriod 24 | }{ 25 | { 26 | name: "Hibernation with start and end", 27 | args: args{ 28 | clusterStateUpdates: []*servicelogsv1.LogEntry{ 29 | hibernationStart, 30 | hibernationEnd, 31 | }, 32 | }, 33 | want: []*hibernationPeriod{ 34 | { 35 | HibernationDuration: hibernationStopTime.Sub(hibernationStartTime), 36 | DehibernationTime: hibernationStopTime, 37 | }, 38 | }, 39 | }, 40 | { 41 | name: "Hibernation without end is not part of the return", 42 | args: args{ 43 | clusterStateUpdates: []*servicelogsv1.LogEntry{ 44 | hibernationStart, 45 | }, 46 | }, 47 | want: emptyHibernationSlice, 48 | }, 49 | { 50 | name: "Hibernation without start is not part of the return", 51 | args: args{ 52 | clusterStateUpdates: []*servicelogsv1.LogEntry{ 53 | hibernationEnd, 54 | }, 55 | }, 56 | want: emptyHibernationSlice, 57 | }, 58 | } 59 | for _, tt := range tests { 60 | t.Run(tt.name, func(t *testing.T) { 61 | if got := createHibernationTimeLine(tt.args.clusterStateUpdates); !reflect.DeepEqual(got, tt.want) { 62 | t.Errorf("CreateHibernationTimeLine() = %v, want %v", got, tt.want) 63 | } 64 | }) 65 | } 66 | } 67 | 68 | func TestHibernatedTooLong(t *testing.T) { 69 | type args struct { 70 | hibernations []*hibernationPeriod 71 | now time.Time 72 | } 73 | hibernationStartTime := time.Date(2023, 0o1, 0o1, 0o0, 0o0, 0o0, 0o0, time.Local) 74 | hibernationShortStopTime := time.Date(2023, 0o1, 11, 0o0, 0o0, 0o0, 0o0, time.Local) 75 | hibernationLongStopTime := time.Date(2023, 0o2, 11, 0o0, 0o0, 0o0, 0o0, time.Local) 76 | shortHibernation := &hibernationPeriod{ 77 | HibernationDuration: hibernationShortStopTime.Sub(hibernationStartTime), 78 | DehibernationTime: hibernationShortStopTime, 79 | } 80 | longHibernation := &hibernationPeriod{ 81 | HibernationDuration: hibernationLongStopTime.Sub(hibernationStartTime), 82 | DehibernationTime: hibernationLongStopTime, 83 | } 84 | tests := []struct { 85 | name string 86 | args args 87 | want bool 88 | }{ 89 | // TODO: Add test cases. 90 | { 91 | name: "Cluster that hibernated for 10 days is ok", 92 | args: args{ 93 | hibernations: []*hibernationPeriod{shortHibernation}, 94 | now: hibernationShortStopTime.Add(1 * time.Hour), 95 | }, 96 | want: false, 97 | }, 98 | { 99 | name: "Cluster that hibernated for 30+ days is too long", 100 | args: args{ 101 | hibernations: []*hibernationPeriod{longHibernation}, 102 | now: hibernationLongStopTime.Add(1 * time.Hour), 103 | }, 104 | want: true, 105 | }, 106 | { 107 | name: "Cluster that never hibernated is ok", 108 | args: args{}, 109 | want: false, 110 | }, 111 | { 112 | name: "Cluster that woke up for 2+ hours ago ok", 113 | args: args{ 114 | hibernations: []*hibernationPeriod{longHibernation}, 115 | now: hibernationLongStopTime.Add(3 * time.Hour), 116 | }, 117 | want: false, 118 | }, 119 | } 120 | for _, tt := range tests { 121 | t.Run(tt.name, func(t *testing.T) { 122 | got := hibernatedTooLong(tt.args.hibernations, tt.args.now) 123 | if got != tt.want { 124 | t.Errorf("HibernatedTooLong() = %v, want %v", got, tt.want) 125 | } 126 | }) 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /pkg/investigations/chgm/chgm_suite_test.go: -------------------------------------------------------------------------------- 1 | package chgm_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/onsi/ginkgo/v2" 7 | . "github.com/onsi/gomega" 8 | ) 9 | 10 | func TestChgm(t *testing.T) { 11 | RegisterFailHandler(Fail) 12 | RunSpecs(t, "Chgm Suite") 13 | } 14 | -------------------------------------------------------------------------------- /pkg/investigations/chgm/util.go: -------------------------------------------------------------------------------- 1 | package chgm 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/openshift/configuration-anomaly-detection/pkg/ocm" 7 | ) 8 | 9 | func createEgressSL(blockedUrls string) *ocm.ServiceLog { 10 | description := fmt.Sprintf("Your cluster requires you to take action. SRE has observed that there have been changes made to the network configuration which impacts normal working of the cluster, including lack of network egress to these internet-based resources which are required for the cluster operation and support: %s. Please revert changes, and refer to documentation regarding firewall requirements for PrivateLink clusters: https://access.redhat.com/documentation/en-us/red_hat_openshift_service_on_aws/4/html/prepare_your_environment/rosa-sts-aws-prereqs#osd-aws-privatelink-firewall-prerequisites_rosa-sts-aws-prereqs#.", blockedUrls) 11 | 12 | egressSL := ocm.ServiceLog{ 13 | Severity: "Critical", 14 | Summary: "Action required: Network misconfiguration", 15 | ServiceName: "SREManualAction", 16 | Description: description, 17 | InternalOnly: false, 18 | } 19 | 20 | return &egressSL 21 | } 22 | -------------------------------------------------------------------------------- /pkg/investigations/chgm/util_test.go: -------------------------------------------------------------------------------- 1 | package chgm 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/openshift/configuration-anomaly-detection/pkg/ocm" 8 | "gotest.tools/v3/assert" 9 | ) 10 | 11 | // Mock data 12 | var blockedUrls = "example.com, test.com" 13 | 14 | // TestCreateEgressSL tests the createEgressSL function 15 | func TestCreateEgressSL(t *testing.T) { 16 | expectedDescription := fmt.Sprintf( 17 | "Your cluster requires you to take action. SRE has observed that there have been changes made to the network configuration which impacts normal working of the cluster, including lack of network egress to these internet-based resources which are required for the cluster operation and support: %s. Please revert changes, and refer to documentation regarding firewall requirements for PrivateLink clusters: https://access.redhat.com/documentation/en-us/red_hat_openshift_service_on_aws/4/html/prepare_your_environment/rosa-sts-aws-prereqs#osd-aws-privatelink-firewall-prerequisites_rosa-sts-aws-prereqs#.", 18 | blockedUrls, 19 | ) 20 | 21 | expected := &ocm.ServiceLog{ 22 | Severity: "Critical", 23 | Summary: "Action required: Network misconfiguration", 24 | ServiceName: "SREManualAction", 25 | Description: expectedDescription, 26 | InternalOnly: false, 27 | } 28 | 29 | result := createEgressSL(blockedUrls) 30 | assert.Equal(t, *expected, *result) 31 | } 32 | -------------------------------------------------------------------------------- /pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn_test.go: -------------------------------------------------------------------------------- 1 | package clustermonitoringerrorbudgetburn 2 | 3 | import ( 4 | "testing" 5 | 6 | configv1 "github.com/openshift/api/config/v1" 7 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | ) 9 | 10 | var ( 11 | statusConditionAvailable = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "True"} 12 | statusConditionUpgradeable = configv1.ClusterOperatorStatusCondition{Type: "Upgradeable", Status: "True"} 13 | statusConditionUnavailableSymptomsMatch = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "False", Message: `the User Workload Configuration from "config.yaml" key in the "openshift-user-workload-monitoring/user-workload-monitoring-config" ConfigMap could not be parsed`} 14 | ) 15 | 16 | func TestSymptomMatches(t *testing.T) { 17 | monitoringCo := configv1.ClusterOperator{ 18 | ObjectMeta: v1.ObjectMeta{Name: "monitoring"}, 19 | Status: configv1.ClusterOperatorStatus{ 20 | Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionUnavailableSymptomsMatch, statusConditionUpgradeable}, 21 | }, 22 | } 23 | if !isUWMConfigInvalid(&monitoringCo) { 24 | t.Fatal("expected symptoms to match") 25 | } 26 | } 27 | 28 | func TestSymptomNoMatch(t *testing.T) { 29 | monitoringCo := configv1.ClusterOperator{ 30 | ObjectMeta: v1.ObjectMeta{Name: "monitoring"}, 31 | Status: configv1.ClusterOperatorStatus{ 32 | Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionAvailable, statusConditionUpgradeable}, 33 | }, 34 | } 35 | if isUWMConfigInvalid(&monitoringCo) { 36 | t.Fatal("expected symptoms to not match") 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pkg/investigations/clustermonitoringerrorbudgetburn/metadata.yaml: -------------------------------------------------------------------------------- 1 | name: clustermonitoringerrorbudgetburn 2 | rbac: 3 | roles: [] 4 | clusterRoleRules: 5 | - verbs: 6 | - "get" 7 | - "list" 8 | apiGroups: 9 | - "config.openshift.io" 10 | resources: 11 | - clusteroperators 12 | customerDataAccess: false 13 | -------------------------------------------------------------------------------- /pkg/investigations/insightsoperatordown/insightsoperatordown_test.go: -------------------------------------------------------------------------------- 1 | package insightsoperatordown 2 | 3 | import ( 4 | "testing" 5 | 6 | configv1 "github.com/openshift/api/config/v1" 7 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | ) 9 | 10 | func TestIsOCPBUG22226(t *testing.T) { 11 | tests := []struct { 12 | name string 13 | co configv1.ClusterOperator 14 | expected bool 15 | }{ 16 | { 17 | name: "SCA certs pull failure detected", 18 | co: configv1.ClusterOperator{ 19 | ObjectMeta: v1.ObjectMeta{Name: "insights"}, 20 | Status: configv1.ClusterOperatorStatus{ 21 | Conditions: []configv1.ClusterOperatorStatusCondition{ 22 | {Type: "SCAAvailable", Message: "Failed to pull SCA certs"}, 23 | }, 24 | }, 25 | }, 26 | expected: true, 27 | }, 28 | { 29 | name: "No SCA certs pull failure", 30 | co: configv1.ClusterOperator{ 31 | ObjectMeta: v1.ObjectMeta{Name: "insights"}, 32 | Status: configv1.ClusterOperatorStatus{ 33 | Conditions: []configv1.ClusterOperatorStatusCondition{ 34 | {Type: "SCAAvailable", Message: "All systems operational"}, 35 | }, 36 | }, 37 | }, 38 | expected: false, 39 | }, 40 | } 41 | 42 | for _, tt := range tests { 43 | t.Run(tt.name, func(t *testing.T) { 44 | if isOCPBUG22226(&tt.co) != tt.expected { 45 | t.Fatalf("expected %v, got %v", tt.expected, !tt.expected) 46 | } 47 | }) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /pkg/investigations/insightsoperatordown/metadata.yaml: -------------------------------------------------------------------------------- 1 | name: insightsoperatordown 2 | rbac: 3 | roles: [] 4 | clusterRoleRules: 5 | - verbs: 6 | - "get" 7 | - "list" 8 | apiGroups: 9 | - "config.openshift.io" 10 | resources: 11 | - clusteroperators 12 | customerDataAccess: false 13 | -------------------------------------------------------------------------------- /pkg/investigations/insightsoperatordown/testing/README.md: -------------------------------------------------------------------------------- 1 | # Testing InsightsOperatorDownSRE 2 | 3 | # OCPBUGS-22226 4 | 5 | We can induce the symptom of `Failed to pull SCA certs` on a stage cluster by blocking `https://api.stage.openshift.com` 6 | The provided script creates a Rule Group and associates it with your clusters VPC. 7 | Requires awscli and backplane 8 | 9 | ``` 10 | ./pkg/investigations/insightsoperatordown/testing/block-api-openshift.sh 11 | ``` 12 | 13 | # Banned user 14 | 15 | TODO 16 | 17 | # Additional Resources 18 | 19 | - SOP Link https://github.com/openshift/ops-sop/blob/master/v4/troubleshoot/clusteroperators/insights.md 20 | - Alert Definition https://github.com/openshift/managed-cluster-config/blob/master/deploy/sre-prometheus/insights/100-sre-insightsoperator.PrometheusRule.yaml 21 | -------------------------------------------------------------------------------- /pkg/investigations/insightsoperatordown/testing/block-api-openshift.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eox pipefail 3 | AWS_PAGER="" 4 | $(ocm backplane cloud credentials -o env $1) 5 | AWS_REGION=$(ocm describe cluster $1 --json | jq -r '.region.id') 6 | FW_RULE_GROUP_ID=$(aws route53resolver create-firewall-rule-group --name "api stage openshift com" | jq -r '.FirewallRuleGroup.Id') 7 | FW_DOMAIN_LIST_ID=$(aws route53resolver create-firewall-domain-list --name "api stage openshift com" | jq -r '.FirewallDomainList.Id') 8 | aws route53resolver update-firewall-domains --firewall-domain-list-id $FW_DOMAIN_LIST_ID --domains "api.stage.openshift.com" --operation "ADD" 9 | aws route53resolver create-firewall-rule --firewall-rule-group-id $FW_RULE_GROUP_ID --firewall-domain-list-id $FW_DOMAIN_LIST_ID --priority "1" --action "BLOCK" --block-response "NODATA" --name "api stage openshift com" 10 | INFRA_ID=$(ocm describe cluster $1 --json | jq -r '.infra_id') 11 | VPC_ID=$(aws ec2 describe-vpcs --filters "Name=tag-key,Values=kubernetes.io/cluster/$INFRA_ID" | jq -r '.Vpcs[0].VpcId') 12 | aws route53resolver associate-firewall-rule-group --firewall-rule-group-id $FW_RULE_GROUP_ID --name "rgassoc-$VPC_ID-$FW_RULE_GROUP_ID" --priority "1001" --vpc-id $VPC_ID 13 | 14 | -------------------------------------------------------------------------------- /pkg/investigations/investigation/investigation.go: -------------------------------------------------------------------------------- 1 | // Package investigation contains base functions for investigations 2 | package investigation 3 | 4 | import ( 5 | cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" 6 | "github.com/openshift/configuration-anomaly-detection/pkg/aws" 7 | "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" 8 | "github.com/openshift/configuration-anomaly-detection/pkg/ocm" 9 | "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty" 10 | hivev1 "github.com/openshift/hive/apis/hive/v1" 11 | ) 12 | 13 | type InvestigationStep struct { 14 | Performed bool 15 | Labels []string 16 | } 17 | 18 | type InvestigationResult struct { 19 | LimitedSupportSet InvestigationStep 20 | ServiceLogPrepared InvestigationStep 21 | ServiceLogSent InvestigationStep 22 | } 23 | 24 | type Investigation interface { 25 | Run(resources *Resources) (InvestigationResult, error) 26 | // Please note that when adding an investigation the name and the directory currently need to be the same, 27 | // so that backplane-api can fetch the metadata.yaml 28 | Name() string 29 | Description() string 30 | IsExperimental() bool 31 | ShouldInvestigateAlert(string) bool 32 | } 33 | 34 | // Resources holds all resources/tools required for alert investigations 35 | type Resources struct { 36 | Name string 37 | Cluster *cmv1.Cluster 38 | ClusterDeployment *hivev1.ClusterDeployment 39 | AwsClient aws.Client 40 | OcmClient ocm.Client 41 | PdClient pagerduty.Client 42 | Notes *notewriter.NoteWriter 43 | AdditionalResources map[string]interface{} 44 | } 45 | -------------------------------------------------------------------------------- /pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/metadata.yaml: -------------------------------------------------------------------------------- 1 | name: MachineHealthCheckUnterminatedShortCircuitSRE 2 | rbac: 3 | roles: 4 | - namespace: "openshift-machine-api" 5 | rules: 6 | - verbs: 7 | - "get" 8 | - "list" 9 | apiGroups: 10 | - "machine.openshift.io" 11 | resources: 12 | - "machines" 13 | - "machinehealthchecks" 14 | clusterRoleRules: 15 | - verbs: 16 | - "get" 17 | - "list" 18 | apiGroups: 19 | - "" 20 | resources: 21 | - "nodes" 22 | customerDataAccess: false 23 | -------------------------------------------------------------------------------- /pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/recommendation.go: -------------------------------------------------------------------------------- 1 | /* 2 | machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert 3 | */ 4 | package machinehealthcheckunterminatedshortcircuitsre 5 | 6 | import ( 7 | "fmt" 8 | 9 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine" 10 | ) 11 | 12 | // machineRecommendations categorizes each machine's individual investigation summary into a recommended course of action 13 | type investigationRecommendations map[recommendedAction][]investigationResult 14 | 15 | func (r investigationRecommendations) addRecommendation(action recommendedAction, object string, notes string) { 16 | recommendation := investigationResult{ 17 | object: object, 18 | notes: notes, 19 | } 20 | r[action] = append(r[action], recommendation) 21 | } 22 | 23 | // summarize prints the machine investigationRecommendations into a human read-able format. 24 | func (r investigationRecommendations) summarize() string { 25 | msg := "" 26 | for recommendation, investigations := range r { 27 | msg += fmt.Sprintf("%s:\n", recommendation) 28 | 29 | if recommendation == recommendationDeleteMachine { 30 | // Consolidate all machine deletion requests into a single oc command for ease of use 31 | deleteCmd := fmt.Sprintf("oc delete machine -n %s", machine.MachineNamespace) 32 | for _, investigation := range investigations { 33 | msg += fmt.Sprintf("- %s\n", investigation.String()) 34 | deleteCmd += " " + investigation.object 35 | } 36 | msg += fmt.Sprintf("to delete these machines, run:\n\n%s\n", deleteCmd) 37 | } else { 38 | for _, investigation := range investigations { 39 | msg += fmt.Sprintf("- %s\n", investigation.String()) 40 | } 41 | } 42 | 43 | msg += "\n" 44 | } 45 | return msg 46 | } 47 | 48 | type investigationResult struct { 49 | // name indicates which object was investigated 50 | object string 51 | // notes provides a high-level summary of the investigation results 52 | notes string 53 | } 54 | 55 | func (s *investigationResult) String() string { 56 | msg := fmt.Sprintf("%q: %s", s.object, s.notes) 57 | return msg 58 | } 59 | 60 | // recommendedAction acts as both a key in the investigationRecommendations map, as well as a header for pagerduty notes when summarize()-ing 61 | type recommendedAction string 62 | 63 | const ( 64 | // recommendationDeleteMachine indicates that the machine(s) in question should be deleted so the machine-api can reprovision them 65 | recommendationDeleteMachine recommendedAction = "delete the following machines" 66 | // recommendationInvestigateMachine indicates that the machine(s) in question need to be manually investigated 67 | recommendationInvestigateMachine recommendedAction = "investigate the following machines" 68 | // recommendationQuotaServiceLog indicates that the machine(s) in question need to be remediated by the customer, and SRE should notify them 69 | // of that fact via servicelog 70 | recommendationQuotaServiceLog recommendedAction = "send a service log regarding quota issues for the following machines" 71 | // recommendationInvestigateNode indicates that the machine's node object is reporting problems which require human intervention to resolve 72 | recommendationInvestigateNode recommendedAction = "investigate the following nodes" 73 | ) 74 | -------------------------------------------------------------------------------- /pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/srep-worker-healthcheck_machinehealthcheck.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: machine.openshift.io/v1beta1 2 | kind: MachineHealthCheck 3 | metadata: 4 | name: srep-worker-healthcheck 5 | namespace: openshift-machine-api 6 | spec: 7 | maxUnhealthy: 0 8 | nodeStartupTimeout: 25m 9 | selector: 10 | matchExpressions: 11 | - key: machine.openshift.io/cluster-api-machine-role 12 | operator: NotIn 13 | values: 14 | - infra 15 | - master 16 | - key: machine.openshift.io/cluster-api-machineset 17 | operator: Exists 18 | - key: machine.openshift.io/instance-type 19 | operator: NotIn 20 | values: 21 | - m5.metal 22 | - m5d.metal 23 | - m5n.metal 24 | - m5dn.metal 25 | - m5zn.metal 26 | - m6a.metal 27 | - m6i.metal 28 | - m6id.metal 29 | - r5.metal 30 | - r5d.metal 31 | - r5n.metal 32 | - r5dn.metal 33 | - r6a.metal 34 | - r6i.metal 35 | - r6id.metal 36 | - x2iezn.metal 37 | - z1d.metal 38 | - c5.metal 39 | - c5d.metal 40 | - c5n.metal 41 | - c6a.metal 42 | - c6i.metal 43 | - c6id.metal 44 | - i3.metal 45 | - i3en.metal 46 | - r7i.48xlarge 47 | unhealthyConditions: 48 | - status: "False" 49 | timeout: 10s 50 | type: Ready 51 | - status: Unknown 52 | timeout: 10s 53 | type: Ready 54 | -------------------------------------------------------------------------------- /pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_pdb.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: policy/v1 2 | kind: PodDisruptionBudget 3 | metadata: 4 | name: test-cad 5 | namespace: default 6 | spec: 7 | maxUnavailable: 0 8 | selector: 9 | matchLabels: 10 | app: "test-cad" 11 | unhealthyPodEvictionPolicy: AlwaysAllow 12 | -------------------------------------------------------------------------------- /pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app: "test-cad" 6 | name: test-cad 7 | namespace: default 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: "test-cad" 13 | template: 14 | metadata: 15 | labels: 16 | app: "test-cad" 17 | spec: 18 | affinity: 19 | nodeAffinity: 20 | preferredDuringSchedulingIgnoredDuringExecution: 21 | - preference: 22 | matchExpressions: 23 | - key: node-role.kubernetes.io/worker 24 | operator: Exists 25 | weight: 1 26 | containers: 27 | - command: 28 | - "sleep" 29 | - "infinity" 30 | image: "quay.io/app-sre/ubi8-ubi:latest" 31 | imagePullPolicy: IfNotPresent 32 | name: test 33 | restartPolicy: Always 34 | -------------------------------------------------------------------------------- /pkg/investigations/pruningcronjoberror/metadata.yaml: -------------------------------------------------------------------------------- 1 | name: pruningcronjoberror 2 | rbac: 3 | roles: [] 4 | clusterRoleRules: 5 | - verbs: 6 | - "get" 7 | - "list" 8 | apiGroups: 9 | - "config.openshift.io" 10 | resources: 11 | - clusteroperators 12 | - apiGroups: 13 | - "" 14 | resources: 15 | - pods 16 | - namespaces 17 | verbs: 18 | - get 19 | - list 20 | customerDataAccess: false 21 | -------------------------------------------------------------------------------- /pkg/investigations/registry.go: -------------------------------------------------------------------------------- 1 | package investigations 2 | 3 | import ( 4 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/apierrorbudgetburn" 5 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatessre" 6 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam" 7 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm" 8 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn" 9 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/cpd" 10 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/insightsoperatordown" 11 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" 12 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre" 13 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/upgradeconfigsyncfailureover4hr" 14 | ) 15 | 16 | // availableInvestigations holds all Investigation implementations. 17 | var availableInvestigations = []investigation.Investigation{ 18 | &apierrorbudgetburn.Investigation{}, 19 | &ccam.Investigation{}, 20 | &chgm.Investiation{}, 21 | &clustermonitoringerrorbudgetburn.Investigation{}, 22 | &cpd.Investigation{}, 23 | &insightsoperatordown.Investigation{}, 24 | &upgradeconfigsyncfailureover4hr.Investigation{}, 25 | &machinehealthcheckunterminatedshortcircuitsre.Investigation{}, 26 | &cannotretrieveupdatessre.Investigation{}, 27 | } 28 | 29 | // GetInvestigation returns the first Investigation that applies to the given alert title. 30 | // This is a naive version that only returns the first matching investigation and ignores the rest. 31 | // Future improvement is to use the proper mapping that can return multiple investigations 32 | // linked to single alert type. 33 | func GetInvestigation(title string, experimental bool) investigation.Investigation { 34 | for _, inv := range availableInvestigations { 35 | if inv.ShouldInvestigateAlert(title) && (experimental || !inv.IsExperimental()) { 36 | return inv 37 | } 38 | } 39 | return nil 40 | } 41 | -------------------------------------------------------------------------------- /pkg/investigations/upgradeconfigsyncfailureover4hr/README.md: -------------------------------------------------------------------------------- 1 | # upgradeconfigsyncfailureover4hr Investigation 2 | 3 | Package upgradeconfigsyncfailureover4hr contains functionality for the UpgradeConfigSyncFailureOver4HrSRE investigation 4 | 5 | ### Integration test for Secret Key check 6 | In order to integration test the logic for checking the pull-secret in OCM vs the pull-secret on your cluster you'll need to do a few things. 7 | 8 | 1. Set up a cluster and test incident in pagerduty as you would for any CAD investigation test. 9 | 2. Get the pull secret from the cluster and output it to a file. 10 | 11 | `oc get secret pull-secret -ojson -n openshift-config --as backplane-cluster-admin > backup_pull_secret.json` 12 | 3. Make a copy of the file you just created for easy backup. We'll be making edits later to the copied file. 13 | `cp backup_pull_secret.json broken_pull_secret.json 14 | 4. Decrypt the .dockerconfigjson entry. The easiest way to do this is to copy the whole part in quotes to your clipboard, echo it in your terminal, and pipe it through `base64 -d` and save the output in a separate file. 15 | 16 | `echo $copied value | base64 -d` 17 | 5. Find the entry for registry.connect.redhat.com and copy the encrypted value for the auth entry. Exclude the quotes again. Repeat the process of de-encrypting this value using `base64 -d` 18 | 19 | `echo $copied_value | base64 -d` 20 | 6. Edit this value in a text editor and change the value after the colon. Leave the preceeding value before the colon as it is. 21 | 7. Do the encryption process detailed above backwards. First you'll need to encrypt your new pull-secret.dockerconfigjson.registry.connect.redhat.com.auth value (the one we just changed). Simply echo it on your command line and pipe it into base64. Place the whole value in single quotes to avoid any text parsing issues. 22 | 23 | `echo $changed_value | base64` 24 | 8. Replace that value in the registry.connect.redhat.com.auth value in your decrypted .dockerconfigjson you saved in step 4 then base64 encrypt the whole thing. Take that encrypted value and replace the encrypted .dockerconfigjson value in your broken_pull_secret.json file. 25 | 9. Apply the newly broken pull-secret json file to your cluster using oc apply. 26 | 27 | `oc apply -f broken_pull_secret.json --as backplane-cluster-admin` 28 | 10. Re run your test according to the CAD readme. This should return a warning in the logs `⚠️ Pull secret does not match on cluster and in OCM` and apply the same message to the pagerduty incident. -------------------------------------------------------------------------------- /pkg/investigations/upgradeconfigsyncfailureover4hr/metadata.yaml: -------------------------------------------------------------------------------- 1 | name: upgradeconfigsyncfailureover4hr 2 | rbac: 3 | roles: 4 | - namespace: "openshift-config" 5 | rules: 6 | - verbs: 7 | - "get" 8 | apiGroups: 9 | - "" 10 | resources: 11 | - "secrets" 12 | resourceNames: 13 | - "pull-secret" 14 | customerDataAccess: false 15 | -------------------------------------------------------------------------------- /pkg/investigations/upgradeconfigsyncfailureover4hr/upgradeconfigsyncfailureover4hr.go: -------------------------------------------------------------------------------- 1 | // Package upgradeconfigsyncfailureover4hr contains functionality for the UpgradeConfigSyncFailureOver4HrSRE investigation 2 | package upgradeconfigsyncfailureover4hr 3 | 4 | import ( 5 | "context" 6 | "encoding/base64" 7 | "errors" 8 | "fmt" 9 | "strings" 10 | 11 | v1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1" 12 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" 13 | k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" 14 | "github.com/openshift/configuration-anomaly-detection/pkg/logging" 15 | "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" 16 | ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm" 17 | corev1 "k8s.io/api/core/v1" 18 | "k8s.io/apimachinery/pkg/types" 19 | "sigs.k8s.io/controller-runtime/pkg/client" 20 | ) 21 | 22 | type Investigation struct{} 23 | 24 | const ( 25 | alertname = "UpgradeConfigSyncFailureOver4HrSRE" 26 | remediationName = "upgradeconfigsyncfailureover4hr" 27 | ) 28 | 29 | func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { 30 | result := investigation.InvestigationResult{} 31 | notes := notewriter.New("UpgradeConfigSyncFailureOver4Hr", logging.RawLogger) 32 | k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName) 33 | if err != nil { 34 | return result, fmt.Errorf("unable to initialize k8s cli: %w", err) 35 | } 36 | defer func() { 37 | deferErr := k8scli.Clean() 38 | if deferErr != nil { 39 | logging.Error(deferErr) 40 | err = errors.Join(err, deferErr) 41 | } 42 | }() 43 | logging.Infof("Checking if user is Banned.") 44 | userBannedStatus, userBannedNotes, err := ocm.CheckIfUserBanned(r.OcmClient, r.Cluster) 45 | if err != nil { 46 | notes.AppendWarning("encountered an issue when checking if the cluster owner is banned: %s\nPlease investigate.", err) 47 | return result, r.PdClient.EscalateIncidentWithNote(notes.String()) 48 | } 49 | if userBannedStatus { 50 | notes.AppendWarning(userBannedNotes) 51 | } else { 52 | notes.AppendSuccess("User is not banned.") 53 | } 54 | user, err := ocm.GetCreatorFromCluster(r.OcmClient.GetConnection(), r.Cluster) 55 | logging.Infof("User ID is: %v", user.ID()) 56 | clusterSecretToken, note, err := getClusterPullSecret(k8scli) 57 | if err != nil { 58 | notes.AppendWarning("Failre getting ClusterSecret: %s", err) 59 | return result, r.PdClient.EscalateIncidentWithNote(notes.String()) 60 | } 61 | if note != "" { 62 | notes.AppendWarning(note) 63 | } 64 | registryCredential, err := ocm.GetOCMPullSecret(r.OcmClient.GetConnection(), user.ID()) 65 | if err != nil { 66 | notes.AppendWarning("Error getting OCMPullSecret: %s", err) 67 | return result, r.PdClient.EscalateIncidentWithNote(notes.String()) 68 | } 69 | if clusterSecretToken == registryCredential { 70 | notes.AppendSuccess("Pull Secret matches on cluster and in OCM. Please continue investigation.") 71 | } else { 72 | notes.AppendWarning("Pull secret does not match on cluster and in OCM.") 73 | } 74 | return result, r.PdClient.EscalateIncidentWithNote(notes.String()) 75 | } 76 | 77 | func getClusterPullSecret(k8scli client.Client) (secretToken string, note string, err error) { 78 | secret := &corev1.Secret{} 79 | err = k8scli.Get(context.TODO(), types.NamespacedName{ 80 | Namespace: "openshift-config", 81 | Name: "pull-secret", 82 | }, secret) 83 | if err != nil { 84 | return "", "", err 85 | } 86 | if secret.Data == nil { 87 | return "", "Cluster pull secret Data is empty.", err 88 | } 89 | secretValue, exists := secret.Data[".dockerconfigjson"] 90 | if !exists { 91 | return "", "Cluster pull secret does not contain the necessary .dockerconfigjson", err 92 | } 93 | 94 | dockerConfigJson, err := v1.UnmarshalAccessToken(secretValue) 95 | if err != nil { 96 | return "", "", err 97 | } 98 | _, exists = dockerConfigJson.Auths()["cloud.openshift.com"] 99 | if !exists { 100 | return "", "cloud.openshift.com value not found in clusterPullSecret. This means there is an issue with the pull secret on the cluster.", err 101 | } 102 | 103 | value, err := base64.StdEncoding.DecodeString(dockerConfigJson.Auths()["registry.connect.redhat.com"].Auth()) 104 | if err != nil { 105 | return "", "", err 106 | } 107 | _, splitValue, _ := strings.Cut(string(value), ":") 108 | return splitValue, "", nil 109 | } 110 | 111 | func (c *Investigation) Name() string { 112 | return "UpgradeConfigSyncFailureOver4hr" 113 | } 114 | 115 | func (c *Investigation) Description() string { 116 | return "Investigates the UpgradeConfigSyncFailureOver4hr alert" 117 | } 118 | 119 | func (c *Investigation) ShouldInvestigateAlert(alert string) bool { 120 | return strings.Contains(alert, "UpgradeConfigSyncFailureOver4HrSRE") 121 | } 122 | 123 | func (c *Investigation) IsExperimental() bool { 124 | return false 125 | } 126 | -------------------------------------------------------------------------------- /pkg/investigations/upgradeconfigsyncfailureover4hr/upgradeconfigsyncfailureover4hr_test.go: -------------------------------------------------------------------------------- 1 | package upgradeconfigsyncfailureover4hr 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 10 | ) 11 | 12 | func TestGetClusterPullSecret(t *testing.T) { 13 | tests := []struct { 14 | name string 15 | data string 16 | secretToken string 17 | expectError bool 18 | expectedNote string 19 | }{ 20 | { 21 | name: "happy path", 22 | data: "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"cloud.openshift.com\":{\"auth\":\"TestAuthValue\",\"email\":\"test_fake_email@redhat.com\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"fake-email@redhat.com\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"test_fake_email@redhat.com\"}}}", 23 | secretToken: "CorrectValue\n", 24 | expectError: false, 25 | }, 26 | { 27 | name: "Value mismatch", 28 | data: "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"cloud.openshift.com\":{\"auth\":\"TestAuthValue\",\"email\":\"test_fake_email@redhat.com\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"fake-email@redhat.com\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"test_fake_email@redhat.com\"}}}", 29 | secretToken: "IncorrectValue\n", 30 | expectError: true, 31 | }, 32 | { 33 | name: "No entry for cloud.openshift.com", 34 | data: "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"MissingValue\":{\"auth\":\"TestAuthValue\",\"email\":\"test_fake_email@redhat.com\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"fake-email@redhat.com\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"test_fake_email@redhat.com\"}}}", 35 | secretToken: "IncorrectValue\n", 36 | expectError: true, 37 | expectedNote: "cloud.openshift.com value not found in clusterPullSecret", 38 | }, 39 | } 40 | 41 | for _, tt := range tests { 42 | t.Run(tt.name, func(t *testing.T) { 43 | secretTest := &corev1.Secret{ 44 | ObjectMeta: v1.ObjectMeta{ 45 | Name: "pull-secret", 46 | Namespace: "openshift-config", 47 | }, 48 | Type: corev1.DockerConfigJsonKey, 49 | Data: map[string][]byte{ 50 | ".dockerconfigjson": []byte(tt.data), 51 | }, 52 | } 53 | k8scli := fake.NewClientBuilder().WithObjects(secretTest).Build() 54 | result, note, _ := getClusterPullSecret(k8scli) 55 | if result != tt.secretToken { 56 | if !strings.Contains(note, tt.expectedNote) { 57 | t.Errorf("Expected note message: %s. Got %s", tt.expectedNote, note) 58 | } 59 | if !tt.expectError { 60 | t.Errorf("expected token %s to match %s", result, tt.secretToken) 61 | } 62 | } 63 | }) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /pkg/investigations/utils/machine/machine.go: -------------------------------------------------------------------------------- 1 | package machine 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | machinev1beta1 "github.com/openshift/api/machine/v1beta1" 8 | "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/node" 9 | corev1 "k8s.io/api/core/v1" 10 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | "k8s.io/apimachinery/pkg/types" 12 | "sigs.k8s.io/controller-runtime/pkg/client" 13 | ) 14 | 15 | const ( 16 | MachineNamespace = "openshift-machine-api" 17 | RoleLabelKey = "machine.openshift.io/cluster-api-machine-role" 18 | WorkerRoleLabelValue = "worker" 19 | ) 20 | 21 | // HealthcheckRemediationAllowed searches the status conditions for the machinehealthcheck object and determines if remediation is allowed 22 | func HealthcheckRemediationAllowed(healthcheck machinev1beta1.MachineHealthCheck) bool { 23 | for _, condition := range healthcheck.Status.Conditions { 24 | if condition.Type == machinev1beta1.RemediationAllowedCondition && condition.Status == corev1.ConditionTrue { 25 | // Only rule out that the mhc is failing if we can both find the condition and determine its current status 26 | return true 27 | } 28 | } 29 | return false 30 | } 31 | 32 | // GetMachinesForMHC retrieves the machines managed by the given MachineHealthCheck object 33 | func GetMachinesForMHC(ctx context.Context, kclient client.Client, healthcheck machinev1beta1.MachineHealthCheck) ([]machinev1beta1.Machine, error) { 34 | machines := machinev1beta1.MachineList{} 35 | selector, err := metav1.LabelSelectorAsSelector(&healthcheck.Spec.Selector) 36 | if err != nil { 37 | return []machinev1beta1.Machine{}, fmt.Errorf("failed to convert machinehealthcheck %q .spec.selector: %w", healthcheck.Name, err) 38 | } 39 | err = kclient.List(ctx, &machines, client.MatchingLabelsSelector{Selector: selector}, &client.ListOptions{Namespace: MachineNamespace}) 40 | if err != nil { 41 | return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machines from machinehealthcheck %q: %w", healthcheck.Name, err) 42 | } 43 | return machines.Items, nil 44 | } 45 | 46 | // GetMachineRole returns the role of the given machine, if present. If not found, an error is returned 47 | func GetRole(machine machinev1beta1.Machine) (string, error) { 48 | role, found := machine.Labels[RoleLabelKey] 49 | if !found { 50 | return "", fmt.Errorf("expected label key %q not found", RoleLabelKey) 51 | } 52 | return role, nil 53 | } 54 | 55 | // GetNodesForMachines retrieves the nodes for the given machines. Errors encountered are joined, but do not block the retrieval of other machines 56 | func GetNodesForMachines(ctx context.Context, kclient client.Client, machines []machinev1beta1.Machine) ([]corev1.Node, error) { 57 | // Retrieving all nodes initially & filtering out irrelevant objects results in fewer API calls 58 | nodes, err := node.GetAll(ctx, kclient) 59 | if err != nil { 60 | return []corev1.Node{}, fmt.Errorf("failed to retrieve nodes: %w", err) 61 | } 62 | 63 | matches := []corev1.Node{} 64 | for _, machine := range machines { 65 | node, found := findMatchingNode(machine, nodes) 66 | if found { 67 | matches = append(matches, node) 68 | } 69 | } 70 | return matches, nil 71 | } 72 | 73 | // findMatchingNode retrieves the node owned by the provided machine, if one exists, along with a boolean indicating whether 74 | // the search succeeded 75 | func findMatchingNode(machine machinev1beta1.Machine, nodes []corev1.Node) (corev1.Node, bool) { 76 | if machine.Status.NodeRef == nil || machine.Status.NodeRef.Name == "" { 77 | return corev1.Node{}, false 78 | } 79 | for _, node := range nodes { 80 | if machine.Status.NodeRef.Name == node.Name { 81 | return node, true 82 | } 83 | } 84 | 85 | return corev1.Node{}, false 86 | } 87 | 88 | // GetNodeForMachine retrieves the node for the given machine. If the provided machine's .Status.NodeRef is empty, 89 | // an error is returned 90 | func GetNodeForMachine(ctx context.Context, kclient client.Client, machine machinev1beta1.Machine) (corev1.Node, error) { 91 | if machine.Status.NodeRef == nil || machine.Status.NodeRef.Name == "" { 92 | return corev1.Node{}, fmt.Errorf("no .Status.NodeRef defined for machine %q", machine.Name) 93 | } 94 | node := &corev1.Node{} 95 | err := kclient.Get(ctx, types.NamespacedName{Name: machine.Status.NodeRef.Name}, node) 96 | return *node, err 97 | } 98 | -------------------------------------------------------------------------------- /pkg/investigations/utils/node/node.go: -------------------------------------------------------------------------------- 1 | /* 2 | node defines investigation utility logic related to node objects 3 | */ 4 | package node 5 | 6 | import ( 7 | "context" 8 | "strings" 9 | 10 | corev1 "k8s.io/api/core/v1" 11 | "sigs.k8s.io/controller-runtime/pkg/client" 12 | ) 13 | 14 | const ( 15 | RoleLabelPrefix = "node-role.kubernetes.io" 16 | WorkerRoleSuffix = "worker" 17 | ) 18 | 19 | // FindNoScheduleTaint searches the node's taints to find one with effect: NoSchedule, if present. 20 | // 21 | // If none is present, an empty taint and 'false' are returned 22 | func FindNoScheduleTaint(node corev1.Node) (corev1.Taint, bool) { 23 | for _, taint := range node.Spec.Taints { 24 | if taint.Effect == corev1.TaintEffectNoSchedule { 25 | return taint, true 26 | } 27 | } 28 | return corev1.Taint{}, false 29 | } 30 | 31 | // GetNodes retrieves all nodes present in the cluster 32 | func GetAll(ctx context.Context, kclient client.Client) ([]corev1.Node, error) { 33 | nodes := corev1.NodeList{} 34 | err := kclient.List(ctx, &nodes) 35 | return nodes.Items, err 36 | } 37 | 38 | // FindReadyCondition searches a node's .Status for the NodeReady condition, and returns it alongside a boolean value which 39 | // indicates whether the condition was found or not 40 | func FindReadyCondition(node corev1.Node) (corev1.NodeCondition, bool) { 41 | for _, condition := range node.Status.Conditions { 42 | if condition.Type == corev1.NodeReady { 43 | return condition, true 44 | } 45 | } 46 | return corev1.NodeCondition{}, false 47 | } 48 | 49 | // GetNodeRole returns the role of the provided node 50 | func GetRole(node corev1.Node) (string, bool) { 51 | for label := range node.Labels { 52 | if strings.Contains(label, RoleLabelPrefix) { 53 | return label, true 54 | } 55 | } 56 | return "", false 57 | } 58 | -------------------------------------------------------------------------------- /pkg/k8s/client.go: -------------------------------------------------------------------------------- 1 | package k8sclient 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/openshift/backplane-cli/pkg/cli/config" 9 | bpremediation "github.com/openshift/backplane-cli/pkg/remediation" 10 | "github.com/openshift/configuration-anomaly-detection/pkg/ocm" 11 | "k8s.io/client-go/rest" 12 | "sigs.k8s.io/controller-runtime/pkg/client" 13 | ) 14 | 15 | type Cleaner interface { 16 | Clean() error 17 | } 18 | 19 | type Client interface { 20 | client.Client 21 | Cleaner 22 | } 23 | 24 | type clientImpl struct { 25 | client.Client 26 | Cleaner 27 | } 28 | 29 | // New returns a Kubernetes client for the given cluster scoped to a given remediation's permissions. 30 | func New(clusterID string, ocmClient ocm.Client, remediationName string) (kclient Client, err error) { 31 | cfg, err := NewCfg(clusterID, ocmClient, remediationName) 32 | if err != nil { 33 | return nil, err 34 | } 35 | 36 | cfgToClean := cfg 37 | defer func() { 38 | if cfgToClean != nil { 39 | deferErr := cfgToClean.Clean() 40 | if deferErr != nil { 41 | err = errors.Join(err, deferErr) 42 | } 43 | } 44 | }() 45 | 46 | scheme, err := initScheme() 47 | if err != nil { 48 | return nil, err 49 | } 50 | 51 | decoratedClient, err := client.New(&cfg.Config, client.Options{Scheme: scheme}) 52 | if err != nil { 53 | return nil, err 54 | } 55 | 56 | cfgToClean = nil 57 | return clientImpl{decoratedClient, cfg}, nil 58 | } 59 | 60 | type Config struct { 61 | rest.Config 62 | Cleaner 63 | } 64 | 65 | type remediationCleaner struct { 66 | clusterID string 67 | ocmClient ocm.Client 68 | remediationInstanceId string 69 | } 70 | 71 | func (cleaner remediationCleaner) Clean() error { 72 | return deleteRemediation(cleaner.clusterID, cleaner.ocmClient, cleaner.remediationInstanceId) 73 | } 74 | 75 | // New returns a the k8s rest config for the given cluster scoped to a given remediation's permissions. 76 | func NewCfg(clusterID string, ocmClient ocm.Client, remediationName string) (cfg *Config, err error) { 77 | backplaneURL := os.Getenv("BACKPLANE_URL") 78 | if backplaneURL == "" { 79 | return nil, fmt.Errorf("could not create new k8sclient: missing environment variable BACKPLANE_URL") 80 | } 81 | 82 | decoratedCfg, remediationInstanceId, err := bpremediation.CreateRemediationWithConn( 83 | config.BackplaneConfiguration{URL: backplaneURL}, 84 | ocmClient.GetConnection(), 85 | clusterID, 86 | remediationName, 87 | ) 88 | if err != nil { 89 | if isAPIServerUnavailable(err) { 90 | return nil, fmt.Errorf("%w: %w", ErrAPIServerUnavailable, err) 91 | } 92 | return nil, err 93 | } 94 | 95 | return &Config{*decoratedCfg, remediationCleaner{clusterID, ocmClient, remediationInstanceId}}, nil 96 | } 97 | 98 | // Cleanup removes the remediation created for the cluster. 99 | func deleteRemediation(clusterID string, ocmClient ocm.Client, remediationInstanceId string) error { 100 | backplaneURL := os.Getenv("BACKPLANE_URL") 101 | if backplaneURL == "" { 102 | return fmt.Errorf("could not clean up k8sclient: missing environment variable BACKPLANE_URL") 103 | } 104 | 105 | return bpremediation.DeleteRemediationWithConn( 106 | config.BackplaneConfiguration{URL: backplaneURL}, 107 | ocmClient.GetConnection(), 108 | clusterID, 109 | remediationInstanceId, 110 | ) 111 | } 112 | -------------------------------------------------------------------------------- /pkg/k8s/errors.go: -------------------------------------------------------------------------------- 1 | package k8sclient 2 | 3 | import ( 4 | "errors" 5 | "strings" 6 | ) 7 | 8 | var ErrAPIServerUnavailable = errors.New("kubernetes API server unavailable") 9 | 10 | // isAPIServerUnavailable detects common symptoms of an unreachable API server. 11 | func isAPIServerUnavailable(err error) bool { 12 | errStr := err.Error() 13 | return strings.Contains(errStr, "The cluster could be down or under heavy load") 14 | } 15 | -------------------------------------------------------------------------------- /pkg/k8s/errors_test.go: -------------------------------------------------------------------------------- 1 | package k8sclient 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | ) 7 | 8 | func TestIsAPIServerUnavailable(t *testing.T) { 9 | tests := []struct { 10 | name string 11 | err error 12 | expected bool 13 | }{ 14 | { 15 | name: "Cluster down message present", 16 | err: errors.New(`Error: Internal error occurred: failed calling webhook "namespace.operator.tekton.dev": failed to call webhook: Post "https://tekton-operator-proxy-webhook.openshift-pipelines.svc:443/namespace-validation?timeout=10s": context deadline exceeded 17 | The cluster could be down or under heavy load 18 | `), 19 | expected: true, 20 | }, 21 | { 22 | name: "Unrelated error message", 23 | err: errors.New("some other error occurred"), 24 | expected: false, 25 | }, 26 | } 27 | 28 | for _, tt := range tests { 29 | t.Run(tt.name, func(t *testing.T) { 30 | if tt.err == nil && isAPIServerUnavailable(tt.err) { 31 | t.Errorf("Expected false for nil error, but got true") 32 | } else if tt.err != nil && isAPIServerUnavailable(tt.err) != tt.expected { 33 | t.Errorf("For test '%s', expected %v, got %v", tt.name, tt.expected, !tt.expected) 34 | } 35 | }) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /pkg/k8s/scheme.go: -------------------------------------------------------------------------------- 1 | package k8sclient 2 | 3 | import ( 4 | "fmt" 5 | 6 | configv1 "github.com/openshift/api/config/v1" 7 | machinev1beta1 "github.com/openshift/api/machine/v1beta1" 8 | corev1 "k8s.io/api/core/v1" 9 | "k8s.io/apimachinery/pkg/runtime" 10 | ) 11 | 12 | // initScheme initializes the runtime scheme with required APIs. 13 | func initScheme() (*runtime.Scheme, error) { 14 | scheme := runtime.NewScheme() 15 | 16 | if err := corev1.AddToScheme(scheme); err != nil { 17 | return nil, fmt.Errorf("unable to add corev1 scheme: %w", err) 18 | } 19 | 20 | if err := configv1.Install(scheme); err != nil { 21 | return nil, fmt.Errorf("unable to add config.openshift.io/v1 scheme: %w", err) 22 | } 23 | 24 | if err := machinev1beta1.AddToScheme(scheme); err != nil { 25 | return nil, fmt.Errorf("unable to add machine.openshift.io/v1beta1 scheme: %w", err) 26 | } 27 | 28 | return scheme, nil 29 | } 30 | -------------------------------------------------------------------------------- /pkg/logging/logging.go: -------------------------------------------------------------------------------- 1 | // Package logging wraps the zap logging package to provide easier access and initialization of the logger 2 | package logging 3 | 4 | import ( 5 | "fmt" 6 | "log" 7 | "os" 8 | 9 | "go.uber.org/zap" 10 | "go.uber.org/zap/zapcore" 11 | ) 12 | 13 | var LogLevelString = getLogLevel() 14 | 15 | // RawLogger is the raw global logger object used for calls wrapped by the logging package 16 | var RawLogger = InitLogger(LogLevelString, "") 17 | 18 | // InitLogger initializes a cluster-id specific child logger 19 | func InitLogger(logLevelString string, clusterID string) *zap.SugaredLogger { 20 | logLevel, err := zap.ParseAtomicLevel(logLevelString) 21 | if err != nil { 22 | log.Fatalln("Invalid log level:", logLevelString) 23 | } 24 | 25 | pipelineName := os.Getenv("PIPELINE_NAME") 26 | if pipelineName == "" { 27 | fmt.Println("Warning: Unable to retrieve the pipeline ID on logger creation. Continuing with empty value.") 28 | } 29 | 30 | config := zap.NewProductionConfig() 31 | config.EncoderConfig.TimeKey = "timestamp" 32 | config.Level = logLevel 33 | config.EncoderConfig.EncodeTime = zapcore.RFC3339TimeEncoder 34 | config.EncoderConfig.StacktraceKey = "" // to hide stacktrace info 35 | config.EncoderConfig.CallerKey = "caller" 36 | 37 | logger, err := config.Build() 38 | if err != nil { 39 | log.Fatal(err) 40 | } 41 | 42 | logger = logger.With(zap.Field{Key: "cluster_id", Type: zapcore.StringType, String: clusterID}, 43 | zap.Field{Key: "pipeline_name", Type: zapcore.StringType, String: pipelineName}) 44 | 45 | return logger.Sugar() 46 | } 47 | 48 | // Info wraps zap's SugaredLogger.Info() 49 | func Info(args ...interface{}) { 50 | RawLogger.Info(args...) 51 | } 52 | 53 | // Debug wraps zap's SugaredLogger.Debug() 54 | func Debug(args ...interface{}) { 55 | RawLogger.Debug(args...) 56 | } 57 | 58 | // Warn wraps zap's SugaredLogger.Warn() 59 | func Warn(args ...interface{}) { 60 | RawLogger.Warn(args...) 61 | } 62 | 63 | // Error wraps zap's SugaredLogger.Error() 64 | func Error(args ...interface{}) { 65 | RawLogger.Error(args...) 66 | } 67 | 68 | // Fatal wraps zap's SugaredLogger.Fatal() 69 | func Fatal(args ...interface{}) { 70 | RawLogger.Fatal(args...) 71 | } 72 | 73 | // Infof wraps zap's SugaredLogger.Infof() 74 | func Infof(template string, args ...interface{}) { 75 | RawLogger.Infof(template, args...) 76 | } 77 | 78 | // Debugf wraps zap's SugaredLogger.Debugf() 79 | func Debugf(template string, args ...interface{}) { 80 | RawLogger.Debugf(template, args...) 81 | } 82 | 83 | // Warnf wraps zap's SugaredLogger.Warnf() 84 | func Warnf(template string, args ...interface{}) { 85 | RawLogger.Warnf(template, args...) 86 | } 87 | 88 | // Errorf wraps zap's SugaredLogger.Errorf() 89 | func Errorf(template string, args ...interface{}) { 90 | RawLogger.Errorf(template, args...) 91 | } 92 | 93 | // Fatalf wraps zap's SugaredLogger.Fatalf() 94 | func Fatalf(template string, args ...interface{}) { 95 | RawLogger.Fatalf(template, args...) 96 | } 97 | 98 | // getLogLevel returns the log level from the environment variable LOG_LEVEL 99 | func getLogLevel() string { 100 | if envLogLevel, exists := os.LookupEnv("LOG_LEVEL"); exists { 101 | return envLogLevel 102 | } 103 | return "info" 104 | } 105 | -------------------------------------------------------------------------------- /pkg/managedcloud/managedcloud.go: -------------------------------------------------------------------------------- 1 | // Package managedcloud contains functionality to access cloud environments of managed clusters 2 | package managedcloud 3 | 4 | import ( 5 | "fmt" 6 | "net/http" 7 | "net/url" 8 | "os" 9 | 10 | cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" 11 | bpcloud "github.com/openshift/backplane-cli/cmd/ocm-backplane/cloud" 12 | "github.com/openshift/backplane-cli/pkg/cli/config" 13 | "github.com/openshift/configuration-anomaly-detection/pkg/aws" 14 | ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm" 15 | ) 16 | 17 | // CreateCustomerAWSClient creates an aws.SdkClient to a cluster's AWS account 18 | func CreateCustomerAWSClient(cluster *cmv1.Cluster, ocmClient ocm.Client) (*aws.SdkClient, error) { 19 | backplaneURL := os.Getenv("BACKPLANE_URL") 20 | if backplaneURL == "" { 21 | return nil, fmt.Errorf("could not create new aws client: missing environment variable BACKPLANE_URL") 22 | } 23 | 24 | backplaneInitialARN := os.Getenv("BACKPLANE_INITIAL_ARN") 25 | if backplaneInitialARN == "" { 26 | return nil, fmt.Errorf("missing environment variable BACKPLANE_INITIAL_ARN") 27 | } 28 | 29 | backplaneProxy := os.Getenv("BACKPLANE_PROXY") 30 | 31 | queryConfig := &bpcloud.QueryConfig{OcmConnection: ocmClient.GetConnection(), BackplaneConfiguration: config.BackplaneConfiguration{URL: backplaneURL, AssumeInitialArn: backplaneInitialARN}, Cluster: cluster} 32 | if backplaneProxy != "" { 33 | queryConfig.ProxyURL = &backplaneProxy 34 | } 35 | 36 | config, err := queryConfig.GetAWSV2Config() 37 | if err != nil { 38 | return nil, fmt.Errorf("unable to query aws credentials from backplane: %w", err) 39 | } 40 | 41 | awsProxy := os.Getenv("AWS_PROXY") 42 | if awsProxy != "" { 43 | config.HTTPClient = &http.Client{ 44 | Transport: &http.Transport{ 45 | Proxy: func(*http.Request) (*url.URL, error) { 46 | return url.Parse(awsProxy) 47 | }, 48 | }, 49 | } 50 | } 51 | 52 | return aws.NewClient(config) 53 | } 54 | -------------------------------------------------------------------------------- /pkg/metrics/README.md: -------------------------------------------------------------------------------- 1 | # Metrics 2 | 3 | This package provides metric instrumentation. 4 | 5 | You can test metrics locally by spawning a aggregation pushgateway container and pushing metrics there. 6 | 7 | ```bash 8 | # Spawn local gateway 9 | podman run --name cad-pushgw -e PAG_APILISTEN=:9091 -e PAG_LIFECYCLELISTEN=:9092 -p 9091:9091 -p 9092:9092 -d ghcr.io/zapier/prom-aggregation-gateway:v0.7.0 10 | # Verify you can reach the gateway (expect empty answer until you pushed metrics) 11 | curl http://localhost:9091/metrics 12 | # Point cad to the gateway 13 | export CAD_PROMETHEUS_PUSHGATEWAY="localhost:9091" 14 | # Run cad locally (it is not relevant for cad to succeed to test the metrics) 15 | ./cadctl investigate --payload-path payload.json 16 | # Verify your metrics got pushed and are available on the gateway 17 | curl http://localhost:9091/metrics 18 | ``` 19 | -------------------------------------------------------------------------------- /pkg/metrics/metrics.go: -------------------------------------------------------------------------------- 1 | // Package metrics provides prometheus instrumentation for CAD 2 | package metrics 3 | 4 | import ( 5 | "os" 6 | 7 | "github.com/openshift/configuration-anomaly-detection/pkg/logging" 8 | "github.com/prometheus/client_golang/prometheus" 9 | "github.com/prometheus/client_golang/prometheus/push" 10 | "github.com/prometheus/common/expfmt" 11 | ) 12 | 13 | // Push collects and pushes metrics to the configured pushgateway 14 | func Push() { 15 | var promPusher *push.Pusher 16 | if pushgateway := os.Getenv("CAD_PROMETHEUS_PUSHGATEWAY"); pushgateway != "" { 17 | promPusher = push.New(pushgateway, "cad").Format(expfmt.NewFormat(expfmt.TypeTextPlain)) 18 | promPusher.Collector(Alerts) 19 | promPusher.Collector(LimitedSupportSet) 20 | promPusher.Collector(ServicelogPrepared) 21 | promPusher.Collector(ServicelogSent) 22 | err := promPusher.Add() 23 | if err != nil { 24 | logging.Errorf("failed to push metrics: %w", err) 25 | } 26 | } else { 27 | logging.Warn("metrics disabled, set env 'CAD_PROMETHEUS_PUSHGATEWAY' to push metrics") 28 | } 29 | } 30 | 31 | // Inc takes a counterVec and a set of label values and increases by one 32 | func Inc(counterVec *prometheus.CounterVec, lsv ...string) { 33 | metric, err := counterVec.GetMetricWithLabelValues(lsv...) 34 | if err != nil { 35 | logging.Error(err) 36 | } 37 | metric.Inc() 38 | } 39 | 40 | const ( 41 | namespace = "cad" 42 | subsystemInvestigate = "investigate" 43 | alertTypeLabel = "alert_type" 44 | lsSummaryLabel = "ls_summary" 45 | ) 46 | 47 | var ( 48 | // Alerts is a metric counting all alerts CAD received 49 | Alerts = prometheus.NewCounterVec( 50 | prometheus.CounterOpts{ 51 | Namespace: namespace, Subsystem: subsystemInvestigate, 52 | Name: "alerts_total", 53 | Help: "counts investigated alerts by alert and event type", 54 | }, []string{alertTypeLabel}) 55 | // LimitedSupportSet is a counter for limited support reasons set by cad 56 | LimitedSupportSet = prometheus.NewCounterVec( 57 | prometheus.CounterOpts{ 58 | Namespace: namespace, Subsystem: subsystemInvestigate, 59 | Name: "limitedsupport_set_total", 60 | Help: "counts investigations resulting in setting a limited support reason", 61 | }, []string{alertTypeLabel, lsSummaryLabel}) 62 | // ServicelogPrepared is a counter for investigation ending in a prepared servicelog 63 | ServicelogPrepared = prometheus.NewCounterVec( 64 | prometheus.CounterOpts{ 65 | Namespace: namespace, Subsystem: subsystemInvestigate, 66 | Name: "servicelog_prepared_total", 67 | Help: "counts investigations resulting in a prepared servicelog attached to the incident notes", 68 | }, []string{alertTypeLabel}) 69 | // ServicelogSent is a counter for investigation ending in a sent servicelog 70 | ServicelogSent = prometheus.NewCounterVec( 71 | prometheus.CounterOpts{ 72 | Namespace: namespace, Subsystem: subsystemInvestigate, 73 | Name: "servicelog_sent_total", 74 | Help: "counts investigations resulting in a sent servicelog", 75 | }, []string{alertTypeLabel}) 76 | ) 77 | -------------------------------------------------------------------------------- /pkg/networkverifier/networkverifier_suite_test.go: -------------------------------------------------------------------------------- 1 | package networkverifier_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/onsi/ginkgo/v2" 7 | . "github.com/onsi/gomega" 8 | ) 9 | 10 | func TestPagerduty(t *testing.T) { 11 | RegisterFailHandler(Fail) 12 | RunSpecs(t, "Network verifier Suite") 13 | } 14 | -------------------------------------------------------------------------------- /pkg/networkverifier/networkverifier_test.go: -------------------------------------------------------------------------------- 1 | package networkverifier_test 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | 7 | . "github.com/onsi/ginkgo/v2" 8 | . "github.com/onsi/gomega" 9 | v1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" 10 | awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock" 11 | "github.com/openshift/configuration-anomaly-detection/pkg/networkverifier" 12 | hivev1 "github.com/openshift/hive/apis/hive/v1" 13 | "go.uber.org/mock/gomock" 14 | ) 15 | 16 | var _ = Describe("RunVerifier", func() { 17 | Describe("AreAllInstancesRunning", func() { 18 | var ( 19 | mockCtrl *gomock.Controller 20 | clusterBuilder *v1.ClusterBuilder 21 | clusterDeployment *hivev1.ClusterDeployment 22 | awsCli *awsmock.MockClient 23 | ) 24 | BeforeEach(func() { 25 | mockCtrl = gomock.NewController(GinkgoT()) 26 | 27 | awsCli = awsmock.NewMockClient(mockCtrl) 28 | 29 | region := v1.NewCloudRegion().ID("us-east-1") 30 | 31 | clusterBuilder = v1.NewCluster().ID("12345").Nodes(v1.NewClusterNodes().Total(1)).Region(region) 32 | 33 | clusterDeployment = &hivev1.ClusterDeployment{ 34 | Spec: hivev1.ClusterDeploymentSpec{ 35 | ClusterMetadata: &hivev1.ClusterMetadata{ 36 | InfraID: "infra_id", 37 | }, 38 | }, 39 | } 40 | }) 41 | AfterEach(func() { 42 | mockCtrl.Finish() 43 | }) 44 | // This test is pretty useless but illustrates what tests for networkverifier should look like 45 | When("Getting security group ids", func() { 46 | It("Should return the error failed to get SecurityGroupId", func() { 47 | // Finish setup 48 | cluster, err := clusterBuilder.Build() 49 | 50 | Expect(err).ToNot(HaveOccurred()) 51 | 52 | // Arrange 53 | expectedError := errors.New("failed to get SecurityGroupId: errormessage") 54 | 55 | awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("", expectedError) 56 | 57 | // Act 58 | result, failures, gotErr := networkverifier.Run(cluster, clusterDeployment, awsCli) 59 | fmt.Printf("result %v, failures %v", result, failures) 60 | 61 | // Assert 62 | Expect(gotErr).To(HaveOccurred()) 63 | Expect(gotErr.Error()).To(ContainSubstring(expectedError.Error())) 64 | }) 65 | }) 66 | 67 | When("Checking input passed to ONV", func() { 68 | It("Should forward the cluster KMS key", func() { 69 | // Finish setup 70 | kmsKey := "some-KMS-key-ARN" 71 | clusterBuilder.AWS(v1.NewAWS().KMSKeyArn(kmsKey)) 72 | 73 | cluster, err := clusterBuilder.Build() 74 | 75 | Expect(err).ToNot(HaveOccurred()) 76 | 77 | // Arrange 78 | awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return(gomock.Any().String(), nil) 79 | awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"string1", "string2"}, nil) 80 | 81 | // Act 82 | input, gotErr := networkverifier.InitializeValidateEgressInput(cluster, clusterDeployment, awsCli) 83 | fmt.Printf("input %v", input) 84 | 85 | // Assert 86 | Expect(gotErr).ToNot(HaveOccurred()) 87 | Expect(input.AWS.KmsKeyID).To(BeIdenticalTo(kmsKey)) 88 | }) 89 | }) 90 | }) 91 | }) 92 | -------------------------------------------------------------------------------- /pkg/notewriter/notewriter.go: -------------------------------------------------------------------------------- 1 | package notewriter 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "go.uber.org/zap" 8 | ) 9 | 10 | type NoteWriter struct { 11 | investigationName string 12 | sb strings.Builder 13 | logger *zap.SugaredLogger 14 | } 15 | 16 | // New initializes a new NoteWriter with an optional logger. 17 | // The note is initialized with an investigation header in the following format: 18 | // 🤖 Automated %s pre-investigation 🤖 19 | // =========================== 20 | // 21 | // E.g. 22 | // 🤖 Automated CHGM pre-investigation 🤖 23 | // =========================== 24 | func New(investigationName string, logger *zap.SugaredLogger) *NoteWriter { 25 | nw := &NoteWriter{investigationName, strings.Builder{}, logger} 26 | nw.sb.WriteString(fmt.Sprintf("🤖 Automated %s pre-investigation 🤖\n", investigationName)) 27 | nw.sb.WriteString("===========================\n") 28 | return nw 29 | } 30 | 31 | // String() returns the current full string format of the built note 32 | func (n *NoteWriter) String() string { 33 | return n.sb.String() 34 | } 35 | 36 | func (n *NoteWriter) writeWithLog(format string, a ...any) { 37 | if n.logger != nil { 38 | n.logger.Infof(format, a...) 39 | } 40 | 41 | n.sb.WriteString(fmt.Sprintf(format, a...)) 42 | } 43 | 44 | // AppendSuccess should be used when a CAD check succeeded, e.g. 45 | // ✅ Network Verifier Passed 46 | // Format appended to the note: 47 | // ✅ \n 48 | func (n *NoteWriter) AppendSuccess(format string, a ...any) { 49 | n.writeWithLog("✅ %s\n", fmt.Sprintf(format, a...)) 50 | } 51 | 52 | // AppendWarning should be used when a CAD check showed an issue, e.g. 53 | // ⚠️ Network Verifier Failed with the following errors: error1, error2, error3 54 | // Format appended to the note: 55 | // ⚠️ \n 56 | func (n *NoteWriter) AppendWarning(format string, a ...any) { 57 | n.writeWithLog("⚠️ %s\n", fmt.Sprintf(format, a...)) 58 | } 59 | 60 | // AppendAutomation should to indicate CAD took an automated action, e.g. 61 | // 🤖 Sent service log: "This is the service log message" 62 | // Format appended to the note: 63 | // 🤖 \n 64 | func (n *NoteWriter) AppendAutomation(format string, a ...any) { 65 | n.writeWithLog("🤖 %s\n", fmt.Sprintf(format, a...)) 66 | } 67 | -------------------------------------------------------------------------------- /pkg/notewriter/notewriter_test.go: -------------------------------------------------------------------------------- 1 | package notewriter 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | var ( 8 | testInvestigationName = "CHGM" 9 | 10 | expectedOutput = `🤖 Automated CHGM pre-investigation 🤖 11 | =========================== 12 | ✅ Network Verifier Succeeded: 123 13 | ⚠️ Network Verifier Failed: 123 14 | 🤖 Sent servicelog for network misconfiguration: 123 15 | ` 16 | ) 17 | 18 | func TestNoteWriter(t *testing.T) { 19 | notesWriter := New(testInvestigationName, nil) 20 | notesWriter.AppendSuccess("Network Verifier Succeeded: 123") 21 | notesWriter.AppendWarning("Network Verifier Failed: 123") 22 | notesWriter.AppendAutomation("Sent servicelog for network misconfiguration: 123") 23 | 24 | res := notesWriter.String() 25 | 26 | if res != expectedOutput { 27 | t.Fatalf("NoteWriter output does not match expected test output.\n NoteWriter output:\n%s\n\n Expected output:\n%s", res, expectedOutput) 28 | } 29 | } 30 | 31 | func TestNoteWriterFormat(t *testing.T) { 32 | notesWriter := New(testInvestigationName, nil) 33 | notesWriter.AppendSuccess("Network Verifier Succeeded: %s", "123") 34 | notesWriter.AppendWarning("Network Verifier Failed: %s", "123") 35 | notesWriter.AppendAutomation("Sent servicelog for network misconfiguration: %s", "123") 36 | 37 | res := notesWriter.String() 38 | 39 | if res != expectedOutput { 40 | t.Fatalf("NoteWriter output does not match expected test output.\n NoteWriter output:\n%s\n\n Expected output:\n%s", res, expectedOutput) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pkg/ocm/ocm_config.go: -------------------------------------------------------------------------------- 1 | package ocm 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | 9 | sdk "github.com/openshift-online/ocm-sdk-go" 10 | ) 11 | 12 | // Config is the type used to store the configuration of the client. 13 | // There's no way to line-split or predefine tags, so... 14 | // 15 | //nolint:lll 16 | type Config struct { 17 | // TODO(efried): Better docs for things like AccessToken 18 | // TODO(efried): Dedup with flag docs in cmd/ocm/login/cmd.go:init where possible 19 | AccessToken string `json:"access_token,omitempty" doc:"Bearer access token."` 20 | ClientID string `json:"client_id,omitempty" doc:"OpenID client identifier."` 21 | ClientSecret string `json:"client_secret,omitempty" doc:"OpenID client secret."` 22 | Insecure bool `json:"insecure,omitempty" doc:"Enables insecure communication with the server. This disables verification of TLS certificates and host names."` 23 | Password string `json:"password,omitempty" doc:"User password."` 24 | RefreshToken string `json:"refresh_token,omitempty" doc:"Offline or refresh token."` 25 | Scopes []string `json:"scopes,omitempty" doc:"OpenID scope. If this option is used it will replace completely the default scopes. Can be repeated multiple times to specify multiple scopes."` 26 | TokenURL string `json:"token_url,omitempty" doc:"OpenID token URL."` 27 | URL string `json:"url,omitempty" doc:"URL of the API gateway. The value can be the complete URL or an alias. The valid aliases are 'production', 'staging' and 'integration'."` 28 | User string `json:"user,omitempty" doc:"User name."` 29 | Pager string `json:"pager,omitempty" doc:"Pager command, for example 'less'. If empty no pager will be used."` 30 | } 31 | 32 | // Load loads the configuration from the configuration file. If the configuration file doesn't exist 33 | // it will return an empty configuration object. 34 | func Load() (cfg *Config, err error) { 35 | file, err := Location() 36 | if err != nil { 37 | return 38 | } 39 | _, err = os.Stat(file) 40 | if os.IsNotExist(err) { 41 | cfg = &Config{} 42 | return 43 | } 44 | if err != nil { 45 | err = fmt.Errorf("can't check if config file '%s' exists: %w", file, err) 46 | return 47 | } 48 | // #nosec G304 49 | data, err := os.ReadFile(file) 50 | if err != nil { 51 | err = fmt.Errorf("can't read config file '%s': %w", file, err) 52 | return 53 | } 54 | cfg = &Config{} 55 | if len(data) == 0 { 56 | return 57 | } 58 | err = json.Unmarshal(data, cfg) 59 | if err != nil { 60 | err = fmt.Errorf("can't parse config file '%s': %w", file, err) 61 | return 62 | } 63 | return 64 | } 65 | 66 | // Location returns the location of the configuration file. If a configuration file 67 | // already exists in the HOME directory, it uses that, otherwise it prefers to 68 | // use the XDG config directory. 69 | func Location() (path string, err error) { 70 | if ocmconfig := os.Getenv("OCM_CONFIG"); ocmconfig != "" { 71 | return ocmconfig, nil 72 | } 73 | 74 | // Determine home directory to use for the legacy file path 75 | home, err := os.UserHomeDir() 76 | if err != nil { 77 | return "", err 78 | } 79 | 80 | path = filepath.Join(home, ".ocm.json") 81 | 82 | _, err = os.Stat(path) 83 | if os.IsNotExist(err) { 84 | // Determine standard config directory 85 | configDir, err := os.UserConfigDir() 86 | if err != nil { 87 | return path, err 88 | } 89 | 90 | // Use standard config directory 91 | path = filepath.Join(configDir, "/ocm/ocm.json") 92 | } 93 | 94 | return path, nil 95 | } 96 | 97 | // Connection creates a connection using this configuration. 98 | func (c *Config) Connection() (connection *sdk.Connection, err error) { 99 | // Prepare the builder for the connection adding only the properties that have explicit 100 | // values in the configuration, so that default values won't be overridden: 101 | builder := sdk.NewConnectionBuilder() 102 | if c.TokenURL != "" { 103 | builder.TokenURL(c.TokenURL) 104 | } 105 | if c.ClientID != "" || c.ClientSecret != "" { 106 | builder.Client(c.ClientID, c.ClientSecret) 107 | } 108 | if c.Scopes != nil { 109 | builder.Scopes(c.Scopes...) 110 | } 111 | if c.URL != "" { 112 | builder.URL(c.URL) 113 | } 114 | if c.User != "" || c.Password != "" { 115 | builder.User(c.User, c.Password) 116 | } 117 | tokens := make([]string, 0, 2) 118 | if c.AccessToken != "" { 119 | tokens = append(tokens, c.AccessToken) 120 | } 121 | if c.RefreshToken != "" { 122 | tokens = append(tokens, c.RefreshToken) 123 | } 124 | if len(tokens) > 0 { 125 | builder.Tokens(tokens...) 126 | } 127 | builder.Insecure(c.Insecure) 128 | 129 | // Create the connection: 130 | connection, err = builder.Build() 131 | if err != nil { 132 | return 133 | } 134 | 135 | return 136 | } 137 | -------------------------------------------------------------------------------- /pkg/pagerduty/errors.go: -------------------------------------------------------------------------------- 1 | package pagerduty 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | ) 7 | 8 | // InvalidTokenError wraps the PagerDuty token invalid error 9 | type InvalidTokenError struct { 10 | Err error 11 | } 12 | 13 | // Error prints the wrapped error and the original one 14 | func (i InvalidTokenError) Error() string { 15 | err := fmt.Errorf("the authToken that was provided is invalid: %w", i.Err) 16 | return err.Error() 17 | } 18 | 19 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects) 20 | func (InvalidTokenError) Is(target error) bool { 21 | return errors.Is(target, InvalidTokenError{}) 22 | } 23 | 24 | // InvalidInputParamsError wraps the PagerDuty Invalid parameters error 25 | // TODO: the API also returns any other error in here, if this persists, think on renaming to "ClientMisconfiguration" 26 | type InvalidInputParamsError struct { 27 | Err error 28 | } 29 | 30 | // Error prints the wrapped error and the original one 31 | func (i InvalidInputParamsError) Error() string { 32 | err := fmt.Errorf("the escalation policy or incident id are invalid: %w", i.Err) 33 | return err.Error() 34 | } 35 | 36 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects) 37 | func (InvalidInputParamsError) Is(target error) bool { 38 | return errors.Is(target, InvalidInputParamsError{}) 39 | } 40 | 41 | // IncidentNotFoundError wraps the PagerDuty not found error while adding notes to an incident 42 | type IncidentNotFoundError struct { 43 | Err error 44 | } 45 | 46 | // Error prints the wrapped error and the original one 47 | func (i IncidentNotFoundError) Error() string { 48 | err := fmt.Errorf("the given incident was not found: %w", i.Err) 49 | return err.Error() 50 | } 51 | 52 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects) 53 | func (IncidentNotFoundError) Is(target error) bool { 54 | return errors.Is(target, IncidentNotFoundError{}) 55 | } 56 | 57 | // ServiceNotFoundError wraps the errors returned when PagerDuty services cannot be retrieved 58 | type ServiceNotFoundError struct { 59 | Err error 60 | } 61 | 62 | // Error prints the wrapped and original error 63 | func (s ServiceNotFoundError) Error() string { 64 | err := fmt.Errorf("the given service was not found: %w", s.Err) 65 | return err.Error() 66 | } 67 | 68 | // Is indicates whether the supplied error is a ServiceNotFoundError 69 | func (ServiceNotFoundError) Is(target error) bool { 70 | return errors.Is(target, ServiceNotFoundError{}) 71 | } 72 | 73 | // IntegrationNotFoundError wraps the errors returned when a PagerDuty service's integration cannot be found 74 | type IntegrationNotFoundError struct { 75 | Err error 76 | } 77 | 78 | // Error prints the wrapped and original error 79 | func (i IntegrationNotFoundError) Error() string { 80 | err := fmt.Errorf("the given integration was not found: %w", i.Err) 81 | return err.Error() 82 | } 83 | 84 | // Is indicates whether the supplied error is an IntegrationNotFoundError 85 | func (IntegrationNotFoundError) Is(target error) bool { 86 | return errors.Is(target, IntegrationNotFoundError{}) 87 | } 88 | 89 | // CreateEventError wraps the errors returned when failing to create a PagerDuty event 90 | type CreateEventError struct { 91 | Err error 92 | } 93 | 94 | // Error prints the wrapped and original error 95 | func (c CreateEventError) Error() string { 96 | err := fmt.Errorf("failed to create event: %w", c.Err) 97 | return err.Error() 98 | } 99 | 100 | // Is indicates whether the supplied error is a CreateEventError 101 | func (CreateEventError) Is(target error) bool { 102 | return errors.Is(target, CreateEventError{}) 103 | } 104 | 105 | // FileNotFoundError wraps the filesystem NotFound Error 106 | type FileNotFoundError struct { 107 | Err error 108 | FilePath string 109 | } 110 | 111 | // Error prints the wrapped error and the original one 112 | func (f FileNotFoundError) Error() string { 113 | err := fmt.Errorf("the file '%s' was not found in the filesystem: %w", f.FilePath, f.Err) 114 | return err.Error() 115 | } 116 | 117 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects) 118 | func (f FileNotFoundError) Is(target error) bool { 119 | return errors.Is(target, FileNotFoundError{}) 120 | } 121 | 122 | // UnmarshalError wraps JSON's json.SyntaxError 123 | type UnmarshalError struct { 124 | Err error 125 | } 126 | 127 | // Error prints the wrapped error and the original one 128 | func (u UnmarshalError) Error() string { 129 | err := fmt.Errorf("could not unmarshal the payloadFile: %w", u.Err) 130 | return err.Error() 131 | } 132 | 133 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects) 134 | func (u UnmarshalError) Is(target error) bool { 135 | return errors.Is(target, UnmarshalError{}) 136 | } 137 | -------------------------------------------------------------------------------- /pkg/pagerduty/mock/pagerdutymock.go: -------------------------------------------------------------------------------- 1 | // Code generated by MockGen. DO NOT EDIT. 2 | // Source: pagerduty.go 3 | // 4 | // Generated by this command: 5 | // 6 | // mockgen --build_flags=--mod=readonly -source pagerduty.go -destination ./mock/pagerdutymock.go -package pdmock 7 | // 8 | 9 | // Package pdmock is a generated GoMock package. 10 | package pdmock 11 | 12 | import ( 13 | reflect "reflect" 14 | 15 | gomock "go.uber.org/mock/gomock" 16 | ) 17 | 18 | // MockClient is a mock of Client interface. 19 | type MockClient struct { 20 | ctrl *gomock.Controller 21 | recorder *MockClientMockRecorder 22 | isgomock struct{} 23 | } 24 | 25 | // MockClientMockRecorder is the mock recorder for MockClient. 26 | type MockClientMockRecorder struct { 27 | mock *MockClient 28 | } 29 | 30 | // NewMockClient creates a new mock instance. 31 | func NewMockClient(ctrl *gomock.Controller) *MockClient { 32 | mock := &MockClient{ctrl: ctrl} 33 | mock.recorder = &MockClientMockRecorder{mock} 34 | return mock 35 | } 36 | 37 | // EXPECT returns an object that allows the caller to indicate expected use. 38 | func (m *MockClient) EXPECT() *MockClientMockRecorder { 39 | return m.recorder 40 | } 41 | 42 | // AddNote mocks base method. 43 | func (m *MockClient) AddNote(notes string) error { 44 | m.ctrl.T.Helper() 45 | ret := m.ctrl.Call(m, "AddNote", notes) 46 | ret0, _ := ret[0].(error) 47 | return ret0 48 | } 49 | 50 | // AddNote indicates an expected call of AddNote. 51 | func (mr *MockClientMockRecorder) AddNote(notes any) *gomock.Call { 52 | mr.mock.ctrl.T.Helper() 53 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddNote", reflect.TypeOf((*MockClient)(nil).AddNote), notes) 54 | } 55 | 56 | // EscalateIncident mocks base method. 57 | func (m *MockClient) EscalateIncident() error { 58 | m.ctrl.T.Helper() 59 | ret := m.ctrl.Call(m, "EscalateIncident") 60 | ret0, _ := ret[0].(error) 61 | return ret0 62 | } 63 | 64 | // EscalateIncident indicates an expected call of EscalateIncident. 65 | func (mr *MockClientMockRecorder) EscalateIncident() *gomock.Call { 66 | mr.mock.ctrl.T.Helper() 67 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EscalateIncident", reflect.TypeOf((*MockClient)(nil).EscalateIncident)) 68 | } 69 | 70 | // EscalateIncidentWithNote mocks base method. 71 | func (m *MockClient) EscalateIncidentWithNote(notes string) error { 72 | m.ctrl.T.Helper() 73 | ret := m.ctrl.Call(m, "EscalateIncidentWithNote", notes) 74 | ret0, _ := ret[0].(error) 75 | return ret0 76 | } 77 | 78 | // EscalateIncidentWithNote indicates an expected call of EscalateIncidentWithNote. 79 | func (mr *MockClientMockRecorder) EscalateIncidentWithNote(notes any) *gomock.Call { 80 | mr.mock.ctrl.T.Helper() 81 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EscalateIncidentWithNote", reflect.TypeOf((*MockClient)(nil).EscalateIncidentWithNote), notes) 82 | } 83 | 84 | // GetServiceID mocks base method. 85 | func (m *MockClient) GetServiceID() string { 86 | m.ctrl.T.Helper() 87 | ret := m.ctrl.Call(m, "GetServiceID") 88 | ret0, _ := ret[0].(string) 89 | return ret0 90 | } 91 | 92 | // GetServiceID indicates an expected call of GetServiceID. 93 | func (mr *MockClientMockRecorder) GetServiceID() *gomock.Call { 94 | mr.mock.ctrl.T.Helper() 95 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServiceID", reflect.TypeOf((*MockClient)(nil).GetServiceID)) 96 | } 97 | 98 | // SilenceIncident mocks base method. 99 | func (m *MockClient) SilenceIncident() error { 100 | m.ctrl.T.Helper() 101 | ret := m.ctrl.Call(m, "SilenceIncident") 102 | ret0, _ := ret[0].(error) 103 | return ret0 104 | } 105 | 106 | // SilenceIncident indicates an expected call of SilenceIncident. 107 | func (mr *MockClientMockRecorder) SilenceIncident() *gomock.Call { 108 | mr.mock.ctrl.T.Helper() 109 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SilenceIncident", reflect.TypeOf((*MockClient)(nil).SilenceIncident)) 110 | } 111 | 112 | // SilenceIncidentWithNote mocks base method. 113 | func (m *MockClient) SilenceIncidentWithNote(notes string) error { 114 | m.ctrl.T.Helper() 115 | ret := m.ctrl.Call(m, "SilenceIncidentWithNote", notes) 116 | ret0, _ := ret[0].(error) 117 | return ret0 118 | } 119 | 120 | // SilenceIncidentWithNote indicates an expected call of SilenceIncidentWithNote. 121 | func (mr *MockClientMockRecorder) SilenceIncidentWithNote(notes any) *gomock.Call { 122 | mr.mock.ctrl.T.Helper() 123 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SilenceIncidentWithNote", reflect.TypeOf((*MockClient)(nil).SilenceIncidentWithNote), notes) 124 | } 125 | 126 | // UpdateIncidentTitle mocks base method. 127 | func (m *MockClient) UpdateIncidentTitle(title string) error { 128 | m.ctrl.T.Helper() 129 | ret := m.ctrl.Call(m, "UpdateIncidentTitle", title) 130 | ret0, _ := ret[0].(error) 131 | return ret0 132 | } 133 | 134 | // UpdateIncidentTitle indicates an expected call of UpdateIncidentTitle. 135 | func (mr *MockClientMockRecorder) UpdateIncidentTitle(title any) *gomock.Call { 136 | mr.mock.ctrl.T.Helper() 137 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateIncidentTitle", reflect.TypeOf((*MockClient)(nil).UpdateIncidentTitle), title) 138 | } 139 | -------------------------------------------------------------------------------- /pkg/pagerduty/pagerduty_suite_test.go: -------------------------------------------------------------------------------- 1 | package pagerduty_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/onsi/ginkgo/v2" 7 | . "github.com/onsi/gomega" 8 | ) 9 | 10 | func TestPagerduty(t *testing.T) { 11 | RegisterFailHandler(Fail) 12 | RunSpecs(t, "Pagerduty Suite") 13 | } 14 | -------------------------------------------------------------------------------- /pkg/pagerduty/types.go: -------------------------------------------------------------------------------- 1 | package pagerduty 2 | 3 | // AlertDetails exposes the required info we need from an alert 4 | type AlertDetails struct { 5 | ID string 6 | ClusterID string // This can be internal or external ID 7 | } 8 | 9 | // NewAlertCustomDetails is a format for the alert details shown in the pagerduty incident 10 | type NewAlertCustomDetails struct { 11 | ClusterID string `json:"Cluster ID"` 12 | Error string `json:"Error"` 13 | Resolution string `json:"Resolution"` 14 | SOP string `json:"SOP"` 15 | } 16 | 17 | // NewAlert is a type for alerts to create on pagerduty 18 | type NewAlert struct { 19 | // The alert description acts as a title for the resulting incident 20 | Description string 21 | Details NewAlertCustomDetails 22 | } 23 | -------------------------------------------------------------------------------- /pkg/utils/utils.go: -------------------------------------------------------------------------------- 1 | // Package utils contains utility functions 2 | package utils 3 | 4 | import ( 5 | "fmt" 6 | "time" 7 | 8 | "github.com/openshift/configuration-anomaly-detection/pkg/logging" 9 | ) 10 | 11 | // WithRetries runs a function with up to 10 retries on error 12 | func WithRetries(fn func() error) error { 13 | const defaultRetries = 10 14 | const defaultInitialBackoff = time.Second * 2 15 | 16 | return WithRetriesConfigurable(defaultRetries, defaultInitialBackoff, fn) 17 | } 18 | 19 | // WithRetriesConfigurable runs a function with a configurable retry count and backoff interval on error 20 | func WithRetriesConfigurable(count int, initialBackoff time.Duration, fn func() error) error { 21 | var err error 22 | for i := 0; i < count; i++ { 23 | if i > 0 { 24 | logging.Warnf("Retry %d: %s \n", i, err.Error()) 25 | time.Sleep(initialBackoff) 26 | initialBackoff *= 2 27 | } 28 | err = fn() 29 | if err == nil { 30 | return nil 31 | } 32 | } 33 | return fmt.Errorf("failed after %d retries: %w", count, err) 34 | } 35 | -------------------------------------------------------------------------------- /pkg/utils/utils_suite_test.go: -------------------------------------------------------------------------------- 1 | package utils_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/onsi/ginkgo/v2" 7 | . "github.com/onsi/gomega" 8 | ) 9 | 10 | func TestChgm(t *testing.T) { 11 | RegisterFailHandler(Fail) 12 | RunSpecs(t, "utils suite") 13 | } 14 | -------------------------------------------------------------------------------- /test/e2e/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM registry.ci.openshift.org/openshift/release:rhel-8-release-golang-1.23-openshift-4.19 as builder 2 | WORKDIR /go/src/github.com/openshift/configuration-anomaly-detection 3 | COPY . . 4 | RUN CGO_ENABLED=0 GOFLAGS="-mod=mod" go test ./test/e2e -v -c --tags=osde2e -o /e2e.test 5 | 6 | FROM registry.access.redhat.com/ubi8/ubi-minimal:latest 7 | COPY --from=builder ./e2e.test e2e.test 8 | ENTRYPOINT [ "/e2e.test" ] 9 | -------------------------------------------------------------------------------- /test/e2e/configuration_anomaly_detection_runner_test.go: -------------------------------------------------------------------------------- 1 | //go:build osde2e 2 | // +build osde2e 3 | 4 | package osde2etests 5 | 6 | import ( 7 | "os" 8 | "path/filepath" 9 | "testing" 10 | 11 | . "github.com/onsi/ginkgo/v2" 12 | . "github.com/onsi/gomega" 13 | ) 14 | 15 | const ( 16 | testResultsDirectory = "/test-run-results" 17 | jUnitOutputFilename = "junit-configuration-anomaly-detection.xml" 18 | ) 19 | 20 | // Test entrypoint. osde2e runs this as a test suite on test pod. 21 | func TestConfigurationAnomalyDetection(t *testing.T) { 22 | RegisterFailHandler(Fail) 23 | suiteConfig, reporterConfig := GinkgoConfiguration() 24 | if _, ok := os.LookupEnv("DISABLE_JUNIT_REPORT"); !ok { 25 | reporterConfig.JUnitReport = filepath.Join(testResultsDirectory, jUnitOutputFilename) 26 | } 27 | RunSpecs(t, "Configuration Anomaly Detection", suiteConfig, reporterConfig) 28 | } 29 | -------------------------------------------------------------------------------- /test/e2e/project.mk: -------------------------------------------------------------------------------- 1 | # Project specific values 2 | OPERATOR_NAME?=configuration-anomaly-detection 3 | 4 | E2E_SUITE_IMAGE_REGISTRY?=quay.io 5 | E2E_SUITE_IMAGE_REPOSITORY?=app-sre 6 | E2E_SUITE_IMAGE_NAME?=$(OPERATOR_NAME)-e2e 7 | 8 | REGISTRY_USER?=$(QUAY_USER) 9 | REGISTRY_TOKEN?=$(QUAY_TOKEN) 10 | 11 | ###################### 12 | # Targets used by e2e test suite 13 | ###################### 14 | 15 | # create binary 16 | .PHONY: e2e-suite-build 17 | e2e-suite-build: GOFLAGS_MOD=-mod=mod 18 | e2e-suite-build: GOENV=GOOS=${GOOS} GOARCH=${GOARCH} CGO_ENABLED=0 GOFLAGS="${GOFLAGS_MOD}" 19 | e2e-suite-build: 20 | go mod tidy 21 | ${GOENV} go test ./test/e2e -v -c --tags=osde2e -o e2e-suite.test 22 | 23 | # TODO: Push to a known image tag and commit id 24 | # push e2e suite image 25 | # Use current commit as e2e suite image tag 26 | CURRENT_COMMIT=$(shell git rev-parse --short=7 HEAD) 27 | E2E_SUITE_IMAGE_TAG=$(CURRENT_COMMIT) 28 | 29 | .PHONY: e2e-image-build-push 30 | e2e-image-build-push: 31 | ${CONTAINER_ENGINE} build --pull -f test/e2e/Dockerfile -t $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):$(E2E_SUITE_IMAGE_TAG) . 32 | ${CONTAINER_ENGINE} tag $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):$(E2E_SUITE_IMAGE_TAG) $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):latest 33 | ${CONTAINER_ENGINE} push $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):$(E2E_SUITE_IMAGE_TAG) 34 | ${CONTAINER_ENGINE} push $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):latest -------------------------------------------------------------------------------- /test/e2e/test-e2e-suite-template.yml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | name: osde2e-focused-tests 5 | 6 | parameters: 7 | - name: OSDE2E_CONFIGS 8 | required: true 9 | - name: TEST_E2E_SUITE_IMAGE 10 | required: true 11 | - name: OCM_TOKEN 12 | required: true 13 | - name: OCM_CCS 14 | required: false 15 | - name: AWS_ACCESS_KEY_ID 16 | required: false 17 | - name: AWS_SECRET_ACCESS_KEY 18 | required: false 19 | - name: CLOUD_PROVIDER_REGION 20 | required: false 21 | - name: GCP_CREDS_JSON 22 | required: false 23 | - name: JOBID 24 | generate: expression 25 | from: "[0-9a-z]{7}" 26 | - name: IMAGE_TAG 27 | value: '' 28 | required: true 29 | - name: LOG_BUCKET 30 | value: 'osde2e-logs' 31 | objects: 32 | - apiVersion: batch/v1 33 | kind: Job 34 | metadata: 35 | name: configuration-anomaly-detection-${IMAGE_TAG}-${JOBID} 36 | spec: 37 | backoffLimit: 0 38 | template: 39 | spec: 40 | restartPolicy: Never 41 | containers: 42 | - name: osde2e 43 | image: quay.io/redhat-services-prod/osde2e-cicada-tenant/osde2e:latest 44 | command: 45 | - /osde2e 46 | args: 47 | - test 48 | - --configs 49 | - ${OSDE2E_CONFIGS} 50 | securityContext: 51 | runAsNonRoot: true 52 | allowPrivilegeEscalation: false 53 | capabilities: 54 | drop: ["ALL"] 55 | seccompProfile: 56 | type: RuntimeDefault 57 | env: 58 | - name: TEST_HARNESSES 59 | value: ${TEST_E2E_SUITE_IMAGE}:${IMAGE_TAG} 60 | - name: OCM_TOKEN 61 | value: ${OCM_TOKEN} 62 | - name: OCM_CCS 63 | value: ${OCM_CCS} 64 | - name: AWS_ACCESS_KEY_ID 65 | value: ${AWS_ACCESS_KEY_ID} 66 | - name: AWS_SECRET_ACCESS_KEY 67 | value: ${AWS_SECRET_ACCESS_KEY} 68 | - name: CLOUD_PROVIDER_REGION 69 | value: ${CLOUD_PROVIDER_REGION} 70 | - name: GCP_CREDS_JSON 71 | value: ${GCP_CREDS_JSON} 72 | - name: LOG_BUCKET 73 | value: ${LOG_BUCKET} -------------------------------------------------------------------------------- /test/e2e/utils/aws.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/aws/aws-sdk-go-v2/service/ec2" 8 | "github.com/aws/aws-sdk-go-v2/service/ec2/types" 9 | ) 10 | 11 | // EC2API interface to make testing easier 12 | type EC2API interface { 13 | RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) 14 | AuthorizeSecurityGroupEgress(ctx context.Context, params *ec2.AuthorizeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupEgressOutput, error) 15 | } 16 | 17 | // EC2ClientWrapper wraps the AWS SDK EC2 client to implement our EC2API interface 18 | type EC2ClientWrapper struct { 19 | Client *ec2.Client 20 | } 21 | 22 | // RevokeSecurityGroupEgress implements EC2API 23 | func (w *EC2ClientWrapper) RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) { 24 | return w.Client.RevokeSecurityGroupEgress(ctx, params, optFns...) 25 | } 26 | 27 | // AuthorizeSecurityGroupEgress implements EC2API 28 | func (w *EC2ClientWrapper) AuthorizeSecurityGroupEgress(ctx context.Context, params *ec2.AuthorizeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupEgressOutput, error) { 29 | return w.Client.AuthorizeSecurityGroupEgress(ctx, params, optFns...) 30 | } 31 | 32 | // NewEC2ClientWrapper creates a new EC2ClientWrapper that implements EC2API 33 | func NewEC2ClientWrapper(client *ec2.Client) *EC2ClientWrapper { 34 | return &EC2ClientWrapper{Client: client} 35 | } 36 | 37 | // BlockEgress revokes all outbound traffic from the security group 38 | func BlockEgress(ctx context.Context, ec2Client EC2API, securityGroupID string) error { 39 | input := &ec2.RevokeSecurityGroupEgressInput{ 40 | GroupId: &securityGroupID, 41 | IpPermissions: []types.IpPermission{ 42 | { 43 | IpProtocol: awsString("-1"), // -1 = all protocols 44 | IpRanges: []types.IpRange{ 45 | {CidrIp: awsString("0.0.0.0/0")}, 46 | }, 47 | }, 48 | }, 49 | } 50 | _, err := ec2Client.RevokeSecurityGroupEgress(ctx, input) 51 | if err != nil { 52 | return fmt.Errorf("failed to revoke egress: %w", err) 53 | } 54 | return nil 55 | } 56 | 57 | // RestoreEgress allows all outbound traffic from the security group 58 | func RestoreEgress(ctx context.Context, ec2Client EC2API, securityGroupID string) error { 59 | input := &ec2.AuthorizeSecurityGroupEgressInput{ 60 | GroupId: &securityGroupID, 61 | IpPermissions: []types.IpPermission{ 62 | { 63 | IpProtocol: awsString("-1"), 64 | IpRanges: []types.IpRange{ 65 | {CidrIp: awsString("0.0.0.0/0")}, 66 | }, 67 | }, 68 | }, 69 | } 70 | _, err := ec2Client.AuthorizeSecurityGroupEgress(ctx, input) 71 | if err != nil { 72 | return fmt.Errorf("failed to restore egress: %w", err) 73 | } 74 | return nil 75 | } 76 | 77 | // awsString helper function to convert a string to a pointer 78 | func awsString(value string) *string { 79 | return &value 80 | } 81 | -------------------------------------------------------------------------------- /test/e2e/utils/generate_incident.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "crypto/rand" 6 | "fmt" 7 | "time" 8 | 9 | sdk "github.com/PagerDuty/go-pagerduty" 10 | ) 11 | 12 | const ( 13 | AlertClusterHasGoneMissing = "ClusterHasGoneMissing" 14 | AlertClusterProvisioningDelay = "ClusterProvisioningDelay" 15 | AlertClusterMonitoringErrorBudgetBurnSRE = "ClusterMonitoringErrorBudgetBurnSRE" 16 | AlertInsightsOperatorDown = "InsightsOperatorDown" 17 | AlertMachineHealthCheckUnterminatedShortCircuitSRE = "MachineHealthCheckUnterminatedShortCircuitSRE" 18 | AlertApiErrorBudgetBurn = "ApiErrorBudgetBurn" 19 | ) 20 | 21 | func GetAlertTitle(alertName string) (string, error) { 22 | switch alertName { 23 | case AlertClusterHasGoneMissing: 24 | return "cadtest has gone missing", nil 25 | case AlertClusterProvisioningDelay: 26 | return "ClusterProvisioningDelay -", nil 27 | case AlertClusterMonitoringErrorBudgetBurnSRE: 28 | return "ClusterMonitoringErrorBudgetBurnSRE Critical (1)", nil 29 | case AlertInsightsOperatorDown: 30 | return "InsightsOperatorDown", nil 31 | case AlertMachineHealthCheckUnterminatedShortCircuitSRE: 32 | return "MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)", nil 33 | case AlertApiErrorBudgetBurn: 34 | return "api-ErrorBudgetBurn k8sgpt test CRITICAL (1)", nil 35 | default: 36 | return "", fmt.Errorf("unknown alert name: %s", alertName) 37 | } 38 | } 39 | 40 | type TestPagerDutyClient interface { 41 | TriggerIncident(alertName, clusterID string) (string, error) 42 | GetIncidentID(dedupKey string) (string, error) 43 | ResolveIncident(incidentID string) error 44 | } 45 | type client struct { 46 | routingKey string 47 | apiClient *sdk.Client 48 | } 49 | 50 | func NewClient(routingKey string) TestPagerDutyClient { 51 | return &client{ 52 | routingKey: routingKey, 53 | apiClient: sdk.NewClient(routingKey), 54 | } 55 | } 56 | 57 | func (c *client) TriggerIncident(alertName, clusterID string) (string, error) { 58 | summary, err := GetAlertTitle(alertName) 59 | if err != nil { 60 | return "", err 61 | } 62 | event := sdk.V2Event{ 63 | RoutingKey: c.routingKey, 64 | Action: "trigger", 65 | DedupKey: generateUUID(), 66 | Payload: &sdk.V2Payload{ 67 | Summary: summary, 68 | Source: "cad-integration-testing", 69 | Severity: "critical", 70 | Timestamp: time.Now().UTC().Format(time.RFC3339), 71 | Details: map[string]interface{}{ 72 | "alertname": alertName, 73 | "cluster_id": clusterID, 74 | }, 75 | }, 76 | } 77 | resp, err := sdk.ManageEventWithContext(context.Background(), event) 78 | if err != nil { 79 | return "", err 80 | } 81 | return resp.DedupKey, nil 82 | } 83 | 84 | func (c *client) GetIncidentID(dedupKey string) (string, error) { 85 | // Implementation can be added if needed 86 | return "", nil 87 | } 88 | 89 | func (c *client) ResolveIncident(incidentID string) error { 90 | // Implementation can be added if needed 91 | return nil 92 | } 93 | 94 | func generateUUID() string { 95 | b := make([]byte, 16) 96 | _, err := rand.Read(b) 97 | if err != nil { 98 | // Fallback to timestamp-based if crypto/rand fails 99 | return fmt.Sprintf("%d", time.Now().UnixNano()) 100 | } 101 | // Set version (4) and variant bits 102 | b[6] = (b[6] & 0x0f) | 0x40 // Version 4 103 | b[8] = (b[8] & 0x3f) | 0x80 // Variant 10 104 | return fmt.Sprintf("%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:16]) 105 | } 106 | -------------------------------------------------------------------------------- /test/e2e/utils/utils.go: -------------------------------------------------------------------------------- 1 | //go:build osde2e 2 | // +build osde2e 3 | 4 | package utils 5 | 6 | import ( 7 | "fmt" 8 | 9 | cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" 10 | servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1" 11 | "github.com/openshift/configuration-anomaly-detection/pkg/ocm" 12 | ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm" 13 | ) 14 | 15 | func GetLimitedSupportReasons(ocme2eCli *ocme2e.Client, clusterID string) (*cmv1.LimitedSupportReasonsListResponse, error) { 16 | lsResponse, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).LimitedSupportReasons().List().Send() 17 | 18 | if err != nil { 19 | return nil, fmt.Errorf("failed sending service log: %w", err) 20 | } 21 | return lsResponse, nil 22 | } 23 | 24 | func GetServiceLogs(ocmCli ocm.Client, cluster *cmv1.Cluster) (*servicelogsv1.ClusterLogsUUIDListResponse, error) { 25 | filter := "log_type='cluster-state-updates'" 26 | clusterLogsUUIDListResponse, err := ocmCli.GetServiceLog(cluster, filter) 27 | if err != nil { 28 | return nil, fmt.Errorf("Failed to get service log: %w", err) 29 | } 30 | return clusterLogsUUIDListResponse, nil 31 | } 32 | -------------------------------------------------------------------------------- /test/generate_incident.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Define the mapping of alert names to titles 5 | # Add more mappings as needed: for the standard service, we should not need to go by title but by the `alertname` field instead. 6 | declare -A alert_mapping=( 7 | ["ClusterHasGoneMissing"]="cadtest has gone missing" 8 | ["ClusterProvisioningDelay"]="ClusterProvisioningDelay -" 9 | ["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)" 10 | ["InsightsOperatorDown"]="InsightsOperatorDown" 11 | ["MachineHealthCheckUnterminatedShortCircuitSRE"]="MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)" 12 | ["ApiErrorBudgetBurn"]="api-ErrorBudgetBurn k8sgpt test CRITICAL (1)" 13 | ["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE" 14 | ["UpgradeConfigSyncFailureOver4HrSRE"]="UpgradeConfigSyncFailureOver4HrSRE Critical (1)" 15 | ) 16 | 17 | # Function to print help message 18 | print_help() { 19 | echo "Usage: $0 " 20 | echo -n "Available alert names (comma separated): " 21 | for alert_name in "${!alert_mapping[@]}"; do 22 | echo -n "$alert_name, " 23 | done 24 | echo 25 | } 26 | # Check if the correct number of arguments is provided 27 | if [ "$#" -ne 2 ]; then 28 | print_help 29 | exit 1 30 | fi 31 | 32 | alert_name=$1 33 | cluster_id=$2 34 | time_current=$(date -u +"%Y-%m-%dT%H:%M:%SZ") 35 | 36 | # Check if the alert name is in the mapping 37 | if [ -z "${alert_mapping[$alert_name]}" ]; then 38 | echo "Error: Unknown alert name '$alert_name'" 39 | print_help 40 | exit 1 41 | fi 42 | 43 | alert_title="${alert_mapping[$alert_name]}" 44 | 45 | # Load testing routing key and test service url from vault 46 | export VAULT_ADDR="https://vault.devshift.net" 47 | export VAULT_TOKEN="$(vault login -method=oidc -token-only)" 48 | for v in $(vault kv get -format=json osd-sre/configuration-anomaly-detection/cad-testing | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done 49 | unset VAULT_ADDR VAULT_TOKEN 50 | echo 51 | 52 | dedup_key=$(uuidgen) 53 | 54 | echo "Creating incident for $alert_name" 55 | response=$(curl --silent --request POST \ 56 | --url https://events.pagerduty.com/v2/enqueue \ 57 | --header 'Accept: application/json' \ 58 | --header 'Content-Type: application/json' \ 59 | --data '{ 60 | "payload": { 61 | "summary": "'"${alert_title}"'", 62 | "timestamp": "'"${time_current}"'", 63 | "severity": "critical", 64 | "source": "cad-integration-testing", 65 | "custom_details": { 66 | "alertname": "'"${alert_name}"'", 67 | "cluster_id": "'"${cluster_id}"'" 68 | } 69 | }, 70 | "routing_key": "'"${pd_test_routing_key}"'", 71 | "event_action": "trigger", 72 | "dedup_key": "'"${dedup_key}"'" 73 | }') 74 | 75 | if [[ $response != *"Event processed"* ]]; then 76 | echo "Error: Couldn't create the incident" 77 | exit 1 78 | fi 79 | echo 80 | 81 | # Pagerduty seems to need a short while to create the incident 82 | # Added this as we intermittently fail to get the incident id otherwise 83 | sleep 2 84 | 85 | INCIDENT_ID=$(curl --silent --request GET \ 86 | --url "https://api.pagerduty.com/incidents?incident_key=${dedup_key}" \ 87 | --header 'Accept: application/json' \ 88 | --header "Authorization: Token token=${pd_test_token}" \ 89 | --header 'Content-Type: application/json' | jq -r '.incidents[0].id') 90 | echo $INCIDENT_ID 91 | echo '{"__pd_metadata":{"incident":{"id":"'$INCIDENT_ID'"}}}' > ./payload 92 | echo "Created ./payload" 93 | -------------------------------------------------------------------------------- /test/launch_local_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | #assuming we're launched from inside the configuration-anomaly-detection repository 5 | CAD_REPO_PATH=$(git rev-parse --show-toplevel) 6 | echo "Assuming CAD repository root is ${CAD_REPO_PATH}" 7 | 8 | #check presence of binary, assume the dnf package name is the same 9 | check_presence () { 10 | # $1 - name of the binary 11 | echo -n "Checking presence of $1..." 12 | if ! which $1 2>/dev/null >/dev/null; then 13 | echo "Not Found" 14 | echo "Try 'dnf install $1' on Fedora" 15 | exit -1 16 | else 17 | echo "Found" 18 | fi 19 | } 20 | 21 | # clean up child processes on SIGINT 22 | trap "kill -- -$$" EXIT 23 | 24 | check_presence "jq" 25 | check_presence "openssl" 26 | check_presence "tinyproxy" 27 | check_presence "haproxy" 28 | check_presence "proxytunnel" 29 | 30 | #loading env vars 31 | . ${CAD_REPO_PATH}/test/set_stage_env.sh 32 | 33 | #checking env vars 34 | set +u 35 | if [[ -z "${OCM_BACKPLANE_REPO_PATH}" ]]; then 36 | echo "Please set OCM_BACKPLANE_REPO_PATH variable to the path of the OCM Backplane code repository" 37 | exit -1 38 | fi 39 | set -u 40 | 41 | if ! [ $(cat ${OCM_BACKPLANE_REPO_PATH}/configs/ocm.json | jq -r .client_id) = "ocm-backplane-staging" ]; then 42 | echo "OCM Backplane ocm.json (${OCM_BACKPLANE_REPO_PATH}/configs/ocm.json) isn't the ocm-backplane-staging config." 43 | echo "Please get the config from a backplane pod on a staging backplanes0* cluster (in /ocm inside the pod)" 44 | echo "and place it in the configs subdirectory of the backplane-api repo." 45 | exit -1 46 | fi 47 | 48 | #checking certificate validity 49 | if ! openssl verify ${OCM_BACKPLANE_REPO_PATH}/localhost.crt; then 50 | echo "Certificate ${OCM_BACKPLANE_REPO_PATH}/localhost.crt not valid, please run make dev-certs in the OCM Backplane directory as root to generate and trust the localhost certificates" 51 | exit -1 52 | fi 53 | 54 | #creating certificate file for the HAProxy 55 | cat ${OCM_BACKPLANE_REPO_PATH}/localhost.crt ${OCM_BACKPLANE_REPO_PATH}/localhost.key > ${CAD_REPO_PATH}/test/testinfra/localhost.pem 56 | 57 | #checking BACKPLANE_PROXY reachability reachability 58 | echo "Checking Proxy reachability" 59 | if ! curl ${BACKPLANE_PROXY} -o /dev/null; then 60 | echo "Proxy ${BACKPLANE_PROXY} not reachable, check VPN connection" 61 | exit -1 62 | fi 63 | 64 | #run the env 65 | echo "Starting tinyproxy on port 8888" 66 | tinyproxy -d -c ${CAD_REPO_PATH}/test/testinfra/tinyproxy.conf > ${CAD_REPO_PATH}/test/testinfra/tinyproxy.log 2> ${CAD_REPO_PATH}/test/testinfra/tinyproxy.error.log& 67 | 68 | echo "Starting proxytunnel on port 8091" 69 | proxytunnel -v -p squid.corp.redhat.com:3128 -d api.stage.backplane.openshift.com:443 -a 8091 > ${CAD_REPO_PATH}/test/testinfra/proxytunnel.log 2> ${CAD_REPO_PATH}/test/testinfra/proxytunnel.error.log & 70 | 71 | echo "Starting haproxy on port 8443" 72 | pushd ${CAD_REPO_PATH}/test/testinfra/ 73 | haproxy -f haproxy.cfg > ${CAD_REPO_PATH}/test/testinfra/haproxy.log 2> ${CAD_REPO_PATH}/test/testinfra/haproxy.error.log & 74 | popd 75 | 76 | echo "Starting backplane-api on port 8001" 77 | pushd $OCM_BACKPLANE_REPO_PATH 78 | GIT_REPO=${CAD_REPO_PATH} make run-local-with-testremediation > ${CAD_REPO_PATH}/test/testinfra/backplan-api.log 2> ${CAD_REPO_PATH}/test/testinfra/backplan-api.error.log & 79 | popd 80 | 81 | echo "Environment started. Check ${CAD_REPO_PATH}/test/testinfra/ directory for logs" 82 | echo "Run cadctl with the following command to test against the local backplane-api for remediations" 83 | echo "" 84 | echo "BACKPLANE_URL=https://localhost:8443 HTTP_PROXY=http://127.0.0.1:8888 HTTPS_PROXY=http://127.0.0.1:8888 BACKPLANE_PROXY=http://127.0.0.1:8888 ./bin/cadctl investigate --payload-path ./payload --log-level debug" 85 | echo "" 86 | echo "Send SIGINT (Ctrl+C) to terminate the local infrastructure" 87 | #keep the script alive until all child processes are cleaned up 88 | wait 89 | -------------------------------------------------------------------------------- /test/set_stage_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | export VAULT_ADDR="https://vault.devshift.net" 5 | export VAULT_TOKEN="$(vault login -method=oidc -token-only)" 6 | for v in $(vault kv get -format=json osd-sre/configuration-anomaly-detection/backplane/stg | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done 7 | for v in $(vault kv get -format=json osd-sre/configuration-anomaly-detection/ocm/ocm-cad-staging | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done 8 | for v in $(vault kv get -format=json osd-sre/configuration-anomaly-detection/pd/stg | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done 9 | unset VAULT_ADDR VAULT_TOKEN 10 | 11 | 12 | PROXY_URL="http://squid.corp.redhat.com:3128" 13 | 14 | export CAD_EXPERIMENTAL_ENABLED=true 15 | export BACKPLANE_PROXY=${PROXY_URL} 16 | export AWS_PROXY=${PROXY_URL} 17 | 18 | set +euo pipefail 19 | -------------------------------------------------------------------------------- /test/testinfra/haproxy.cfg: -------------------------------------------------------------------------------- 1 | global 2 | log stderr format iso local7 3 | defaults 4 | log global 5 | mode http 6 | timeout connect 5000ms 7 | timeout client 50000ms 8 | timeout server 50000ms 9 | 10 | frontend https-in 11 | option httplog 12 | bind *:8443 ssl crt ./localhost.pem 13 | redirect scheme https code 301 if !{ ssl_fc } 14 | use_backend local-ocmb if { path_beg /backplane/remediat } 15 | default_backend upstream-ocmb 16 | 17 | backend upstream-ocmb 18 | http-request set-header Host api.stage.backplane.openshift.com 19 | server upstream 127.0.0.1:8091 ssl verify none 20 | 21 | backend local-ocmb 22 | server local 127.0.0.1:8001 ssl verify none 23 | -------------------------------------------------------------------------------- /test/testinfra/tinyproxy.conf: -------------------------------------------------------------------------------- 1 | Port 8888 2 | Listen 127.0.0.1 3 | Timeout 600 4 | DefaultErrorFile "/usr/share/tinyproxy/default.html" 5 | StatFile "/usr/share/tinyproxy/stats.html" 6 | LogLevel Info 7 | upstream http squid.corp.redhat.com:3128 ".com" 8 | upstream none "localhost" 9 | MaxClients 100 10 | Allow 127.0.0.1 11 | Allow ::1 12 | ViaProxyName "tinyproxy" 13 | 14 | --------------------------------------------------------------------------------