├── .ci-operator.yaml
├── .codecov.yml
├── .editorconfig
├── .gitattributes
├── .github
    ├── dependabot.yml
    ├── pull_request_template.md
    └── workflows
    │   └── auto-tidy-interceptor.yml
├── .gitignore
├── .golangci.yml
├── LICENSE
├── Makefile
├── OWNERS
├── OWNERS_ALIASES
├── README.md
├── boilerplate
    ├── _data
    │   ├── backing-image-tag
    │   └── last-boilerplate-commit
    ├── _lib
    │   ├── boilerplate-commit
    │   ├── boilerplate.mk
    │   ├── common.sh
    │   ├── container-make
    │   ├── freeze-check
    │   ├── release.sh
    │   ├── subscriber
    │   ├── subscriber-propose
    │   ├── subscriber-propose-update
    │   ├── subscriber-report
    │   ├── subscriber-report-onboarding
    │   ├── subscriber-report-pr
    │   ├── subscriber-report-release
    │   └── subscriber.sh
    ├── generated-includes.mk
    ├── openshift
    │   └── osd-container-image
    │   │   ├── .ci-operator.yaml
    │   │   ├── OWNERS_ALIASES
    │   │   ├── README.md
    │   │   ├── app-sre-build-push.sh
    │   │   ├── dependabot.yml
    │   │   ├── prow-config
    │   │   ├── standard.mk
    │   │   └── update
    ├── update
    └── update.cfg
├── build
    └── Dockerfile
├── cadctl
    ├── .gitignore
    ├── LICENSE
    ├── cmd
    │   ├── investigate
    │   │   └── investigate.go
    │   └── root.go
    └── main.go
├── dashboards
    └── grafana-dashboard-configuration-anomaly-detection.configmap.yaml
├── go.mod
├── go.sum
├── hack
    ├── bootstrap-investigation.sh
    └── codecov.sh
├── images
    ├── CadCat.png
    ├── cad_chgm_investigation
    │   ├── README.md
    │   ├── chgm_investigation.excalidraw
    │   ├── chgm_investigation_dark.png
    │   └── chgm_investigation_light.png
    └── cad_overview
    │   ├── cad_architecture.excalidraw
    │   ├── cad_architecture_dark.png
    │   └── cad_architecture_light.png
├── interceptor
    ├── README.md
    ├── go.mod
    ├── go.sum
    ├── main.go
    ├── pkg
    │   └── interceptor
    │   │   ├── metrics.go
    │   │   └── pdinterceptor.go
    └── test
    │   └── e2e.sh
├── openshift
    ├── PipelinePruning.md
    ├── README.md
    ├── assets
    │   └── cad_pipeline_pruning.drawio.png
    ├── gateway-template.yaml
    └── template.yaml
├── pkg
    ├── ai
    │   └── k8sgpt
    │   │   └── k8sgpt.go
    ├── aws
    │   ├── aws.go
    │   ├── aws_test.go
    │   └── mock
    │   │   └── aws.go
    ├── investigations
    │   ├── aitest
    │   │   ├── README.md
    │   │   ├── metadata.yaml
    │   │   └── testing
    │   │   │   └── README.md
    │   ├── apierrorbudgetburn
    │   │   ├── README.md
    │   │   ├── apierrorbudgetburn.go
    │   │   ├── metadata.yaml
    │   │   └── testing
    │   │   │   └── README.md
    │   ├── cannotretrieveupdatessre
    │   │   ├── README.md
    │   │   ├── cannotretrieveupdatessre.go
    │   │   ├── cannotretrieveupdatessre_test.go
    │   │   ├── metadata.yaml
    │   │   └── testing
    │   │   │   └── README.md
    │   ├── ccam
    │   │   ├── ccam.go
    │   │   └── ccam_test.go
    │   ├── chgm
    │   │   ├── README.md
    │   │   ├── chgm.go
    │   │   ├── chgm_hibernation_check.go
    │   │   ├── chgm_hibernation_check_test.go
    │   │   ├── chgm_suite_test.go
    │   │   ├── chgm_test.go
    │   │   ├── util.go
    │   │   └── util_test.go
    │   ├── clustermonitoringerrorbudgetburn
    │   │   ├── clustermonitoringerrorbudgetburn.go
    │   │   ├── clustermonitoringerrorbudgetburn_test.go
    │   │   └── metadata.yaml
    │   ├── cpd
    │   │   └── cpd.go
    │   ├── insightsoperatordown
    │   │   ├── insightsoperatordown.go
    │   │   ├── insightsoperatordown_test.go
    │   │   ├── metadata.yaml
    │   │   └── testing
    │   │   │   ├── README.md
    │   │   │   └── block-api-openshift.sh
    │   ├── investigation
    │   │   └── investigation.go
    │   ├── machinehealthcheckunterminatedshortcircuitsre
    │   │   ├── machinehealthcheckunterminatedshortcircuitsre.go
    │   │   ├── machinehealthcheckunterminatedshortcircuitsre_test.go
    │   │   ├── metadata.yaml
    │   │   ├── recommendation.go
    │   │   └── testing
    │   │   │   ├── README.md
    │   │   │   ├── srep-worker-healthcheck_machinehealthcheck.yaml
    │   │   │   ├── unstoppable_pdb.yaml
    │   │   │   └── unstoppable_workload.yaml
    │   ├── pruningcronjoberror
    │   │   └── metadata.yaml
    │   ├── registry.go
    │   ├── upgradeconfigsyncfailureover4hr
    │   │   ├── README.md
    │   │   ├── metadata.yaml
    │   │   ├── upgradeconfigsyncfailureover4hr.go
    │   │   └── upgradeconfigsyncfailureover4hr_test.go
    │   └── utils
    │   │   ├── machine
    │   │       ├── machine.go
    │   │       └── machine_test.go
    │   │   └── node
    │   │       ├── node.go
    │   │       └── node_test.go
    ├── k8s
    │   ├── client.go
    │   ├── errors.go
    │   ├── errors_test.go
    │   └── scheme.go
    ├── logging
    │   └── logging.go
    ├── managedcloud
    │   └── managedcloud.go
    ├── metrics
    │   ├── README.md
    │   └── metrics.go
    ├── networkverifier
    │   ├── networkverifier.go
    │   ├── networkverifier_suite_test.go
    │   └── networkverifier_test.go
    ├── notewriter
    │   ├── notewriter.go
    │   └── notewriter_test.go
    ├── ocm
    │   ├── mock
    │   │   └── ocmmock.go
    │   ├── ocm.go
    │   └── ocm_config.go
    ├── pagerduty
    │   ├── errors.go
    │   ├── mock
    │   │   └── pagerdutymock.go
    │   ├── pagerduty.go
    │   ├── pagerduty_suite_test.go
    │   ├── pagerduty_test.go
    │   └── types.go
    └── utils
    │   ├── utils.go
    │   └── utils_suite_test.go
└── test
    ├── e2e
        ├── Dockerfile
        ├── configuration_anomaly_detection_runner_test.go
        ├── configuration_anomaly_detection_test.go
        ├── project.mk
        ├── test-e2e-suite-template.yml
        └── utils
        │   ├── aws.go
        │   ├── generate_incident.go
        │   └── utils.go
    ├── generate_incident.sh
    ├── launch_local_env.sh
    ├── set_stage_env.sh
    └── testinfra
        ├── haproxy.cfg
        └── tinyproxy.conf


/.ci-operator.yaml:
--------------------------------------------------------------------------------
1 | build_root_image:
2 |   name: boilerplate
3 |   namespace: openshift
4 |   tag: image-v7.3.0
5 | 


--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   notify:
 3 |     require_ci_to_pass: no
 4 | 
 5 | coverage:
 6 |   precision: 2
 7 |   round: down
 8 |   range: "20...100"
 9 | 
10 |   status:
11 |     project: no
12 |     patch: no
13 |     changes: no
14 | 
15 | parsers:
16 |   gcov:
17 |     branch_detection:
18 |       conditional: yes
19 |       loop: yes
20 |       method: no
21 |       macro: no
22 | 
23 | comment:
24 |   layout: "reach,diff,flags,tree"
25 |   behavior: default
26 |   require_changes: no


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | end_of_line = lf
 5 | charset = utf-8
 6 | trim_trailing_whitespace = true
 7 | insert_final_newline = true
 8 | 
 9 | [*.go]
10 | indent_style = tab
11 | tab_width = 4


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | ### BEGIN BOILERPLATE GENERATED -- DO NOT EDIT    ###
 2 | ### This block must be the last thing in your     ###
 3 | ### .gitattributes file; otherwise the 'validate' ###
 4 | ### CI check will fail.                           ###
 5 | # Used to ensure nobody mucked with boilerplate files.
 6 | boilerplate/_lib/freeze-check linguist-generated=false
 7 | # Show the boilerplate commit hash update. It's only one line anyway.
 8 | boilerplate/_data/last-boilerplate-commit linguist-generated=false
 9 | # Used by freeze-check. Good place for attackers to inject badness.
10 | boilerplate/update linguist-generated=false
11 | # Make sure attackers can't hide changes to this configuration
12 | .gitattributes linguist-generated=false
13 | ### END BOILERPLATE GENERATED ###
14 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "gomod"
 4 |     directories:
 5 |       - "/"
 6 |       - "interceptor/"
 7 |     allow:
 8 |       - dependency-type: all
 9 |     schedule:
10 |       interval: "daily"
11 |   - package-ecosystem: "docker"
12 |     directory: "/build"
13 |     labels:
14 |       - "area/dependency"
15 |       - "ok-to-test"
16 |     schedule:
17 |       interval: "weekly"
18 |     ignore:
19 |       - dependency-name: "redhat-services-prod/openshift/boilerplate"
20 |         # don't upgrade boilerplate via these means
21 |       - dependency-name: "openshift4/ose-operator-registry"
22 |         # don't upgrade ose-operator-registry via these means
23 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ### What type of PR is this?
 2 | 
 3 | (feature/bug/documentation/other)
 4 | 
 5 | ### What this PR does / Why we need it?
 6 | 
 7 | ### Special notes for your reviewer
 8 | 
 9 | ### Test Coverage
10 | #### Guidelines for CAD investigations
11 | - New investgations should be accompanied by unit tests and/or step-by-step manual tests in the investigation README.
12 | - E2E testing is desired for actioning investigations. See README for more info on investigation graduation process.
13 | 
14 | #### Test coverage checks
15 | - [ ] Added tests
16 | - [ ] Created jira card to add unit test
17 | - [ ] This PR may not need unit tests
18 | 
19 | ### Pre-checks (if applicable)
20 | - [ ] Ran unit tests locally
21 | - [ ] Validated the changes in a cluster
22 | - [ ] Included documentation changes with PR
23 | 


--------------------------------------------------------------------------------
/.github/workflows/auto-tidy-interceptor.yml:
--------------------------------------------------------------------------------
 1 | name: Auto tidy interceptor after cadctl changes
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types: [opened, synchronize]
 6 |     paths:
 7 |       - 'go.mod'
 8 |       - 'go.sum'
 9 |       - '**/*.go'
10 |       - '!interceptor/**'
11 | 
12 | permissions:
13 |   contents: write
14 |   pull-requests: write
15 | 
16 | jobs:
17 |   tidy:
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |       - name: Checkout PR branch
22 |         uses: actions/checkout@v4
23 |         with:
24 |           ref: ${{ github.event.pull_request.head.ref }}
25 |           repository: ${{ github.event.pull_request.head.repo.full_name }}
26 |           token: ${{ secrets.GITHUB_TOKEN }}
27 | 
28 |       - name: Set up Go
29 |         uses: actions/setup-go@v5
30 |         with:
31 |           go-version: stable
32 | 
33 |       - name: Run go mod tidy in interceptor
34 |         working-directory: interceptor
35 |         run: go mod tidy
36 | 
37 |       - name: Check for changes when run go mod tidy in interceptor
38 |         id: diffcheck
39 |         run: |
40 |           if [[ -n "$(git status --porcelain interceptor/go.mod interceptor/go.sum)" ]]; then
41 |             echo "changes=true" >> $GITHUB_OUTPUT
42 |           else
43 |             echo "changes=false" >> $GITHUB_OUTPUT
44 |           fi
45 | 
46 |       - name: Commit and push if there are changes
47 |         if: steps.diffcheck.outputs.changes == 'true'
48 |         uses: EndBug/add-and-commit@v9
49 |         with:
50 |           message: "On PR: tidy interceptor go.mod after cadctl go.mod update"
51 |           add: "interceptor/go.mod interceptor/go.sum"
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /bin
 2 | dist
 3 | *.out
 4 | .docker
 5 | .envrc
 6 | .idea
 7 | .vscode
 8 | cad_testing
 9 | e2e-suite.test
10 | payload
11 | test/testinfra/*.log
12 | test/testinfra/*.pem
13 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | run:
 3 |   modules-download-mode: readonly
 4 | output:
 5 |   path-prefix: ""
 6 | linters:
 7 |   default: none
 8 |   enable:
 9 |     - asasalint
10 |     - asciicheck
11 |     - bidichk
12 |     - bodyclose
13 |     - decorder
14 |     - dupword
15 |     - durationcheck
16 |     - errcheck
17 |     - errchkjson
18 |     - errname
19 |     - errorlint
20 |     - ginkgolinter
21 |     - gocheckcompilerdirectives
22 |     - goconst
23 |     - gocritic
24 |     - gocyclo
25 |     - goheader
26 |     - gomodguard
27 |     - gosec
28 |     - govet
29 |     - grouper
30 |     - importas
31 |     - ineffassign
32 |     - loggercheck
33 |     - maintidx
34 |     - makezero
35 |     - misspell
36 |     - nestif
37 |     - nilerr
38 |     - nilnil
39 |     - noctx
40 |     - nolintlint
41 |     - nosprintfhostport
42 |     - prealloc
43 |     - predeclared
44 |     - promlinter
45 |     - reassign
46 |     - revive
47 |     - rowserrcheck
48 |     - staticcheck
49 |     - thelper
50 |     - tparallel
51 |     - unconvert
52 |     - unused
53 |     - usestdlibvars
54 |     - wastedassign
55 |     - whitespace
56 |   settings:
57 |     nestif:
58 |       min-complexity: 10
59 |     revive:
60 |       rules:
61 |         - name: dot-imports
62 |           arguments:
63 |             - allowedPackages:
64 |                 - github.com/onsi/ginkgo/v2
65 |                 - github.com/onsi/gomega
66 |           severity: warning
67 |           disabled: false
68 |           exclude:
69 |             - ""
70 |   exclusions:
71 |     generated: lax
72 |     paths:
73 |       - third_party$
74 |       - builtin$
75 |       - examples$
76 | formatters:
77 |   enable:
78 |     - gofmt
79 |     - gofumpt
80 |     - goimports
81 |   exclusions:
82 |     generated: lax
83 |     paths:
84 |       - third_party$
85 |       - builtin$
86 |       - examples$
87 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | IMAGE_REGISTRY?=quay.io
  2 | IMAGE_REPOSITORY?=app-sre
  3 | IMAGE_NAME?=configuration-anomaly-detection
  4 | DOCKERFILE?=./build/Dockerfile
  5 | define ADDITIONAL_IMAGE_SPECS
  6 | ./build/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/$(IMAGE_NAME):$(CURRENT_COMMIT)
  7 | endef
  8 | 
  9 | include boilerplate/generated-includes.mk
 10 | include test/e2e/project.mk
 11 | 
 12 | GOLANGCI_LINT_VERSION=v2.0.2
 13 | MOCKGEN_VERSION=v0.5.0
 14 | 
 15 | .DEFAULT_GOAL := all
 16 | 
 17 | help:  # Display this help
 18 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[0-9A-Za-z_-]+:.*?##/ { printf "  \033[36m%-50s\033[0m %s\n", $$1, $$2 } /^\$$\([0-9A-Za-z_-]+\):.*?##/ { gsub("_","-", $$1); printf "  \033[36m%-50s\033[0m %s\n", tolower(substr($$1, 3, length($$1)-7)), $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 19 | 
 20 | ##@ Global:
 21 | .PHONY: all
 22 | all: interceptor cadctl ## Generate, build, lint, test all subprojects
 23 | 
 24 | .PHONY: build
 25 | build: build-interceptor build-cadctl ## Build all subprojects in this repository
 26 | 
 27 | .PHONY: lint
 28 | lint: lint-cadctl lint-interceptor ## Lint all subprojects
 29 | 
 30 | .PHONY: test
 31 | test: test-cadctl test-interceptor
 32 | 
 33 | ##@ cadctl:
 34 | .PHONY: cadctl
 35 | cadctl: generate-cadctl build-cadctl test-cadctl lint-cadctl ## Run all targets for cadctl (generate, build, test, lint, generation)
 36 | 
 37 | .PHONY: generate-cadctl
 38 | generate-cadctl: check-go121-install install-mockgen ## Generate mocks for cadctl
 39 | 	go generate -mod=readonly ./...
 40 | 
 41 | .PHONY: build-cadctl
 42 | build-cadctl: check-go121-install ## Build the cadctl binary
 43 | 	@echo
 44 | 	@echo "Building cadctl..."
 45 | 	cd cadctl && go build -ldflags="-s -w" -mod=readonly -trimpath -o ../bin/cadctl .
 46 | 
 47 | .PHONY: lint-cadctl
 48 | lint-cadctl: install-linter ## Lint cadctl subproject
 49 | 	@echo
 50 | 	@echo "Linting cadctl..."
 51 | 	# Explicitly set GOROOT, see https://github.com/golangci/golangci-lint/issues/3107
 52 | 	GOROOT=$$(go env GOROOT) GOLANGCI_LINT_CACHE=$$(mktemp -d) $(GOPATH)/bin/golangci-lint run -c .golangci.yml
 53 | 
 54 | .PHONY: test-cadctl
 55 | test-cadctl: check-go121-install ## Run automated tests for cadctl
 56 | 	@echo
 57 | 	@echo "Running unit tests for cadctl..."
 58 | 	go test $(TESTOPTS) -race -mod=readonly ./cadctl/... ./pkg/...
 59 | 
 60 | ##@ Interceptor:
 61 | .PHONY: interceptor
 62 | interceptor: build-interceptor test-interceptor test-interceptor-e2e lint-interceptor ## Run all targets for interceptor (build, test, lint)
 63 | 
 64 | .PHONY: build-interceptor
 65 | build-interceptor: check-go121-install ## Build the interceptor binary
 66 | 	@echo
 67 | 	@echo "Building interceptor..."
 68 | 	cd interceptor && go build -ldflags="-s -w" -mod=readonly -trimpath -o ../bin/interceptor .
 69 | 
 70 | .PHONY: lint-interceptor
 71 | lint-interceptor: install-linter ## Lint interceptor subproject
 72 | 	@echo
 73 | 	@echo "Linting interceptor..."
 74 | 	# Explicitly set GOROOT, see https://github.com/golangci/golangci-lint/issues/3107
 75 | 	cd interceptor && GOROOT=$$(go env GOROOT) GOLANGCI_LINT_CACHE=$$(mktemp -d) $(GOPATH)/bin/golangci-lint run -c ../.golangci.yml
 76 | 
 77 | .PHONY: test-interceptor
 78 | test-interceptor: check-go121-install check-jq-install build-interceptor ## Run unit tests for interceptor
 79 | 	@echo
 80 | 	@echo "Running unit tests for interceptor..."
 81 | 	cd interceptor && go test -race -mod=readonly ./...
 82 | 
 83 | .PHONY: test-interceptor-e2e
 84 | test-interceptor-e2e: check-go121-install check-jq-install check-vault-install build-interceptor ## Run e2e tests for interceptor
 85 | 	@echo
 86 | 	@echo "Running e2e tests for interceptor..."
 87 | 	cd interceptor && ./test/e2e.sh
 88 | 
 89 | ##@ Boilerplate:
 90 | .PHONY: boilerplate
 91 | bootstrap-investigation: ## Bootstrap a new boilerplate investigation
 92 | 	@cd hack && ./bootstrap-investigation.sh
 93 | 
 94 | 
 95 | .PHONY: boilerplate-update
 96 | boilerplate-update: ## Update boilerplate version
 97 | 	@boilerplate/update
 98 | 
 99 | ### CI Only
100 | .PHONY: coverage
101 | coverage:
102 | 	hack/codecov.sh
103 | 
104 | .PHONY: validate
105 | validate: isclean
106 | 
107 | ### Prerequisites
108 | ### It is assumed that 'make' is already installed
109 | ### Version of go is checked but the version the tools are not checked as this should not matter much.
110 | .PHONY: check-%-install
111 | check-%-install:
112 | 	@type $* 1> /dev/null || (>&2 echo && echo "'$*' IS NOT INSTALLED - install it manually" && echo && false)
113 | 
114 | .PHONY: check-go121-install
115 | check-go121-install:
116 | 	@(type go 1> /dev/null && go version | grep -q 'go[1-9].[2-9][1-9]') || (>&2 echo && echo "'go' WITH VERSION >= 1.21 IS NOT INSTALLED - install it manually" && echo && false)
117 | 
118 | .PHONY: install-linter
119 | install-linter: check-curl-install check-go121-install
120 | 	@ls $(GOPATH)/bin/golangci-lint 1>/dev/null || (echo && echo "Installing 'golangci-lint'..." && mkdir -p $(GOPATH)/bin && curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GOPATH)/bin $(GOLANGCI_LINT_VERSION))
121 | 
122 | .PHONY: install-mockgen
123 | install-mockgen: check-go121-install
124 | 	@type mockgen 1> /dev/null || (echo && echo "Installing 'mockgen'..." && go install go.uber.org/mock/mockgen@$(MOCKGEN_VERSION))
125 | 


--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
 1 | reviewers:
 2 | - Makdaam
 3 | - Nikokolas3270
 4 | - rafael-azevedo
 5 | - RaphaelBut
 6 | - bng0y
 7 | - typeid
 8 | - tnierman
 9 | - zmird-r
10 | - joshbranham
11 | - MateSaary
12 | - srep-functional-team-orange
13 | approvers:
14 | - Makdaam
15 | - Nikokolas3270
16 | - rafael-azevedo
17 | - RaphaelBut
18 | - bng0y
19 | - typeid
20 | - tnierman
21 | - zmird-r
22 | - joshbranham
23 | - MateSaary
24 | - srep-functional-team-orange
25 | - srep-team-leads
26 | maintainers:
27 | - rafael-azevedo
28 | 


--------------------------------------------------------------------------------
/OWNERS_ALIASES:
--------------------------------------------------------------------------------
 1 | # ================================ DO NOT EDIT ================================
 2 | # This file is managed in https://github.com/openshift/boilerplate
 3 | # See the OWNERS_ALIASES docs: https://git.k8s.io/community/contributors/guide/owners.md#OWNERS_ALIASES
 4 | # =============================================================================
 5 | aliases:
 6 |   srep-functional-team-aurora:
 7 |     - abyrne55
 8 |     - dakotalongRH
 9 |     - joshbranham
10 |     - luis-falcon
11 |     - reedcort
12 |   srep-functional-team-fedramp:
13 |     - tonytheleg
14 |     - theautoroboto
15 |     - rhdedgar
16 |     - katherinelc321
17 |     - rojasreinold
18 |     - fsferraz-rh
19 |   srep-functional-team-hulk:
20 |     - a7vicky
21 |     - ravitri
22 |     - shitaljante
23 |     - devppratik
24 |     - Tafhim
25 |     - tkong-redhat
26 |     - TheUndeadKing
27 |     - vaidehi411
28 |     - chamalabey
29 |   srep-functional-team-orange:
30 |     - bergmannf
31 |     - Makdaam
32 |     - Nikokolas3270
33 |     - RaphaelBut
34 |     - MateSaary
35 |     - rolandmkunkel
36 |     - petrkotas
37 |     - zmird-r
38 |     - evlin-rh
39 |     - hectorakemp
40 |   srep-functional-team-rocket:
41 |     - aliceh
42 |     - anispate
43 |     - clcollins
44 |     - Mhodesty
45 |     - nephomaniac
46 |     - tnierman
47 |   srep-functional-team-security:
48 |     - jaybeeunix
49 |     - sam-nguyen7
50 |     - wshearn
51 |     - dem4gus
52 |     - npecka
53 |     - pshickeydev
54 |     - casey-williams-rh
55 |     - boranx
56 |   srep-functional-team-thor:
57 |     - bmeng
58 |     - MitaliBhalla
59 |     - feichashao
60 |     - samanthajayasinghe
61 |     - xiaoyu74
62 |     - Dee-6777
63 |     - Tessg22
64 |     - smarthall
65 |   srep-infra-cicd:
66 |     - mmazur
67 |     - mrsantamaria
68 |     - ritmun
69 |     - jbpratt
70 |     - yiqinzhang
71 |   srep-functional-leads:
72 |     - abyrne55
73 |     - clcollins
74 |     - Nikokolas3270
75 |     - theautoroboto
76 |     - smarthall
77 |     - sam-nguyen7
78 |     - ravitri
79 |   srep-team-leads:
80 |     - rafael-azevedo
81 |     - iamkirkbater
82 |     - rogbas
83 |     - fahlmant
84 |     - dustman9000
85 |     - wanghaoran1988
86 |     - bng0y
87 |     - bmeng
88 |     - typeid
89 |   sre-group-leads:
90 |     - apahim
91 |     - maorfr
92 |     - rogbas
93 |   srep-architects:
94 |     - jharrington22
95 |     - cblecker
96 | 


--------------------------------------------------------------------------------
/boilerplate/_data/backing-image-tag:
--------------------------------------------------------------------------------
1 | image-v7.3.0
2 | 


--------------------------------------------------------------------------------
/boilerplate/_data/last-boilerplate-commit:
--------------------------------------------------------------------------------
1 | 933276b05c4d7c6a049aad2a1b291de3281b1a7b
2 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/boilerplate.mk:
--------------------------------------------------------------------------------
1 | .PHONY: boilerplate-commit
2 | boilerplate-commit:
3 | 	@boilerplate/_lib/boilerplate-commit
4 | 
5 | .PHONY: boilerplate-freeze-check
6 | boilerplate-freeze-check:
7 | 	@boilerplate/_lib/freeze-check
8 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/container-make:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ "$1" == "-h"* ]] || [[ "$1" == "--h"* ]]; then
 4 |   echo "Usage: $0 {arguments to the real 'make'}"
 5 |   echo "Runs 'make' in the boilerplate backing container."
 6 |   echo "If the command fails, starts a shell in the container so you can debug."
 7 |   exit -1
 8 | fi
 9 | 
10 | source ${0%/*}/common.sh
11 | 
12 | CONTAINER_ENGINE="${CONTAINER_ENGINE:-$(command -v podman || command -v docker)}"
13 | [[ -n "$CONTAINER_ENGINE" ]] || err "Couldn't find a container engine. Are you already in a container?"
14 | 
15 | # Make sure the mount inside the container is named in such a way that
16 | # - openapi-gen (which relies on GOPATH) produces absolute paths; and
17 | # - other go-ish paths are writeable, e.g. for `go mod download`.
18 | CONTAINER_MOUNT=/go/src/$(repo_import $REPO_ROOT)
19 | 
20 | # First set up a detached container with the repo mounted.
21 | banner "Starting the container"
22 | CE_OPTS="--platform=linux/amd64"
23 | if [[ "${CONTAINER_ENGINE##*/}" == "podman" ]]; then
24 |     CE_OPTS="${CE_OPTS} --userns keep-id"
25 | fi
26 | if [[ "${CONTAINER_ENGINE##*/}" == "podman" ]] && [[ $OSTYPE == *"linux"* ]]; then
27 |     CE_OPTS="${CE_OPTS} -v $REPO_ROOT:$CONTAINER_MOUNT:Z"
28 | else
29 |     CE_OPTS="${CE_OPTS} -v $REPO_ROOT:$CONTAINER_MOUNT"
30 | fi
31 | container_id=$($CONTAINER_ENGINE run -d ${CE_OPTS} $IMAGE_PULL_PATH sleep infinity)
32 | 
33 | if [[ $? -ne 0 ]] || [[ -z "$container_id" ]]; then
34 |   err "Couldn't start detached container"
35 | fi
36 | 
37 | # Now run our `make` command in it with the right UID and working directory
38 | args="exec -it -u $(id -u):0 -w $CONTAINER_MOUNT $container_id"
39 | banner "Running: make $@"
40 | $CONTAINER_ENGINE $args make "$@"
41 | rc=$?
42 | 
43 | # If it failed, drop into the container in a shell
44 | if [[ $rc -ne 0 ]]; then
45 |   banner "The 'make' command failed! Starting a shell in the container for debugging. Just 'exit' when done."
46 |   $CONTAINER_ENGINE $args /bin/bash
47 | fi
48 | 
49 | # Finally, remove the container
50 | banner "Cleaning up the container"
51 | $CONTAINER_ENGINE rm -f $container_id >/dev/null
52 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/freeze-check:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # NOTE: For security reasons, everything imported or invoked (even
 4 | # indirectly) by this script should be audited for vulnerabilities and
 5 | # explicitly excluded from `linguist-generated` in the consuming
 6 | # repository's .gitattributes. In other words, we want PRs to show
 7 | # deltas to this script and all its dependencies by default so that
 8 | # attempts to inject or circumvent code are visible.
 9 | 
10 | set -e
11 | 
12 | REPO_ROOT=$(git rev-parse --show-toplevel)
13 | # Hardcoded rather than sourced to reduce attack surface.
14 | BOILERPLATE_GIT_REPO=https://github.com/openshift/boilerplate.git
15 | 
16 | # Validate that no subscribed boilerplate artifacts have been changed.
17 | # PR checks may wish to gate on this.
18 | 
19 | # This works by grabbing the commit hash of the boilerplate repository
20 | # at which the last update was applied, running the main `update` driver
21 | # against that, and failing if there's a resulting diff.
22 | 
23 | # If we can't tell what that commit was, we must assume this is the
24 | # first update, and we'll (noisily) "succeed".
25 | 
26 | # Note that this ought to work when you've just committed an update,
27 | # even if you've changed your update.cfg beforehand. We're basically
28 | # making sure you didn't muck with anything after updating.
29 | 
30 | # For this to work, you have to be starting from a clean repository
31 | # state (any changes committed).
32 | # TODO(efried): This is not ideal -- it would be nice if I could check
33 | # this before committing my changes -- but how would that work? Diff to
34 | # a file, create a temporary commit, run the rest, remove the commit,
35 | # and reapply the diff? Messy and error-prone -- and I would be
36 | # seriously ticked off if something went wrong and lost my in-flight
37 | # changes.
38 | if ! [ -z "$(git status --porcelain -- ':!build/Dockerfile*')" ]; then
39 |   echo "Can't validate boilerplate in a dirty repository. Please commit your changes and try again." >&2
40 |   exit 1
41 | fi
42 | 
43 | # We glean the last boilerplate commit from the
44 | # last-boilerplate-commit file, which gets laid down by the main
45 | # `update` driver each time it runs.
46 | LBCF=${REPO_ROOT}/boilerplate/_data/last-boilerplate-commit
47 | if ! [[ -f "$LBCF" ]]; then
48 |   echo "Couldn't discover last boilerplate commit! Assuming you're bootstrapping."
49 |   exit 0
50 | fi
51 | LBC=$(cat $LBCF)
52 | 
53 | # Download just that commit
54 | echo "Fetching $LBC from $BOILERPLATE_GIT_REPO"
55 | # boilerplate/update cleans up this temp dir
56 | TMPD=$(mktemp -d)
57 | cd $TMPD
58 | git init
59 | # TODO(efried): DRY this remote. Make it configurable?
60 | git remote add origin $BOILERPLATE_GIT_REPO
61 | git fetch origin $(cat $LBCF) --tags
62 | git reset --hard FETCH_HEAD
63 | 
64 | # Now invoke the update script, overriding the source repository we've
65 | # just downloaded at the appropriate commit.
66 | # We invoke the script explicitly rather than via the make target to
67 | # close a security hole whereby the latter is overridden.
68 | echo "Running update"
69 | cd $REPO_ROOT
70 | BOILERPLATE_GIT_REPO="${TMPD}" boilerplate/update
71 | 
72 | # Okay, if anything has changed, that's bad.
73 | if [[ $(git status --porcelain -- ':!build/Dockerfile*' | wc -l) -ne 0 ]]; then
74 |   echo "Your boilerplate is dirty!" >&2
75 |   git status --porcelain -- ':!build/Dockerfile*'
76 |   exit 1
77 | fi
78 | 
79 | echo "Your boilerplate is clean!"
80 | exit 0
81 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/release.sh:
--------------------------------------------------------------------------------
  1 | # Helpers and variables for dealing with openshift/release
  2 | 
  3 | # NOTE: This library is sourced from user-run scripts. It should not be
  4 | # sourced in CI, as it relies on git config that's not necessarily
  5 | # present there.
  6 | 
  7 | RELEASE_REPO=openshift/release
  8 | 
  9 | ## Information about the boilerplate consumer
 10 | # E.g. "openshift/my-wizbang-operator"
 11 | CONSUMER=$(repo_name .)
 12 | [[ -z "$CONSUMER" ]] && err "
 13 | Failed to determine current repository name"
 14 | #
 15 | # E.g. "openshift"
 16 | CONSUMER_ORG=${CONSUMER%/*}
 17 | [[ -z "$CONSUMER_ORG" ]] && err "
 18 | Failed to determine consumer org"
 19 | #
 20 | # E.g. "my-wizbang-operator"
 21 | CONSUMER_NAME=${CONSUMER#*/}
 22 | [[ -z "$CONSUMER_NAME" ]] && err "
 23 | Failed to determine consumer name"
 24 | #
 25 | # E.g. "master"
 26 | # This will produce something like refs/remotes/origin/master
 27 | DEFAULT_BRANCH=$(git symbolic-ref refs/remotes/upstream/HEAD 2>/dev/null || git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null || echo defaulting/to/master)
 28 | # Strip off refs/remotes/{upstream|origin}/
 29 | DEFAULT_BRANCH=${DEFAULT_BRANCH##*/}
 30 | [[ -z "$DEFAULT_BRANCH" ]] && err "
 31 | Failed to determine default branch name"
 32 | 
 33 | ## release_process_args "$@"
 34 | #
 35 | # This is for use by commands expecting one optional argument which is
 36 | # the file system path to a clone of the $RELEASE_REPO.
 37 | #
 38 | # Will invoke `usage` -- which must be defined by the caller -- if
 39 | # the wrong number of arguments are received, or if the single argument
 40 | # is `help` or a flag.
 41 | #
 42 | # If exactly one argument is specified and it is valid, it is assigned
 43 | # to the global RELEASE_CLONE variable.
 44 | release_process_args() {
 45 |     if [[ $# -eq 1 ]]; then
 46 |         # Special cases for usage queries
 47 |         if [[ "$1" == '-'* ]] || [[ "$1" == help ]]; then
 48 |             usage
 49 |         fi
 50 | 
 51 |         [[ -d $1 ]] || err "
 52 |     $1: Not a directory."
 53 | 
 54 |         [[ $(repo_name $1) == "$RELEASE_REPO" ]] || err "
 55 |     $1 is not a clone of $RELEASE_REPO; or its 'origin' remote is not set properly."
 56 | 
 57 |         # Got a usable clone of openshift/release
 58 |         RELEASE_CLONE="$1"
 59 | 
 60 |     elif [[ $# -ne 0 ]]; then
 61 |         usage
 62 |     fi
 63 | }
 64 | 
 65 | ## release_validate_invocation
 66 | #
 67 | # Make sure we were called from a reasonable place, that being:
 68 | # - A boilerplate consumer
 69 | # - ...that's actually subscribed to a convention
 70 | # - ...containing the script being invoked
 71 | release_validate_invocation() {
 72 |     # Make sure we were invoked from a boilerplate consumer.
 73 |     [[ -z "$CONVENTION_NAME" ]] && err "
 74 |     $cmd must be invoked from a consumer of an appropriate convention. Where did you get this script from?"
 75 |     # Or at least not from boilerplate itself
 76 |     [[ "$CONSUMER" == "openshift/boilerplate" ]] && err "
 77 |     $cmd must be invoked from a boilerplate consumer, not from boilerplate itself."
 78 | 
 79 |     [[ -s $CONVENTION_ROOT/_data/last-boilerplate-commit ]] || err "
 80 |     $cmd must be invoked from a boilerplate consumer!"
 81 | 
 82 |     grep -E -q "^$CONVENTION_NAME(\s.*)?$" $CONVENTION_ROOT/update.cfg || err "
 83 |     $CONSUMER is not subscribed to $CONVENTION_NAME!"
 84 | }
 85 | 
 86 | ## release_prep_clone
 87 | #
 88 | # If $RELEASE_CLONE is already set:
 89 | # - It should represent a directory containing a clean checkout of the
 90 | #   release repository; otherwise we error.
 91 | # - We checkout and pull master.
 92 | # Otherwise:
 93 | # - We clone the release repo to a temporary directory.
 94 | # - We set the $RELEASE_CLONE global variable to point to that
 95 | #   directory.
 96 | release_prep_clone() {
 97 |     # If a release repo clone wasn't specified, create one
 98 |     if [[ -z "$RELEASE_CLONE" ]]; then
 99 |         RELEASE_CLONE=$(mktemp -dt openshift_release_XXXXXXX)
100 |         git clone --depth=1 git@github.com:${RELEASE_REPO}.git $RELEASE_CLONE
101 |     else
102 |         [[ -z "$(git -C $RELEASE_CLONE status --porcelain)" ]] || err "
103 | Your release clone must start clean."
104 |         # These will blow up if it's misconfigured
105 |         git -C $RELEASE_CLONE checkout master
106 |         git -C $RELEASE_CLONE pull
107 |     fi
108 | }
109 | 
110 | ## release_done_msg BRANCH
111 | #
112 | # Print exit instructions for submitting the release PR.
113 | # BRANCH is a suggested branch name.
114 | release_done_msg() {
115 |     echo
116 |     git status
117 | 
118 |     cat <<EOF
119 | 
120 | Ready to commit, push, and create a PR in $RELEASE_CLONE
121 | You may wish to:
122 | 
123 | cd $RELEASE_CLONE
124 | git checkout -b $release_branch
125 | git add -A
126 | git commit
127 | git push origin $release_branch
128 | EOF
129 | }
130 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/subscriber:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | REPO_ROOT=$(git rev-parse --show-toplevel)
 4 | source $REPO_ROOT/boilerplate/_lib/common.sh
 5 | 
 6 | declare -A SUBCOMMANDS
 7 | SUBCOMMANDS=(
 8 |     [propose]='Propose pull/merge requests for subscribers'
 9 |     [report]='Print information about subscribers'
10 | )
11 | 
12 | source $REPO_ROOT/boilerplate/_lib/subscriber.sh
13 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/subscriber-propose:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | REPO_ROOT=$(git rev-parse --show-toplevel)
 4 | source $REPO_ROOT/boilerplate/_lib/common.sh
 5 | 
 6 | declare -A SUBCOMMANDS
 7 | SUBCOMMANDS=(
 8 |     # TODO:
 9 |     # [bootstrap]='Bootstrap a new subscriber'
10 |     # [prow-config]='Propose standardized prow configuration to openshift/release'
11 |     [update]='Update an already-onboarded subscriber'
12 | )
13 | 
14 | source $REPO_ROOT/boilerplate/_lib/subscriber.sh
15 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/subscriber-propose-update:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | REPO_ROOT=$(git rev-parse --show-toplevel)
  4 | source $REPO_ROOT/boilerplate/_lib/common.sh
  5 | 
  6 | usage() {
  7 |     cat <<EOF
  8 | $CMD SUBSCRIBER ...
  9 | 
 10 | Generate and propose a boilerplate update commit to one or more
 11 | subscribers.
 12 | 
 13 | Arguments:
 14 |     SUBSCRIBER  One or more subscriber repositories of the form
 15 |                 "org/name" (e.g. "openshift/deadmanssnitch-operator");
 16 |                 or the special keyword "ALL" to update all onboarded
 17 |                 subscribers that need it.
 18 | 
 19 | Quirks and Limitations:
 20 | - Requires a functional and authenticated gh CLI, with git_protocol
 21 |   appropriately configured for however you push.
 22 |       gh config set git_protocol {ssh|https}
 23 | - For each subscriber we actually try to propose to, UPDATES YOUR FORK'S
 24 |   DEFAULT BRANCH to be in sync with upstream's.
 25 | - Is still slightly interactive, because 'gh pr create' likes to ask
 26 |   questions about your origin and upstream.
 27 | EOF
 28 |     exit -1
 29 | }
 30 | 
 31 | source $REPO_ROOT/boilerplate/_lib/subscriber.sh
 32 | 
 33 | # Arguments are required
 34 | [[ $# -eq 0 ]] && usage
 35 | 
 36 | TMPD=$(mktemp -d)
 37 | trap "rm -fr $TMPD" EXIT
 38 | 
 39 | propose_update() {
 40 |     local subscriber=$1
 41 |     local proj=${subscriber#*/}
 42 | 
 43 |     if [[ -z "$DRY_RUN" ]]; then
 44 |         echo "DRY RUN: Would propose update for $subscriber"
 45 |         return 0
 46 |     fi
 47 | 
 48 |     (
 49 |         # Clone my fork of the subscriber repo
 50 |         cd $TMPD
 51 |         # This
 52 |         # - uses the existing fork if one exists
 53 |         # - sets 'origin' and 'upstream' remotes
 54 |         gh repo fork $subscriber --clone=true --remote=true
 55 |         cd $proj
 56 | 
 57 |         # Current branch is 'master' or 'main'
 58 |         cur_branch=$(current_branch .)
 59 |         # Make sure our origin is synced with upstream, so our update
 60 |         # commit is based off of the latest code.
 61 |         # WARNING: This changes your fork!
 62 |         git pull upstream $cur_branch
 63 |         git push origin $cur_branch
 64 | 
 65 |         # Create the update commit
 66 |         make boilerplate-update
 67 |         make boilerplate-commit
 68 | 
 69 |         # And create the PR
 70 |         # TODO: This is interactive. How do we tell gh "Yes, please use
 71 |         # upstream as upstream and origin as origin?"
 72 |         gh pr create -f
 73 |     )
 74 | }
 75 | 
 76 | bp_master=$(git rev-parse master)
 77 | 
 78 | for subscriber in $(subscriber_args "$@"); do
 79 | 
 80 |     # Does this one need an update?
 81 |     lbc=$(last_bp_commit $subscriber)
 82 |     [[ -n "$lbc" ]] || err "No last-boilerplate-commit file for onboarded subscriber '$subscriber'"
 83 | 
 84 |     banner "Processing $subscriber"
 85 | 
 86 |     cbm=$(commits_behind_bp_master $lbc)
 87 |     if [[ $cbm -eq 0 ]]; then
 88 |         echo "Subscriber already up to date; skipping: '$subscriber'"
 89 |         continue
 90 |     fi
 91 | 
 92 |     # Is there already a PR proposed for this level?
 93 |     existing_pr=$(gh pr list --repo $subscriber | grep -P ":boilerplate-\S+-$bp_master\s")
 94 |     if [[ -n "$existing_pr" ]]; then
 95 |         echo "Subscriber '$subscriber' already has an open PR:"
 96 |         echo "https://github.com/$subscriber/pull/$existing_pr"
 97 |         continue
 98 |     fi
 99 | 
100 |     # Pull the trigger
101 |     propose_update "$subscriber"
102 | done
103 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/subscriber-report:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | REPO_ROOT=$(git rev-parse --show-toplevel)
 4 | source $REPO_ROOT/boilerplate/_lib/common.sh
 5 | 
 6 | declare -A SUBCOMMANDS
 7 | SUBCOMMANDS=(
 8 |     [onboarding]='Prints a CSV report of onboarded boilerplate subscribers.'
 9 |     [pr]='Finds boilerplate-related pull requests for registered subscribers.'
10 |     [release]='Checks openshift/release configuration for onboarded subscribers.'
11 | )
12 | 
13 | source $REPO_ROOT/boilerplate/_lib/subscriber.sh
14 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/subscriber-report-onboarding:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | REPO_ROOT=$(git rev-parse --show-toplevel)
 4 | source $REPO_ROOT/boilerplate/_lib/common.sh
 5 | 
 6 | usage() {
 7 |     cat <<EOF
 8 | $CMD
 9 | 
10 | Prints a CSV report of onboarded boilerplate subscribers. Each record is of
11 | the form:
12 |       org/proj,hash (-N),link
13 | where:
14 |       org:    Github org (e.g. openshift)
15 |       proj:   Github project (e.g. my-wizbang-operator)
16 |       hash:   Short (7c) hash of the project's last boilerplate commit
17 |       N:      How many merge commits {hash} is behind master
18 |       link:   Github URL comparing {hash} to master (empty if N==0)
19 | EOF
20 |     exit -1
21 | }
22 | 
23 | source $REPO_ROOT/boilerplate/_lib/subscriber.sh
24 | 
25 | ## bp_range_url HASH
26 | #
27 | # Prints the github URL for the commit range from HASH to master.
28 | bp_range_url() {
29 |     local hash=$1
30 |     [[ -n "$hash" ]] || return
31 |     echo "https://github.com/openshift/boilerplate/compare/$hash...master"
32 | }
33 | 
34 | for subscriber in $(subscriber_list onboarded); do
35 |     lbc=$(last_bp_commit $subscriber)
36 |     [[ -n "$lbc" ]] || continue
37 |     printf "%s,%s (-%d),%s\n" $subscriber $lbc $(commits_behind_bp_master $lbc) $(bp_range_url $lbc)
38 | done
39 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/subscriber-report-pr:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | REPO_ROOT=$(git rev-parse --show-toplevel)
 4 | source $REPO_ROOT/boilerplate/_lib/common.sh
 5 | 
 6 | usage() {
 7 |     cat <<EOF
 8 | $CMD
 9 | 
10 | Finds boilerplate-related pull requests for registered subscribers.
11 | Prints a report of the following form for each subscriber:
12 | 
13 |       org/proj:
14 |         link  title  user:branch  state
15 |         ...
16 | 
17 | where:
18 |       org:    Github org (e.g. openshift)
19 |       proj:   Github project (e.g. my-wizbang-operator)
20 |       link:   Link to the pull request on GitHub
21 |       title:  Pull request title
22 |       user:   Author of the pull request
23 |       branch: Branch name in author's fork
24 |       state:  State of the PR (should always be OPEN)
25 | EOF
26 |     exit -1
27 | }
28 | 
29 | source $REPO_ROOT/boilerplate/_lib/subscriber.sh
30 | 
31 | for subscriber in $(subscriber_list all); do
32 |     echo $subscriber:
33 |     # This awk takes advantage of the fact that these lines start with the
34 |     # PR number and end with /\w+user:branch\w+OPEN$/;
35 |     # and we name our branches starting with 'boilerplate-'
36 |     gh pr list --repo $subscriber | awk -F: '$NF ~ /^boilerplate-/ {print "  https://github.com/'$subscriber'/pull/"$0}'
37 |     echo
38 | done
39 | 


--------------------------------------------------------------------------------
/boilerplate/_lib/subscriber-report-release:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | REPO_ROOT=$(git rev-parse --show-toplevel)
  4 | source $REPO_ROOT/boilerplate/_lib/common.sh
  5 | source $REPO_ROOT/boilerplate/_lib/release.sh
  6 | 
  7 | usage() {
  8 |     cat <<EOF
  9 | $CMD SUBSCRIBER ...
 10 | 
 11 | Analyzes the openshift/release footprint of onboarded boilerplate
 12 | subscribers. For each subscriber, prints the delta, if any, between the
 13 | existing and expected prow configuration.
 14 | 
 15 | Arguments:
 16 |     SUBSCRIBER  One or more subscriber repositories of the form
 17 |                 "org/name" (e.g. "openshift/deadmanssnitch-operator");
 18 |                 or the special keyword "ALL" to report on all onboarded
 19 |                 subscribers.
 20 | EOF
 21 |     exit -1
 22 | }
 23 | 
 24 | source $REPO_ROOT/boilerplate/_lib/subscriber.sh
 25 | 
 26 | # Arguments are required
 27 | [[ $# -eq 0 ]] && usage
 28 | 
 29 | ## prow_config ORG PROJ
 30 | #
 31 | # Downloads the ci-operator configuration file from openshift/release for the
 32 | # specified consuming project. Prints to stdout the path to the file. If the
 33 | # file does not exist on the server, there is no output.
 34 | #
 35 | # Set the TMPD global before invoking this.
 36 | prow_config() {
 37 |     local org=$1
 38 |     local proj=$2
 39 |     local p=https://raw.githubusercontent.com/$RELEASE_REPO/master/ci-operator/config/$org/$proj
 40 |     local f
 41 |     for branch in master main; do
 42 |         f=$org-$proj-$branch.yaml
 43 |         local c="$(curl -s $p/$f)"
 44 |         if [[ "$c" != "404: Not Found" ]]; then
 45 |             echo "$c" > $TMPD/$f
 46 |             echo $TMPD/$f
 47 |             return
 48 |         fi
 49 |     done
 50 | }
 51 | 
 52 | ## expected_prow_config ORG PROJ BRANCH
 53 | #
 54 | # Prints to stdout the expected prow configuration for the specified
 55 | # ORG/PROJ.
 56 | expected_prow_config() {
 57 |     local org=$1
 58 |     local consumer_name=$2
 59 |     local branch=$3
 60 |     # TODO: DRY this with what's in prow-config.
 61 |     # Do it by making it a template in the convention dir.
 62 |     cat <<EOF
 63 | build_root:
 64 |   from_repository: true
 65 | images:
 66 | - dockerfile_path: build/Dockerfile
 67 |   to: unused
 68 | resources:
 69 |   '*':
 70 |     limits:
 71 |       memory: 4Gi
 72 |     requests:
 73 |       cpu: 100m
 74 |       memory: 200Mi
 75 | tests:
 76 | - as: coverage
 77 |   commands: |
 78 |     export CODECOV_TOKEN=\$(cat /tmp/secret/CODECOV_TOKEN)
 79 |     make coverage
 80 |   container:
 81 |     from: src
 82 |   secret:
 83 |     mount_path: /tmp/secret
 84 |     name: ${consumer_name}-codecov-token
 85 | - as: publish-coverage
 86 |   commands: |
 87 |     export CODECOV_TOKEN=\$(cat /tmp/secret/CODECOV_TOKEN)
 88 |     make coverage
 89 |   container:
 90 |     from: src
 91 |   postsubmit: true
 92 |   secret:
 93 |     mount_path: /tmp/secret
 94 |     name: ${consumer_name}-codecov-token
 95 | - as: lint
 96 |   commands: make lint
 97 |   container:
 98 |     from: src
 99 | - as: test
100 |   commands: make test
101 |   container:
102 |     from: src
103 | - as: validate
104 |   commands: make validate
105 |   container:
106 |     from: src
107 | zz_generated_metadata:
108 |   branch: ${branch}
109 |   org: ${org}
110 |   repo: ${consumer_name}
111 | EOF
112 | }
113 | 
114 | TMPD=$(mktemp -d)
115 | trap "rm -fr $TMPD" EXIT
116 | 
117 | for subscriber in $(subscriber_args "$@"); do
118 |     banner $subscriber
119 |     org=${subscriber%/*}
120 |     proj=${subscriber#*/}
121 |     pc=$(prow_config $org $proj)
122 |     # Filename is of the form ...-$branch.yaml
123 |     branch=${pc##*-}
124 |     branch=${branch%.yaml}
125 |     if [[ -z "$pc" ]]; then
126 |         echo "=== No configuration ==="
127 |     else
128 |         d="$(expected_prow_config $org $proj $branch | diff - $pc)"
129 |         if [[ -z "$d" ]]; then
130 |             echo "=== A-OK ==="
131 |         else
132 |             echo "$d"
133 |         fi
134 |     fi
135 | done
136 | 


--------------------------------------------------------------------------------
/boilerplate/generated-includes.mk:
--------------------------------------------------------------------------------
1 | # THIS FILE IS GENERATED BY BOILERPLATE. DO NOT EDIT.
2 | # This file automatically includes any *.mk files in your subscribed
3 | # conventions. Please ensure your base Makefile includes only this file.
4 | include boilerplate/_lib/boilerplate.mk
5 | include boilerplate/openshift/osd-container-image/standard.mk
6 | 


--------------------------------------------------------------------------------
/boilerplate/openshift/osd-container-image/.ci-operator.yaml:
--------------------------------------------------------------------------------
1 | build_root_image:
2 |   name: __NAME__
3 |   namespace: __NAMESPACE__
4 |   tag: __TAG__
5 | 


--------------------------------------------------------------------------------
/boilerplate/openshift/osd-container-image/OWNERS_ALIASES:
--------------------------------------------------------------------------------
 1 | # ================================ DO NOT EDIT ================================
 2 | # This file is managed in https://github.com/openshift/boilerplate
 3 | # See the OWNERS_ALIASES docs: https://git.k8s.io/community/contributors/guide/owners.md#OWNERS_ALIASES
 4 | # =============================================================================
 5 | aliases:
 6 |   srep-functional-team-aurora:
 7 |     - abyrne55
 8 |     - dakotalongRH
 9 |     - joshbranham
10 |     - luis-falcon
11 |     - reedcort
12 |   srep-functional-team-fedramp:
13 |     - tonytheleg
14 |     - theautoroboto
15 |     - rhdedgar
16 |     - katherinelc321
17 |     - rojasreinold
18 |     - fsferraz-rh
19 |   srep-functional-team-hulk:
20 |     - a7vicky
21 |     - ravitri
22 |     - shitaljante
23 |     - devppratik
24 |     - Tafhim
25 |     - tkong-redhat
26 |     - TheUndeadKing
27 |     - vaidehi411
28 |     - chamalabey
29 |   srep-functional-team-orange:
30 |     - bergmannf
31 |     - Makdaam
32 |     - Nikokolas3270
33 |     - RaphaelBut
34 |     - MateSaary
35 |     - rolandmkunkel
36 |     - petrkotas
37 |     - zmird-r
38 |     - evlin-rh
39 |     - hectorakemp
40 |   srep-functional-team-rocket:
41 |     - aliceh
42 |     - anispate
43 |     - clcollins
44 |     - Mhodesty
45 |     - nephomaniac
46 |     - tnierman
47 |   srep-functional-team-security:
48 |     - jaybeeunix
49 |     - sam-nguyen7
50 |     - wshearn
51 |     - dem4gus
52 |     - npecka
53 |     - pshickeydev
54 |     - casey-williams-rh
55 |     - boranx
56 |   srep-functional-team-thor:
57 |     - bmeng
58 |     - MitaliBhalla
59 |     - feichashao
60 |     - samanthajayasinghe
61 |     - xiaoyu74
62 |     - Dee-6777
63 |     - Tessg22
64 |     - smarthall
65 |   srep-infra-cicd:
66 |     - mmazur
67 |     - mrsantamaria
68 |     - ritmun
69 |     - jbpratt
70 |     - yiqinzhang
71 |   srep-functional-leads:
72 |     - abyrne55
73 |     - clcollins
74 |     - Nikokolas3270
75 |     - theautoroboto
76 |     - smarthall
77 |     - sam-nguyen7
78 |     - ravitri
79 |   srep-team-leads:
80 |     - rafael-azevedo
81 |     - iamkirkbater
82 |     - rogbas
83 |     - fahlmant
84 |     - dustman9000
85 |     - wanghaoran1988
86 |     - bng0y
87 |     - bmeng
88 |     - typeid
89 |   sre-group-leads:
90 |     - apahim
91 |     - maorfr
92 |     - rogbas
93 |   srep-architects:
94 |     - jharrington22
95 |     - cblecker
96 | 


--------------------------------------------------------------------------------
/boilerplate/openshift/osd-container-image/README.md:
--------------------------------------------------------------------------------
 1 | # Conventions for OSD Containers
 2 | 
 3 | This convention is suitable for standalone containers - if an operator is desired other conventions should be used.
 4 | 
 5 | > Note: The repository's main `Makefile` needs to be edited to have the following line:
 6 | 
 7 | ```make
 8 | include boilerplate/generated-includes.mk
 9 | ```
10 | 
11 | ## `make` targets and functions
12 | 
13 | The provided `Makefile` will build and push a container image defined by a Dockerfile at `build/Dockerfile`. If multiple containers are contained in the repo, they can also be managed by defining an `ADDITIONAL_IMAGE_SPECS` variable like so:
14 | 
15 | ```make
16 | define ADDITIONAL_IMAGE_SPECS
17 | ./path/to/a/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/a-image:v1.2.3
18 | ./path/to/b/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/b-image:v4.5.6
19 | endef
20 | ```
21 | 
22 | | Makefile target | Description |
23 | |---|---|
24 | | `make osd-container-image-build` | Build the default container at `build/Dockerfile` and tag it based on the commit. Specify `DOCKERFILE` and `IMAGE_URI` to build other containers. |
25 | | `make osd-container-image-push` | Push the default container. |
26 | | `make osd-container-image-build-push` | Build and push the default container and `ADDITIONAL_IMAGE_SPECS`. Meant to be run by app-interface. |
27 | | `make isclean` | Ensure the local git checkout is clean. |
28 | | `make prow-config` | Updates the corresponding Prow config file in [openshift/release](https://github.com/openshift/release) to run `make test` on merge requests. This `test` make target should be defined by the consumer. If this is a new repository it should be onboarded to openshift/release first before this is run. |
29 | 
30 | ## Linting/Testing
31 | 
32 | This boilerplate convention does not contain any linting or testing guidelines to support a variety of containers. Those `Makefile` targets should be defined by the consumer themselves.
33 | 


--------------------------------------------------------------------------------
/boilerplate/openshift/osd-container-image/app-sre-build-push.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -ev
 4 | 
 5 | usage() {
 6 |     cat <<EOF
 7 |     Usage: $0 "IMAGE_SPECS"
 8 |     IMAGE_SPECS is a multiline string where each line has the format:
 9 | 
10 | dockerfile_path image_uri
11 | 
12 |     For example:
13 | 
14 | # This is the main operator image
15 | ./build/Dockerfile quay.io/app-sre/my-wizbang-operator:v0.1.123-abcd123
16 | 
17 | # A supplemental image to also build and push
18 | ./build/Dockerfile.other quay.io/app-sre/supplemental-image:v5.6.0
19 | 
20 |     The parameter is mandatory; if only building the catalog image,
21 |     specify the empty string.
22 | EOF
23 |     exit -1
24 | }
25 | 
26 | REPO_ROOT=$(git rev-parse --show-toplevel)
27 | source $REPO_ROOT/boilerplate/_lib/common.sh
28 | 
29 | [[ $# -eq 1 ]] || usage
30 | 
31 | IMAGE_SPECS="$1"
32 | 
33 | while read dockerfile_path image_uri junk; do
34 |     # Support comment lines
35 |     if [[ "$dockerfile_path" == '#'* ]]; then
36 |         continue
37 |     fi
38 |     # Support blank lines
39 |     if [[ "$dockerfile_path" == "" ]]; then
40 |         continue
41 |     fi
42 |     if [[ "$junk" != "" ]] && [[ "$junk" != '#'* ]]; then
43 |         echo "Invalid image spec: found extra garbage: '$junk'"
44 |         exit 1
45 |     fi
46 |     if ! [[ -f "$dockerfile_path" ]]; then
47 |         echo "Invalid image spec: no such dockerfile: '$dockerfile_path'"
48 |         exit 1
49 |     fi
50 |     # TODO: Validate ${image_uri} format?
51 | 
52 |     # Don't rebuild the image if it already exists in the repository
53 |     if image_exists_in_repo "${image_uri}"; then
54 |         echo "Skipping build/push for ${image_uri}"
55 |     else
56 |         # build and push the image
57 |         make IMAGE_URI="${image_uri}" DOCKERFILE_PATH="${dockerfile_path}" osd-container-image-build-push-one
58 |     fi
59 | done <<< "$1"
60 | 


--------------------------------------------------------------------------------
/boilerplate/openshift/osd-container-image/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "docker"
 4 |     directory: "/build"
 5 |     labels:
 6 |       - "area/dependency"
 7 |       - "ok-to-test"
 8 |     schedule:
 9 |       interval: "weekly"
10 |     ignore:
11 |       - dependency-name: "redhat-services-prod/openshift/boilerplate"
12 |         # don't upgrade boilerplate via these means
13 |       - dependency-name: "openshift4/ose-operator-registry"
14 |         # don't upgrade ose-operator-registry via these means
15 | 


--------------------------------------------------------------------------------
/boilerplate/openshift/osd-container-image/prow-config:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | REPO_ROOT=$(git rev-parse --show-toplevel)
 6 | source $REPO_ROOT/boilerplate/_lib/common.sh
 7 | source $REPO_ROOT/boilerplate/_lib/release.sh
 8 | 
 9 | cmd=${0##*/}
10 | 
11 | usage() {
12 |     cat <<EOF
13 | Usage: $cmd [PATH_TO_RELEASE_CLONE]
14 | 
15 | Creates a delta in $RELEASE_REPO standardizing prow configuration for a
16 | boilerplate consumer. Must be invoked from within a local clone of a repository
17 | already subscribed to the $CONVENTION_NAME convention.
18 | 
19 | Parameters:
20 |     PATH_TO_RELEASE_CLONE   File system path to a local clone of
21 |                             https://github.com/$RELEASE_REPO. If not
22 |                             specified, the repository will be cloned in a
23 |                             temporary directory.
24 | EOF
25 |     exit -1
26 | }
27 | 
28 | # Was a release repo clone specified?
29 | release_process_args "$@"
30 | 
31 | release_validate_invocation
32 | 
33 | release_prep_clone
34 | 
35 | cd $RELEASE_CLONE
36 | release_branch=$CONSUMER_ORG-$CONSUMER_NAME-$DEFAULT_BRANCH-boilerplate-$cmd
37 | config_dir=ci-operator/config/${CONSUMER_ORG}/${CONSUMER_NAME}
38 | config=${CONSUMER_ORG}-${CONSUMER_NAME}-${DEFAULT_BRANCH}.yaml
39 | [[ -f $config_dir/$config ]] || err "
40 | $RELEASE_REPO bootstrapping is not fully supported! Recommend running 'make new-repo' first!
41 | To circumvent this warning (not recommended), run:
42 | 
43 | git -C $RELEASE_CLONE checkout -b $release_branch
44 | mkdir -p $RELEASE_CLONE/$config_dir
45 | touch $RELEASE_CLONE/$config_dir/$config
46 | git -C $RELEASE_CLONE add $config_dir/$config
47 | git -C $RELEASE_CLONE commit
48 | $0 $RELEASE_CLONE"
49 | 
50 | # If we get here, the config file exists. Replace it.
51 | # TODO: Edit it instead, replacing only the relevant sections. This would allow
52 | # the consumer to preserve any additional checks they want in prow.
53 | cat <<EOF > $config_dir/$config
54 | build_root:
55 |   from_repository: true
56 | images:
57 | - dockerfile_path: build/Dockerfile
58 |   to: unused
59 | resources:
60 |   '*':
61 |     limits:
62 |       memory: 4Gi
63 |     requests:
64 |       cpu: 100m
65 |       memory: 200Mi
66 | tests:
67 | - as: test
68 |   commands: make test
69 |   container:
70 |     from: src
71 | zz_generated_metadata:
72 |   branch: ${DEFAULT_BRANCH}
73 |   org: ${CONSUMER_ORG}
74 |   repo: ${CONSUMER_NAME}
75 | EOF
76 | 
77 | make jobs
78 | 
79 | release_done_msg $release_branch
80 | 


--------------------------------------------------------------------------------
/boilerplate/openshift/osd-container-image/standard.mk:
--------------------------------------------------------------------------------
  1 | # Validate variables in project.mk exist
  2 | IMAGE_REGISTRY?=quay.io
  3 | IMAGE_REPOSITORY?=app-sre
  4 | REGISTRY_USER?=$(QUAY_USER)
  5 | REGISTRY_TOKEN?=$(QUAY_TOKEN)
  6 | 
  7 | VERSION_MAJOR?=0
  8 | VERSION_MINOR?=1
  9 | 
 10 | ifndef IMAGE_NAME
 11 | $(error IMAGE_NAME is not set)
 12 | endif
 13 | 
 14 | ### Accommodate docker or podman
 15 | #
 16 | # The docker/podman creds cache needs to be in a location unique to this
 17 | # invocation; otherwise it could collide across jenkins jobs. We'll use
 18 | # a .docker folder relative to pwd (the repo root).
 19 | CONTAINER_ENGINE_CONFIG_DIR = .docker
 20 | # But docker and podman use different options to configure it :eyeroll:
 21 | # ==> Podman uses --authfile=PATH *after* the `login` subcommand; but
 22 | # also accepts REGISTRY_AUTH_FILE from the env. See
 23 | # https://www.mankier.com/1/podman-login#Options---authfile=path
 24 | export REGISTRY_AUTH_FILE = ${CONTAINER_ENGINE_CONFIG_DIR}/config.json
 25 | # ==> Docker uses --config=PATH *before* (any) subcommand; so we'll glue
 26 | # that to the CONTAINER_ENGINE variable itself. (NOTE: I tried half a
 27 | # dozen other ways to do this. This was the least ugly one that actually
 28 | # works.)
 29 | ifndef CONTAINER_ENGINE
 30 | CONTAINER_ENGINE=$(shell command -v podman 2>/dev/null || echo docker --config=$(CONTAINER_ENGINE_CONFIG_DIR))
 31 | endif
 32 | 
 33 | # Generate version and tag information from inputs
 34 | COMMIT_NUMBER=$(shell git rev-list `git rev-list --parents HEAD | grep -E "^[a-f0-9]{40}$$"`..HEAD --count)
 35 | CURRENT_COMMIT=$(shell git rev-parse --short=7 HEAD)
 36 | IMAGE_VERSION := $(VERSION_MAJOR).$(VERSION_MINOR).$(COMMIT_NUMBER)-$(CURRENT_COMMIT)
 37 | 
 38 | IMAGE=$(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/$(IMAGE_NAME)
 39 | IMAGE_TAG=v$(IMAGE_VERSION)
 40 | IMAGE_URI?=$(IMAGE):$(IMAGE_TAG)
 41 | DOCKERFILE ?=./build/Dockerfile
 42 | 
 43 | 
 44 | # Consumer can optionally define ADDITIONAL_IMAGE_SPECS like:
 45 | #     define ADDITIONAL_IMAGE_SPECS
 46 | #     ./path/to/a/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/a-image:v1.2.3
 47 | #     ./path/to/b/Dockerfile $(IMAGE_REGISTRY)/$(IMAGE_REPOSITORY)/b-image:v4.5.6
 48 | #     endef
 49 | # Each will be conditionally built and pushed along with the default image.
 50 | define IMAGES_TO_BUILD
 51 | $(DOCKERFILE) $(IMAGE_URI)
 52 | $(ADDITIONAL_IMAGE_SPECS)
 53 | endef
 54 | export IMAGES_TO_BUILD
 55 | 
 56 | REGISTRY_USER ?=
 57 | REGISTRY_TOKEN ?=
 58 | 
 59 | ALLOW_DIRTY_CHECKOUT?=false
 60 | 
 61 | # TODO: Figure out how to discover this dynamically
 62 | CONVENTION_DIR := boilerplate/openshift/osd-container-image
 63 | 
 64 | # Set the default goal in a way that works for older & newer versions of `make`:
 65 | # Older versions (<=3.8.0) will pay attention to the `default` target.
 66 | # Newer versions pay attention to .DEFAULT_GOAL, where uunsetting it makes the next defined target the default:
 67 | # https://www.gnu.org/software/make/manual/make.html#index-_002eDEFAULT_005fGOAL-_0028define-default-goal_0029
 68 | .DEFAULT_GOAL :=
 69 | .PHONY: default
 70 | default: osd-container-image-build
 71 | 
 72 | .PHONY: isclean
 73 | isclean:
 74 | 	@(test "$(ALLOW_DIRTY_CHECKOUT)" != "false" || test 0 -eq $$(git status --porcelain | wc -l)) || (echo "Local git checkout is not clean, commit changes and try again." >&2 && git --no-pager diff && exit 1)
 75 | 
 76 | .PHONY: osd-container-image-build
 77 | osd-container-image-build: isclean
 78 | 	${CONTAINER_ENGINE} build --pull -f $(DOCKERFILE) -t $(IMAGE_URI) .
 79 | 
 80 | .PHONY: osd-container-image-push
 81 | osd-container-image-push: osd-container-image-login osd-container-image-build
 82 | 	${CONTAINER_ENGINE} push ${IMAGE_URI}
 83 | 
 84 | .PHONY: prow-config
 85 | prow-config:
 86 | 	${CONVENTION_DIR}/prow-config ${RELEASE_CLONE}
 87 | 
 88 | 
 89 | #########################
 90 | # Targets used by app-sre
 91 | #########################
 92 | 
 93 | .PHONY: osd-container-image-login
 94 | osd-container-image-login:
 95 | 	@test "${REGISTRY_USER}" != "" && test "${REGISTRY_TOKEN}" != "" || (echo "REGISTRY_USER and REGISTRY_TOKEN must be defined" && exit 1)
 96 | 	mkdir -p ${CONTAINER_ENGINE_CONFIG_DIR}
 97 | 	@${CONTAINER_ENGINE} login -u="${REGISTRY_USER}" -p="${REGISTRY_TOKEN}" quay.io
 98 | 
 99 | # TODO: figure out how to osd-container-image-login only once across multiple `make` calls
100 | .PHONY: osd-container-image-build-push-one
101 | osd-container-image-build-push-one: isclean osd-container-image-login
102 | 	@(if [[ -z "${IMAGE_URI}" ]]; then echo "Must specify IMAGE_URI"; exit 1; fi)
103 | 	@(if [[ -z "${DOCKERFILE_PATH}" ]]; then echo "Must specify DOCKERFILE_PATH"; exit 1; fi)
104 | 	${CONTAINER_ENGINE} build --pull -f $(DOCKERFILE_PATH) -t $(IMAGE_URI) .
105 | 	${CONTAINER_ENGINE} push ${IMAGE_URI}
106 | 
107 | # build-push: Construct, tag, and push all container images.
108 | # TODO: Boilerplate this script.
109 | .PHONY: osd-container-image-build-push
110 | osd-container-image-build-push:
111 | 	${CONVENTION_DIR}/app-sre-build-push.sh "$$IMAGES_TO_BUILD"
112 | 


--------------------------------------------------------------------------------
/boilerplate/openshift/osd-container-image/update:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | source $CONVENTION_ROOT/_lib/common.sh
 6 | 
 7 | # No PRE
 8 | [[ "$1" == "PRE" ]] && exit 0
 9 | 
10 | # Expect POST
11 | [[ "$1" == "POST" ]] || err "Got a parameter I don't understand: '$1'. Did the infrastructure change?"
12 | 
13 | # Add OWNERS_ALIASES to $REPO_ROOT
14 | echo "Copying OWNERS_ALIASES to your repository root."
15 | cp ${HERE}/OWNERS_ALIASES $REPO_ROOT
16 | 
17 | # Add dependabot configuration
18 | mkdir -p $REPO_ROOT/.github
19 | echo "Copying dependabot.yml to .github/dependabot.yml"
20 | cp ${HERE}/dependabot.yml ${REPO_ROOT}/.github/dependabot.yml
21 | 
22 | echo "Writing .ci-operator.yaml in your repository root with:"
23 | echo "    namespace: $IMAGE_NAMESPACE"
24 | echo "    name: $IMAGE_NAME"
25 | echo "    tag: $LATEST_IMAGE_TAG"
26 | ${SED?} "s/__NAMESPACE__/$IMAGE_NAMESPACE/; s/__NAME__/$IMAGE_NAME/; s/__TAG__/$LATEST_IMAGE_TAG/" ${HERE}/.ci-operator.yaml > $REPO_ROOT/.ci-operator.yaml
27 | 
28 | cat <<'EOF'
29 | 
30 | =====================
31 | THINGS YOU NEED TO DO
32 | =====================
33 | - Make sure the following line is in your base Makefile:
34 | 
35 | include boilerplate/generated-includes.mk
36 | 
37 | - Remove any other 'include' lines, unless they're for things truly
38 |   unique to your repository. (Otherwise, consider proposing them to
39 |   boilerplate.)
40 | 
41 | - Delete any obsolete files you're no longer including.
42 | 
43 | - Define a `make test` target for Prow
44 | 
45 | - Have a Dockerfile in ./build/Dockerfile and define IMAGE_NAME for it.
46 |   Others container images can be specified with ADDITIONAL_IMAGE_SPECS
47 | 
48 | =====================
49 | 
50 | EOF
51 | 


--------------------------------------------------------------------------------
/boilerplate/update.cfg:
--------------------------------------------------------------------------------
1 | openshift/osd-container-image
2 | 


--------------------------------------------------------------------------------
/build/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM quay.io/redhat-services-prod/openshift/boilerplate:image-v7.0.0 as builder
 2 | 
 3 | ADD . /opt
 4 | WORKDIR /opt
 5 | 
 6 | RUN make CGO_ENABLED=0 build-cadctl
 7 | RUN make CGO_ENABLED=0 build-interceptor
 8 | 
 9 | 
10 | FROM quay.io/app-sre/ubi8-ubi-minimal:8.10 as runner
11 | 
12 | COPY --from=builder /opt/bin/cadctl /bin/cadctl
13 | COPY --from=builder /opt/bin/interceptor /bin/interceptor
14 | 
15 | ARG BUILD_DATE
16 | ARG VERSION
17 | ARG VCS_REF
18 | ARG DOCKERFILE_PATH
19 | 
20 | LABEL vendor="RedHat" \
21 |   name="openshift/configuration-anomaly-detection" \
22 |   description="a CLI tool to detect and mitigate configuration mishaps" \
23 |   io.k8s.display-name="openshift/configuration-anomaly-detection" \
24 |   io.k8s.description="a CLI tool to detect and mitigate configuration mishaps" \
25 |   maintainer="RedHat <>" \
26 |   version="$VERSION" \
27 |   org.label-schema.build-date=$BUILD_DATE \
28 |   org.label-schema.description="a CLI tool to detect and mitigate configuration mishaps" \
29 |   org.label-schema.docker.cmd="docker run --rm openshift/configuration-anomaly-detection" \
30 |   org.label-schema.docker.dockerfile=$DOCKERFILE_PATH \
31 |   org.label-schema.name="openshift/configuration-anomaly-detection" \
32 |   org.label-schema.schema-version="0.1.0" \
33 |   org.label-schema.vcs-branch=$VCS_BRANCH \
34 |   org.label-schema.vcs-ref=$VCS_REF \
35 |   org.label-schema.vcs-url="https://github.com/openshift/configuration-anomaly-detection" \
36 |   org.label-schema.vendor="openshift/configuration-anomaly-detection" \
37 |   org.label-schema.version=$VERSION
38 | 
39 | RUN microdnf install jq
40 | 
41 | ENTRYPOINT ["/bin/cadctl"]
42 | 


--------------------------------------------------------------------------------
/cadctl/.gitignore:
--------------------------------------------------------------------------------
1 | /cadctl
2 | 


--------------------------------------------------------------------------------
/cadctl/cmd/root.go:
--------------------------------------------------------------------------------
 1 | // Package cmd holds the cadctl cobra data
 2 | /*
 3 | Copyright © 2022 Red Hat, Inc.
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | */
17 | package cmd
18 | 
19 | import (
20 | 	investigate "github.com/openshift/configuration-anomaly-detection/cadctl/cmd/investigate"
21 | 	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
22 | 	"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
23 | 	"github.com/spf13/cobra"
24 | )
25 | 
26 | // rootCmd represents the base command when called without any subcommands
27 | var rootCmd = &cobra.Command{
28 | 	Use:   "cadctl",
29 | 	Short: "A util of configuration-anomaly-detection (CAD) checks",
30 | }
31 | 
32 | // Execute adds all child commands to the root command and sets flags appropriately.
33 | // This is called by main.main(). It only needs to happen once to the rootCmd.
34 | func Execute() {
35 | 	err := rootCmd.Execute()
36 | 	metrics.Push()
37 | 	if err != nil {
38 | 		logging.Fatal(err)
39 | 	}
40 | }
41 | 
42 | func init() {
43 | 	rootCmd.AddCommand(investigate.InvestigateCmd)
44 | }
45 | 


--------------------------------------------------------------------------------
/cadctl/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright © 2022 Red Hat, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | 	http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package main is the main package
18 | package main
19 | 
20 | import "github.com/openshift/configuration-anomaly-detection/cadctl/cmd"
21 | 
22 | func main() {
23 | 	cmd.Execute()
24 | }
25 | 


--------------------------------------------------------------------------------
/hack/bootstrap-investigation.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | set -e
  4 | 
  5 | read -p "Enter the new investigation (package) name: " INVESTIGATION_NAME
  6 | if [[ "${INVESTIGATION_NAME}" == "" ]] ; then
  7 | 	echo "Investigation name cannot be empty."
  8 | 	exit 1
  9 | elif [[ "${INVESTIGATION_NAME}" =~ [^a-zA-Z0-9_] ]] ; then
 10 | 	echo "Investigation name must be alphanumeric."
 11 | 	exit 1
 12 | fi
 13 | 
 14 | read -p "Enter new investigation description: " INVESTIGATION_DESCRIPTION
 15 | if [[ "${INVESTIGATION_DESCRIPTION}" == "" ]] ; then
 16 | 	INVESTIGATION_DESCRIPTION="TODO"
 17 | fi
 18 | 
 19 | read -p "Should Investigate Alert (y/n): " INVESTIGATE_ALERT_BOOL
 20 | if [[ "${INVESTIGATE_ALERT_BOOL}" == "y" ]] ; then
 21 | 	read -p "Investigation alert string: " INVESTIGATION_ALERT_STRING
 22 | 	INVESTIGATION_ALERT="strings.Contains(alert, \"${INVESTIGATION_ALERT_STRING}\")"
 23 | elif [[ "${INVESTIGATE_ALERT_BOOL}" == "n" ]] ; then
 24 | 	INVESTIGATION_ALERT="false"
 25 | else
 26 | 	echo "Invalid input. Please enter 'y' or 'n'."
 27 | 	exit 1
 28 | fi
 29 | 
 30 | INVESTIGATION_NAME=$(echo "${INVESTIGATION_NAME}" | tr '[:upper:]' '[:lower:]')
 31 | 
 32 | INVESTIGATION_DIR="../pkg/investigations/${INVESTIGATION_NAME}"
 33 | 
 34 | if [ -d "${INVESTIGATION_DIR}" ]; then
 35 |     echo "Investigation of name ${INVESTIGATION_NAME} already exists."
 36 |     exit 1
 37 | fi
 38 | 
 39 | mkdir -p "${INVESTIGATION_DIR}"
 40 | ls "${INVESTIGATION_DIR}"
 41 | 
 42 | touch "${INVESTIGATION_DIR}/${INVESTIGATION_NAME}.go"
 43 | touch "${INVESTIGATION_DIR}/metadata.yaml"
 44 | touch "${INVESTIGATION_DIR}/README.md"
 45 | mkdir "${INVESTIGATION_DIR}/testing/"
 46 | 
 47 | # Create README.md file
 48 | cat <<EOF > "${INVESTIGATION_DIR}/README.md"
 49 | # ${INVESTIGATION_NAME} Investigation
 50 | 
 51 | ${INVESTIGATION_DESCRIPTION}
 52 | 
 53 | ## Testing
 54 | 
 55 | Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
 56 | 
 57 | EOF
 58 | 
 59 | # Create testing/README.md file
 60 | cat <<EOF > "${INVESTIGATION_DIR}/testing/README.md"
 61 | # Testing ${INVESTIGATION_NAME} Investigation
 62 | 
 63 | TODO:
 64 | - Add a test script or test objects to this `testing/` directory for future maintainers to use
 65 | - Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)
 66 | EOF
 67 | 
 68 | 
 69 | # Create metadata.yaml file
 70 | cat <<EOF > "${INVESTIGATION_DIR}/metadata.yaml"
 71 | name: ${INVESTIGATION_NAME}
 72 | rbac:
 73 |   roles: []
 74 |   clusterRoleRules: []
 75 | customerDataAccess: false
 76 | 
 77 | EOF
 78 | 
 79 | # Create boilerplate investigation file
 80 | cat <<EOF > "${INVESTIGATION_DIR}/${INVESTIGATION_NAME}.go"
 81 | // Package ${INVESTIGATION_NAME} contains...TODO
 82 | package ${INVESTIGATION_NAME}
 83 | 
 84 | import (
 85 | 	"strings"
 86 | 
 87 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
 88 | 	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
 89 | 	"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
 90 | )
 91 | 
 92 | type Investigation struct{}
 93 | 
 94 | func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
 95 | 	result := investigation.InvestigationResult{}
 96 | 
 97 | 	// Initialize PagerDuty note writer
 98 | 	notes := notewriter.New(r.Name, logging.RawLogger)
 99 | 	defer func() { r.Notes = notes }()
100 | 
101 | 	// TODO: Implement investigation logic here
102 | 
103 | 	return result, r.PdClient.EscalateIncidentWithNote(notes.String())
104 | }
105 | 
106 | func (c *Investigation) Name() string {
107 | 	return "${INVESTIGATION_NAME}"
108 | }
109 | 
110 | func (c *Investigation) Description() string {
111 | 	return "${INVESTIGATION_DESCRIPTION}"
112 | }
113 | 
114 | func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
115 | 	return ${INVESTIGATION_ALERT}
116 | }
117 | 
118 | func (c *Investigation) IsExperimental() bool {
119 | 	// TODO: Update to false when graduating to production.
120 | 	return true
121 | }
122 | 
123 | EOF
124 | 
125 | echo "${INVESTIGATION_NAME} created in ${INVESTIGATION_DIR}"
126 | echo "metadata.yaml file created in ${INVESTIGATION_DIR}"
127 | 
128 | # Update registry.go to contain new investigation
129 | if ! grep -q "${INVESTIGATION_NAME}" ../pkg/investigations/registry.go && ! grep -q "${INVESTIGATION_NAME}" ../pkg/investigations/registry.go; then
130 | 	sed -i "/import (/a \\\t\"github.com/openshift/configuration-anomaly-detection/pkg/investigations/${INVESTIGATION_NAME}\"" ../pkg/investigations/registry.go
131 |     sed -i "/var availableInvestigations = \[/a \\\t&${INVESTIGATION_NAME}.Investigation{}," ../pkg/investigations/registry.go
132 |     echo "${INVESTIGATION_NAME} added to registry.go"
133 | else
134 |     echo "${INVESTIGATION_NAME} already exists in registry.go"
135 | fi
136 | 


--------------------------------------------------------------------------------
/hack/codecov.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit
 4 | set -o nounset
 5 | set -o pipefail
 6 | 
 7 | REPO_ROOT=$(git rev-parse --show-toplevel)
 8 | CI_SERVER_URL=https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test
 9 | COVER_PROFILE=${COVER_PROFILE:-coverage.out}
10 | JOB_TYPE=${JOB_TYPE:-"local"}
11 | 
12 | # Default concurrency to four threads. By default it's the number of procs,
13 | # which seems to be 16 in the CI env. Some consumers' coverage jobs were
14 | # regularly getting OOM-killed; so do this rather than boost the pod resources
15 | # unreasonably.
16 | COV_THREAD_COUNT=${COV_THREAD_COUNT:-4}
17 | make -C "${REPO_ROOT}" test-cadctl TESTOPTS="-coverprofile=${COVER_PROFILE}.tmp -covermode=atomic -coverpkg=./... -p ${COV_THREAD_COUNT}"
18 | 
19 | # Remove generated files from coverage profile
20 | grep -v "zz_generated" "${COVER_PROFILE}.tmp" > "${COVER_PROFILE}"
21 | rm -f "${COVER_PROFILE}.tmp"
22 | 
23 | # Configure the git refs and job link based on how the job was triggered via prow
24 | if [[ "${JOB_TYPE}" == "presubmit" ]]; then
25 |        echo "detected PR code coverage job for #${PULL_NUMBER}"
26 |        REF_FLAGS="-P ${PULL_NUMBER} -C ${PULL_PULL_SHA}"
27 |        JOB_LINK="${CI_SERVER_URL}/pr-logs/pull/${REPO_OWNER}_${REPO_NAME}/${PULL_NUMBER}/${JOB_NAME}/${BUILD_ID}"
28 | elif [[ "${JOB_TYPE}" == "postsubmit" ]]; then
29 |        echo "detected branch code coverage job for ${PULL_BASE_REF}"
30 |        REF_FLAGS="-B ${PULL_BASE_REF} -C ${PULL_BASE_SHA}"
31 |        JOB_LINK="${CI_SERVER_URL}/logs/${JOB_NAME}/${BUILD_ID}"
32 | elif [[ "${JOB_TYPE}" == "local" ]]; then
33 |        echo "coverage report available at ${COVER_PROFILE}"
34 |        exit 0
35 | else
36 |        echo "${JOB_TYPE} jobs not supported" >&2
37 |        exit 1
38 | fi
39 | 
40 | # Configure certain internal codecov variables with values from prow.
41 | export CI_BUILD_URL="${JOB_LINK}"
42 | export CI_BUILD_ID="${JOB_NAME}"
43 | export CI_JOB_ID="${BUILD_ID}"
44 | 
45 | if [[ "${JOB_TYPE}" != "local" ]]; then
46 |        if [[ -z "${ARTIFACT_DIR:-}" ]] || [[ ! -d "${ARTIFACT_DIR}" ]] || [[ ! -w "${ARTIFACT_DIR}" ]]; then
47 |               echo '${ARTIFACT_DIR} must be set for non-local jobs, and must point to a writable directory' >&2
48 |               exit 1
49 |        fi
50 |        curl -sS https://codecov.io/bash -o "${ARTIFACT_DIR}/codecov.sh"
51 |        bash <(cat "${ARTIFACT_DIR}/codecov.sh") -Z -K -f "${COVER_PROFILE}" -r "${REPO_OWNER}/${REPO_NAME}" ${REF_FLAGS}
52 | else
53 |        bash <(curl -s https://codecov.io/bash) -Z -K -f "${COVER_PROFILE}" -r "${REPO_OWNER}/${REPO_NAME}" ${REF_FLAGS}
54 | fi


--------------------------------------------------------------------------------
/images/CadCat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/CadCat.png


--------------------------------------------------------------------------------
/images/cad_chgm_investigation/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_chgm_investigation/README.md


--------------------------------------------------------------------------------
/images/cad_chgm_investigation/chgm_investigation_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_chgm_investigation/chgm_investigation_dark.png


--------------------------------------------------------------------------------
/images/cad_chgm_investigation/chgm_investigation_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_chgm_investigation/chgm_investigation_light.png


--------------------------------------------------------------------------------
/images/cad_overview/cad_architecture_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_overview/cad_architecture_dark.png


--------------------------------------------------------------------------------
/images/cad_overview/cad_architecture_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/images/cad_overview/cad_architecture_light.png


--------------------------------------------------------------------------------
/interceptor/README.md:
--------------------------------------------------------------------------------
 1 | # CAD Tekton Interceptor
 2 | 
 3 | The tekton interceptor is a component plugged between the event listener and the task runs. The interceptor makes sure we don't start a pipeline for every alert we receive. Instead, alerts are filtered based on whether or not they are handled by CAD. Unhandled alerts are directly escalated and no pipeline is started.
 4 | 
 5 | ## Testing
 6 | 
 7 | ### E2E
 8 | 
 9 | The interceptor has E2E tests starting the HTTP service and checking the HTTP responses. The tests are based on pre-existing PagerDuty alerts.
10 | 
11 | ``` bash
12 | 
13 | make e2e-interceptor
14 | 
15 | # To also print the output of the interceptor service:
16 | CAD_E2E_VERBOSE=true make test-interceptor
17 | ```
18 | 
19 | ## Development
20 | 
21 | It is possible to run the interceptor locally in a "minimal" state, where E2E is not used, and only the
22 | crucial-to-run env variables (seen below) are set as placeholders. This is useful for *local* development/debugging.
23 | 
24 | ``` bash
25 | $ make build-interceptor
26 | 
27 | $ CAD_SILENT_POLICY=test
28 | $ CAD_PD_TOKEN=test
29 | $ PD_SIGNATURE=test
30 | 
31 | $ ./bin/interceptor
32 | ```
33 | 


--------------------------------------------------------------------------------
/interceptor/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"net"
 7 | 	"net/http"
 8 | 	"os"
 9 | 	"os/signal"
10 | 	"syscall"
11 | 	"time"
12 | 
13 | 	"github.com/openshift/configuration-anomaly-detection/interceptor/pkg/interceptor"
14 | 	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
15 | 	"github.com/prometheus/client_golang/prometheus/promhttp"
16 | 	"knative.dev/pkg/signals"
17 | 	"sigs.k8s.io/controller-runtime/pkg/metrics"
18 | )
19 | 
20 | const (
21 | 	HTTPPort     = 8080
22 | 	readTimeout  = 5 * time.Second
23 | 	writeTimeout = 20 * time.Second
24 | 	idleTimeout  = 60 * time.Second
25 | )
26 | 
27 | var logger = logging.InitLogger(logging.LogLevelString, "")
28 | 
29 | func main() {
30 | 	// set up signals so we handle the first shutdown signal gracefully
31 | 	ctx := signals.NewContext()
32 | 
33 | 	stats := interceptor.CreateInterceptorStats()
34 | 	mux := http.NewServeMux()
35 | 	mux.Handle("/", interceptor.CreateInterceptorHandler(stats))
36 | 	mux.HandleFunc("/ready", readinessHandler)
37 | 	interceptor.CreateAndRegisterMetricsCollector(stats)
38 | 	mux.Handle("/metrics", promhttp.HandlerFor(metrics.Registry, promhttp.HandlerOpts{Registry: metrics.Registry}))
39 | 
40 | 	srv := &http.Server{
41 | 		Addr: fmt.Sprintf(":%d", HTTPPort),
42 | 		BaseContext: func(listener net.Listener) context.Context {
43 | 			return ctx
44 | 		},
45 | 		ReadTimeout:  readTimeout,
46 | 		WriteTimeout: writeTimeout,
47 | 		IdleTimeout:  idleTimeout,
48 | 		Handler:      mux,
49 | 	}
50 | 
51 | 	// Channel to listen for OS signals
52 | 	stop := make(chan os.Signal, 1)
53 | 	signal.Notify(stop, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
54 | 
55 | 	// Run server in a goroutine
56 | 	go func() {
57 | 		logger.Infof("Listen and serve on port %d", HTTPPort)
58 | 		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
59 | 			logger.Fatalf("failed to start interceptors service: %v", err)
60 | 		}
61 | 	}()
62 | 
63 | 	// Block until we receive a stop signal
64 | 	<-stop
65 | 
66 | 	// Create a deadline to wait for.
67 | 	ctxShutDown, cancel := context.WithTimeout(context.Background(), 5*time.Second)
68 | 	defer cancel()
69 | 
70 | 	// Attempt to gracefully shutdown the server
71 | 	if err := srv.Shutdown(ctxShutDown); err != nil {
72 | 		logger.Fatalf("server forced to shutdown: %v", err)
73 | 	}
74 | 
75 | 	logger.Infof("Server exiting")
76 | }
77 | 
78 | func readinessHandler(w http.ResponseWriter, r *http.Request) {
79 | 	w.WriteHeader(http.StatusOK)
80 | }
81 | 


--------------------------------------------------------------------------------
/interceptor/pkg/interceptor/metrics.go:
--------------------------------------------------------------------------------
 1 | package interceptor
 2 | 
 3 | import (
 4 | 	"strconv"
 5 | 
 6 | 	"sigs.k8s.io/controller-runtime/pkg/metrics"
 7 | 
 8 | 	"github.com/prometheus/client_golang/prometheus"
 9 | )
10 | 
11 | const (
12 | 	requestsCountMetricName = "cad_interceptor_requests_total"
13 | 	requestsCountMetricHelp = "Number of times CAD interceptor has been called (through a PagerDuty webhook, normally)"
14 | 
15 | 	errorsCountMetricName = "cad_interceptor_errors_total"
16 | 	errorsCountMetricHelp = "Number of times CAD interceptor has been failed to process a request"
17 | )
18 | 
19 | var (
20 | 	requestsCountMetricDesc = prometheus.NewDesc(
21 | 		requestsCountMetricName,
22 | 		requestsCountMetricHelp,
23 | 		nil, nil)
24 | 
25 | 	errorsCountMetricDesc = prometheus.NewDesc(
26 | 		errorsCountMetricName,
27 | 		errorsCountMetricHelp,
28 | 		[]string{"error_code", "reason"}, nil)
29 | )
30 | 
31 | type interceptorMetricsCollector struct {
32 | 	stats *InterceptorStats
33 | }
34 | 
35 | func CreateAndRegisterMetricsCollector(stats *InterceptorStats) {
36 | 	metrics.Registry.MustRegister(&interceptorMetricsCollector{stats})
37 | }
38 | 
39 | func (c *interceptorMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
40 | 	prometheus.DescribeByCollect(c, ch)
41 | }
42 | 
43 | func (c *interceptorMetricsCollector) Collect(ch chan<- prometheus.Metric) {
44 | 	ch <- prometheus.MustNewConstMetric(
45 | 		requestsCountMetricDesc,
46 | 		prometheus.CounterValue,
47 | 		float64(c.stats.RequestsCount),
48 | 	)
49 | 
50 | 	for codeWithReason, errorsCount := range c.stats.CodeWithReasonToErrorsCount {
51 | 		ch <- prometheus.MustNewConstMetric(
52 | 			errorsCountMetricDesc,
53 | 			prometheus.CounterValue,
54 | 			float64(errorsCount),
55 | 			strconv.Itoa(codeWithReason.ErrorCode),
56 | 			codeWithReason.Reason,
57 | 		)
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/interceptor/test/e2e.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # Text colors for output
  5 | GREEN='\033[0;32m'
  6 | RED='\033[0;31m'
  7 | NC='\033[0m'
  8 | 
  9 | # Load pd token from vault - needed by interceptor
 10 | export VAULT_ADDR="https://vault.devshift.net"
 11 | export VAULT_TOKEN="$(vault login -method=oidc -token-only)"
 12 | for v in $(vault kv get  -format=json osd-sre/configuration-anomaly-detection/cad-testing | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done
 13 | unset VAULT_ADDR VAULT_TOKEN
 14 | echo
 15 | 
 16 | temp_log_file=$(mktemp)
 17 | 
 18 | # Function to send an interceptor request and check the response
 19 | function test_interceptor {
 20 | 
 21 |     local incident_id=$1
 22 |     local expected_response=$2
 23 |     local expected_metrics=$3
 24 |     local override_signature=$4
 25 | 
 26 |     # Run the interceptor and print logs to temporary log file
 27 |     export PD_SIGNATURE="test"
 28 |     CAD_PD_TOKEN=$(echo $pd_test_token) CAD_SILENT_POLICY=$(echo $pd_test_silence_policy) ./../bin/interceptor > $temp_log_file  2>&1 &
 29 |     PAYLOAD_BODY="{\\\"__pd_metadata\\\":{\\\"incident\\\":{\\\"id\\\":\\\"$incident_id\\\"}}}"
 30 |     PAYLOAD_BODY_FORMATTED='{"__pd_metadata":{"incident":{"id":"'$incident_id'"}}}'
 31 | 
 32 |     # Allow for test 3; override the signature after correct one has already been added to env
 33 |     if [[ "$override_signature" != "" ]]; then
 34 |       export PD_SIGNATURE=$override_signature
 35 |     fi
 36 | 
 37 |     SIGN=$(echo -n "$PAYLOAD_BODY_FORMATTED" | openssl dgst -sha256 -hmac $PD_SIGNATURE | sed 's/^.* //')
 38 | 
 39 |     # Store the PID of the interceptor process
 40 |     INTERCEPTOR_PID=$!
 41 | 
 42 |     # Wrap the webhook originating payload (this is the expected format of the payload sent to the interceptor)
 43 |     WRAPPED_PAYLOAD="{\"header\":{\"Content-Type\":[\"application/json\"],\"X-PagerDuty-Signature\":[\"v1=$SIGN\"]},\"body\":\"$PAYLOAD_BODY\"}"
 44 | 
 45 |     # Wait for 1 second to allow the interceptor to start up
 46 |     sleep 5
 47 | 
 48 | 
 49 |     # Send an interceptor request to localhost:8080
 50 |     # See https://pkg.go.dev/github.com/tektoncd/triggers/pkg/apis/triggers/v1alpha1#InterceptorRequest
 51 |     CURL_EXITCODE=0
 52 |     CURL_OUTPUT=$(curl -s -X POST -H "X-PagerDuty-Signature:v1=${SIGN}" -H "Content-Type: application/json" \
 53 |         -d "$WRAPPED_PAYLOAD" \
 54 |         http://localhost:8080) || CURL_EXITCODE=$?
 55 | 
 56 |     local return_code=0
 57 | 
 58 |     # Check if the curl output differs from the expected response
 59 |     if [[ "$CURL_OUTPUT" != "$expected_response" ]] || [[ "$CURL_EXITCODE" != "0" ]]; then
 60 |         echo -e "${RED}Test failed for incident ID $incident_id: Unexpected response.${NC}"
 61 |         echo -e "${RED}Expected: $expected_response${NC}"
 62 |         echo -e "${RED}Got: $CURL_OUTPUT${NC}"
 63 |         echo -e "${RED}Exit code: $CURL_EXITCODE${NC}"
 64 |         echo -e ""
 65 |         echo -e "Interceptor logs"
 66 |         cat $temp_log_file
 67 |         return_code=1
 68 |     else
 69 |         curl_metrics_exitcode=0
 70 |         curl_metrics_output=$(curl -s http://localhost:8080/metrics | grep '^cad_interceptor_') || curl_metrics_exitcode=$?
 71 | 
 72 |         if [[ "$curl_metrics_output" != "$expected_metrics" ]] || [[ "$curl_metrics_exitcode" != "0" ]]; then
 73 |             echo -e "${RED}Test failed for incident ID $incident_id: Unexpected metrics.${NC}"
 74 |             echo -e "${RED}Expected: $expected_metrics${NC}"
 75 |             echo -e "${RED}Got: $curl_metrics_output${NC}"
 76 |             echo -e "${RED}Exit code: $curl_metrics_exitcode${NC}"
 77 |             echo -e ""
 78 |             echo -e "Interceptor logs"
 79 |             cat $temp_log_file
 80 |             return_code=1
 81 |         else
 82 |             echo -e "${GREEN}Test passed for incident ID $incident_id: Response and metrics are as expected.${NC}"
 83 |         fi
 84 |     fi
 85 | 
 86 |     # Shut down the interceptor
 87 |     kill $INTERCEPTOR_PID
 88 | 
 89 |     return $return_code
 90 | }
 91 | 
 92 | # Expected outputs
 93 | # See https://github.com/tektoncd/triggers/blob/v0.27.0/pkg/apis/triggers/v1alpha1/interceptor_types.go#L134
 94 | EXPECTED_RESPONSE_CONTINUE='{"continue":true,"status":{}}'
 95 | EXPECTED_RESPONSE_STOP='{"continue":false,"status":{}}'
 96 | EXPECTED_RESPONSE_SIGNATURE_ERROR='failed to verify signature: invalid webhook signature'
 97 | 
 98 | echo "========= TESTS ============="
 99 | # Test for a pre-existing alert we handle (ClusterProvisioningDelay)
100 | echo "Test 1: alert with existing handling returns a 'continue: true' response"
101 | test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_CONTINUE" "cad_interceptor_requests_total 1"
102 | 
103 | # Test for an alert we don't handle (alert called unhandled)
104 | echo "Test 2: unhandled alerts returns a 'continue: false' response"
105 | test_interceptor "Q3722KGCG12ZWD" "$EXPECTED_RESPONSE_STOP" "cad_interceptor_requests_total 1"
106 | 
107 | # Test for an alert with invalid signature
108 | echo "Test 3: expected failure due to invalid signature"
109 | PD_SIGNATURE="invalid-signature"
110 | test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_SIGNATURE_ERROR" 'cad_interceptor_errors_total{error_code="400",reason="failed to verify signature"} 1'$'\n''cad_interceptor_requests_total 1' "invalid-signature"
111 | 


--------------------------------------------------------------------------------
/openshift/PipelinePruning.md:
--------------------------------------------------------------------------------
 1 | # PipelinePruning
 2 | 
 3 | ![Pipeline Pruning](assets/cad_pipeline_pruning.drawio.png)
 4 | 
 5 | ## Overview
 6 | 
 7 | Prior, we have exploited the AppSRE pipeline pruning via importing their pipeline defaults in service/app-interface.
 8 | This has been changed, because it had a few disadvantages. For instance, we have also imported resource constraints
 9 | and other defaults that we do not want for our pipeline.
10 | 
11 | Instead, we are now using our own PipelinePruner in form of a Cronjob. This Cronjob is defined in [template.yaml](template.yaml).
12 | The Cronjob creates a pod with the following command: `tkn pipelinerun delete --keep=20 -f`.
13 | This command will delete all PipelineRuns except for the last 20.
14 | 
15 | For doing this, the cronjob needs permissions, these permissions are set in a role, defined in  [template.yaml](template.yaml) as well.
16 | 
17 | Note, that we have also defined a ResourceQuota that limits the PipelineRuns to a maximum number of 1000. This does not mean concurrent runs but all the runs that exist for that pipeline.
18 | 


--------------------------------------------------------------------------------
/openshift/README.md:
--------------------------------------------------------------------------------
1 | # OpenShift Template
2 | 
3 | This folder holds the template used by app-interface to deploy CAD resources on a target cluster.
4 | 


--------------------------------------------------------------------------------
/openshift/assets/cad_pipeline_pruning.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openshift/configuration-anomaly-detection/a59dd9ae83f9546f9cd373ffe3eb615885f4164b/openshift/assets/cad_pipeline_pruning.drawio.png


--------------------------------------------------------------------------------
/openshift/gateway-template.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: template.openshift.io/v1
  2 | kind: Template
  3 | metadata:
  4 |   name: configuration-anomaly-detection-gateway-template
  5 | parameters:
  6 | 
  7 | - name: IMAGE_TAG
  8 |   value: v0.7.0
  9 | 
 10 | - name: REGISTRY_IMG
 11 |   value: quay.io/app-sre/aggregation-gateway
 12 | 
 13 | - name: MEMORY_REQUEST
 14 |   description: Memory request for the API pods.
 15 |   value: "512Mi"
 16 | 
 17 | - name: MEMORY_LIMIT
 18 |   description: Memory limit for the API pods.
 19 |   value: "1Gi"
 20 | 
 21 | - name: CPU_REQUEST
 22 |   description: CPU request for the API pods.
 23 |   value: "200m"
 24 | 
 25 | - name: CPU_LIMIT
 26 |   description: CPU limit for the API pods.
 27 |   value: "1"
 28 | 
 29 | objects:
 30 | - kind: Service
 31 |   apiVersion: v1
 32 |   metadata:
 33 |     name: aggregation-pushgateway
 34 |     labels:
 35 |       app: configuration-anomaly-detection
 36 |       port: metrics
 37 |     annotations:
 38 |       description: Exposes and load balances the aggregation-pushgateway pods
 39 |   spec:
 40 |     selector:
 41 |       app: aggregation-pushgateway
 42 |     ports:
 43 |     - name: metrics
 44 |       port: 9091
 45 |       targetPort: 9091
 46 |       protocol: TCP
 47 | - kind: Deployment
 48 |   apiVersion: apps/v1
 49 |   metadata:
 50 |     name: aggregation-pushgateway
 51 |     labels:
 52 |       app: aggregation-pushgateway
 53 |   spec:
 54 |     selector:
 55 |       matchLabels:
 56 |         app: aggregation-pushgateway
 57 |     replicas: 2
 58 |     strategy:
 59 |     rollingParams:
 60 |       intervalSeconds: 1
 61 |       maxSurge: 25%
 62 |       maxUnavailable: 25%
 63 |       timeoutSeconds: 600
 64 |       updatePeriodSeconds: 1
 65 |     type: Rolling
 66 |     template:
 67 |       metadata:
 68 |         labels:
 69 |           app: aggregation-pushgateway
 70 |       spec:
 71 |         serviceAccountName: pushgateway
 72 |         containers:
 73 |           - name: aggregation-pushgateway
 74 |             image: ${REGISTRY_IMG}:${IMAGE_TAG}
 75 |             securityContext: 
 76 |               allowPrivilegeEscalation: false
 77 |               runAsNonRoot: true
 78 |               capabilities:
 79 |                 drop: ["ALL"]
 80 |               seccompProfile:
 81 |                 type: RuntimeDefault
 82 |             imagePullPolicy: IfNotPresent
 83 |             env:
 84 |               - name: PAG_APILISTEN
 85 |                 value: :9091
 86 |               - name: PAG_LIFECYCLELISTEN
 87 |                 value: :9092
 88 |             resources:
 89 |               requests:
 90 |                 cpu: ${CPU_REQUEST}
 91 |                 memory: ${MEMORY_REQUEST}
 92 |               limits:
 93 |                 cpu: ${CPU_LIMIT}
 94 |                 memory: ${MEMORY_LIMIT}
 95 |             ports:
 96 |             - name: metrics
 97 |               protocol: TCP
 98 |               containerPort: 9091
 99 |             - name: lifecycle
100 |               protocol: TCP
101 |               containerPort: 9092
102 |             livenessProbe:
103 |               httpGet:
104 |                 path: /healthy
105 |                 port: 9092
106 |                 scheme: HTTP
107 |               initialDelaySeconds: 15
108 |               periodSeconds: 5
109 |             readinessProbe:
110 |               httpGet:
111 |                 path: /ready
112 |                 port: 9092
113 |                 scheme: HTTP
114 |               initialDelaySeconds: 20
115 |               periodSeconds: 10


--------------------------------------------------------------------------------
/pkg/ai/k8sgpt/k8sgpt.go:
--------------------------------------------------------------------------------
  1 | package k8sgpt
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"strings"
  9 | 
 10 | 	k8sgpt_ai "github.com/k8sgpt-ai/k8sgpt/pkg/ai"
 11 | 	"github.com/k8sgpt-ai/k8sgpt/pkg/analysis"
 12 | 	"github.com/k8sgpt-ai/k8sgpt/pkg/cache"
 13 | 	gptK8sClient "github.com/k8sgpt-ai/k8sgpt/pkg/kubernetes"
 14 | 	"k8s.io/apimachinery/pkg/runtime"
 15 | 	"k8s.io/client-go/kubernetes"
 16 | 	"k8s.io/client-go/rest"
 17 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 18 | )
 19 | 
 20 | var model = "mistral-small-maas"
 21 | 
 22 | func K8sGptAnalysis(k8sRestConfig *rest.Config) (string, error) {
 23 | 	ctrlClient, err := client.New(k8sRestConfig, client.Options{Scheme: runtime.NewScheme()})
 24 | 	if err != nil {
 25 | 		return "", errors.New("unable to init ctrlClient")
 26 | 	}
 27 | 	clientset := kubernetes.NewForConfigOrDie(k8sRestConfig)
 28 | 
 29 | 	client := &gptK8sClient.Client{CtrlClient: ctrlClient, Config: k8sRestConfig, Client: clientset}
 30 | 
 31 | 	aiToken := os.Getenv("CAD_HCM_AI_TOKEN")
 32 | 	if aiToken == "" {
 33 | 		return "", errors.New("could not find CAD_HCM_AI_TOKEN env")
 34 | 	}
 35 | 
 36 | 	aiClient := k8sgpt_ai.NewClient("openai")
 37 | 	aiProvider := &k8sgpt_ai.AIProvider{
 38 | 		Name:     "openai",
 39 | 		Model:    model,
 40 | 		BaseURL:  "https://mistral-small-maas-maas.apps.rosa.hcmaii01ue1.a9ro.p3.openshiftapps.com/v1", // TODO: Let's not hardcode this.
 41 | 		Password: aiToken,
 42 | 	}
 43 | 
 44 | 	if err = aiClient.Configure(aiProvider); err != nil {
 45 | 		return "", fmt.Errorf("unable to configure ai provider: %w", err)
 46 | 	}
 47 | 
 48 | 	cache, err := cache.GetCacheConfiguration()
 49 | 	if err != nil {
 50 | 		return "", fmt.Errorf("unable to get k8sgpt cache configuration: %w", err)
 51 | 	}
 52 | 	cache.DisableCache()
 53 | 
 54 | 	a := &analysis.Analysis{
 55 | 		Context:        context.Background(),
 56 | 		Filters:        []string{"Pod", "Deployment", "ReplicaSet", "PersistentVolumeClaim", "Service", "Ingress", "StatefulSet", "CronJob", "Node", "ValidatingWebhookConfiguration", "MutatingWebhookConfiguration"},
 57 | 		Client:         client,
 58 | 		Language:       "english",
 59 | 		Namespace:      "",
 60 | 		LabelSelector:  "",
 61 | 		Cache:          cache,
 62 | 		Explain:        true,
 63 | 		MaxConcurrency: 10,
 64 | 		WithDoc:        false,
 65 | 		WithStats:      false,
 66 | 		AIClient:       aiClient,
 67 | 	}
 68 | 
 69 | 	a.RunAnalysis()
 70 | 
 71 | 	var output string
 72 | 	anonymize := false
 73 | 	if err := a.GetAIResults(output, anonymize); err != nil {
 74 | 		return "", fmt.Errorf("unable to get ai results: %w", err)
 75 | 	}
 76 | 
 77 | 	return formatOutput(a)
 78 | }
 79 | 
 80 | func formatOutput(a *analysis.Analysis) (string, error) {
 81 | 	var output strings.Builder
 82 | 
 83 | 	output.WriteString("🤖🔧 AI Analysis Results 🔧🤖\n")
 84 | 	output.WriteString(fmt.Sprintf("Model: %s\n", model))
 85 | 	if len(a.Errors) != 0 {
 86 | 		output.WriteString("⚠️ Analysis failures: \n")
 87 | 		for _, aerror := range a.Errors {
 88 | 			output.WriteString(fmt.Sprintf("- %s\n", aerror))
 89 | 		}
 90 | 	}
 91 | 	if len(a.Results) == 0 {
 92 | 		output.WriteString("✅ No cluster problems detected\n")
 93 | 		return output.String(), nil
 94 | 	}
 95 | 	output.WriteString(fmt.Sprintf("🔍 %d cluster issues detected\n", len(a.Results)))
 96 | 	output.WriteString("================\n\n")
 97 | 
 98 | 	for _, result := range a.Results {
 99 | 		if result.Kind != "" {
100 | 			output.WriteString(fmt.Sprintf("Kind: %s\n", result.Kind))
101 | 		}
102 | 
103 | 		if result.Name != "" {
104 | 			output.WriteString(fmt.Sprintf("Name: %s\n", result.Name))
105 | 		}
106 | 
107 | 		if result.ParentObject != "" {
108 | 			output.WriteString(fmt.Sprintf("ParentObject: %s\n", result.ParentObject))
109 | 		}
110 | 
111 | 		if len(result.Error) > 0 {
112 | 			output.WriteString("Issues:\n")
113 | 			for _, err := range result.Error {
114 | 				output.WriteString(fmt.Sprintf("- %s\n", err.Text))
115 | 				if err.KubernetesDoc != "" {
116 | 					output.WriteString(fmt.Sprintf("  Kubernetes Doc: %s\n", err.KubernetesDoc))
117 | 				}
118 | 			}
119 | 		}
120 | 
121 | 		if result.Details != "" {
122 | 			output.WriteString(fmt.Sprintf("Details: %s\n", result.Details))
123 | 		}
124 | 
125 | 		output.WriteString("\n------------------------------------------------------------\n\n")
126 | 	}
127 | 
128 | 	return output.String(), nil
129 | }
130 | 


--------------------------------------------------------------------------------
/pkg/aws/aws_test.go:
--------------------------------------------------------------------------------
  1 | // Package aws contains functions related to aws sdk
  2 | package aws
  3 | 
  4 | import (
  5 | 	"testing"
  6 | 
  7 | 	awsv2 "github.com/aws/aws-sdk-go-v2/aws"
  8 | 	ec2v2 "github.com/aws/aws-sdk-go-v2/service/ec2"
  9 | 	ec2v2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
 10 | 	"go.uber.org/mock/gomock"
 11 | 
 12 | 	awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock"
 13 | )
 14 | 
 15 | func setupSubnetMock(t *testing.T, gatewayId *string, mapPublicIps bool) EC2API {
 16 | 	t.Helper()
 17 | 	ctrl := gomock.NewController(t)
 18 | 	rtb := []ec2v2types.Route{
 19 | 		{
 20 | 			DestinationCidrBlock: awsv2.String("0.0.0.0/0"),
 21 | 			GatewayId:            gatewayId,
 22 | 		},
 23 | 	}
 24 | 	ec2api := awsmock.NewMockEC2API(ctrl)
 25 | 	ec2api.EXPECT().DescribeSubnets(gomock.Any(), gomock.Any()).Return(&ec2v2.DescribeSubnetsOutput{
 26 | 		Subnets: []ec2v2types.Subnet{
 27 | 			{
 28 | 				MapPublicIpOnLaunch: awsv2.Bool(mapPublicIps),
 29 | 				SubnetId:            awsv2.String("subnet-1"),
 30 | 			},
 31 | 		},
 32 | 	}, nil)
 33 | 	ec2api.EXPECT().DescribeRouteTables(gomock.Any(), gomock.Any()).Return(&ec2v2.DescribeRouteTablesOutput{
 34 | 		RouteTables: []ec2v2types.RouteTable{
 35 | 			{
 36 | 				Routes: rtb,
 37 | 			},
 38 | 		},
 39 | 	}, nil)
 40 | 	return ec2api
 41 | }
 42 | 
 43 | func TestSdkClient_IsSubnetPrivate(t *testing.T) {
 44 | 	type fields struct {
 45 | 		Region           string
 46 | 		StsClient        StsAPI
 47 | 		Ec2Client        EC2API
 48 | 		CloudTrailClient CloudTrailAPI
 49 | 		BaseConfig       awsv2.Config
 50 | 	}
 51 | 	type args struct {
 52 | 		subnet string
 53 | 	}
 54 | 	tests := []struct {
 55 | 		name    string
 56 | 		fields  fields
 57 | 		args    args
 58 | 		want    bool
 59 | 		wantErr bool
 60 | 	}{
 61 | 		{
 62 | 			name: "A subnet without a GatewayID is considered private",
 63 | 			fields: fields{
 64 | 				Region:           "us-east-1",
 65 | 				StsClient:        nil,
 66 | 				Ec2Client:        setupSubnetMock(t, nil, false),
 67 | 				CloudTrailClient: nil,
 68 | 				BaseConfig:       awsv2.Config{},
 69 | 			},
 70 | 			args: args{
 71 | 				subnet: "subnet-1",
 72 | 			},
 73 | 			want:    true,
 74 | 			wantErr: false,
 75 | 		},
 76 | 		{
 77 | 			name: "A subnet with an internet gateway ID is considered public",
 78 | 			fields: fields{
 79 | 				Region:           "us-east-1",
 80 | 				StsClient:        nil,
 81 | 				Ec2Client:        setupSubnetMock(t, awsv2.String("igw-1"), true),
 82 | 				CloudTrailClient: nil,
 83 | 				BaseConfig:       awsv2.Config{},
 84 | 			},
 85 | 			args: args{
 86 | 				subnet: "subnet-1",
 87 | 			},
 88 | 			want:    false,
 89 | 			wantErr: false,
 90 | 		},
 91 | 		{
 92 | 			name: "A subnet with an virtual private gateway ID is considered private",
 93 | 			fields: fields{
 94 | 				Region:           "us-east-1",
 95 | 				StsClient:        nil,
 96 | 				Ec2Client:        setupSubnetMock(t, awsv2.String("vgw-1"), false),
 97 | 				CloudTrailClient: nil,
 98 | 				BaseConfig:       awsv2.Config{},
 99 | 			},
100 | 			args: args{
101 | 				subnet: "subnet-1",
102 | 			},
103 | 			want:    true,
104 | 			wantErr: false,
105 | 		},
106 | 	}
107 | 	for _, tt := range tests {
108 | 		t.Run(tt.name, func(t *testing.T) {
109 | 			c := &SdkClient{
110 | 				Region:           tt.fields.Region,
111 | 				StsClient:        tt.fields.StsClient,
112 | 				Ec2Client:        tt.fields.Ec2Client,
113 | 				CloudtrailClient: tt.fields.CloudTrailClient,
114 | 				BaseConfig:       &tt.fields.BaseConfig,
115 | 			}
116 | 			got, err := c.IsSubnetPrivate(tt.args.subnet)
117 | 			if (err != nil) != tt.wantErr {
118 | 				t.Errorf("SdkClient.IsSubnetPrivate() error = %v, wantErr %v", err, tt.wantErr)
119 | 				return
120 | 			}
121 | 			if got != tt.want {
122 | 				t.Errorf("SdkClient.IsSubnetPrivate() = %v, want %v", got, tt.want)
123 | 			}
124 | 		})
125 | 	}
126 | }
127 | 


--------------------------------------------------------------------------------
/pkg/investigations/aitest/README.md:
--------------------------------------------------------------------------------
1 | # aitest Investigation
2 | 
3 | Test investigation to run k8sgpt
4 | 
5 | ## Testing
6 | 
7 | Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
8 | 
9 | 


--------------------------------------------------------------------------------
/pkg/investigations/aitest/metadata.yaml:
--------------------------------------------------------------------------------
1 | name: aitest
2 | rbac:
3 |   roles: []
4 |   clusterRoleRules:
5 |     - verbs: ["get", "watch", "list"]
6 |       apiGroups: ["*"]
7 |       resources: ["*"]
8 | customerDataAccess: true
9 | 


--------------------------------------------------------------------------------
/pkg/investigations/aitest/testing/README.md:
--------------------------------------------------------------------------------
1 | # Testing aitest Investigation
2 | 
3 | TODO:
4 | - Add a test script or test objects to this  directory for future maintainers to use
5 | - Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)
6 | 


--------------------------------------------------------------------------------
/pkg/investigations/apierrorbudgetburn/README.md:
--------------------------------------------------------------------------------
1 | # apierrorbudgetburn Investigation
2 | 
3 | POC Api-ErrorBudgetBurn investigation using k8sgpt.
4 | 
5 | ## Testing
6 | 
7 | Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
8 | 
9 | 


--------------------------------------------------------------------------------
/pkg/investigations/apierrorbudgetburn/apierrorbudgetburn.go:
--------------------------------------------------------------------------------
 1 | // Package apierrorbudgetburn contains the investigation for api-ErrorBudgetBurn alerts
 2 | package apierrorbudgetburn
 3 | 
 4 | import (
 5 | 	"errors"
 6 | 	"fmt"
 7 | 	"strings"
 8 | 
 9 | 	"github.com/openshift/configuration-anomaly-detection/pkg/ai/k8sgpt"
10 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
11 | 	k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
12 | 	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13 | )
14 | 
15 | type Investigation struct{}
16 | 
17 | func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
18 | 	result := investigation.InvestigationResult{}
19 | 
20 | 	k8sConfig, err := k8sclient.NewCfg(r.Cluster.ID(), r.OcmClient, r.Name)
21 | 	if err != nil {
22 | 		if errors.Is(err, k8sclient.ErrAPIServerUnavailable) {
23 | 			return result, r.PdClient.EscalateIncidentWithNote("CAD was unable to access cluster's kube-api. Please investigate manually.")
24 | 		}
25 | 
26 | 		return result, fmt.Errorf("unable to initialize k8s cli config: %w", err)
27 | 	}
28 | 	defer func() {
29 | 		deferErr := k8sConfig.Clean()
30 | 		if deferErr != nil {
31 | 			logging.Error(deferErr)
32 | 			err = errors.Join(err, deferErr)
33 | 		}
34 | 	}()
35 | 
36 | 	analysis, err := k8sgpt.K8sGptAnalysis(&k8sConfig.Config)
37 | 	if err != nil {
38 | 		return result, fmt.Errorf("failed to run K8sGptAnalysis: %w", err)
39 | 	}
40 | 
41 | 	return result, r.PdClient.EscalateIncidentWithNote(analysis)
42 | }
43 | 
44 | func (c *Investigation) Name() string {
45 | 	return "apierrorbudgetburn"
46 | }
47 | 
48 | func (c *Investigation) Description() string {
49 | 	return "POC Api-ErrorBudgetBurn investigation using k8sgpt."
50 | }
51 | 
52 | func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
53 | 	return strings.Contains(alert, "api-ErrorBudgetBurn")
54 | }
55 | 
56 | func (c *Investigation) IsExperimental() bool {
57 | 	// This is an experimental investigation leveraging k8sgpt.
58 | 	return true
59 | }
60 | 


--------------------------------------------------------------------------------
/pkg/investigations/apierrorbudgetburn/metadata.yaml:
--------------------------------------------------------------------------------
1 | name: apierrorbudgetburn
2 | rbac:
3 |   roles: []
4 |   clusterRoleRules:
5 |     - verbs: ["get", "watch", "list"]
6 |       apiGroups: ["*"]
7 |       resources: ["*"]
8 | customerDataAccess: true
9 | 


--------------------------------------------------------------------------------
/pkg/investigations/apierrorbudgetburn/testing/README.md:
--------------------------------------------------------------------------------
1 | # Testing apierrorbudgetburn Investigation
2 | 
3 | TODO:
4 | - Add a test script or test objects to this  directory for future maintainers to use
5 | - Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)
6 | 


--------------------------------------------------------------------------------
/pkg/investigations/cannotretrieveupdatessre/README.md:
--------------------------------------------------------------------------------
 1 | # cannotretrieveupdatessre Investigation
 2 | 
 3 | Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and updating the notes with investigation details to the PagerDuty alert about the cluster version status.
 4 | 
 5 | ## Investigation Logic
 6 | 
 7 | The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
 8 | 1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
 9 | 2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.
10 | 
11 | ## Testing
12 | 
13 | Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
14 | 


--------------------------------------------------------------------------------
/pkg/investigations/cannotretrieveupdatessre/cannotretrieveupdatessre.go:
--------------------------------------------------------------------------------
  1 | package cannotretrieveupdatessre
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"strings"
  8 | 
  9 | 	configv1 "github.com/openshift/api/config/v1"
 10 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
 11 | 	k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
 12 | 	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
 13 | 	"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
 14 | 	"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
 15 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 16 | )
 17 | 
 18 | const (
 19 | 	alertname       = "CannotRetrieveUpdatesSRE"
 20 | 	remediationName = "CannotRetrieveUpdatesSRE"
 21 | )
 22 | 
 23 | type Investigation struct{}
 24 | 
 25 | // Run executes the investigation for the CannotRetrieveUpdatesSRE alert
 26 | func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
 27 | 	result := investigation.InvestigationResult{}
 28 | 	notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger)
 29 | 	k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
 30 | 	if err != nil {
 31 | 		return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
 32 | 	}
 33 | 	defer func() {
 34 | 		deferErr := k8scli.Clean()
 35 | 		if deferErr != nil {
 36 | 			logging.Error(deferErr)
 37 | 			err = errors.Join(err, deferErr)
 38 | 		}
 39 | 	}()
 40 | 
 41 | 	// Run network verifier
 42 | 	verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
 43 | 	if err != nil {
 44 | 		notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error())
 45 | 	} else {
 46 | 		switch verifierResult {
 47 | 		case networkverifier.Failure:
 48 | 			result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
 49 | 			notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason)
 50 | 		case networkverifier.Success:
 51 | 			notes.AppendSuccess("Network verifier passed")
 52 | 		}
 53 | 	}
 54 | 
 55 | 	// Check ClusterVersion
 56 | 	clusterVersion, err := getClusterVersion(k8scli)
 57 | 	if err != nil {
 58 | 		notes.AppendWarning("Failed to get ClusterVersion: %s", err.Error())
 59 | 	} else {
 60 | 		notes.AppendSuccess("ClusterVersion found: %s", clusterVersion.Status.Desired.Version)
 61 | 
 62 | 		failureReason := getUpdateRetrievalFailures(clusterVersion)
 63 | 		if failureReason != "" {
 64 | 			logging.Warnf("Detected ClusterVersion issue: %s", failureReason)
 65 | 			notes.AppendWarning("ClusterVersion related issue detected: %s. Current version %s not found in channel %s",
 66 | 				failureReason, clusterVersion.Status.Desired.Version, clusterVersion.Spec.Channel)
 67 | 		}
 68 | 	}
 69 | 	notes.AppendWarning("Alert escalated to on-call primary for review and please check the ClusterVersion.")
 70 | 	return result, r.PdClient.EscalateIncidentWithNote(notes.String())
 71 | }
 72 | 
 73 | func getClusterVersion(k8scli client.Client) (*configv1.ClusterVersion, error) {
 74 | 	clusterVersion := &configv1.ClusterVersion{}
 75 | 	err := k8scli.Get(context.TODO(), client.ObjectKey{Name: "version"}, clusterVersion)
 76 | 	if err != nil {
 77 | 		return nil, fmt.Errorf("failed to get ClusterVersion: %w", err)
 78 | 	}
 79 | 	return clusterVersion, nil
 80 | }
 81 | 
 82 | // getUpdateRetrievalFailures checks for update retrieval failures in the ClusterVersion
 83 | func getUpdateRetrievalFailures(clusterVersion *configv1.ClusterVersion) string {
 84 | 	for _, condition := range clusterVersion.Status.Conditions {
 85 | 		msg, found := checkCondition(condition)
 86 | 		if found {
 87 | 			return msg
 88 | 		}
 89 | 	}
 90 | 	return ""
 91 | }
 92 | 
 93 | func checkCondition(condition configv1.ClusterOperatorStatusCondition) (string, bool) {
 94 | 	if condition.Type != "RetrievedUpdates" {
 95 | 		return "", false
 96 | 	}
 97 | 	if condition.Status == configv1.ConditionFalse {
 98 | 		return fmt.Sprintf("(Reason: %s). %s", condition.Reason, condition.Message), true
 99 | 	}
100 | 	return "", false
101 | }
102 | 
103 | func (i *Investigation) Name() string {
104 | 	return alertname
105 | }
106 | 
107 | func (i *Investigation) Description() string {
108 | 	return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
109 | }
110 | 
111 | func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
112 | 	return strings.Contains(alert, alertname)
113 | }
114 | 
115 | func (i *Investigation) IsExperimental() bool {
116 | 	return true
117 | }
118 | 


--------------------------------------------------------------------------------
/pkg/investigations/cannotretrieveupdatessre/metadata.yaml:
--------------------------------------------------------------------------------
 1 | name: cannotretrieveupdatessre
 2 | rbac:
 3 |   roles: []
 4 |   clusterRoleRules:
 5 |     - verbs:
 6 |         - "get"
 7 |         - "list"
 8 |       apiGroups:
 9 |         - "config.openshift.io"
10 |       resources:
11 |         - clusterversions
12 | customerDataAccess: false
13 | 


--------------------------------------------------------------------------------
/pkg/investigations/cannotretrieveupdatessre/testing/README.md:
--------------------------------------------------------------------------------
 1 | # Testing CannotRetrieveUpdatesSRE Investigation
 2 | 
 3 | ### Update the ClusterVersion Channel
 4 | - Below script helps to set the test channel to check the clusterversion change.
 5 | ```sh
 6 | #!/bin/bash
 7 | 
 8 | # Use test channel for the ClusterVersion
 9 | oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18-test"}}' --as backplane-cluster-admin
10 | sleep 30
11 | 
12 | # Verify
13 | oc get clusterversion version -o jsonpath='{.spec.channel}' | grep "stable-4.18-test" || { echo "Failed to set the channel"; exit 1; }
14 | 
15 | # Optional: Revert back to the original change
16 | #oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18"}}' --as backplane-cluster-admin
17 | ```
18 | 


--------------------------------------------------------------------------------
/pkg/investigations/ccam/ccam_test.go:
--------------------------------------------------------------------------------
 1 | package ccam
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"testing"
 6 | 
 7 | 	investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
 8 | )
 9 | 
10 | func TestEvaluateRandomError(t *testing.T) {
11 | 	timeoutError := errors.New("credentials are there, error is different: timeout")
12 | 	input := investigation.Resources{
13 | 		Cluster:           nil,
14 | 		ClusterDeployment: nil,
15 | 		AwsClient:         nil,
16 | 		OcmClient:         nil,
17 | 		PdClient:          nil,
18 | 		AdditionalResources: map[string]interface{}{
19 | 			"error": errors.New("timeout"),
20 | 		},
21 | 	}
22 | 
23 | 	inv := Investigation{}
24 | 
25 | 	_, err := inv.Run(&input)
26 | 	if err.Error() != timeoutError.Error() {
27 | 		t.Fatalf("Expected error %v, but got %v", timeoutError, err)
28 | 	}
29 | }
30 | 
31 | func TestCustomerRemovedPermissions(t *testing.T) {
32 | 	tests := []struct {
33 | 		name          string
34 | 		errorMessage  string
35 | 		expectedMatch bool
36 | 	}{
37 | 		{
38 | 			name:          "Matching error 1",
39 | 			errorMessage:  "unable to query aws credentials from backplane: failed to determine if cluster is using isolated backlpane access: failed to get sts support jump role ARN for cluster 28testqvq0jpo1hsrch6gvbc0123test: failed to get STS Support Jump Role for cluster 28testqvq0jpo1hsrch6gvbc0qgqtest, status is 404, identifier is '404', code is 'CLUSTERS-MGMT-404' and operation identifier is 'teste1d1-3844-46f7-82d4-643c5aeeca53': Failed to find trusted relationship to support role 'RH-Technical-Support-Access'",
40 | 			expectedMatch: true,
41 | 		},
42 | 		{
43 | 			name:          "Matching error 2",
44 | 			errorMessage:  "unable to query aws credentials from backplane: failed to determine if cluster is using isolated backlpane access: failed to get sts support jump role ARN for cluster test9tm92uu49s29plim5dn1sbc1test: failed to get STS Support Jump Role for cluster test9tm92uu49s29plim5dn1sbc1test, status is 404, identifier is '404', code is 'CLUSTERS-MGMT-404' and operation identifier is 'testf5f3-6591-452f-98cb-3943edf4test': Support role, used with cluster 'test9tm92uu49s29plim5dn1sbc1test', does not exist in the customer's AWS account",
45 | 			expectedMatch: true,
46 | 		},
47 | 		{
48 | 			name:          "Matching error 3",
49 | 			errorMessage:  "something could not assume support role in customer's account: AccessDenied: something",
50 | 			expectedMatch: true,
51 | 		},
52 | 		{
53 | 			name:          "Matching error 4",
54 | 			errorMessage:  "unable to query aws credentials from backplane: failed to determine if cluster is using isolated backlpane access: failed to get sts support jump role ARN for cluster <cluster_id>: failed to get STS Support Jump Role for cluster <cluster_id>, status is 400, identifier is '400', code is 'CLUSTERS-MGMT-400' and operation identifier is '<op_id>': Please make sure IAM role 'arn:aws:iam::<cluster_aws_account_id>:role/ManagedOpenShift-Installer-Role' exists, and add 'arn:aws:iam::<ocm_aws_account_id>:role/RH-Managed-OpenShift-Installer' to the trust policy on IAM role 'arn:aws:iam::<cluster_aws_account_id>:role/ManagedOpenShift-Installer-Role': Failed to assume role: User: arn:aws:sts::<ocm_aws_account_id>:assumed-role/RH-Managed-OpenShift-Installer/OCM is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::<cluster_aws_account_id>:role/ManagedOpenShift-Installer-Role",
55 | 			expectedMatch: true,
56 | 		},
57 | 		{
58 | 			name:          "Matching error 5",
59 | 			errorMessage:  "unable to query aws credentials from backplane: failed to determine if cluster is using isolated backlpane access: failed to get sts support jump role ARN for cluster <cluster_id>: failed to get STS Support Jump Role for cluster <cluster_id>, status is 400, identifier is '400', code is 'CLUSTERS-MGMT-400' and operation identifier is '<op_id>': Failed to get role: User: arn:aws:sts::<cluster_aws_account_id>:assumed-role/ManagedOpenShift-Installer-Role/OCM is not authorized to perform: iam:GetRole on resource: role ManagedOpenShift-Support-Role because no identity-based policy allows the iam:GetRole action",
60 | 			expectedMatch: true,
61 | 		},
62 | 		{
63 | 			name:          "Non-matching error",
64 | 			errorMessage:  "Some timeout error",
65 | 			expectedMatch: false,
66 | 		},
67 | 	}
68 | 
69 | 	for _, tt := range tests {
70 | 		t.Run(tt.name, func(t *testing.T) {
71 | 			match := customerRemovedPermissions(tt.errorMessage)
72 | 			if match != tt.expectedMatch {
73 | 				t.Errorf("customerRemovedPermissions() = %v, expectedMatch %v", match, tt.expectedMatch)
74 | 			}
75 | 		})
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/pkg/investigations/chgm/README.md:
--------------------------------------------------------------------------------
 1 | # ClusterHasGoneMissing Investigation
 2 | 
 3 | ## Alert firing investigation
 4 | 
 5 | 1. PagerDuty webhook receives CHGM alert from Dead Man's Snitch.
 6 | 2. CAD Tekton pipeline is triggered via PagerDuty sending a webhook to Tekton EventListener.
 7 | 3. Logs into AWS account of cluster and checks for stopped/terminated instances.
 8 |     - If unable to access AWS account, posts "cluster credentials are missing" limited support reason.
 9 | 4. If stopped/terminated instances are found, pulls AWS CloudTrail events for those instances.
10 |     - If no stopped/terminated instances are found, escalates to SRE for further investigation.
11 | 5. If the user of the event is:
12 |     - Authorized (SRE or OSD managed), runs the network verifier and escalates the alert to SRE for futher investigation.
13 |         - **Note:** Authorized users have prefix RH-SRE, osdManagedAdmin, or have the ManagedOpenShift-Installer-Role.
14 |     - Not authorized (not SRE or OSD managed), posts the appropriate limited support reason and silences the alert.
15 | 6. Adds notes with investigation details to the PagerDuty alert.
16 |    
17 | ## CHGM investigation overview
18 | 
19 | ![CHGM investigation overview](./images/cad_chgm_investigation/chgm_investigation_dark.png#gh-dark-mode-only)
20 | ![CHGM investigation overview](./images/cad_chgm_investigation/chgm_investigation_light.png#gh-light-mode-only)
21 | 


--------------------------------------------------------------------------------
/pkg/investigations/chgm/chgm_hibernation_check.go:
--------------------------------------------------------------------------------
 1 | package chgm
 2 | 
 3 | import (
 4 | 	"sort"
 5 | 	"time"
 6 | 
 7 | 	cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
 8 | 	servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
 9 | 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
10 | )
11 | 
12 | const recentWakeupTime = 2 * time.Hour
13 | 
14 | // 30 Days is always a problem as kubelet certificates will be expired
15 | const hibernationTooLong = 30 * 24 * time.Hour
16 | 
17 | const (
18 | 	hibernationStartEvent = "cluster_state_hibernating"
19 | 	hibernationEndEvent   = "cluster_state_ready"
20 | )
21 | 
22 | // const hibernationOngoingEvent = "cluster_state_hibernating"
23 | // const hibernationResumeEvent = "cluster_state_resuming"
24 | 
25 | type hibernationPeriod struct {
26 | 	HibernationDuration time.Duration
27 | 	DehibernationTime   time.Time
28 | }
29 | 
30 | func hibernatedTooLong(hibernations []*hibernationPeriod, now time.Time) bool {
31 | 	if len(hibernations) == 0 {
32 | 		return false
33 | 	}
34 | 	latestHibernation := hibernations[len(hibernations)-1]
35 | 	// The cluster was woken up within the RECENT_WAKEUP_TIME which might
36 | 	// indicate a CSR problem.
37 | 	if now.Sub(latestHibernation.DehibernationTime) >= recentWakeupTime {
38 | 		return false
39 | 	}
40 | 	// Only clusters that have hibernated for a long time are susceptible to
41 | 	// have cert issues.
42 | 	if latestHibernation.HibernationDuration >= hibernationTooLong {
43 | 		return true
44 | 	}
45 | 	return false
46 | }
47 | 
48 | func getHibernationStatusForCluster(ocmClient ocm.Client, cluster *cmv1.Cluster) ([]*hibernationPeriod, error) {
49 | 	filter := "log_type='cluster-state-updates'"
50 | 	clusterStateUpdates, err := ocmClient.GetServiceLog(cluster, filter)
51 | 	if err != nil {
52 | 		return nil, err
53 | 	}
54 | 	return createHibernationTimeLine(clusterStateUpdates.Items().Slice()), nil
55 | }
56 | 
57 | func createHibernationTimeLine(clusterStateUpdates []*servicelogsv1.LogEntry) []*hibernationPeriod {
58 | 	var hibernations []*hibernationPeriod
59 | 
60 | 	var hibernationStartTime time.Time
61 | 	var hibernationEndTime time.Time
62 | 	sort.SliceStable(clusterStateUpdates, func(i, j int) bool {
63 | 		return clusterStateUpdates[i].Timestamp().Before(clusterStateUpdates[j].Timestamp())
64 | 	})
65 | 	for _, stateUpdate := range clusterStateUpdates {
66 | 		event := stateUpdate.Summary()
67 | 		date := stateUpdate.Timestamp()
68 | 		if event == hibernationStartEvent {
69 | 			hibernationStartTime = date
70 | 		}
71 | 		if event == hibernationEndEvent {
72 | 			if (time.Time.Equal(hibernationStartTime, time.Time{})) {
73 | 				// Cluster became ready after installation
74 | 				continue
75 | 			}
76 | 			hibernationEndTime = date
77 | 			hibernation := &hibernationPeriod{
78 | 				DehibernationTime:   hibernationEndTime,
79 | 				HibernationDuration: hibernationEndTime.Sub(hibernationStartTime),
80 | 			}
81 | 			hibernations = append(hibernations, hibernation)
82 | 		}
83 | 	}
84 | 	// Would be an ongoing hibernation
85 | 	// if (hibernationStartTime != time.Time{} && hibernationEndTime == time.Time{}) {
86 | 	// 	hibernations = append(hibernations, &HibernationPeriod{})
87 | 	// }
88 | 	return hibernations
89 | }
90 | 


--------------------------------------------------------------------------------
/pkg/investigations/chgm/chgm_hibernation_check_test.go:
--------------------------------------------------------------------------------
  1 | package chgm
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | 	"time"
  7 | 
  8 | 	servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
  9 | )
 10 | 
 11 | func TestCreateHibernationTimeLine(t *testing.T) {
 12 | 	type args struct {
 13 | 		clusterStateUpdates []*servicelogsv1.LogEntry
 14 | 	}
 15 | 	hibernationStartTime := time.Date(2023, 0o1, 0o1, 0o0, 0o0, 0o0, 0o0, time.Local)
 16 | 	hibernationStopTime := time.Date(2023, 0o2, 0o1, 0o0, 0o0, 0o0, 0o0, time.Local)
 17 | 	hibernationStart, _ := servicelogsv1.NewLogEntry().Timestamp(hibernationStartTime).Summary(hibernationStartEvent).Build()
 18 | 	hibernationEnd, _ := servicelogsv1.NewLogEntry().Timestamp(hibernationStopTime).Summary(hibernationEndEvent).Build()
 19 | 	var emptyHibernationSlice []*hibernationPeriod
 20 | 	tests := []struct {
 21 | 		name string
 22 | 		args args
 23 | 		want []*hibernationPeriod
 24 | 	}{
 25 | 		{
 26 | 			name: "Hibernation with start and end",
 27 | 			args: args{
 28 | 				clusterStateUpdates: []*servicelogsv1.LogEntry{
 29 | 					hibernationStart,
 30 | 					hibernationEnd,
 31 | 				},
 32 | 			},
 33 | 			want: []*hibernationPeriod{
 34 | 				{
 35 | 					HibernationDuration: hibernationStopTime.Sub(hibernationStartTime),
 36 | 					DehibernationTime:   hibernationStopTime,
 37 | 				},
 38 | 			},
 39 | 		},
 40 | 		{
 41 | 			name: "Hibernation without end is not part of the return",
 42 | 			args: args{
 43 | 				clusterStateUpdates: []*servicelogsv1.LogEntry{
 44 | 					hibernationStart,
 45 | 				},
 46 | 			},
 47 | 			want: emptyHibernationSlice,
 48 | 		},
 49 | 		{
 50 | 			name: "Hibernation without start is not part of the return",
 51 | 			args: args{
 52 | 				clusterStateUpdates: []*servicelogsv1.LogEntry{
 53 | 					hibernationEnd,
 54 | 				},
 55 | 			},
 56 | 			want: emptyHibernationSlice,
 57 | 		},
 58 | 	}
 59 | 	for _, tt := range tests {
 60 | 		t.Run(tt.name, func(t *testing.T) {
 61 | 			if got := createHibernationTimeLine(tt.args.clusterStateUpdates); !reflect.DeepEqual(got, tt.want) {
 62 | 				t.Errorf("CreateHibernationTimeLine() = %v, want %v", got, tt.want)
 63 | 			}
 64 | 		})
 65 | 	}
 66 | }
 67 | 
 68 | func TestHibernatedTooLong(t *testing.T) {
 69 | 	type args struct {
 70 | 		hibernations []*hibernationPeriod
 71 | 		now          time.Time
 72 | 	}
 73 | 	hibernationStartTime := time.Date(2023, 0o1, 0o1, 0o0, 0o0, 0o0, 0o0, time.Local)
 74 | 	hibernationShortStopTime := time.Date(2023, 0o1, 11, 0o0, 0o0, 0o0, 0o0, time.Local)
 75 | 	hibernationLongStopTime := time.Date(2023, 0o2, 11, 0o0, 0o0, 0o0, 0o0, time.Local)
 76 | 	shortHibernation := &hibernationPeriod{
 77 | 		HibernationDuration: hibernationShortStopTime.Sub(hibernationStartTime),
 78 | 		DehibernationTime:   hibernationShortStopTime,
 79 | 	}
 80 | 	longHibernation := &hibernationPeriod{
 81 | 		HibernationDuration: hibernationLongStopTime.Sub(hibernationStartTime),
 82 | 		DehibernationTime:   hibernationLongStopTime,
 83 | 	}
 84 | 	tests := []struct {
 85 | 		name string
 86 | 		args args
 87 | 		want bool
 88 | 	}{
 89 | 		// TODO: Add test cases.
 90 | 		{
 91 | 			name: "Cluster that hibernated for 10 days is ok",
 92 | 			args: args{
 93 | 				hibernations: []*hibernationPeriod{shortHibernation},
 94 | 				now:          hibernationShortStopTime.Add(1 * time.Hour),
 95 | 			},
 96 | 			want: false,
 97 | 		},
 98 | 		{
 99 | 			name: "Cluster that hibernated for 30+ days is too long",
100 | 			args: args{
101 | 				hibernations: []*hibernationPeriod{longHibernation},
102 | 				now:          hibernationLongStopTime.Add(1 * time.Hour),
103 | 			},
104 | 			want: true,
105 | 		},
106 | 		{
107 | 			name: "Cluster that never hibernated is ok",
108 | 			args: args{},
109 | 			want: false,
110 | 		},
111 | 		{
112 | 			name: "Cluster that woke up for 2+ hours ago ok",
113 | 			args: args{
114 | 				hibernations: []*hibernationPeriod{longHibernation},
115 | 				now:          hibernationLongStopTime.Add(3 * time.Hour),
116 | 			},
117 | 			want: false,
118 | 		},
119 | 	}
120 | 	for _, tt := range tests {
121 | 		t.Run(tt.name, func(t *testing.T) {
122 | 			got := hibernatedTooLong(tt.args.hibernations, tt.args.now)
123 | 			if got != tt.want {
124 | 				t.Errorf("HibernatedTooLong() = %v, want %v", got, tt.want)
125 | 			}
126 | 		})
127 | 	}
128 | }
129 | 


--------------------------------------------------------------------------------
/pkg/investigations/chgm/chgm_suite_test.go:
--------------------------------------------------------------------------------
 1 | package chgm_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	. "github.com/onsi/ginkgo/v2"
 7 | 	. "github.com/onsi/gomega"
 8 | )
 9 | 
10 | func TestChgm(t *testing.T) {
11 | 	RegisterFailHandler(Fail)
12 | 	RunSpecs(t, "Chgm Suite")
13 | }
14 | 


--------------------------------------------------------------------------------
/pkg/investigations/chgm/util.go:
--------------------------------------------------------------------------------
 1 | package chgm
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
 7 | )
 8 | 
 9 | func createEgressSL(blockedUrls string) *ocm.ServiceLog {
10 | 	description := fmt.Sprintf("Your cluster requires you to take action. SRE has observed that there have been changes made to the network configuration which impacts normal working of the cluster, including lack of network egress to these internet-based resources which are required for the cluster operation and support: %s. Please revert changes, and refer to documentation regarding firewall requirements for PrivateLink clusters: https://access.redhat.com/documentation/en-us/red_hat_openshift_service_on_aws/4/html/prepare_your_environment/rosa-sts-aws-prereqs#osd-aws-privatelink-firewall-prerequisites_rosa-sts-aws-prereqs#.", blockedUrls)
11 | 
12 | 	egressSL := ocm.ServiceLog{
13 | 		Severity:     "Critical",
14 | 		Summary:      "Action required: Network misconfiguration",
15 | 		ServiceName:  "SREManualAction",
16 | 		Description:  description,
17 | 		InternalOnly: false,
18 | 	}
19 | 
20 | 	return &egressSL
21 | }
22 | 


--------------------------------------------------------------------------------
/pkg/investigations/chgm/util_test.go:
--------------------------------------------------------------------------------
 1 | package chgm
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
 8 | 	"gotest.tools/v3/assert"
 9 | )
10 | 
11 | // Mock data
12 | var blockedUrls = "example.com, test.com"
13 | 
14 | // TestCreateEgressSL tests the createEgressSL function
15 | func TestCreateEgressSL(t *testing.T) {
16 | 	expectedDescription := fmt.Sprintf(
17 | 		"Your cluster requires you to take action. SRE has observed that there have been changes made to the network configuration which impacts normal working of the cluster, including lack of network egress to these internet-based resources which are required for the cluster operation and support: %s. Please revert changes, and refer to documentation regarding firewall requirements for PrivateLink clusters: https://access.redhat.com/documentation/en-us/red_hat_openshift_service_on_aws/4/html/prepare_your_environment/rosa-sts-aws-prereqs#osd-aws-privatelink-firewall-prerequisites_rosa-sts-aws-prereqs#.",
18 | 		blockedUrls,
19 | 	)
20 | 
21 | 	expected := &ocm.ServiceLog{
22 | 		Severity:     "Critical",
23 | 		Summary:      "Action required: Network misconfiguration",
24 | 		ServiceName:  "SREManualAction",
25 | 		Description:  expectedDescription,
26 | 		InternalOnly: false,
27 | 	}
28 | 
29 | 	result := createEgressSL(blockedUrls)
30 | 	assert.Equal(t, *expected, *result)
31 | }
32 | 


--------------------------------------------------------------------------------
/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn_test.go:
--------------------------------------------------------------------------------
 1 | package clustermonitoringerrorbudgetburn
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	configv1 "github.com/openshift/api/config/v1"
 7 | 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 8 | )
 9 | 
10 | var (
11 | 	statusConditionAvailable                = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "True"}
12 | 	statusConditionUpgradeable              = configv1.ClusterOperatorStatusCondition{Type: "Upgradeable", Status: "True"}
13 | 	statusConditionUnavailableSymptomsMatch = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "False", Message: `the User Workload Configuration from "config.yaml" key in the "openshift-user-workload-monitoring/user-workload-monitoring-config" ConfigMap could not be parsed`}
14 | )
15 | 
16 | func TestSymptomMatches(t *testing.T) {
17 | 	monitoringCo := configv1.ClusterOperator{
18 | 		ObjectMeta: v1.ObjectMeta{Name: "monitoring"},
19 | 		Status: configv1.ClusterOperatorStatus{
20 | 			Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionUnavailableSymptomsMatch, statusConditionUpgradeable},
21 | 		},
22 | 	}
23 | 	if !isUWMConfigInvalid(&monitoringCo) {
24 | 		t.Fatal("expected symptoms to match")
25 | 	}
26 | }
27 | 
28 | func TestSymptomNoMatch(t *testing.T) {
29 | 	monitoringCo := configv1.ClusterOperator{
30 | 		ObjectMeta: v1.ObjectMeta{Name: "monitoring"},
31 | 		Status: configv1.ClusterOperatorStatus{
32 | 			Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionAvailable, statusConditionUpgradeable},
33 | 		},
34 | 	}
35 | 	if isUWMConfigInvalid(&monitoringCo) {
36 | 		t.Fatal("expected symptoms to not match")
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/pkg/investigations/clustermonitoringerrorbudgetburn/metadata.yaml:
--------------------------------------------------------------------------------
 1 | name: clustermonitoringerrorbudgetburn
 2 | rbac:
 3 |   roles: []
 4 |   clusterRoleRules:
 5 |     - verbs:
 6 |         - "get"
 7 |         - "list"
 8 |       apiGroups:
 9 |         - "config.openshift.io"
10 |       resources:
11 |         - clusteroperators
12 | customerDataAccess: false
13 | 


--------------------------------------------------------------------------------
/pkg/investigations/insightsoperatordown/insightsoperatordown_test.go:
--------------------------------------------------------------------------------
 1 | package insightsoperatordown
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	configv1 "github.com/openshift/api/config/v1"
 7 | 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 8 | )
 9 | 
10 | func TestIsOCPBUG22226(t *testing.T) {
11 | 	tests := []struct {
12 | 		name     string
13 | 		co       configv1.ClusterOperator
14 | 		expected bool
15 | 	}{
16 | 		{
17 | 			name: "SCA certs pull failure detected",
18 | 			co: configv1.ClusterOperator{
19 | 				ObjectMeta: v1.ObjectMeta{Name: "insights"},
20 | 				Status: configv1.ClusterOperatorStatus{
21 | 					Conditions: []configv1.ClusterOperatorStatusCondition{
22 | 						{Type: "SCAAvailable", Message: "Failed to pull SCA certs"},
23 | 					},
24 | 				},
25 | 			},
26 | 			expected: true,
27 | 		},
28 | 		{
29 | 			name: "No SCA certs pull failure",
30 | 			co: configv1.ClusterOperator{
31 | 				ObjectMeta: v1.ObjectMeta{Name: "insights"},
32 | 				Status: configv1.ClusterOperatorStatus{
33 | 					Conditions: []configv1.ClusterOperatorStatusCondition{
34 | 						{Type: "SCAAvailable", Message: "All systems operational"},
35 | 					},
36 | 				},
37 | 			},
38 | 			expected: false,
39 | 		},
40 | 	}
41 | 
42 | 	for _, tt := range tests {
43 | 		t.Run(tt.name, func(t *testing.T) {
44 | 			if isOCPBUG22226(&tt.co) != tt.expected {
45 | 				t.Fatalf("expected %v, got %v", tt.expected, !tt.expected)
46 | 			}
47 | 		})
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/pkg/investigations/insightsoperatordown/metadata.yaml:
--------------------------------------------------------------------------------
 1 | name: insightsoperatordown
 2 | rbac:
 3 |   roles: []
 4 |   clusterRoleRules:
 5 |     - verbs:
 6 |         - "get"
 7 |         - "list"
 8 |       apiGroups:
 9 |         - "config.openshift.io"
10 |       resources:
11 |         - clusteroperators
12 | customerDataAccess: false
13 | 


--------------------------------------------------------------------------------
/pkg/investigations/insightsoperatordown/testing/README.md:
--------------------------------------------------------------------------------
 1 | # Testing InsightsOperatorDownSRE
 2 | 
 3 | # OCPBUGS-22226
 4 | 
 5 | We can induce the symptom of `Failed to pull SCA certs` on a stage cluster by blocking `https://api.stage.openshift.com`
 6 | The provided script creates a Rule Group and associates it with your clusters VPC.
 7 | Requires awscli and backplane
 8 | 
 9 | ```
10 | ./pkg/investigations/insightsoperatordown/testing/block-api-openshift.sh <cluster-id>
11 | ```
12 | 
13 | # Banned user
14 | 
15 | TODO
16 | 
17 | # Additional Resources
18 | 
19 | - SOP Link https://github.com/openshift/ops-sop/blob/master/v4/troubleshoot/clusteroperators/insights.md
20 | - Alert Definition https://github.com/openshift/managed-cluster-config/blob/master/deploy/sre-prometheus/insights/100-sre-insightsoperator.PrometheusRule.yaml
21 | 


--------------------------------------------------------------------------------
/pkg/investigations/insightsoperatordown/testing/block-api-openshift.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eox pipefail
 3 | AWS_PAGER=""
 4 | $(ocm backplane cloud credentials -o env $1)
 5 | AWS_REGION=$(ocm describe cluster $1 --json | jq -r '.region.id')
 6 | FW_RULE_GROUP_ID=$(aws route53resolver create-firewall-rule-group --name "api stage openshift com"  | jq -r '.FirewallRuleGroup.Id')
 7 | FW_DOMAIN_LIST_ID=$(aws route53resolver create-firewall-domain-list --name "api stage openshift com" | jq -r '.FirewallDomainList.Id')
 8 | aws route53resolver update-firewall-domains --firewall-domain-list-id $FW_DOMAIN_LIST_ID --domains "api.stage.openshift.com" --operation "ADD"
 9 | aws route53resolver create-firewall-rule --firewall-rule-group-id $FW_RULE_GROUP_ID --firewall-domain-list-id $FW_DOMAIN_LIST_ID --priority "1" --action "BLOCK" --block-response "NODATA" --name "api stage openshift com"
10 | INFRA_ID=$(ocm describe cluster $1 --json | jq -r '.infra_id')
11 | VPC_ID=$(aws ec2 describe-vpcs --filters "Name=tag-key,Values=kubernetes.io/cluster/$INFRA_ID" | jq -r '.Vpcs[0].VpcId')
12 | aws route53resolver associate-firewall-rule-group --firewall-rule-group-id $FW_RULE_GROUP_ID --name "rgassoc-$VPC_ID-$FW_RULE_GROUP_ID" --priority "1001" --vpc-id $VPC_ID
13 | 
14 | 


--------------------------------------------------------------------------------
/pkg/investigations/investigation/investigation.go:
--------------------------------------------------------------------------------
 1 | // Package investigation contains base functions for investigations
 2 | package investigation
 3 | 
 4 | import (
 5 | 	cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
 6 | 	"github.com/openshift/configuration-anomaly-detection/pkg/aws"
 7 | 	"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
 8 | 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
 9 | 	"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
10 | 	hivev1 "github.com/openshift/hive/apis/hive/v1"
11 | )
12 | 
13 | type InvestigationStep struct {
14 | 	Performed bool
15 | 	Labels    []string
16 | }
17 | 
18 | type InvestigationResult struct {
19 | 	LimitedSupportSet  InvestigationStep
20 | 	ServiceLogPrepared InvestigationStep
21 | 	ServiceLogSent     InvestigationStep
22 | }
23 | 
24 | type Investigation interface {
25 | 	Run(resources *Resources) (InvestigationResult, error)
26 | 	// Please note that when adding an investigation the name and the directory currently need to be the same,
27 | 	// so that backplane-api can fetch the metadata.yaml
28 | 	Name() string
29 | 	Description() string
30 | 	IsExperimental() bool
31 | 	ShouldInvestigateAlert(string) bool
32 | }
33 | 
34 | // Resources holds all resources/tools required for alert investigations
35 | type Resources struct {
36 | 	Name                string
37 | 	Cluster             *cmv1.Cluster
38 | 	ClusterDeployment   *hivev1.ClusterDeployment
39 | 	AwsClient           aws.Client
40 | 	OcmClient           ocm.Client
41 | 	PdClient            pagerduty.Client
42 | 	Notes               *notewriter.NoteWriter
43 | 	AdditionalResources map[string]interface{}
44 | }
45 | 


--------------------------------------------------------------------------------
/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/metadata.yaml:
--------------------------------------------------------------------------------
 1 | name: MachineHealthCheckUnterminatedShortCircuitSRE
 2 | rbac:
 3 |   roles:
 4 |     - namespace: "openshift-machine-api"
 5 |       rules:
 6 |         - verbs:
 7 |             - "get"
 8 |             - "list"
 9 |           apiGroups:
10 |             - "machine.openshift.io"
11 |           resources:
12 |             - "machines"
13 |             - "machinehealthchecks"
14 |   clusterRoleRules:
15 |     - verbs:
16 |         - "get"
17 |         - "list"
18 |       apiGroups:
19 |         - ""
20 |       resources:
21 |         - "nodes"
22 | customerDataAccess: false
23 | 


--------------------------------------------------------------------------------
/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/recommendation.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert
 3 | */
 4 | package machinehealthcheckunterminatedshortcircuitsre
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | 
 9 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine"
10 | )
11 | 
12 | // machineRecommendations categorizes each machine's individual investigation summary into a recommended course of action
13 | type investigationRecommendations map[recommendedAction][]investigationResult
14 | 
15 | func (r investigationRecommendations) addRecommendation(action recommendedAction, object string, notes string) {
16 | 	recommendation := investigationResult{
17 | 		object: object,
18 | 		notes:  notes,
19 | 	}
20 | 	r[action] = append(r[action], recommendation)
21 | }
22 | 
23 | // summarize prints the machine investigationRecommendations into a human read-able format.
24 | func (r investigationRecommendations) summarize() string {
25 | 	msg := ""
26 | 	for recommendation, investigations := range r {
27 | 		msg += fmt.Sprintf("%s:\n", recommendation)
28 | 
29 | 		if recommendation == recommendationDeleteMachine {
30 | 			// Consolidate all machine deletion requests into a single oc command for ease of use
31 | 			deleteCmd := fmt.Sprintf("oc delete machine -n %s", machine.MachineNamespace)
32 | 			for _, investigation := range investigations {
33 | 				msg += fmt.Sprintf("- %s\n", investigation.String())
34 | 				deleteCmd += " " + investigation.object
35 | 			}
36 | 			msg += fmt.Sprintf("to delete these machines, run:\n\n%s\n", deleteCmd)
37 | 		} else {
38 | 			for _, investigation := range investigations {
39 | 				msg += fmt.Sprintf("- %s\n", investigation.String())
40 | 			}
41 | 		}
42 | 
43 | 		msg += "\n"
44 | 	}
45 | 	return msg
46 | }
47 | 
48 | type investigationResult struct {
49 | 	// name indicates which object was investigated
50 | 	object string
51 | 	// notes provides a high-level summary of the investigation results
52 | 	notes string
53 | }
54 | 
55 | func (s *investigationResult) String() string {
56 | 	msg := fmt.Sprintf("%q: %s", s.object, s.notes)
57 | 	return msg
58 | }
59 | 
60 | // recommendedAction acts as both a key in the investigationRecommendations map, as well as a header for pagerduty notes when summarize()-ing
61 | type recommendedAction string
62 | 
63 | const (
64 | 	// recommendationDeleteMachine indicates that the machine(s) in question should be deleted so the machine-api can reprovision them
65 | 	recommendationDeleteMachine recommendedAction = "delete the following machines"
66 | 	// recommendationInvestigateMachine indicates that the machine(s) in question need to be manually investigated
67 | 	recommendationInvestigateMachine recommendedAction = "investigate the following machines"
68 | 	// recommendationQuotaServiceLog indicates that the machine(s) in question need to be remediated by the customer, and SRE should notify them
69 | 	// of that fact via servicelog
70 | 	recommendationQuotaServiceLog recommendedAction = "send a service log regarding quota issues for the following machines"
71 | 	// recommendationInvestigateNode indicates that the machine's node object is reporting problems which require human intervention to resolve
72 | 	recommendationInvestigateNode recommendedAction = "investigate the following nodes"
73 | )
74 | 


--------------------------------------------------------------------------------
/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/srep-worker-healthcheck_machinehealthcheck.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: machine.openshift.io/v1beta1
 2 | kind: MachineHealthCheck
 3 | metadata:
 4 |   name: srep-worker-healthcheck
 5 |   namespace: openshift-machine-api
 6 | spec:
 7 |   maxUnhealthy: 0
 8 |   nodeStartupTimeout: 25m
 9 |   selector:
10 |     matchExpressions:
11 |     - key: machine.openshift.io/cluster-api-machine-role
12 |       operator: NotIn
13 |       values:
14 |       - infra
15 |       - master
16 |     - key: machine.openshift.io/cluster-api-machineset
17 |       operator: Exists
18 |     - key: machine.openshift.io/instance-type
19 |       operator: NotIn
20 |       values:
21 |       - m5.metal
22 |       - m5d.metal
23 |       - m5n.metal
24 |       - m5dn.metal
25 |       - m5zn.metal
26 |       - m6a.metal
27 |       - m6i.metal
28 |       - m6id.metal
29 |       - r5.metal
30 |       - r5d.metal
31 |       - r5n.metal
32 |       - r5dn.metal
33 |       - r6a.metal
34 |       - r6i.metal
35 |       - r6id.metal
36 |       - x2iezn.metal
37 |       - z1d.metal
38 |       - c5.metal
39 |       - c5d.metal
40 |       - c5n.metal
41 |       - c6a.metal
42 |       - c6i.metal
43 |       - c6id.metal
44 |       - i3.metal
45 |       - i3en.metal
46 |       - r7i.48xlarge
47 |   unhealthyConditions:
48 |   - status: "False"
49 |     timeout: 10s
50 |     type: Ready
51 |   - status: Unknown
52 |     timeout: 10s
53 |     type: Ready
54 | 


--------------------------------------------------------------------------------
/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_pdb.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: policy/v1
 2 | kind: PodDisruptionBudget
 3 | metadata:
 4 |   name: test-cad
 5 |   namespace: default
 6 | spec:
 7 |   maxUnavailable: 0
 8 |   selector:
 9 |     matchLabels:
10 |       app: "test-cad"
11 |   unhealthyPodEvictionPolicy: AlwaysAllow
12 | 


--------------------------------------------------------------------------------
/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: "test-cad"
 6 |   name: test-cad
 7 |   namespace: default
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app: "test-cad"
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: "test-cad"
17 |     spec:
18 |       affinity:
19 |         nodeAffinity:
20 |           preferredDuringSchedulingIgnoredDuringExecution:
21 |           - preference:
22 |               matchExpressions:
23 |               - key: node-role.kubernetes.io/worker
24 |                 operator: Exists
25 |             weight: 1
26 |       containers:
27 |       - command:
28 |           - "sleep"
29 |           - "infinity"
30 |         image: "quay.io/app-sre/ubi8-ubi:latest"
31 |         imagePullPolicy: IfNotPresent
32 |         name: test
33 |       restartPolicy: Always
34 | 


--------------------------------------------------------------------------------
/pkg/investigations/pruningcronjoberror/metadata.yaml:
--------------------------------------------------------------------------------
 1 | name: pruningcronjoberror
 2 | rbac:
 3 |   roles: []
 4 |   clusterRoleRules:
 5 |     - verbs:
 6 |         - "get"
 7 |         - "list"
 8 |       apiGroups:
 9 |         - "config.openshift.io"
10 |       resources:
11 |         - clusteroperators
12 |     - apiGroups:
13 |         - ""
14 |       resources:
15 |         - pods
16 |         - namespaces
17 |       verbs:
18 |         - get
19 |         - list
20 | customerDataAccess: false
21 | 


--------------------------------------------------------------------------------
/pkg/investigations/registry.go:
--------------------------------------------------------------------------------
 1 | package investigations
 2 | 
 3 | import (
 4 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/apierrorbudgetburn"
 5 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatessre"
 6 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
 7 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
 8 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
 9 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/cpd"
10 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/insightsoperatordown"
11 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
12 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre"
13 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/upgradeconfigsyncfailureover4hr"
14 | )
15 | 
16 | // availableInvestigations holds all Investigation implementations.
17 | var availableInvestigations = []investigation.Investigation{
18 | 	&apierrorbudgetburn.Investigation{},
19 | 	&ccam.Investigation{},
20 | 	&chgm.Investiation{},
21 | 	&clustermonitoringerrorbudgetburn.Investigation{},
22 | 	&cpd.Investigation{},
23 | 	&insightsoperatordown.Investigation{},
24 | 	&upgradeconfigsyncfailureover4hr.Investigation{},
25 | 	&machinehealthcheckunterminatedshortcircuitsre.Investigation{},
26 | 	&cannotretrieveupdatessre.Investigation{},
27 | }
28 | 
29 | // GetInvestigation returns the first Investigation that applies to the given alert title.
30 | // This is a naive version that only returns the first matching investigation and ignores the rest.
31 | // Future improvement is to use the proper mapping that can return multiple investigations
32 | // linked to single alert type.
33 | func GetInvestigation(title string, experimental bool) investigation.Investigation {
34 | 	for _, inv := range availableInvestigations {
35 | 		if inv.ShouldInvestigateAlert(title) && (experimental || !inv.IsExperimental()) {
36 | 			return inv
37 | 		}
38 | 	}
39 | 	return nil
40 | }
41 | 


--------------------------------------------------------------------------------
/pkg/investigations/upgradeconfigsyncfailureover4hr/README.md:
--------------------------------------------------------------------------------
 1 | # upgradeconfigsyncfailureover4hr Investigation
 2 | 
 3 | Package upgradeconfigsyncfailureover4hr contains functionality for the UpgradeConfigSyncFailureOver4HrSRE investigation
 4 | 
 5 | ### Integration test for Secret Key check
 6 | In order to integration test the logic for checking the pull-secret in OCM vs the pull-secret on your cluster you'll need to do a few things.
 7 | 
 8 |  1. Set up a cluster and test incident in pagerduty as you would for any CAD investigation test. 
 9 |  2. Get the pull secret from the cluster and output it to a file.
10 | 
11 |      `oc get secret pull-secret -ojson -n openshift-config --as backplane-cluster-admin > backup_pull_secret.json`
12 |  3. Make a copy of the file you just created for easy backup. We'll be making edits later to the copied file.
13 |      `cp backup_pull_secret.json broken_pull_secret.json
14 |  4. Decrypt the .dockerconfigjson entry. The easiest way to do this is to copy the whole part in quotes to your clipboard, echo it in your terminal, and pipe it through `base64 -d` and save the output in a separate file.
15 | 
16 |      `echo $copied value | base64 -d`
17 |  5. Find the entry for registry.connect.redhat.com and copy the encrypted value for the auth entry. Exclude the quotes again. Repeat the process of de-encrypting this value using `base64 -d`
18 | 
19 |      `echo $copied_value | base64 -d`
20 |  6. Edit this value in a text editor and change the value after the colon. Leave the preceeding value before the colon as it is. 
21 |  7. Do the encryption process detailed above backwards. First you'll need to encrypt your new pull-secret.dockerconfigjson.registry.connect.redhat.com.auth value (the one we just changed). Simply echo it on your command line and pipe it into base64. Place the whole value in single quotes to avoid any text parsing issues. 
22 | 
23 |      `echo $changed_value | base64`
24 |  8. Replace that value in the registry.connect.redhat.com.auth value in your decrypted .dockerconfigjson you saved in step 4 then base64 encrypt the whole thing. Take that encrypted value and replace the encrypted .dockerconfigjson value in your broken_pull_secret.json file.
25 |  9. Apply the newly broken pull-secret json file to your cluster using oc apply.
26 |  
27 |      `oc apply -f broken_pull_secret.json --as backplane-cluster-admin`
28 |  10. Re run your test according to the CAD readme. This should return a warning in the logs `⚠️ Pull secret does not match on cluster and in OCM` and apply the same message to the pagerduty incident.


--------------------------------------------------------------------------------
/pkg/investigations/upgradeconfigsyncfailureover4hr/metadata.yaml:
--------------------------------------------------------------------------------
 1 | name: upgradeconfigsyncfailureover4hr
 2 | rbac:
 3 |   roles:
 4 |     - namespace: "openshift-config"
 5 |       rules:
 6 |         - verbs:
 7 |             - "get"
 8 |           apiGroups:
 9 |             - ""
10 |           resources:
11 |             - "secrets"
12 |           resourceNames:
13 |             - "pull-secret"
14 | customerDataAccess: false
15 | 


--------------------------------------------------------------------------------
/pkg/investigations/upgradeconfigsyncfailureover4hr/upgradeconfigsyncfailureover4hr.go:
--------------------------------------------------------------------------------
  1 | // Package upgradeconfigsyncfailureover4hr contains functionality for the UpgradeConfigSyncFailureOver4HrSRE investigation
  2 | package upgradeconfigsyncfailureover4hr
  3 | 
  4 | import (
  5 | 	"context"
  6 | 	"encoding/base64"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"strings"
 10 | 
 11 | 	v1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1"
 12 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
 13 | 	k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
 14 | 	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
 15 | 	"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
 16 | 	ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm"
 17 | 	corev1 "k8s.io/api/core/v1"
 18 | 	"k8s.io/apimachinery/pkg/types"
 19 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 20 | )
 21 | 
 22 | type Investigation struct{}
 23 | 
 24 | const (
 25 | 	alertname       = "UpgradeConfigSyncFailureOver4HrSRE"
 26 | 	remediationName = "upgradeconfigsyncfailureover4hr"
 27 | )
 28 | 
 29 | func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
 30 | 	result := investigation.InvestigationResult{}
 31 | 	notes := notewriter.New("UpgradeConfigSyncFailureOver4Hr", logging.RawLogger)
 32 | 	k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
 33 | 	if err != nil {
 34 | 		return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
 35 | 	}
 36 | 	defer func() {
 37 | 		deferErr := k8scli.Clean()
 38 | 		if deferErr != nil {
 39 | 			logging.Error(deferErr)
 40 | 			err = errors.Join(err, deferErr)
 41 | 		}
 42 | 	}()
 43 | 	logging.Infof("Checking if user is Banned.")
 44 | 	userBannedStatus, userBannedNotes, err := ocm.CheckIfUserBanned(r.OcmClient, r.Cluster)
 45 | 	if err != nil {
 46 | 		notes.AppendWarning("encountered an issue when checking if the cluster owner is banned: %s\nPlease investigate.", err)
 47 | 		return result, r.PdClient.EscalateIncidentWithNote(notes.String())
 48 | 	}
 49 | 	if userBannedStatus {
 50 | 		notes.AppendWarning(userBannedNotes)
 51 | 	} else {
 52 | 		notes.AppendSuccess("User is not banned.")
 53 | 	}
 54 | 	user, err := ocm.GetCreatorFromCluster(r.OcmClient.GetConnection(), r.Cluster)
 55 | 	logging.Infof("User ID is: %v", user.ID())
 56 | 	clusterSecretToken, note, err := getClusterPullSecret(k8scli)
 57 | 	if err != nil {
 58 | 		notes.AppendWarning("Failre getting ClusterSecret: %s", err)
 59 | 		return result, r.PdClient.EscalateIncidentWithNote(notes.String())
 60 | 	}
 61 | 	if note != "" {
 62 | 		notes.AppendWarning(note)
 63 | 	}
 64 | 	registryCredential, err := ocm.GetOCMPullSecret(r.OcmClient.GetConnection(), user.ID())
 65 | 	if err != nil {
 66 | 		notes.AppendWarning("Error getting OCMPullSecret: %s", err)
 67 | 		return result, r.PdClient.EscalateIncidentWithNote(notes.String())
 68 | 	}
 69 | 	if clusterSecretToken == registryCredential {
 70 | 		notes.AppendSuccess("Pull Secret matches on cluster and in OCM. Please continue investigation.")
 71 | 	} else {
 72 | 		notes.AppendWarning("Pull secret does not match on cluster and in OCM.")
 73 | 	}
 74 | 	return result, r.PdClient.EscalateIncidentWithNote(notes.String())
 75 | }
 76 | 
 77 | func getClusterPullSecret(k8scli client.Client) (secretToken string, note string, err error) {
 78 | 	secret := &corev1.Secret{}
 79 | 	err = k8scli.Get(context.TODO(), types.NamespacedName{
 80 | 		Namespace: "openshift-config",
 81 | 		Name:      "pull-secret",
 82 | 	}, secret)
 83 | 	if err != nil {
 84 | 		return "", "", err
 85 | 	}
 86 | 	if secret.Data == nil {
 87 | 		return "", "Cluster pull secret Data is empty.", err
 88 | 	}
 89 | 	secretValue, exists := secret.Data[".dockerconfigjson"]
 90 | 	if !exists {
 91 | 		return "", "Cluster pull secret does not contain the necessary .dockerconfigjson", err
 92 | 	}
 93 | 
 94 | 	dockerConfigJson, err := v1.UnmarshalAccessToken(secretValue)
 95 | 	if err != nil {
 96 | 		return "", "", err
 97 | 	}
 98 | 	_, exists = dockerConfigJson.Auths()["cloud.openshift.com"]
 99 | 	if !exists {
100 | 		return "", "cloud.openshift.com value not found in clusterPullSecret. This means there is an issue with the pull secret on the cluster.", err
101 | 	}
102 | 
103 | 	value, err := base64.StdEncoding.DecodeString(dockerConfigJson.Auths()["registry.connect.redhat.com"].Auth())
104 | 	if err != nil {
105 | 		return "", "", err
106 | 	}
107 | 	_, splitValue, _ := strings.Cut(string(value), ":")
108 | 	return splitValue, "", nil
109 | }
110 | 
111 | func (c *Investigation) Name() string {
112 | 	return "UpgradeConfigSyncFailureOver4hr"
113 | }
114 | 
115 | func (c *Investigation) Description() string {
116 | 	return "Investigates the UpgradeConfigSyncFailureOver4hr alert"
117 | }
118 | 
119 | func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
120 | 	return strings.Contains(alert, "UpgradeConfigSyncFailureOver4HrSRE")
121 | }
122 | 
123 | func (c *Investigation) IsExperimental() bool {
124 | 	return false
125 | }
126 | 


--------------------------------------------------------------------------------
/pkg/investigations/upgradeconfigsyncfailureover4hr/upgradeconfigsyncfailureover4hr_test.go:
--------------------------------------------------------------------------------
 1 | package upgradeconfigsyncfailureover4hr
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"testing"
 6 | 
 7 | 	corev1 "k8s.io/api/core/v1"
 8 | 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 9 | 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
10 | )
11 | 
12 | func TestGetClusterPullSecret(t *testing.T) {
13 | 	tests := []struct {
14 | 		name         string
15 | 		data         string
16 | 		secretToken  string
17 | 		expectError  bool
18 | 		expectedNote string
19 | 	}{
20 | 		{
21 | 			name:        "happy path",
22 | 			data:        "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"cloud.openshift.com\":{\"auth\":\"TestAuthValue\",\"email\":\"test_fake_email@redhat.com\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"fake-email@redhat.com\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"test_fake_email@redhat.com\"}}}",
23 | 			secretToken: "CorrectValue\n",
24 | 			expectError: false,
25 | 		},
26 | 		{
27 | 			name:        "Value mismatch",
28 | 			data:        "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"cloud.openshift.com\":{\"auth\":\"TestAuthValue\",\"email\":\"test_fake_email@redhat.com\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"fake-email@redhat.com\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"test_fake_email@redhat.com\"}}}",
29 | 			secretToken: "IncorrectValue\n",
30 | 			expectError: true,
31 | 		},
32 | 		{
33 | 			name:         "No entry for cloud.openshift.com",
34 | 			data:         "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"MissingValue\":{\"auth\":\"TestAuthValue\",\"email\":\"test_fake_email@redhat.com\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"fake-email@redhat.com\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"test_fake_email@redhat.com\"}}}",
35 | 			secretToken:  "IncorrectValue\n",
36 | 			expectError:  true,
37 | 			expectedNote: "cloud.openshift.com value not found in clusterPullSecret",
38 | 		},
39 | 	}
40 | 
41 | 	for _, tt := range tests {
42 | 		t.Run(tt.name, func(t *testing.T) {
43 | 			secretTest := &corev1.Secret{
44 | 				ObjectMeta: v1.ObjectMeta{
45 | 					Name:      "pull-secret",
46 | 					Namespace: "openshift-config",
47 | 				},
48 | 				Type: corev1.DockerConfigJsonKey,
49 | 				Data: map[string][]byte{
50 | 					".dockerconfigjson": []byte(tt.data),
51 | 				},
52 | 			}
53 | 			k8scli := fake.NewClientBuilder().WithObjects(secretTest).Build()
54 | 			result, note, _ := getClusterPullSecret(k8scli)
55 | 			if result != tt.secretToken {
56 | 				if !strings.Contains(note, tt.expectedNote) {
57 | 					t.Errorf("Expected note message: %s. Got %s", tt.expectedNote, note)
58 | 				}
59 | 				if !tt.expectError {
60 | 					t.Errorf("expected token %s to match %s", result, tt.secretToken)
61 | 				}
62 | 			}
63 | 		})
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/pkg/investigations/utils/machine/machine.go:
--------------------------------------------------------------------------------
 1 | package machine
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 
 7 | 	machinev1beta1 "github.com/openshift/api/machine/v1beta1"
 8 | 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/node"
 9 | 	corev1 "k8s.io/api/core/v1"
10 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11 | 	"k8s.io/apimachinery/pkg/types"
12 | 	"sigs.k8s.io/controller-runtime/pkg/client"
13 | )
14 | 
15 | const (
16 | 	MachineNamespace     = "openshift-machine-api"
17 | 	RoleLabelKey         = "machine.openshift.io/cluster-api-machine-role"
18 | 	WorkerRoleLabelValue = "worker"
19 | )
20 | 
21 | // HealthcheckRemediationAllowed searches the status conditions for the machinehealthcheck object and determines if remediation is allowed
22 | func HealthcheckRemediationAllowed(healthcheck machinev1beta1.MachineHealthCheck) bool {
23 | 	for _, condition := range healthcheck.Status.Conditions {
24 | 		if condition.Type == machinev1beta1.RemediationAllowedCondition && condition.Status == corev1.ConditionTrue {
25 | 			// Only rule out that the mhc is failing if we can both find the condition and determine its current status
26 | 			return true
27 | 		}
28 | 	}
29 | 	return false
30 | }
31 | 
32 | // GetMachinesForMHC retrieves the machines managed by the given MachineHealthCheck object
33 | func GetMachinesForMHC(ctx context.Context, kclient client.Client, healthcheck machinev1beta1.MachineHealthCheck) ([]machinev1beta1.Machine, error) {
34 | 	machines := machinev1beta1.MachineList{}
35 | 	selector, err := metav1.LabelSelectorAsSelector(&healthcheck.Spec.Selector)
36 | 	if err != nil {
37 | 		return []machinev1beta1.Machine{}, fmt.Errorf("failed to convert machinehealthcheck %q .spec.selector: %w", healthcheck.Name, err)
38 | 	}
39 | 	err = kclient.List(ctx, &machines, client.MatchingLabelsSelector{Selector: selector}, &client.ListOptions{Namespace: MachineNamespace})
40 | 	if err != nil {
41 | 		return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machines from machinehealthcheck %q: %w", healthcheck.Name, err)
42 | 	}
43 | 	return machines.Items, nil
44 | }
45 | 
46 | // GetMachineRole returns the role of the given machine, if present. If not found, an error is returned
47 | func GetRole(machine machinev1beta1.Machine) (string, error) {
48 | 	role, found := machine.Labels[RoleLabelKey]
49 | 	if !found {
50 | 		return "", fmt.Errorf("expected label key %q not found", RoleLabelKey)
51 | 	}
52 | 	return role, nil
53 | }
54 | 
55 | // GetNodesForMachines retrieves the nodes for the given machines. Errors encountered are joined, but do not block the retrieval of other machines
56 | func GetNodesForMachines(ctx context.Context, kclient client.Client, machines []machinev1beta1.Machine) ([]corev1.Node, error) {
57 | 	// Retrieving all nodes initially & filtering out irrelevant objects results in fewer API calls
58 | 	nodes, err := node.GetAll(ctx, kclient)
59 | 	if err != nil {
60 | 		return []corev1.Node{}, fmt.Errorf("failed to retrieve nodes: %w", err)
61 | 	}
62 | 
63 | 	matches := []corev1.Node{}
64 | 	for _, machine := range machines {
65 | 		node, found := findMatchingNode(machine, nodes)
66 | 		if found {
67 | 			matches = append(matches, node)
68 | 		}
69 | 	}
70 | 	return matches, nil
71 | }
72 | 
73 | // findMatchingNode retrieves the node owned by the provided machine, if one exists, along with a boolean indicating whether
74 | // the search succeeded
75 | func findMatchingNode(machine machinev1beta1.Machine, nodes []corev1.Node) (corev1.Node, bool) {
76 | 	if machine.Status.NodeRef == nil || machine.Status.NodeRef.Name == "" {
77 | 		return corev1.Node{}, false
78 | 	}
79 | 	for _, node := range nodes {
80 | 		if machine.Status.NodeRef.Name == node.Name {
81 | 			return node, true
82 | 		}
83 | 	}
84 | 
85 | 	return corev1.Node{}, false
86 | }
87 | 
88 | // GetNodeForMachine retrieves the node for the given machine. If the provided machine's .Status.NodeRef is empty,
89 | // an error is returned
90 | func GetNodeForMachine(ctx context.Context, kclient client.Client, machine machinev1beta1.Machine) (corev1.Node, error) {
91 | 	if machine.Status.NodeRef == nil || machine.Status.NodeRef.Name == "" {
92 | 		return corev1.Node{}, fmt.Errorf("no .Status.NodeRef defined for machine %q", machine.Name)
93 | 	}
94 | 	node := &corev1.Node{}
95 | 	err := kclient.Get(ctx, types.NamespacedName{Name: machine.Status.NodeRef.Name}, node)
96 | 	return *node, err
97 | }
98 | 


--------------------------------------------------------------------------------
/pkg/investigations/utils/node/node.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | node defines investigation utility logic related to node objects
 3 | */
 4 | package node
 5 | 
 6 | import (
 7 | 	"context"
 8 | 	"strings"
 9 | 
10 | 	corev1 "k8s.io/api/core/v1"
11 | 	"sigs.k8s.io/controller-runtime/pkg/client"
12 | )
13 | 
14 | const (
15 | 	RoleLabelPrefix  = "node-role.kubernetes.io"
16 | 	WorkerRoleSuffix = "worker"
17 | )
18 | 
19 | // FindNoScheduleTaint searches the node's taints to find one with effect: NoSchedule, if present.
20 | //
21 | // If none is present, an empty taint and 'false' are returned
22 | func FindNoScheduleTaint(node corev1.Node) (corev1.Taint, bool) {
23 | 	for _, taint := range node.Spec.Taints {
24 | 		if taint.Effect == corev1.TaintEffectNoSchedule {
25 | 			return taint, true
26 | 		}
27 | 	}
28 | 	return corev1.Taint{}, false
29 | }
30 | 
31 | // GetNodes retrieves all nodes present in the cluster
32 | func GetAll(ctx context.Context, kclient client.Client) ([]corev1.Node, error) {
33 | 	nodes := corev1.NodeList{}
34 | 	err := kclient.List(ctx, &nodes)
35 | 	return nodes.Items, err
36 | }
37 | 
38 | // FindReadyCondition searches a node's .Status for the NodeReady condition, and returns it alongside a boolean value which
39 | // indicates whether the condition was found or not
40 | func FindReadyCondition(node corev1.Node) (corev1.NodeCondition, bool) {
41 | 	for _, condition := range node.Status.Conditions {
42 | 		if condition.Type == corev1.NodeReady {
43 | 			return condition, true
44 | 		}
45 | 	}
46 | 	return corev1.NodeCondition{}, false
47 | }
48 | 
49 | // GetNodeRole returns the role of the provided node
50 | func GetRole(node corev1.Node) (string, bool) {
51 | 	for label := range node.Labels {
52 | 		if strings.Contains(label, RoleLabelPrefix) {
53 | 			return label, true
54 | 		}
55 | 	}
56 | 	return "", false
57 | }
58 | 


--------------------------------------------------------------------------------
/pkg/k8s/client.go:
--------------------------------------------------------------------------------
  1 | package k8sclient
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"os"
  7 | 
  8 | 	"github.com/openshift/backplane-cli/pkg/cli/config"
  9 | 	bpremediation "github.com/openshift/backplane-cli/pkg/remediation"
 10 | 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
 11 | 	"k8s.io/client-go/rest"
 12 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 13 | )
 14 | 
 15 | type Cleaner interface {
 16 | 	Clean() error
 17 | }
 18 | 
 19 | type Client interface {
 20 | 	client.Client
 21 | 	Cleaner
 22 | }
 23 | 
 24 | type clientImpl struct {
 25 | 	client.Client
 26 | 	Cleaner
 27 | }
 28 | 
 29 | // New returns a Kubernetes client for the given cluster scoped to a given remediation's permissions.
 30 | func New(clusterID string, ocmClient ocm.Client, remediationName string) (kclient Client, err error) {
 31 | 	cfg, err := NewCfg(clusterID, ocmClient, remediationName)
 32 | 	if err != nil {
 33 | 		return nil, err
 34 | 	}
 35 | 
 36 | 	cfgToClean := cfg
 37 | 	defer func() {
 38 | 		if cfgToClean != nil {
 39 | 			deferErr := cfgToClean.Clean()
 40 | 			if deferErr != nil {
 41 | 				err = errors.Join(err, deferErr)
 42 | 			}
 43 | 		}
 44 | 	}()
 45 | 
 46 | 	scheme, err := initScheme()
 47 | 	if err != nil {
 48 | 		return nil, err
 49 | 	}
 50 | 
 51 | 	decoratedClient, err := client.New(&cfg.Config, client.Options{Scheme: scheme})
 52 | 	if err != nil {
 53 | 		return nil, err
 54 | 	}
 55 | 
 56 | 	cfgToClean = nil
 57 | 	return clientImpl{decoratedClient, cfg}, nil
 58 | }
 59 | 
 60 | type Config struct {
 61 | 	rest.Config
 62 | 	Cleaner
 63 | }
 64 | 
 65 | type remediationCleaner struct {
 66 | 	clusterID             string
 67 | 	ocmClient             ocm.Client
 68 | 	remediationInstanceId string
 69 | }
 70 | 
 71 | func (cleaner remediationCleaner) Clean() error {
 72 | 	return deleteRemediation(cleaner.clusterID, cleaner.ocmClient, cleaner.remediationInstanceId)
 73 | }
 74 | 
 75 | // New returns a the k8s rest config for the given cluster scoped to a given remediation's permissions.
 76 | func NewCfg(clusterID string, ocmClient ocm.Client, remediationName string) (cfg *Config, err error) {
 77 | 	backplaneURL := os.Getenv("BACKPLANE_URL")
 78 | 	if backplaneURL == "" {
 79 | 		return nil, fmt.Errorf("could not create new k8sclient: missing environment variable BACKPLANE_URL")
 80 | 	}
 81 | 
 82 | 	decoratedCfg, remediationInstanceId, err := bpremediation.CreateRemediationWithConn(
 83 | 		config.BackplaneConfiguration{URL: backplaneURL},
 84 | 		ocmClient.GetConnection(),
 85 | 		clusterID,
 86 | 		remediationName,
 87 | 	)
 88 | 	if err != nil {
 89 | 		if isAPIServerUnavailable(err) {
 90 | 			return nil, fmt.Errorf("%w: %w", ErrAPIServerUnavailable, err)
 91 | 		}
 92 | 		return nil, err
 93 | 	}
 94 | 
 95 | 	return &Config{*decoratedCfg, remediationCleaner{clusterID, ocmClient, remediationInstanceId}}, nil
 96 | }
 97 | 
 98 | // Cleanup removes the remediation created for the cluster.
 99 | func deleteRemediation(clusterID string, ocmClient ocm.Client, remediationInstanceId string) error {
100 | 	backplaneURL := os.Getenv("BACKPLANE_URL")
101 | 	if backplaneURL == "" {
102 | 		return fmt.Errorf("could not clean up k8sclient: missing environment variable BACKPLANE_URL")
103 | 	}
104 | 
105 | 	return bpremediation.DeleteRemediationWithConn(
106 | 		config.BackplaneConfiguration{URL: backplaneURL},
107 | 		ocmClient.GetConnection(),
108 | 		clusterID,
109 | 		remediationInstanceId,
110 | 	)
111 | }
112 | 


--------------------------------------------------------------------------------
/pkg/k8s/errors.go:
--------------------------------------------------------------------------------
 1 | package k8sclient
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"strings"
 6 | )
 7 | 
 8 | var ErrAPIServerUnavailable = errors.New("kubernetes API server unavailable")
 9 | 
10 | // isAPIServerUnavailable detects common symptoms of an unreachable API server.
11 | func isAPIServerUnavailable(err error) bool {
12 | 	errStr := err.Error()
13 | 	return strings.Contains(errStr, "The cluster could be down or under heavy load")
14 | }
15 | 


--------------------------------------------------------------------------------
/pkg/k8s/errors_test.go:
--------------------------------------------------------------------------------
 1 | package k8sclient
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestIsAPIServerUnavailable(t *testing.T) {
 9 | 	tests := []struct {
10 | 		name     string
11 | 		err      error
12 | 		expected bool
13 | 	}{
14 | 		{
15 | 			name: "Cluster down message present",
16 | 			err: errors.New(`Error: Internal error occurred: failed calling webhook "namespace.operator.tekton.dev": failed to call webhook: Post "https://tekton-operator-proxy-webhook.openshift-pipelines.svc:443/namespace-validation?timeout=10s": context deadline exceeded
17 | 	The cluster could be down or under heavy load
18 | 	`),
19 | 			expected: true,
20 | 		},
21 | 		{
22 | 			name:     "Unrelated error message",
23 | 			err:      errors.New("some other error occurred"),
24 | 			expected: false,
25 | 		},
26 | 	}
27 | 
28 | 	for _, tt := range tests {
29 | 		t.Run(tt.name, func(t *testing.T) {
30 | 			if tt.err == nil && isAPIServerUnavailable(tt.err) {
31 | 				t.Errorf("Expected false for nil error, but got true")
32 | 			} else if tt.err != nil && isAPIServerUnavailable(tt.err) != tt.expected {
33 | 				t.Errorf("For test '%s', expected %v, got %v", tt.name, tt.expected, !tt.expected)
34 | 			}
35 | 		})
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/pkg/k8s/scheme.go:
--------------------------------------------------------------------------------
 1 | package k8sclient
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	configv1 "github.com/openshift/api/config/v1"
 7 | 	machinev1beta1 "github.com/openshift/api/machine/v1beta1"
 8 | 	corev1 "k8s.io/api/core/v1"
 9 | 	"k8s.io/apimachinery/pkg/runtime"
10 | )
11 | 
12 | // initScheme initializes the runtime scheme with required APIs.
13 | func initScheme() (*runtime.Scheme, error) {
14 | 	scheme := runtime.NewScheme()
15 | 
16 | 	if err := corev1.AddToScheme(scheme); err != nil {
17 | 		return nil, fmt.Errorf("unable to add corev1 scheme: %w", err)
18 | 	}
19 | 
20 | 	if err := configv1.Install(scheme); err != nil {
21 | 		return nil, fmt.Errorf("unable to add config.openshift.io/v1 scheme: %w", err)
22 | 	}
23 | 
24 | 	if err := machinev1beta1.AddToScheme(scheme); err != nil {
25 | 		return nil, fmt.Errorf("unable to add machine.openshift.io/v1beta1 scheme: %w", err)
26 | 	}
27 | 
28 | 	return scheme, nil
29 | }
30 | 


--------------------------------------------------------------------------------
/pkg/logging/logging.go:
--------------------------------------------------------------------------------
  1 | // Package logging wraps the zap logging package to provide easier access and initialization of the logger
  2 | package logging
  3 | 
  4 | import (
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"os"
  8 | 
  9 | 	"go.uber.org/zap"
 10 | 	"go.uber.org/zap/zapcore"
 11 | )
 12 | 
 13 | var LogLevelString = getLogLevel()
 14 | 
 15 | // RawLogger is the raw global logger object used for calls wrapped by the logging package
 16 | var RawLogger = InitLogger(LogLevelString, "")
 17 | 
 18 | // InitLogger initializes a cluster-id specific child logger
 19 | func InitLogger(logLevelString string, clusterID string) *zap.SugaredLogger {
 20 | 	logLevel, err := zap.ParseAtomicLevel(logLevelString)
 21 | 	if err != nil {
 22 | 		log.Fatalln("Invalid log level:", logLevelString)
 23 | 	}
 24 | 
 25 | 	pipelineName := os.Getenv("PIPELINE_NAME")
 26 | 	if pipelineName == "" {
 27 | 		fmt.Println("Warning: Unable to retrieve the pipeline ID on logger creation. Continuing with empty value.")
 28 | 	}
 29 | 
 30 | 	config := zap.NewProductionConfig()
 31 | 	config.EncoderConfig.TimeKey = "timestamp"
 32 | 	config.Level = logLevel
 33 | 	config.EncoderConfig.EncodeTime = zapcore.RFC3339TimeEncoder
 34 | 	config.EncoderConfig.StacktraceKey = "" // to hide stacktrace info
 35 | 	config.EncoderConfig.CallerKey = "caller"
 36 | 
 37 | 	logger, err := config.Build()
 38 | 	if err != nil {
 39 | 		log.Fatal(err)
 40 | 	}
 41 | 
 42 | 	logger = logger.With(zap.Field{Key: "cluster_id", Type: zapcore.StringType, String: clusterID},
 43 | 		zap.Field{Key: "pipeline_name", Type: zapcore.StringType, String: pipelineName})
 44 | 
 45 | 	return logger.Sugar()
 46 | }
 47 | 
 48 | // Info wraps zap's SugaredLogger.Info()
 49 | func Info(args ...interface{}) {
 50 | 	RawLogger.Info(args...)
 51 | }
 52 | 
 53 | // Debug wraps zap's SugaredLogger.Debug()
 54 | func Debug(args ...interface{}) {
 55 | 	RawLogger.Debug(args...)
 56 | }
 57 | 
 58 | // Warn wraps zap's SugaredLogger.Warn()
 59 | func Warn(args ...interface{}) {
 60 | 	RawLogger.Warn(args...)
 61 | }
 62 | 
 63 | // Error wraps zap's SugaredLogger.Error()
 64 | func Error(args ...interface{}) {
 65 | 	RawLogger.Error(args...)
 66 | }
 67 | 
 68 | // Fatal wraps zap's SugaredLogger.Fatal()
 69 | func Fatal(args ...interface{}) {
 70 | 	RawLogger.Fatal(args...)
 71 | }
 72 | 
 73 | // Infof wraps zap's SugaredLogger.Infof()
 74 | func Infof(template string, args ...interface{}) {
 75 | 	RawLogger.Infof(template, args...)
 76 | }
 77 | 
 78 | // Debugf wraps zap's SugaredLogger.Debugf()
 79 | func Debugf(template string, args ...interface{}) {
 80 | 	RawLogger.Debugf(template, args...)
 81 | }
 82 | 
 83 | // Warnf wraps zap's SugaredLogger.Warnf()
 84 | func Warnf(template string, args ...interface{}) {
 85 | 	RawLogger.Warnf(template, args...)
 86 | }
 87 | 
 88 | // Errorf wraps zap's SugaredLogger.Errorf()
 89 | func Errorf(template string, args ...interface{}) {
 90 | 	RawLogger.Errorf(template, args...)
 91 | }
 92 | 
 93 | // Fatalf wraps zap's SugaredLogger.Fatalf()
 94 | func Fatalf(template string, args ...interface{}) {
 95 | 	RawLogger.Fatalf(template, args...)
 96 | }
 97 | 
 98 | // getLogLevel returns the log level from the environment variable LOG_LEVEL
 99 | func getLogLevel() string {
100 | 	if envLogLevel, exists := os.LookupEnv("LOG_LEVEL"); exists {
101 | 		return envLogLevel
102 | 	}
103 | 	return "info"
104 | }
105 | 


--------------------------------------------------------------------------------
/pkg/managedcloud/managedcloud.go:
--------------------------------------------------------------------------------
 1 | // Package managedcloud contains functionality to access cloud environments of managed clusters
 2 | package managedcloud
 3 | 
 4 | import (
 5 | 	"fmt"
 6 | 	"net/http"
 7 | 	"net/url"
 8 | 	"os"
 9 | 
10 | 	cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
11 | 	bpcloud "github.com/openshift/backplane-cli/cmd/ocm-backplane/cloud"
12 | 	"github.com/openshift/backplane-cli/pkg/cli/config"
13 | 	"github.com/openshift/configuration-anomaly-detection/pkg/aws"
14 | 	ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm"
15 | )
16 | 
17 | // CreateCustomerAWSClient creates an aws.SdkClient to a cluster's AWS account
18 | func CreateCustomerAWSClient(cluster *cmv1.Cluster, ocmClient ocm.Client) (*aws.SdkClient, error) {
19 | 	backplaneURL := os.Getenv("BACKPLANE_URL")
20 | 	if backplaneURL == "" {
21 | 		return nil, fmt.Errorf("could not create new aws client: missing environment variable BACKPLANE_URL")
22 | 	}
23 | 
24 | 	backplaneInitialARN := os.Getenv("BACKPLANE_INITIAL_ARN")
25 | 	if backplaneInitialARN == "" {
26 | 		return nil, fmt.Errorf("missing environment variable BACKPLANE_INITIAL_ARN")
27 | 	}
28 | 
29 | 	backplaneProxy := os.Getenv("BACKPLANE_PROXY")
30 | 
31 | 	queryConfig := &bpcloud.QueryConfig{OcmConnection: ocmClient.GetConnection(), BackplaneConfiguration: config.BackplaneConfiguration{URL: backplaneURL, AssumeInitialArn: backplaneInitialARN}, Cluster: cluster}
32 | 	if backplaneProxy != "" {
33 | 		queryConfig.ProxyURL = &backplaneProxy
34 | 	}
35 | 
36 | 	config, err := queryConfig.GetAWSV2Config()
37 | 	if err != nil {
38 | 		return nil, fmt.Errorf("unable to query aws credentials from backplane: %w", err)
39 | 	}
40 | 
41 | 	awsProxy := os.Getenv("AWS_PROXY")
42 | 	if awsProxy != "" {
43 | 		config.HTTPClient = &http.Client{
44 | 			Transport: &http.Transport{
45 | 				Proxy: func(*http.Request) (*url.URL, error) {
46 | 					return url.Parse(awsProxy)
47 | 				},
48 | 			},
49 | 		}
50 | 	}
51 | 
52 | 	return aws.NewClient(config)
53 | }
54 | 


--------------------------------------------------------------------------------
/pkg/metrics/README.md:
--------------------------------------------------------------------------------
 1 | # Metrics
 2 | 
 3 | This package provides metric instrumentation.
 4 | 
 5 | You can test metrics locally by spawning a aggregation pushgateway container and pushing metrics there.
 6 | 
 7 | ```bash
 8 | # Spawn local gateway
 9 | podman run --name cad-pushgw -e PAG_APILISTEN=:9091 -e PAG_LIFECYCLELISTEN=:9092 -p 9091:9091 -p 9092:9092 -d ghcr.io/zapier/prom-aggregation-gateway:v0.7.0
10 | # Verify you can reach the gateway (expect empty answer until you pushed metrics)
11 | curl http://localhost:9091/metrics 
12 | # Point cad to the gateway
13 | export CAD_PROMETHEUS_PUSHGATEWAY="localhost:9091"
14 | # Run cad locally (it is not relevant for cad to succeed to test the metrics)
15 | ./cadctl investigate --payload-path payload.json
16 | # Verify your metrics got pushed and are available on the gateway
17 | curl http://localhost:9091/metrics
18 | ```
19 | 


--------------------------------------------------------------------------------
/pkg/metrics/metrics.go:
--------------------------------------------------------------------------------
 1 | // Package metrics provides prometheus instrumentation for CAD
 2 | package metrics
 3 | 
 4 | import (
 5 | 	"os"
 6 | 
 7 | 	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
 8 | 	"github.com/prometheus/client_golang/prometheus"
 9 | 	"github.com/prometheus/client_golang/prometheus/push"
10 | 	"github.com/prometheus/common/expfmt"
11 | )
12 | 
13 | // Push collects and pushes metrics to the configured pushgateway
14 | func Push() {
15 | 	var promPusher *push.Pusher
16 | 	if pushgateway := os.Getenv("CAD_PROMETHEUS_PUSHGATEWAY"); pushgateway != "" {
17 | 		promPusher = push.New(pushgateway, "cad").Format(expfmt.NewFormat(expfmt.TypeTextPlain))
18 | 		promPusher.Collector(Alerts)
19 | 		promPusher.Collector(LimitedSupportSet)
20 | 		promPusher.Collector(ServicelogPrepared)
21 | 		promPusher.Collector(ServicelogSent)
22 | 		err := promPusher.Add()
23 | 		if err != nil {
24 | 			logging.Errorf("failed to push metrics: %w", err)
25 | 		}
26 | 	} else {
27 | 		logging.Warn("metrics disabled, set env 'CAD_PROMETHEUS_PUSHGATEWAY' to push metrics")
28 | 	}
29 | }
30 | 
31 | // Inc takes a counterVec and a set of label values and increases by one
32 | func Inc(counterVec *prometheus.CounterVec, lsv ...string) {
33 | 	metric, err := counterVec.GetMetricWithLabelValues(lsv...)
34 | 	if err != nil {
35 | 		logging.Error(err)
36 | 	}
37 | 	metric.Inc()
38 | }
39 | 
40 | const (
41 | 	namespace            = "cad"
42 | 	subsystemInvestigate = "investigate"
43 | 	alertTypeLabel       = "alert_type"
44 | 	lsSummaryLabel       = "ls_summary"
45 | )
46 | 
47 | var (
48 | 	// Alerts is a metric counting all alerts CAD received
49 | 	Alerts = prometheus.NewCounterVec(
50 | 		prometheus.CounterOpts{
51 | 			Namespace: namespace, Subsystem: subsystemInvestigate,
52 | 			Name: "alerts_total",
53 | 			Help: "counts investigated alerts by alert and event type",
54 | 		}, []string{alertTypeLabel})
55 | 	// LimitedSupportSet is a counter for limited support reasons set by cad
56 | 	LimitedSupportSet = prometheus.NewCounterVec(
57 | 		prometheus.CounterOpts{
58 | 			Namespace: namespace, Subsystem: subsystemInvestigate,
59 | 			Name: "limitedsupport_set_total",
60 | 			Help: "counts investigations resulting in setting a limited support reason",
61 | 		}, []string{alertTypeLabel, lsSummaryLabel})
62 | 	// ServicelogPrepared is a counter for investigation ending in a prepared servicelog
63 | 	ServicelogPrepared = prometheus.NewCounterVec(
64 | 		prometheus.CounterOpts{
65 | 			Namespace: namespace, Subsystem: subsystemInvestigate,
66 | 			Name: "servicelog_prepared_total",
67 | 			Help: "counts investigations resulting in a prepared servicelog attached to the incident notes",
68 | 		}, []string{alertTypeLabel})
69 | 	// ServicelogSent is a counter for investigation ending in a sent servicelog
70 | 	ServicelogSent = prometheus.NewCounterVec(
71 | 		prometheus.CounterOpts{
72 | 			Namespace: namespace, Subsystem: subsystemInvestigate,
73 | 			Name: "servicelog_sent_total",
74 | 			Help: "counts investigations resulting in a sent servicelog",
75 | 		}, []string{alertTypeLabel})
76 | )
77 | 


--------------------------------------------------------------------------------
/pkg/networkverifier/networkverifier_suite_test.go:
--------------------------------------------------------------------------------
 1 | package networkverifier_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	. "github.com/onsi/ginkgo/v2"
 7 | 	. "github.com/onsi/gomega"
 8 | )
 9 | 
10 | func TestPagerduty(t *testing.T) {
11 | 	RegisterFailHandler(Fail)
12 | 	RunSpecs(t, "Network verifier Suite")
13 | }
14 | 


--------------------------------------------------------------------------------
/pkg/networkverifier/networkverifier_test.go:
--------------------------------------------------------------------------------
 1 | package networkverifier_test
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"fmt"
 6 | 
 7 | 	. "github.com/onsi/ginkgo/v2"
 8 | 	. "github.com/onsi/gomega"
 9 | 	v1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
10 | 	awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock"
11 | 	"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
12 | 	hivev1 "github.com/openshift/hive/apis/hive/v1"
13 | 	"go.uber.org/mock/gomock"
14 | )
15 | 
16 | var _ = Describe("RunVerifier", func() {
17 | 	Describe("AreAllInstancesRunning", func() {
18 | 		var (
19 | 			mockCtrl          *gomock.Controller
20 | 			clusterBuilder    *v1.ClusterBuilder
21 | 			clusterDeployment *hivev1.ClusterDeployment
22 | 			awsCli            *awsmock.MockClient
23 | 		)
24 | 		BeforeEach(func() {
25 | 			mockCtrl = gomock.NewController(GinkgoT())
26 | 
27 | 			awsCli = awsmock.NewMockClient(mockCtrl)
28 | 
29 | 			region := v1.NewCloudRegion().ID("us-east-1")
30 | 
31 | 			clusterBuilder = v1.NewCluster().ID("12345").Nodes(v1.NewClusterNodes().Total(1)).Region(region)
32 | 
33 | 			clusterDeployment = &hivev1.ClusterDeployment{
34 | 				Spec: hivev1.ClusterDeploymentSpec{
35 | 					ClusterMetadata: &hivev1.ClusterMetadata{
36 | 						InfraID: "infra_id",
37 | 					},
38 | 				},
39 | 			}
40 | 		})
41 | 		AfterEach(func() {
42 | 			mockCtrl.Finish()
43 | 		})
44 | 		// This test is pretty useless but illustrates what tests for networkverifier should look like
45 | 		When("Getting security group ids", func() {
46 | 			It("Should return the error failed to get SecurityGroupId", func() {
47 | 				// Finish setup
48 | 				cluster, err := clusterBuilder.Build()
49 | 
50 | 				Expect(err).ToNot(HaveOccurred())
51 | 
52 | 				// Arrange
53 | 				expectedError := errors.New("failed to get SecurityGroupId: errormessage")
54 | 
55 | 				awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("", expectedError)
56 | 
57 | 				// Act
58 | 				result, failures, gotErr := networkverifier.Run(cluster, clusterDeployment, awsCli)
59 | 				fmt.Printf("result %v, failures %v", result, failures)
60 | 
61 | 				// Assert
62 | 				Expect(gotErr).To(HaveOccurred())
63 | 				Expect(gotErr.Error()).To(ContainSubstring(expectedError.Error()))
64 | 			})
65 | 		})
66 | 
67 | 		When("Checking input passed to ONV", func() {
68 | 			It("Should forward the cluster KMS key", func() {
69 | 				// Finish setup
70 | 				kmsKey := "some-KMS-key-ARN"
71 | 				clusterBuilder.AWS(v1.NewAWS().KMSKeyArn(kmsKey))
72 | 
73 | 				cluster, err := clusterBuilder.Build()
74 | 
75 | 				Expect(err).ToNot(HaveOccurred())
76 | 
77 | 				// Arrange
78 | 				awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return(gomock.Any().String(), nil)
79 | 				awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"string1", "string2"}, nil)
80 | 
81 | 				// Act
82 | 				input, gotErr := networkverifier.InitializeValidateEgressInput(cluster, clusterDeployment, awsCli)
83 | 				fmt.Printf("input %v", input)
84 | 
85 | 				// Assert
86 | 				Expect(gotErr).ToNot(HaveOccurred())
87 | 				Expect(input.AWS.KmsKeyID).To(BeIdenticalTo(kmsKey))
88 | 			})
89 | 		})
90 | 	})
91 | })
92 | 


--------------------------------------------------------------------------------
/pkg/notewriter/notewriter.go:
--------------------------------------------------------------------------------
 1 | package notewriter
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"strings"
 6 | 
 7 | 	"go.uber.org/zap"
 8 | )
 9 | 
10 | type NoteWriter struct {
11 | 	investigationName string
12 | 	sb                strings.Builder
13 | 	logger            *zap.SugaredLogger
14 | }
15 | 
16 | // New initializes a new NoteWriter with an optional logger.
17 | // The note is initialized with an investigation header in the following format:
18 | // 🤖 Automated %s pre-investigation 🤖
19 | // ===========================
20 | //
21 | // E.g.
22 | // 🤖 Automated CHGM pre-investigation 🤖
23 | // ===========================
24 | func New(investigationName string, logger *zap.SugaredLogger) *NoteWriter {
25 | 	nw := &NoteWriter{investigationName, strings.Builder{}, logger}
26 | 	nw.sb.WriteString(fmt.Sprintf("🤖 Automated %s pre-investigation 🤖\n", investigationName))
27 | 	nw.sb.WriteString("===========================\n")
28 | 	return nw
29 | }
30 | 
31 | // String() returns the current full string format of the built note
32 | func (n *NoteWriter) String() string {
33 | 	return n.sb.String()
34 | }
35 | 
36 | func (n *NoteWriter) writeWithLog(format string, a ...any) {
37 | 	if n.logger != nil {
38 | 		n.logger.Infof(format, a...)
39 | 	}
40 | 
41 | 	n.sb.WriteString(fmt.Sprintf(format, a...))
42 | }
43 | 
44 | // AppendSuccess should be used when a CAD check succeeded, e.g.
45 | // ✅ Network Verifier Passed
46 | // Format appended to the note:
47 | // ✅ <my string>\n
48 | func (n *NoteWriter) AppendSuccess(format string, a ...any) {
49 | 	n.writeWithLog("✅ %s\n", fmt.Sprintf(format, a...))
50 | }
51 | 
52 | // AppendWarning should be used when a CAD check showed an issue, e.g.
53 | // ⚠️ Network Verifier Failed with the following errors: error1, error2, error3
54 | // Format appended to the note:
55 | // ⚠️ <my string>\n
56 | func (n *NoteWriter) AppendWarning(format string, a ...any) {
57 | 	n.writeWithLog("⚠️ %s\n", fmt.Sprintf(format, a...))
58 | }
59 | 
60 | // AppendAutomation should to indicate CAD took an automated action, e.g.
61 | // 🤖 Sent service log: "This is the service log message"
62 | // Format appended to the note:
63 | // 🤖 <my string>\n
64 | func (n *NoteWriter) AppendAutomation(format string, a ...any) {
65 | 	n.writeWithLog("🤖 %s\n", fmt.Sprintf(format, a...))
66 | }
67 | 


--------------------------------------------------------------------------------
/pkg/notewriter/notewriter_test.go:
--------------------------------------------------------------------------------
 1 | package notewriter
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | var (
 8 | 	testInvestigationName = "CHGM"
 9 | 
10 | 	expectedOutput = `🤖 Automated CHGM pre-investigation 🤖
11 | ===========================
12 | ✅ Network Verifier Succeeded: 123
13 | ⚠️ Network Verifier Failed: 123
14 | 🤖 Sent servicelog for network misconfiguration: 123
15 | `
16 | )
17 | 
18 | func TestNoteWriter(t *testing.T) {
19 | 	notesWriter := New(testInvestigationName, nil)
20 | 	notesWriter.AppendSuccess("Network Verifier Succeeded: 123")
21 | 	notesWriter.AppendWarning("Network Verifier Failed: 123")
22 | 	notesWriter.AppendAutomation("Sent servicelog for network misconfiguration: 123")
23 | 
24 | 	res := notesWriter.String()
25 | 
26 | 	if res != expectedOutput {
27 | 		t.Fatalf("NoteWriter output does not match expected test output.\n NoteWriter output:\n%s\n\n Expected output:\n%s", res, expectedOutput)
28 | 	}
29 | }
30 | 
31 | func TestNoteWriterFormat(t *testing.T) {
32 | 	notesWriter := New(testInvestigationName, nil)
33 | 	notesWriter.AppendSuccess("Network Verifier Succeeded: %s", "123")
34 | 	notesWriter.AppendWarning("Network Verifier Failed: %s", "123")
35 | 	notesWriter.AppendAutomation("Sent servicelog for network misconfiguration: %s", "123")
36 | 
37 | 	res := notesWriter.String()
38 | 
39 | 	if res != expectedOutput {
40 | 		t.Fatalf("NoteWriter output does not match expected test output.\n NoteWriter output:\n%s\n\n Expected output:\n%s", res, expectedOutput)
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/pkg/ocm/ocm_config.go:
--------------------------------------------------------------------------------
  1 | package ocm
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 
  9 | 	sdk "github.com/openshift-online/ocm-sdk-go"
 10 | )
 11 | 
 12 | // Config is the type used to store the configuration of the client.
 13 | // There's no way to line-split or predefine tags, so...
 14 | //
 15 | //nolint:lll
 16 | type Config struct {
 17 | 	// TODO(efried): Better docs for things like AccessToken
 18 | 	// TODO(efried): Dedup with flag docs in cmd/ocm/login/cmd.go:init where possible
 19 | 	AccessToken  string   `json:"access_token,omitempty" doc:"Bearer access token."`
 20 | 	ClientID     string   `json:"client_id,omitempty" doc:"OpenID client identifier."`
 21 | 	ClientSecret string   `json:"client_secret,omitempty" doc:"OpenID client secret."`
 22 | 	Insecure     bool     `json:"insecure,omitempty" doc:"Enables insecure communication with the server. This disables verification of TLS certificates and host names."`
 23 | 	Password     string   `json:"password,omitempty" doc:"User password."`
 24 | 	RefreshToken string   `json:"refresh_token,omitempty" doc:"Offline or refresh token."`
 25 | 	Scopes       []string `json:"scopes,omitempty" doc:"OpenID scope. If this option is used it will replace completely the default scopes. Can be repeated multiple times to specify multiple scopes."`
 26 | 	TokenURL     string   `json:"token_url,omitempty" doc:"OpenID token URL."`
 27 | 	URL          string   `json:"url,omitempty" doc:"URL of the API gateway. The value can be the complete URL or an alias. The valid aliases are 'production', 'staging' and 'integration'."`
 28 | 	User         string   `json:"user,omitempty" doc:"User name."`
 29 | 	Pager        string   `json:"pager,omitempty" doc:"Pager command, for example 'less'. If empty no pager will be used."`
 30 | }
 31 | 
 32 | // Load loads the configuration from the configuration file. If the configuration file doesn't exist
 33 | // it will return an empty configuration object.
 34 | func Load() (cfg *Config, err error) {
 35 | 	file, err := Location()
 36 | 	if err != nil {
 37 | 		return
 38 | 	}
 39 | 	_, err = os.Stat(file)
 40 | 	if os.IsNotExist(err) {
 41 | 		cfg = &Config{}
 42 | 		return
 43 | 	}
 44 | 	if err != nil {
 45 | 		err = fmt.Errorf("can't check if config file '%s' exists: %w", file, err)
 46 | 		return
 47 | 	}
 48 | 	// #nosec G304
 49 | 	data, err := os.ReadFile(file)
 50 | 	if err != nil {
 51 | 		err = fmt.Errorf("can't read config file '%s': %w", file, err)
 52 | 		return
 53 | 	}
 54 | 	cfg = &Config{}
 55 | 	if len(data) == 0 {
 56 | 		return
 57 | 	}
 58 | 	err = json.Unmarshal(data, cfg)
 59 | 	if err != nil {
 60 | 		err = fmt.Errorf("can't parse config file '%s': %w", file, err)
 61 | 		return
 62 | 	}
 63 | 	return
 64 | }
 65 | 
 66 | // Location returns the location of the configuration file. If a configuration file
 67 | // already exists in the HOME directory, it uses that, otherwise it prefers to
 68 | // use the XDG config directory.
 69 | func Location() (path string, err error) {
 70 | 	if ocmconfig := os.Getenv("OCM_CONFIG"); ocmconfig != "" {
 71 | 		return ocmconfig, nil
 72 | 	}
 73 | 
 74 | 	// Determine home directory to use for the legacy file path
 75 | 	home, err := os.UserHomeDir()
 76 | 	if err != nil {
 77 | 		return "", err
 78 | 	}
 79 | 
 80 | 	path = filepath.Join(home, ".ocm.json")
 81 | 
 82 | 	_, err = os.Stat(path)
 83 | 	if os.IsNotExist(err) {
 84 | 		// Determine standard config directory
 85 | 		configDir, err := os.UserConfigDir()
 86 | 		if err != nil {
 87 | 			return path, err
 88 | 		}
 89 | 
 90 | 		// Use standard config directory
 91 | 		path = filepath.Join(configDir, "/ocm/ocm.json")
 92 | 	}
 93 | 
 94 | 	return path, nil
 95 | }
 96 | 
 97 | // Connection creates a connection using this configuration.
 98 | func (c *Config) Connection() (connection *sdk.Connection, err error) {
 99 | 	// Prepare the builder for the connection adding only the properties that have explicit
100 | 	// values in the configuration, so that default values won't be overridden:
101 | 	builder := sdk.NewConnectionBuilder()
102 | 	if c.TokenURL != "" {
103 | 		builder.TokenURL(c.TokenURL)
104 | 	}
105 | 	if c.ClientID != "" || c.ClientSecret != "" {
106 | 		builder.Client(c.ClientID, c.ClientSecret)
107 | 	}
108 | 	if c.Scopes != nil {
109 | 		builder.Scopes(c.Scopes...)
110 | 	}
111 | 	if c.URL != "" {
112 | 		builder.URL(c.URL)
113 | 	}
114 | 	if c.User != "" || c.Password != "" {
115 | 		builder.User(c.User, c.Password)
116 | 	}
117 | 	tokens := make([]string, 0, 2)
118 | 	if c.AccessToken != "" {
119 | 		tokens = append(tokens, c.AccessToken)
120 | 	}
121 | 	if c.RefreshToken != "" {
122 | 		tokens = append(tokens, c.RefreshToken)
123 | 	}
124 | 	if len(tokens) > 0 {
125 | 		builder.Tokens(tokens...)
126 | 	}
127 | 	builder.Insecure(c.Insecure)
128 | 
129 | 	// Create the connection:
130 | 	connection, err = builder.Build()
131 | 	if err != nil {
132 | 		return
133 | 	}
134 | 
135 | 	return
136 | }
137 | 


--------------------------------------------------------------------------------
/pkg/pagerduty/errors.go:
--------------------------------------------------------------------------------
  1 | package pagerduty
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | )
  7 | 
  8 | // InvalidTokenError wraps the PagerDuty token invalid error
  9 | type InvalidTokenError struct {
 10 | 	Err error
 11 | }
 12 | 
 13 | // Error prints the wrapped error and the original one
 14 | func (i InvalidTokenError) Error() string {
 15 | 	err := fmt.Errorf("the authToken that was provided is invalid: %w", i.Err)
 16 | 	return err.Error()
 17 | }
 18 | 
 19 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects)
 20 | func (InvalidTokenError) Is(target error) bool {
 21 | 	return errors.Is(target, InvalidTokenError{})
 22 | }
 23 | 
 24 | // InvalidInputParamsError wraps the PagerDuty Invalid parameters error
 25 | // TODO: the API also returns any other error in here, if this persists, think on renaming to "ClientMisconfiguration"
 26 | type InvalidInputParamsError struct {
 27 | 	Err error
 28 | }
 29 | 
 30 | // Error prints the wrapped error and the original one
 31 | func (i InvalidInputParamsError) Error() string {
 32 | 	err := fmt.Errorf("the escalation policy or incident id are invalid: %w", i.Err)
 33 | 	return err.Error()
 34 | }
 35 | 
 36 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects)
 37 | func (InvalidInputParamsError) Is(target error) bool {
 38 | 	return errors.Is(target, InvalidInputParamsError{})
 39 | }
 40 | 
 41 | // IncidentNotFoundError wraps the PagerDuty not found error while adding notes to an incident
 42 | type IncidentNotFoundError struct {
 43 | 	Err error
 44 | }
 45 | 
 46 | // Error prints the wrapped error and the original one
 47 | func (i IncidentNotFoundError) Error() string {
 48 | 	err := fmt.Errorf("the given incident was not found: %w", i.Err)
 49 | 	return err.Error()
 50 | }
 51 | 
 52 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects)
 53 | func (IncidentNotFoundError) Is(target error) bool {
 54 | 	return errors.Is(target, IncidentNotFoundError{})
 55 | }
 56 | 
 57 | // ServiceNotFoundError wraps the errors returned when PagerDuty services cannot be retrieved
 58 | type ServiceNotFoundError struct {
 59 | 	Err error
 60 | }
 61 | 
 62 | // Error prints the wrapped and original error
 63 | func (s ServiceNotFoundError) Error() string {
 64 | 	err := fmt.Errorf("the given service was not found: %w", s.Err)
 65 | 	return err.Error()
 66 | }
 67 | 
 68 | // Is indicates whether the supplied error is a ServiceNotFoundError
 69 | func (ServiceNotFoundError) Is(target error) bool {
 70 | 	return errors.Is(target, ServiceNotFoundError{})
 71 | }
 72 | 
 73 | // IntegrationNotFoundError wraps the errors returned when a PagerDuty service's integration cannot be found
 74 | type IntegrationNotFoundError struct {
 75 | 	Err error
 76 | }
 77 | 
 78 | // Error prints the wrapped and original error
 79 | func (i IntegrationNotFoundError) Error() string {
 80 | 	err := fmt.Errorf("the given integration was not found: %w", i.Err)
 81 | 	return err.Error()
 82 | }
 83 | 
 84 | // Is indicates whether the supplied error is an IntegrationNotFoundError
 85 | func (IntegrationNotFoundError) Is(target error) bool {
 86 | 	return errors.Is(target, IntegrationNotFoundError{})
 87 | }
 88 | 
 89 | // CreateEventError wraps the errors returned when failing to create a PagerDuty event
 90 | type CreateEventError struct {
 91 | 	Err error
 92 | }
 93 | 
 94 | // Error prints the wrapped and original error
 95 | func (c CreateEventError) Error() string {
 96 | 	err := fmt.Errorf("failed to create event: %w", c.Err)
 97 | 	return err.Error()
 98 | }
 99 | 
100 | // Is indicates whether the supplied error is a CreateEventError
101 | func (CreateEventError) Is(target error) bool {
102 | 	return errors.Is(target, CreateEventError{})
103 | }
104 | 
105 | // FileNotFoundError wraps the filesystem NotFound Error
106 | type FileNotFoundError struct {
107 | 	Err      error
108 | 	FilePath string
109 | }
110 | 
111 | // Error prints the wrapped error and the original one
112 | func (f FileNotFoundError) Error() string {
113 | 	err := fmt.Errorf("the file '%s' was not found in the filesystem: %w", f.FilePath, f.Err)
114 | 	return err.Error()
115 | }
116 | 
117 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects)
118 | func (f FileNotFoundError) Is(target error) bool {
119 | 	return errors.Is(target, FileNotFoundError{})
120 | }
121 | 
122 | // UnmarshalError wraps JSON's json.SyntaxError
123 | type UnmarshalError struct {
124 | 	Err error
125 | }
126 | 
127 | // Error prints the wrapped error and the original one
128 | func (u UnmarshalError) Error() string {
129 | 	err := fmt.Errorf("could not unmarshal the payloadFile: %w", u.Err)
130 | 	return err.Error()
131 | }
132 | 
133 | // Is ignores the internal error, thus making errors.Is work (as by default it compares the internal objects)
134 | func (u UnmarshalError) Is(target error) bool {
135 | 	return errors.Is(target, UnmarshalError{})
136 | }
137 | 


--------------------------------------------------------------------------------
/pkg/pagerduty/mock/pagerdutymock.go:
--------------------------------------------------------------------------------
  1 | // Code generated by MockGen. DO NOT EDIT.
  2 | // Source: pagerduty.go
  3 | //
  4 | // Generated by this command:
  5 | //
  6 | //	mockgen --build_flags=--mod=readonly -source pagerduty.go -destination ./mock/pagerdutymock.go -package pdmock
  7 | //
  8 | 
  9 | // Package pdmock is a generated GoMock package.
 10 | package pdmock
 11 | 
 12 | import (
 13 | 	reflect "reflect"
 14 | 
 15 | 	gomock "go.uber.org/mock/gomock"
 16 | )
 17 | 
 18 | // MockClient is a mock of Client interface.
 19 | type MockClient struct {
 20 | 	ctrl     *gomock.Controller
 21 | 	recorder *MockClientMockRecorder
 22 | 	isgomock struct{}
 23 | }
 24 | 
 25 | // MockClientMockRecorder is the mock recorder for MockClient.
 26 | type MockClientMockRecorder struct {
 27 | 	mock *MockClient
 28 | }
 29 | 
 30 | // NewMockClient creates a new mock instance.
 31 | func NewMockClient(ctrl *gomock.Controller) *MockClient {
 32 | 	mock := &MockClient{ctrl: ctrl}
 33 | 	mock.recorder = &MockClientMockRecorder{mock}
 34 | 	return mock
 35 | }
 36 | 
 37 | // EXPECT returns an object that allows the caller to indicate expected use.
 38 | func (m *MockClient) EXPECT() *MockClientMockRecorder {
 39 | 	return m.recorder
 40 | }
 41 | 
 42 | // AddNote mocks base method.
 43 | func (m *MockClient) AddNote(notes string) error {
 44 | 	m.ctrl.T.Helper()
 45 | 	ret := m.ctrl.Call(m, "AddNote", notes)
 46 | 	ret0, _ := ret[0].(error)
 47 | 	return ret0
 48 | }
 49 | 
 50 | // AddNote indicates an expected call of AddNote.
 51 | func (mr *MockClientMockRecorder) AddNote(notes any) *gomock.Call {
 52 | 	mr.mock.ctrl.T.Helper()
 53 | 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddNote", reflect.TypeOf((*MockClient)(nil).AddNote), notes)
 54 | }
 55 | 
 56 | // EscalateIncident mocks base method.
 57 | func (m *MockClient) EscalateIncident() error {
 58 | 	m.ctrl.T.Helper()
 59 | 	ret := m.ctrl.Call(m, "EscalateIncident")
 60 | 	ret0, _ := ret[0].(error)
 61 | 	return ret0
 62 | }
 63 | 
 64 | // EscalateIncident indicates an expected call of EscalateIncident.
 65 | func (mr *MockClientMockRecorder) EscalateIncident() *gomock.Call {
 66 | 	mr.mock.ctrl.T.Helper()
 67 | 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EscalateIncident", reflect.TypeOf((*MockClient)(nil).EscalateIncident))
 68 | }
 69 | 
 70 | // EscalateIncidentWithNote mocks base method.
 71 | func (m *MockClient) EscalateIncidentWithNote(notes string) error {
 72 | 	m.ctrl.T.Helper()
 73 | 	ret := m.ctrl.Call(m, "EscalateIncidentWithNote", notes)
 74 | 	ret0, _ := ret[0].(error)
 75 | 	return ret0
 76 | }
 77 | 
 78 | // EscalateIncidentWithNote indicates an expected call of EscalateIncidentWithNote.
 79 | func (mr *MockClientMockRecorder) EscalateIncidentWithNote(notes any) *gomock.Call {
 80 | 	mr.mock.ctrl.T.Helper()
 81 | 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EscalateIncidentWithNote", reflect.TypeOf((*MockClient)(nil).EscalateIncidentWithNote), notes)
 82 | }
 83 | 
 84 | // GetServiceID mocks base method.
 85 | func (m *MockClient) GetServiceID() string {
 86 | 	m.ctrl.T.Helper()
 87 | 	ret := m.ctrl.Call(m, "GetServiceID")
 88 | 	ret0, _ := ret[0].(string)
 89 | 	return ret0
 90 | }
 91 | 
 92 | // GetServiceID indicates an expected call of GetServiceID.
 93 | func (mr *MockClientMockRecorder) GetServiceID() *gomock.Call {
 94 | 	mr.mock.ctrl.T.Helper()
 95 | 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServiceID", reflect.TypeOf((*MockClient)(nil).GetServiceID))
 96 | }
 97 | 
 98 | // SilenceIncident mocks base method.
 99 | func (m *MockClient) SilenceIncident() error {
100 | 	m.ctrl.T.Helper()
101 | 	ret := m.ctrl.Call(m, "SilenceIncident")
102 | 	ret0, _ := ret[0].(error)
103 | 	return ret0
104 | }
105 | 
106 | // SilenceIncident indicates an expected call of SilenceIncident.
107 | func (mr *MockClientMockRecorder) SilenceIncident() *gomock.Call {
108 | 	mr.mock.ctrl.T.Helper()
109 | 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SilenceIncident", reflect.TypeOf((*MockClient)(nil).SilenceIncident))
110 | }
111 | 
112 | // SilenceIncidentWithNote mocks base method.
113 | func (m *MockClient) SilenceIncidentWithNote(notes string) error {
114 | 	m.ctrl.T.Helper()
115 | 	ret := m.ctrl.Call(m, "SilenceIncidentWithNote", notes)
116 | 	ret0, _ := ret[0].(error)
117 | 	return ret0
118 | }
119 | 
120 | // SilenceIncidentWithNote indicates an expected call of SilenceIncidentWithNote.
121 | func (mr *MockClientMockRecorder) SilenceIncidentWithNote(notes any) *gomock.Call {
122 | 	mr.mock.ctrl.T.Helper()
123 | 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SilenceIncidentWithNote", reflect.TypeOf((*MockClient)(nil).SilenceIncidentWithNote), notes)
124 | }
125 | 
126 | // UpdateIncidentTitle mocks base method.
127 | func (m *MockClient) UpdateIncidentTitle(title string) error {
128 | 	m.ctrl.T.Helper()
129 | 	ret := m.ctrl.Call(m, "UpdateIncidentTitle", title)
130 | 	ret0, _ := ret[0].(error)
131 | 	return ret0
132 | }
133 | 
134 | // UpdateIncidentTitle indicates an expected call of UpdateIncidentTitle.
135 | func (mr *MockClientMockRecorder) UpdateIncidentTitle(title any) *gomock.Call {
136 | 	mr.mock.ctrl.T.Helper()
137 | 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateIncidentTitle", reflect.TypeOf((*MockClient)(nil).UpdateIncidentTitle), title)
138 | }
139 | 


--------------------------------------------------------------------------------
/pkg/pagerduty/pagerduty_suite_test.go:
--------------------------------------------------------------------------------
 1 | package pagerduty_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	. "github.com/onsi/ginkgo/v2"
 7 | 	. "github.com/onsi/gomega"
 8 | )
 9 | 
10 | func TestPagerduty(t *testing.T) {
11 | 	RegisterFailHandler(Fail)
12 | 	RunSpecs(t, "Pagerduty Suite")
13 | }
14 | 


--------------------------------------------------------------------------------
/pkg/pagerduty/types.go:
--------------------------------------------------------------------------------
 1 | package pagerduty
 2 | 
 3 | // AlertDetails exposes the required info we need from an alert
 4 | type AlertDetails struct {
 5 | 	ID        string
 6 | 	ClusterID string // This can be internal or external ID
 7 | }
 8 | 
 9 | // NewAlertCustomDetails is a format for the alert details shown in the pagerduty incident
10 | type NewAlertCustomDetails struct {
11 | 	ClusterID  string `json:"Cluster ID"`
12 | 	Error      string `json:"Error"`
13 | 	Resolution string `json:"Resolution"`
14 | 	SOP        string `json:"SOP"`
15 | }
16 | 
17 | // NewAlert is a type for alerts to create on pagerduty
18 | type NewAlert struct {
19 | 	// The alert description acts as a title for the resulting incident
20 | 	Description string
21 | 	Details     NewAlertCustomDetails
22 | }
23 | 


--------------------------------------------------------------------------------
/pkg/utils/utils.go:
--------------------------------------------------------------------------------
 1 | // Package utils contains utility functions
 2 | package utils
 3 | 
 4 | import (
 5 | 	"fmt"
 6 | 	"time"
 7 | 
 8 | 	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
 9 | )
10 | 
11 | // WithRetries runs a function with up to 10 retries on error
12 | func WithRetries(fn func() error) error {
13 | 	const defaultRetries = 10
14 | 	const defaultInitialBackoff = time.Second * 2
15 | 
16 | 	return WithRetriesConfigurable(defaultRetries, defaultInitialBackoff, fn)
17 | }
18 | 
19 | // WithRetriesConfigurable runs a function with a configurable retry count and backoff interval on error
20 | func WithRetriesConfigurable(count int, initialBackoff time.Duration, fn func() error) error {
21 | 	var err error
22 | 	for i := 0; i < count; i++ {
23 | 		if i > 0 {
24 | 			logging.Warnf("Retry %d: %s \n", i, err.Error())
25 | 			time.Sleep(initialBackoff)
26 | 			initialBackoff *= 2
27 | 		}
28 | 		err = fn()
29 | 		if err == nil {
30 | 			return nil
31 | 		}
32 | 	}
33 | 	return fmt.Errorf("failed after %d retries: %w", count, err)
34 | }
35 | 


--------------------------------------------------------------------------------
/pkg/utils/utils_suite_test.go:
--------------------------------------------------------------------------------
 1 | package utils_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	. "github.com/onsi/ginkgo/v2"
 7 | 	. "github.com/onsi/gomega"
 8 | )
 9 | 
10 | func TestChgm(t *testing.T) {
11 | 	RegisterFailHandler(Fail)
12 | 	RunSpecs(t, "utils suite")
13 | }
14 | 


--------------------------------------------------------------------------------
/test/e2e/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM registry.ci.openshift.org/openshift/release:rhel-8-release-golang-1.23-openshift-4.19 as builder
2 | WORKDIR /go/src/github.com/openshift/configuration-anomaly-detection
3 | COPY . .
4 | RUN  CGO_ENABLED=0 GOFLAGS="-mod=mod" go test ./test/e2e -v -c --tags=osde2e -o /e2e.test
5 | 
6 | FROM registry.access.redhat.com/ubi8/ubi-minimal:latest
7 | COPY --from=builder ./e2e.test e2e.test
8 | ENTRYPOINT [ "/e2e.test" ]
9 | 


--------------------------------------------------------------------------------
/test/e2e/configuration_anomaly_detection_runner_test.go:
--------------------------------------------------------------------------------
 1 | //go:build osde2e
 2 | // +build osde2e
 3 | 
 4 | package osde2etests
 5 | 
 6 | import (
 7 | 	"os"
 8 | 	"path/filepath"
 9 | 	"testing"
10 | 
11 | 	. "github.com/onsi/ginkgo/v2"
12 | 	. "github.com/onsi/gomega"
13 | )
14 | 
15 | const (
16 | 	testResultsDirectory = "/test-run-results"
17 | 	jUnitOutputFilename  = "junit-configuration-anomaly-detection.xml"
18 | )
19 | 
20 | // Test entrypoint. osde2e runs this as a test suite on test pod.
21 | func TestConfigurationAnomalyDetection(t *testing.T) {
22 | 	RegisterFailHandler(Fail)
23 | 	suiteConfig, reporterConfig := GinkgoConfiguration()
24 | 	if _, ok := os.LookupEnv("DISABLE_JUNIT_REPORT"); !ok {
25 | 		reporterConfig.JUnitReport = filepath.Join(testResultsDirectory, jUnitOutputFilename)
26 | 	}
27 | 	RunSpecs(t, "Configuration Anomaly Detection", suiteConfig, reporterConfig)
28 | }
29 | 


--------------------------------------------------------------------------------
/test/e2e/project.mk:
--------------------------------------------------------------------------------
 1 | # Project specific values
 2 | OPERATOR_NAME?=configuration-anomaly-detection
 3 | 
 4 | E2E_SUITE_IMAGE_REGISTRY?=quay.io
 5 | E2E_SUITE_IMAGE_REPOSITORY?=app-sre
 6 | E2E_SUITE_IMAGE_NAME?=$(OPERATOR_NAME)-e2e
 7 | 
 8 | REGISTRY_USER?=$(QUAY_USER)
 9 | REGISTRY_TOKEN?=$(QUAY_TOKEN)
10 | 
11 | ######################
12 | # Targets used by e2e test suite
13 | ######################
14 | 
15 | # create binary
16 | .PHONY: e2e-suite-build
17 | e2e-suite-build: GOFLAGS_MOD=-mod=mod
18 | e2e-suite-build: GOENV=GOOS=${GOOS} GOARCH=${GOARCH} CGO_ENABLED=0 GOFLAGS="${GOFLAGS_MOD}"
19 | e2e-suite-build:
20 | 	go mod tidy
21 | 	${GOENV} go test ./test/e2e -v -c --tags=osde2e -o e2e-suite.test
22 | 
23 | # TODO: Push to a known image tag and commit id
24 | # push e2e suite image
25 | # Use current commit as e2e suite image tag
26 | CURRENT_COMMIT=$(shell git rev-parse --short=7 HEAD)
27 | E2E_SUITE_IMAGE_TAG=$(CURRENT_COMMIT)
28 | 
29 | .PHONY: e2e-image-build-push
30 | e2e-image-build-push:
31 | 	${CONTAINER_ENGINE} build --pull -f test/e2e/Dockerfile -t $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):$(E2E_SUITE_IMAGE_TAG) .
32 | 	${CONTAINER_ENGINE} tag $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):$(E2E_SUITE_IMAGE_TAG) $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):latest
33 | 	${CONTAINER_ENGINE} push $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):$(E2E_SUITE_IMAGE_TAG)
34 | 	${CONTAINER_ENGINE} push $(E2E_SUITE_IMAGE_REGISTRY)/$(E2E_SUITE_IMAGE_REPOSITORY)/$(E2E_SUITE_IMAGE_NAME):latest


--------------------------------------------------------------------------------
/test/e2e/test-e2e-suite-template.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: template.openshift.io/v1
 2 | kind: Template
 3 | metadata:
 4 |   name: osde2e-focused-tests
 5 | 
 6 | parameters:
 7 |   - name: OSDE2E_CONFIGS
 8 |     required: true
 9 |   - name: TEST_E2E_SUITE_IMAGE
10 |     required: true
11 |   - name: OCM_TOKEN
12 |     required: true
13 |   - name: OCM_CCS
14 |     required: false
15 |   - name: AWS_ACCESS_KEY_ID
16 |     required: false
17 |   - name: AWS_SECRET_ACCESS_KEY
18 |     required: false
19 |   - name: CLOUD_PROVIDER_REGION
20 |     required: false
21 |   - name: GCP_CREDS_JSON
22 |     required: false
23 |   - name: JOBID
24 |     generate: expression
25 |     from: "[0-9a-z]{7}"
26 |   - name: IMAGE_TAG
27 |     value: ''
28 |     required: true
29 |   - name: LOG_BUCKET
30 |     value: 'osde2e-logs'
31 | objects:
32 |   - apiVersion: batch/v1
33 |     kind: Job
34 |     metadata:
35 |       name: configuration-anomaly-detection-${IMAGE_TAG}-${JOBID}
36 |     spec:
37 |       backoffLimit: 0
38 |       template:
39 |         spec:
40 |           restartPolicy: Never
41 |           containers:
42 |             - name: osde2e
43 |               image: quay.io/redhat-services-prod/osde2e-cicada-tenant/osde2e:latest
44 |               command:
45 |                 - /osde2e
46 |               args:
47 |                 - test
48 |                 - --configs
49 |                 - ${OSDE2E_CONFIGS}
50 |               securityContext:
51 |                 runAsNonRoot: true
52 |                 allowPrivilegeEscalation: false
53 |                 capabilities:
54 |                   drop: ["ALL"]
55 |                 seccompProfile:
56 |                   type: RuntimeDefault
57 |               env:
58 |                 - name: TEST_HARNESSES
59 |                   value: ${TEST_E2E_SUITE_IMAGE}:${IMAGE_TAG}
60 |                 - name: OCM_TOKEN
61 |                   value: ${OCM_TOKEN}
62 |                 - name: OCM_CCS
63 |                   value: ${OCM_CCS}
64 |                 - name: AWS_ACCESS_KEY_ID
65 |                   value: ${AWS_ACCESS_KEY_ID}
66 |                 - name: AWS_SECRET_ACCESS_KEY
67 |                   value: ${AWS_SECRET_ACCESS_KEY}
68 |                 - name: CLOUD_PROVIDER_REGION
69 |                   value: ${CLOUD_PROVIDER_REGION}
70 |                 - name: GCP_CREDS_JSON
71 |                   value: ${GCP_CREDS_JSON}
72 |                 - name: LOG_BUCKET
73 |                   value: ${LOG_BUCKET}                 


--------------------------------------------------------------------------------
/test/e2e/utils/aws.go:
--------------------------------------------------------------------------------
 1 | package utils
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 
 7 | 	"github.com/aws/aws-sdk-go-v2/service/ec2"
 8 | 	"github.com/aws/aws-sdk-go-v2/service/ec2/types"
 9 | )
10 | 
11 | // EC2API interface to make testing easier
12 | type EC2API interface {
13 | 	RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error)
14 | 	AuthorizeSecurityGroupEgress(ctx context.Context, params *ec2.AuthorizeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupEgressOutput, error)
15 | }
16 | 
17 | // EC2ClientWrapper wraps the AWS SDK EC2 client to implement our EC2API interface
18 | type EC2ClientWrapper struct {
19 | 	Client *ec2.Client
20 | }
21 | 
22 | // RevokeSecurityGroupEgress implements EC2API
23 | func (w *EC2ClientWrapper) RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) {
24 | 	return w.Client.RevokeSecurityGroupEgress(ctx, params, optFns...)
25 | }
26 | 
27 | // AuthorizeSecurityGroupEgress implements EC2API
28 | func (w *EC2ClientWrapper) AuthorizeSecurityGroupEgress(ctx context.Context, params *ec2.AuthorizeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupEgressOutput, error) {
29 | 	return w.Client.AuthorizeSecurityGroupEgress(ctx, params, optFns...)
30 | }
31 | 
32 | // NewEC2ClientWrapper creates a new EC2ClientWrapper that implements EC2API
33 | func NewEC2ClientWrapper(client *ec2.Client) *EC2ClientWrapper {
34 | 	return &EC2ClientWrapper{Client: client}
35 | }
36 | 
37 | // BlockEgress revokes all outbound traffic from the security group
38 | func BlockEgress(ctx context.Context, ec2Client EC2API, securityGroupID string) error {
39 | 	input := &ec2.RevokeSecurityGroupEgressInput{
40 | 		GroupId: &securityGroupID,
41 | 		IpPermissions: []types.IpPermission{
42 | 			{
43 | 				IpProtocol: awsString("-1"), // -1 = all protocols
44 | 				IpRanges: []types.IpRange{
45 | 					{CidrIp: awsString("0.0.0.0/0")},
46 | 				},
47 | 			},
48 | 		},
49 | 	}
50 | 	_, err := ec2Client.RevokeSecurityGroupEgress(ctx, input)
51 | 	if err != nil {
52 | 		return fmt.Errorf("failed to revoke egress: %w", err)
53 | 	}
54 | 	return nil
55 | }
56 | 
57 | // RestoreEgress allows all outbound traffic from the security group
58 | func RestoreEgress(ctx context.Context, ec2Client EC2API, securityGroupID string) error {
59 | 	input := &ec2.AuthorizeSecurityGroupEgressInput{
60 | 		GroupId: &securityGroupID,
61 | 		IpPermissions: []types.IpPermission{
62 | 			{
63 | 				IpProtocol: awsString("-1"),
64 | 				IpRanges: []types.IpRange{
65 | 					{CidrIp: awsString("0.0.0.0/0")},
66 | 				},
67 | 			},
68 | 		},
69 | 	}
70 | 	_, err := ec2Client.AuthorizeSecurityGroupEgress(ctx, input)
71 | 	if err != nil {
72 | 		return fmt.Errorf("failed to restore egress: %w", err)
73 | 	}
74 | 	return nil
75 | }
76 | 
77 | // awsString helper function to convert a string to a pointer
78 | func awsString(value string) *string {
79 | 	return &value
80 | }
81 | 


--------------------------------------------------------------------------------
/test/e2e/utils/generate_incident.go:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"crypto/rand"
  6 | 	"fmt"
  7 | 	"time"
  8 | 
  9 | 	sdk "github.com/PagerDuty/go-pagerduty"
 10 | )
 11 | 
 12 | const (
 13 | 	AlertClusterHasGoneMissing                         = "ClusterHasGoneMissing"
 14 | 	AlertClusterProvisioningDelay                      = "ClusterProvisioningDelay"
 15 | 	AlertClusterMonitoringErrorBudgetBurnSRE           = "ClusterMonitoringErrorBudgetBurnSRE"
 16 | 	AlertInsightsOperatorDown                          = "InsightsOperatorDown"
 17 | 	AlertMachineHealthCheckUnterminatedShortCircuitSRE = "MachineHealthCheckUnterminatedShortCircuitSRE"
 18 | 	AlertApiErrorBudgetBurn                            = "ApiErrorBudgetBurn"
 19 | )
 20 | 
 21 | func GetAlertTitle(alertName string) (string, error) {
 22 | 	switch alertName {
 23 | 	case AlertClusterHasGoneMissing:
 24 | 		return "cadtest has gone missing", nil
 25 | 	case AlertClusterProvisioningDelay:
 26 | 		return "ClusterProvisioningDelay -", nil
 27 | 	case AlertClusterMonitoringErrorBudgetBurnSRE:
 28 | 		return "ClusterMonitoringErrorBudgetBurnSRE Critical (1)", nil
 29 | 	case AlertInsightsOperatorDown:
 30 | 		return "InsightsOperatorDown", nil
 31 | 	case AlertMachineHealthCheckUnterminatedShortCircuitSRE:
 32 | 		return "MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)", nil
 33 | 	case AlertApiErrorBudgetBurn:
 34 | 		return "api-ErrorBudgetBurn k8sgpt test CRITICAL (1)", nil
 35 | 	default:
 36 | 		return "", fmt.Errorf("unknown alert name: %s", alertName)
 37 | 	}
 38 | }
 39 | 
 40 | type TestPagerDutyClient interface {
 41 | 	TriggerIncident(alertName, clusterID string) (string, error)
 42 | 	GetIncidentID(dedupKey string) (string, error)
 43 | 	ResolveIncident(incidentID string) error
 44 | }
 45 | type client struct {
 46 | 	routingKey string
 47 | 	apiClient  *sdk.Client
 48 | }
 49 | 
 50 | func NewClient(routingKey string) TestPagerDutyClient {
 51 | 	return &client{
 52 | 		routingKey: routingKey,
 53 | 		apiClient:  sdk.NewClient(routingKey),
 54 | 	}
 55 | }
 56 | 
 57 | func (c *client) TriggerIncident(alertName, clusterID string) (string, error) {
 58 | 	summary, err := GetAlertTitle(alertName)
 59 | 	if err != nil {
 60 | 		return "", err
 61 | 	}
 62 | 	event := sdk.V2Event{
 63 | 		RoutingKey: c.routingKey,
 64 | 		Action:     "trigger",
 65 | 		DedupKey:   generateUUID(),
 66 | 		Payload: &sdk.V2Payload{
 67 | 			Summary:   summary,
 68 | 			Source:    "cad-integration-testing",
 69 | 			Severity:  "critical",
 70 | 			Timestamp: time.Now().UTC().Format(time.RFC3339),
 71 | 			Details: map[string]interface{}{
 72 | 				"alertname":  alertName,
 73 | 				"cluster_id": clusterID,
 74 | 			},
 75 | 		},
 76 | 	}
 77 | 	resp, err := sdk.ManageEventWithContext(context.Background(), event)
 78 | 	if err != nil {
 79 | 		return "", err
 80 | 	}
 81 | 	return resp.DedupKey, nil
 82 | }
 83 | 
 84 | func (c *client) GetIncidentID(dedupKey string) (string, error) {
 85 | 	// Implementation can be added if needed
 86 | 	return "", nil
 87 | }
 88 | 
 89 | func (c *client) ResolveIncident(incidentID string) error {
 90 | 	// Implementation can be added if needed
 91 | 	return nil
 92 | }
 93 | 
 94 | func generateUUID() string {
 95 | 	b := make([]byte, 16)
 96 | 	_, err := rand.Read(b)
 97 | 	if err != nil {
 98 | 		// Fallback to timestamp-based if crypto/rand fails
 99 | 		return fmt.Sprintf("%d", time.Now().UnixNano())
100 | 	}
101 | 	// Set version (4) and variant bits
102 | 	b[6] = (b[6] & 0x0f) | 0x40 // Version 4
103 | 	b[8] = (b[8] & 0x3f) | 0x80 // Variant 10
104 | 	return fmt.Sprintf("%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:16])
105 | }
106 | 


--------------------------------------------------------------------------------
/test/e2e/utils/utils.go:
--------------------------------------------------------------------------------
 1 | //go:build osde2e
 2 | // +build osde2e
 3 | 
 4 | package utils
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | 
 9 | 	cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
10 | 	servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
11 | 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
12 | 	ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
13 | )
14 | 
15 | func GetLimitedSupportReasons(ocme2eCli *ocme2e.Client, clusterID string) (*cmv1.LimitedSupportReasonsListResponse, error) {
16 | 	lsResponse, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).LimitedSupportReasons().List().Send()
17 | 
18 | 	if err != nil {
19 | 		return nil, fmt.Errorf("failed sending service log: %w", err)
20 | 	}
21 | 	return lsResponse, nil
22 | }
23 | 
24 | func GetServiceLogs(ocmCli ocm.Client, cluster *cmv1.Cluster) (*servicelogsv1.ClusterLogsUUIDListResponse, error) {
25 | 	filter := "log_type='cluster-state-updates'"
26 | 	clusterLogsUUIDListResponse, err := ocmCli.GetServiceLog(cluster, filter)
27 | 	if err != nil {
28 | 		return nil, fmt.Errorf("Failed to get service log: %w", err)
29 | 	}
30 | 	return clusterLogsUUIDListResponse, nil
31 | }
32 | 


--------------------------------------------------------------------------------
/test/generate_incident.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Define the mapping of alert names to titles
 5 | # Add more mappings as needed: for the standard service, we should not need to go by title but by the `alertname` field instead.
 6 | declare -A alert_mapping=(
 7 |     ["ClusterHasGoneMissing"]="cadtest has gone missing"
 8 |     ["ClusterProvisioningDelay"]="ClusterProvisioningDelay -"
 9 |     ["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)"
10 |     ["InsightsOperatorDown"]="InsightsOperatorDown"
11 |     ["MachineHealthCheckUnterminatedShortCircuitSRE"]="MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)"
12 |     ["ApiErrorBudgetBurn"]="api-ErrorBudgetBurn k8sgpt test CRITICAL (1)"
13 |     ["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
14 |     ["UpgradeConfigSyncFailureOver4HrSRE"]="UpgradeConfigSyncFailureOver4HrSRE Critical (1)"
15 | )
16 | 
17 | # Function to print help message
18 | print_help() {
19 |     echo "Usage: $0 <alertname> <clusterid>"
20 |     echo -n "Available alert names (comma separated): "
21 |     for alert_name in "${!alert_mapping[@]}"; do
22 |         echo -n "$alert_name, "
23 |     done
24 |     echo
25 | }
26 | # Check if the correct number of arguments is provided
27 | if [ "$#" -ne 2 ]; then
28 |     print_help
29 |     exit 1
30 | fi
31 | 
32 | alert_name=$1
33 | cluster_id=$2
34 | time_current=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
35 | 
36 | # Check if the alert name is in the mapping
37 | if [ -z "${alert_mapping[$alert_name]}" ]; then
38 |     echo "Error: Unknown alert name '$alert_name'"
39 |     print_help
40 |     exit 1
41 | fi
42 | 
43 | alert_title="${alert_mapping[$alert_name]}"
44 | 
45 | # Load testing routing key and test service url from vault
46 | export VAULT_ADDR="https://vault.devshift.net"
47 | export VAULT_TOKEN="$(vault login -method=oidc -token-only)"
48 | for v in $(vault kv get  -format=json osd-sre/configuration-anomaly-detection/cad-testing | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done
49 | unset VAULT_ADDR VAULT_TOKEN
50 | echo
51 | 
52 | dedup_key=$(uuidgen)
53 | 
54 | echo "Creating incident for $alert_name"
55 | response=$(curl --silent --request POST \
56 |   --url https://events.pagerduty.com/v2/enqueue \
57 |   --header 'Accept: application/json' \
58 |   --header 'Content-Type: application/json' \
59 |   --data '{
60 |     "payload": {
61 |       "summary": "'"${alert_title}"'",
62 |       "timestamp": "'"${time_current}"'",
63 |       "severity": "critical",
64 |       "source": "cad-integration-testing",
65 |       "custom_details": {
66 |         "alertname": "'"${alert_name}"'",
67 |         "cluster_id": "'"${cluster_id}"'"
68 |       }
69 |     },
70 |     "routing_key": "'"${pd_test_routing_key}"'",
71 |     "event_action": "trigger",
72 |     "dedup_key": "'"${dedup_key}"'"
73 |   }')
74 | 
75 | if [[ $response != *"Event processed"* ]]; then
76 |   echo "Error: Couldn't create the incident"
77 |   exit 1
78 | fi
79 | echo
80 | 
81 | # Pagerduty seems to need a short while to create the incident
82 | # Added this as we intermittently fail to get the incident id otherwise
83 | sleep 2
84 | 
85 | INCIDENT_ID=$(curl --silent --request GET \
86 |   --url "https://api.pagerduty.com/incidents?incident_key=${dedup_key}" \
87 |   --header 'Accept: application/json' \
88 |   --header "Authorization: Token token=${pd_test_token}" \
89 |   --header 'Content-Type: application/json' | jq -r '.incidents[0].id')
90 | echo $INCIDENT_ID
91 | echo '{"__pd_metadata":{"incident":{"id":"'$INCIDENT_ID'"}}}' > ./payload
92 | echo "Created ./payload"
93 | 


--------------------------------------------------------------------------------
/test/launch_local_env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | #assuming we're launched from inside the configuration-anomaly-detection repository
 5 | CAD_REPO_PATH=$(git rev-parse --show-toplevel)
 6 | echo "Assuming CAD repository root is ${CAD_REPO_PATH}"
 7 | 
 8 | #check presence of binary, assume the dnf package name is the same
 9 | check_presence () {
10 |     # $1 - name of the binary
11 |     echo -n "Checking presence of $1..."
12 |     if ! which $1 2>/dev/null >/dev/null; then
13 |         echo "Not Found"
14 |         echo "Try 'dnf install $1' on Fedora"
15 |         exit -1
16 |     else
17 |         echo "Found"
18 |     fi
19 | }
20 | 
21 | # clean up child processes on SIGINT
22 | trap "kill -- -$$" EXIT
23 | 
24 | check_presence "jq"
25 | check_presence "openssl"
26 | check_presence "tinyproxy"
27 | check_presence "haproxy"
28 | check_presence "proxytunnel"
29 | 
30 | #loading env vars
31 | . ${CAD_REPO_PATH}/test/set_stage_env.sh 
32 | 
33 | #checking env vars
34 | set +u
35 | if [[ -z "${OCM_BACKPLANE_REPO_PATH}" ]]; then
36 |     echo "Please set OCM_BACKPLANE_REPO_PATH variable to the path of the OCM Backplane code repository"
37 |     exit -1
38 | fi
39 | set -u
40 | 
41 | if ! [ $(cat ${OCM_BACKPLANE_REPO_PATH}/configs/ocm.json | jq -r .client_id) = "ocm-backplane-staging" ]; then
42 |     echo "OCM Backplane ocm.json (${OCM_BACKPLANE_REPO_PATH}/configs/ocm.json) isn't the ocm-backplane-staging config."
43 |     echo "Please get the config from a backplane pod on a staging backplanes0* cluster (in /ocm inside the pod)"
44 |     echo "and place it in the configs subdirectory of the backplane-api repo."
45 |     exit -1
46 | fi
47 | 
48 | #checking certificate validity
49 | if ! openssl verify ${OCM_BACKPLANE_REPO_PATH}/localhost.crt; then
50 |     echo "Certificate ${OCM_BACKPLANE_REPO_PATH}/localhost.crt not valid, please run make dev-certs in the OCM Backplane directory as root to generate and trust the localhost certificates"
51 |     exit -1
52 | fi
53 | 
54 | #creating certificate file for the HAProxy
55 | cat ${OCM_BACKPLANE_REPO_PATH}/localhost.crt ${OCM_BACKPLANE_REPO_PATH}/localhost.key > ${CAD_REPO_PATH}/test/testinfra/localhost.pem
56 | 
57 | #checking BACKPLANE_PROXY reachability reachability
58 | echo "Checking Proxy reachability"
59 | if ! curl ${BACKPLANE_PROXY} -o /dev/null; then
60 |     echo "Proxy ${BACKPLANE_PROXY} not reachable, check VPN connection"
61 |     exit -1
62 | fi
63 | 
64 | #run the env
65 | echo "Starting tinyproxy on port 8888"
66 | tinyproxy -d -c ${CAD_REPO_PATH}/test/testinfra/tinyproxy.conf > ${CAD_REPO_PATH}/test/testinfra/tinyproxy.log 2> ${CAD_REPO_PATH}/test/testinfra/tinyproxy.error.log&
67 | 
68 | echo "Starting proxytunnel on port 8091"
69 | proxytunnel -v -p squid.corp.redhat.com:3128 -d api.stage.backplane.openshift.com:443 -a 8091 > ${CAD_REPO_PATH}/test/testinfra/proxytunnel.log 2> ${CAD_REPO_PATH}/test/testinfra/proxytunnel.error.log &
70 | 
71 | echo "Starting haproxy on port 8443"
72 | pushd ${CAD_REPO_PATH}/test/testinfra/
73 | haproxy -f haproxy.cfg > ${CAD_REPO_PATH}/test/testinfra/haproxy.log 2> ${CAD_REPO_PATH}/test/testinfra/haproxy.error.log &
74 | popd
75 | 
76 | echo "Starting backplane-api on port 8001"
77 | pushd $OCM_BACKPLANE_REPO_PATH
78 | GIT_REPO=${CAD_REPO_PATH} make run-local-with-testremediation > ${CAD_REPO_PATH}/test/testinfra/backplan-api.log 2> ${CAD_REPO_PATH}/test/testinfra/backplan-api.error.log &
79 | popd
80 | 
81 | echo "Environment started. Check ${CAD_REPO_PATH}/test/testinfra/ directory for logs"
82 | echo "Run cadctl with the following command to test against the local backplane-api for remediations"
83 | echo ""
84 | echo "BACKPLANE_URL=https://localhost:8443 HTTP_PROXY=http://127.0.0.1:8888 HTTPS_PROXY=http://127.0.0.1:8888 BACKPLANE_PROXY=http://127.0.0.1:8888  ./bin/cadctl investigate --payload-path ./payload --log-level debug"
85 | echo ""
86 | echo "Send SIGINT (Ctrl+C) to terminate the local infrastructure"
87 | #keep the script alive until all child processes are cleaned up
88 | wait
89 | 


--------------------------------------------------------------------------------
/test/set_stage_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | export VAULT_ADDR="https://vault.devshift.net"
 5 | export VAULT_TOKEN="$(vault login -method=oidc -token-only)"
 6 | for v in $(vault kv get  -format=json osd-sre/configuration-anomaly-detection/backplane/stg | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done
 7 | for v in $(vault kv get  -format=json osd-sre/configuration-anomaly-detection/ocm/ocm-cad-staging | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done
 8 | for v in $(vault kv get  -format=json osd-sre/configuration-anomaly-detection/pd/stg | jq -r ".data.data|to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"); do export $v; done
 9 | unset VAULT_ADDR VAULT_TOKEN
10 | 
11 | 
12 | PROXY_URL="http://squid.corp.redhat.com:3128"
13 | 
14 | export CAD_EXPERIMENTAL_ENABLED=true
15 | export BACKPLANE_PROXY=${PROXY_URL}
16 | export AWS_PROXY=${PROXY_URL}
17 | 
18 | set +euo pipefail
19 | 


--------------------------------------------------------------------------------
/test/testinfra/haproxy.cfg:
--------------------------------------------------------------------------------
 1 | global
 2 |     log stderr format iso local7
 3 | defaults
 4 |     log global
 5 |     mode http
 6 |     timeout connect 5000ms
 7 |     timeout client 50000ms
 8 |     timeout server 50000ms
 9 | 
10 | frontend https-in
11 |     option httplog
12 |     bind *:8443 ssl crt ./localhost.pem
13 |     redirect scheme https code 301 if !{ ssl_fc }
14 |     use_backend local-ocmb if { path_beg /backplane/remediat }
15 |     default_backend upstream-ocmb
16 | 
17 | backend upstream-ocmb
18 |     http-request set-header Host api.stage.backplane.openshift.com
19 |     server upstream 127.0.0.1:8091 ssl verify none
20 | 
21 | backend local-ocmb
22 |     server local 127.0.0.1:8001 ssl verify none
23 | 


--------------------------------------------------------------------------------
/test/testinfra/tinyproxy.conf:
--------------------------------------------------------------------------------
 1 | Port 8888
 2 | Listen 127.0.0.1
 3 | Timeout 600
 4 | DefaultErrorFile "/usr/share/tinyproxy/default.html"
 5 | StatFile "/usr/share/tinyproxy/stats.html"
 6 | LogLevel Info
 7 | upstream http squid.corp.redhat.com:3128 ".com"
 8 | upstream none "localhost"
 9 | MaxClients 100
10 | Allow 127.0.0.1
11 | Allow ::1
12 | ViaProxyName "tinyproxy"
13 | 
14 | 


--------------------------------------------------------------------------------